1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of Gromacs Copyright (c) 1991-2008
5 * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Gnomes, ROck Monsters And Chili Sauce
32 #include "domdec_network.h"
35 #include "chargegroup.h"
44 #include "pull_rotation.h"
45 #include "gmx_wallcycle.h"
49 #include "mtop_util.h"
51 #include "gmx_ga2la.h"
61 #define DDRANK(dd,rank) (rank)
62 #define DDMASTERRANK(dd) (dd->masterrank)
64 typedef struct gmx_domdec_master
66 /* The cell boundaries */
68 /* The global charge group division */
69 int *ncg; /* Number of home charge groups for each node */
70 int *index; /* Index of nnodes+1 into cg */
71 int *cg; /* Global charge group index */
72 int *nat; /* Number of home atoms for each node. */
73 int *ibuf; /* Buffer for communication */
74 rvec *vbuf; /* Buffer for state scattering and gathering */
75 } gmx_domdec_master_t;
79 /* The numbers of charge groups to send and receive for each cell
80 * that requires communication, the last entry contains the total
81 * number of atoms that needs to be communicated.
83 int nsend[DD_MAXIZONE+2];
84 int nrecv[DD_MAXIZONE+2];
85 /* The charge groups to send */
88 /* The atom range for non-in-place communication */
89 int cell2at0[DD_MAXIZONE];
90 int cell2at1[DD_MAXIZONE];
95 int np; /* Number of grid pulses in this dimension */
96 int np_dlb; /* For dlb, for use with edlbAUTO */
97 gmx_domdec_ind_t *ind; /* The indices to communicate, size np */
99 gmx_bool bInPlace; /* Can we communicate in place? */
100 } gmx_domdec_comm_dim_t;
104 gmx_bool *bCellMin; /* Temp. var.: is this cell size at the limit */
105 real *cell_f; /* State var.: cell boundaries, box relative */
106 real *old_cell_f; /* Temp. var.: old cell size */
107 real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
108 real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
109 real *bound_min; /* Temp. var.: lower limit for cell boundary */
110 real *bound_max; /* Temp. var.: upper limit for cell boundary */
111 gmx_bool bLimited; /* State var.: is DLB limited in this dim and row */
112 real *buf_ncd; /* Temp. var. */
115 #define DD_NLOAD_MAX 9
117 /* Here floats are accurate enough, since these variables
118 * only influence the load balancing, not the actual MD results.
142 gmx_cgsort_t *sort1,*sort2;
144 gmx_cgsort_t *sort_new;
156 /* This enum determines the order of the coordinates.
157 * ddnatHOME and ddnatZONE should be first and second,
158 * the others can be ordered as wanted.
160 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
162 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
163 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
167 int dim; /* The dimension */
168 gmx_bool dim_match;/* Tells if DD and PME dims match */
169 int nslab; /* The number of PME slabs in this dimension */
170 real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB */
171 int *pp_min; /* The minimum pp node location, size nslab */
172 int *pp_max; /* The maximum pp node location,size nslab */
173 int maxshift; /* The maximum shift for coordinate redistribution in PME */
178 real min0; /* The minimum bottom of this zone */
179 real max1; /* The maximum top of this zone */
180 real mch0; /* The maximum bottom communicaton height for this zone */
181 real mch1; /* The maximum top communicaton height for this zone */
182 real p1_0; /* The bottom value of the first cell in this zone */
183 real p1_1; /* The top value of the first cell in this zone */
186 typedef struct gmx_domdec_comm
188 /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
189 * unless stated otherwise.
192 /* The number of decomposition dimensions for PME, 0: no PME */
194 /* The number of nodes doing PME (PP/PME or only PME) */
198 /* The communication setup including the PME only nodes */
199 gmx_bool bCartesianPP_PME;
202 int *pmenodes; /* size npmenodes */
203 int *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
204 * but with bCartesianPP_PME */
205 gmx_ddpme_t ddpme[2];
207 /* The DD particle-particle nodes only */
208 gmx_bool bCartesianPP;
209 int *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
211 /* The global charge groups */
214 /* Should we sort the cgs */
216 gmx_domdec_sort_t *sort;
218 /* Are there bonded and multi-body interactions between charge groups? */
219 gmx_bool bInterCGBondeds;
220 gmx_bool bInterCGMultiBody;
222 /* Data for the optional bonded interaction atom communication range */
229 /* Are we actually using DLB? */
230 gmx_bool bDynLoadBal;
232 /* Cell sizes for static load balancing, first index cartesian */
235 /* The width of the communicated boundaries */
238 /* The minimum cell size (including triclinic correction) */
240 /* For dlb, for use with edlbAUTO */
241 rvec cellsize_min_dlb;
242 /* The lower limit for the DD cell size with DLB */
244 /* Effectively no NB cut-off limit with DLB for systems without PBC? */
245 gmx_bool bVacDLBNoLimit;
247 /* tric_dir is only stored here because dd_get_ns_ranges needs it */
249 /* box0 and box_size are required with dim's without pbc and -gcom */
253 /* The cell boundaries */
257 /* The old location of the cell boundaries, to check cg displacements */
261 /* The communication setup and charge group boundaries for the zones */
262 gmx_domdec_zones_t zones;
264 /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
265 * cell boundaries of neighboring cells for dynamic load balancing.
267 gmx_ddzone_t zone_d1[2];
268 gmx_ddzone_t zone_d2[2][2];
270 /* The coordinate/force communication setup and indices */
271 gmx_domdec_comm_dim_t cd[DIM];
272 /* The maximum number of cells to communicate with in one dimension */
275 /* Which cg distribution is stored on the master node */
276 int master_cg_ddp_count;
278 /* The number of cg's received from the direct neighbors */
279 int zone_ncg1[DD_MAXZONE];
281 /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
284 /* Communication buffer for general use */
288 /* Communication buffer for general use */
291 /* Communication buffers only used with multiple grid pulses */
296 /* Communication buffers for local redistribution */
298 int cggl_flag_nalloc[DIM*2];
300 int cgcm_state_nalloc[DIM*2];
302 /* Cell sizes for dynamic load balancing */
303 gmx_domdec_root_t **root;
307 real cell_f_max0[DIM];
308 real cell_f_min1[DIM];
310 /* Stuff for load communication */
311 gmx_bool bRecordLoad;
312 gmx_domdec_load_t *load;
314 MPI_Comm *mpi_comm_load;
317 /* Maximum DLB scaling per load balancing step in percent */
321 float cycl[ddCyclNr];
322 int cycl_n[ddCyclNr];
323 float cycl_max[ddCyclNr];
324 /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
328 /* Have often have did we have load measurements */
330 /* Have often have we collected the load measurements */
334 double sum_nat[ddnatNR-ddnatZONE];
344 /* The last partition step */
345 gmx_large_int_t globalcomm_step;
353 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
356 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
357 #define DD_FLAG_NRCG 65535
358 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
359 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
361 /* Zone permutation required to obtain consecutive charge groups
362 * for neighbor searching.
364 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
366 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
367 * components see only j zones with that component 0.
370 /* The DD zone order */
371 static const ivec dd_zo[DD_MAXZONE] =
372 {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
377 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
382 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
387 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
389 /* Factors used to avoid problems due to rounding issues */
390 #define DD_CELL_MARGIN 1.0001
391 #define DD_CELL_MARGIN2 1.00005
392 /* Factor to account for pressure scaling during nstlist steps */
393 #define DD_PRES_SCALE_MARGIN 1.02
395 /* Allowed performance loss before we DLB or warn */
396 #define DD_PERF_LOSS 0.05
398 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
400 /* Use separate MPI send and receive commands
401 * when nnodes <= GMX_DD_NNODES_SENDRECV.
402 * This saves memory (and some copying for small nnodes).
403 * For high parallelization scatter and gather calls are used.
405 #define GMX_DD_NNODES_SENDRECV 4
409 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
411 static void index2xyz(ivec nc,int ind,ivec xyz)
413 xyz[XX] = ind % nc[XX];
414 xyz[YY] = (ind / nc[XX]) % nc[YY];
415 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
419 /* This order is required to minimize the coordinate communication in PME
420 * which uses decomposition in the x direction.
422 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
424 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
426 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
427 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
428 xyz[ZZ] = ind % nc[ZZ];
431 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
436 ddindex = dd_index(dd->nc,c);
437 if (dd->comm->bCartesianPP_PME)
439 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
441 else if (dd->comm->bCartesianPP)
444 MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
455 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
457 return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
460 int ddglatnr(gmx_domdec_t *dd,int i)
470 if (i >= dd->comm->nat[ddnatNR-1])
472 gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
474 atnr = dd->gatindex[i] + 1;
480 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
482 return &dd->comm->cgs_gl;
485 static void vec_rvec_init(vec_rvec_t *v)
491 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
495 v->nalloc = over_alloc_dd(n);
496 srenew(v->v,v->nalloc);
500 void dd_store_state(gmx_domdec_t *dd,t_state *state)
504 if (state->ddp_count != dd->ddp_count)
506 gmx_incons("The state does not the domain decomposition state");
509 state->ncg_gl = dd->ncg_home;
510 if (state->ncg_gl > state->cg_gl_nalloc)
512 state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
513 srenew(state->cg_gl,state->cg_gl_nalloc);
515 for(i=0; i<state->ncg_gl; i++)
517 state->cg_gl[i] = dd->index_gl[i];
520 state->ddp_count_cg_gl = dd->ddp_count;
523 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
525 return &dd->comm->zones;
528 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
529 int *jcg0,int *jcg1,ivec shift0,ivec shift1)
531 gmx_domdec_zones_t *zones;
534 zones = &dd->comm->zones;
537 while (icg >= zones->izone[izone].cg1)
546 else if (izone < zones->nizone)
548 *jcg0 = zones->izone[izone].jcg0;
552 gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
553 icg,izone,zones->nizone);
556 *jcg1 = zones->izone[izone].jcg1;
558 for(d=0; d<dd->ndim; d++)
561 shift0[dim] = zones->izone[izone].shift0[dim];
562 shift1[dim] = zones->izone[izone].shift1[dim];
563 if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
565 /* A conservative approach, this can be optimized */
572 int dd_natoms_vsite(gmx_domdec_t *dd)
574 return dd->comm->nat[ddnatVSITE];
577 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
579 *at_start = dd->comm->nat[ddnatCON-1];
580 *at_end = dd->comm->nat[ddnatCON];
583 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
585 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
587 gmx_domdec_comm_t *comm;
588 gmx_domdec_comm_dim_t *cd;
589 gmx_domdec_ind_t *ind;
590 rvec shift={0,0,0},*buf,*rbuf;
591 gmx_bool bPBC,bScrew;
595 cgindex = dd->cgindex;
600 nat_tot = dd->nat_home;
601 for(d=0; d<dd->ndim; d++)
603 bPBC = (dd->ci[dd->dim[d]] == 0);
604 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
607 copy_rvec(box[dd->dim[d]],shift);
610 for(p=0; p<cd->np; p++)
617 for(i=0; i<ind->nsend[nzone]; i++)
619 at0 = cgindex[index[i]];
620 at1 = cgindex[index[i]+1];
621 for(j=at0; j<at1; j++)
623 copy_rvec(x[j],buf[n]);
630 for(i=0; i<ind->nsend[nzone]; i++)
632 at0 = cgindex[index[i]];
633 at1 = cgindex[index[i]+1];
634 for(j=at0; j<at1; j++)
636 /* We need to shift the coordinates */
637 rvec_add(x[j],shift,buf[n]);
644 for(i=0; i<ind->nsend[nzone]; i++)
646 at0 = cgindex[index[i]];
647 at1 = cgindex[index[i]+1];
648 for(j=at0; j<at1; j++)
651 buf[n][XX] = x[j][XX] + shift[XX];
653 * This operation requires a special shift force
654 * treatment, which is performed in calc_vir.
656 buf[n][YY] = box[YY][YY] - x[j][YY];
657 buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
669 rbuf = comm->vbuf2.v;
671 /* Send and receive the coordinates */
672 dd_sendrecv_rvec(dd, d, dddirBackward,
673 buf, ind->nsend[nzone+1],
674 rbuf, ind->nrecv[nzone+1]);
678 for(zone=0; zone<nzone; zone++)
680 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
682 copy_rvec(rbuf[j],x[i]);
687 nat_tot += ind->nrecv[nzone+1];
693 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
695 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
697 gmx_domdec_comm_t *comm;
698 gmx_domdec_comm_dim_t *cd;
699 gmx_domdec_ind_t *ind;
703 gmx_bool bPBC,bScrew;
707 cgindex = dd->cgindex;
712 nzone = comm->zones.n/2;
713 nat_tot = dd->nat_tot;
714 for(d=dd->ndim-1; d>=0; d--)
716 bPBC = (dd->ci[dd->dim[d]] == 0);
717 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
718 if (fshift == NULL && !bScrew)
722 /* Determine which shift vector we need */
728 for(p=cd->np-1; p>=0; p--) {
730 nat_tot -= ind->nrecv[nzone+1];
737 sbuf = comm->vbuf2.v;
739 for(zone=0; zone<nzone; zone++)
741 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
743 copy_rvec(f[i],sbuf[j]);
748 /* Communicate the forces */
749 dd_sendrecv_rvec(dd, d, dddirForward,
750 sbuf, ind->nrecv[nzone+1],
751 buf, ind->nsend[nzone+1]);
753 /* Add the received forces */
757 for(i=0; i<ind->nsend[nzone]; i++)
759 at0 = cgindex[index[i]];
760 at1 = cgindex[index[i]+1];
761 for(j=at0; j<at1; j++)
763 rvec_inc(f[j],buf[n]);
770 for(i=0; i<ind->nsend[nzone]; i++)
772 at0 = cgindex[index[i]];
773 at1 = cgindex[index[i]+1];
774 for(j=at0; j<at1; j++)
776 rvec_inc(f[j],buf[n]);
777 /* Add this force to the shift force */
778 rvec_inc(fshift[is],buf[n]);
785 for(i=0; i<ind->nsend[nzone]; i++)
787 at0 = cgindex[index[i]];
788 at1 = cgindex[index[i]+1];
789 for(j=at0; j<at1; j++)
791 /* Rotate the force */
792 f[j][XX] += buf[n][XX];
793 f[j][YY] -= buf[n][YY];
794 f[j][ZZ] -= buf[n][ZZ];
797 /* Add this force to the shift force */
798 rvec_inc(fshift[is],buf[n]);
809 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
811 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
813 gmx_domdec_comm_t *comm;
814 gmx_domdec_comm_dim_t *cd;
815 gmx_domdec_ind_t *ind;
820 cgindex = dd->cgindex;
822 buf = &comm->vbuf.v[0][0];
825 nat_tot = dd->nat_home;
826 for(d=0; d<dd->ndim; d++)
829 for(p=0; p<cd->np; p++)
834 for(i=0; i<ind->nsend[nzone]; i++)
836 at0 = cgindex[index[i]];
837 at1 = cgindex[index[i]+1];
838 for(j=at0; j<at1; j++)
851 rbuf = &comm->vbuf2.v[0][0];
853 /* Send and receive the coordinates */
854 dd_sendrecv_real(dd, d, dddirBackward,
855 buf, ind->nsend[nzone+1],
856 rbuf, ind->nrecv[nzone+1]);
860 for(zone=0; zone<nzone; zone++)
862 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
869 nat_tot += ind->nrecv[nzone+1];
875 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
877 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
879 gmx_domdec_comm_t *comm;
880 gmx_domdec_comm_dim_t *cd;
881 gmx_domdec_ind_t *ind;
886 cgindex = dd->cgindex;
888 buf = &comm->vbuf.v[0][0];
891 nzone = comm->zones.n/2;
892 nat_tot = dd->nat_tot;
893 for(d=dd->ndim-1; d>=0; d--)
896 for(p=cd->np-1; p>=0; p--) {
898 nat_tot -= ind->nrecv[nzone+1];
905 sbuf = &comm->vbuf2.v[0][0];
907 for(zone=0; zone<nzone; zone++)
909 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
916 /* Communicate the forces */
917 dd_sendrecv_real(dd, d, dddirForward,
918 sbuf, ind->nrecv[nzone+1],
919 buf, ind->nsend[nzone+1]);
921 /* Add the received forces */
923 for(i=0; i<ind->nsend[nzone]; i++)
925 at0 = cgindex[index[i]];
926 at1 = cgindex[index[i]+1];
927 for(j=at0; j<at1; j++)
938 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
940 fprintf(fp,"zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
942 zone->min0,zone->max1,
943 zone->mch0,zone->mch0,
944 zone->p1_0,zone->p1_1);
947 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
948 int ddimind,int direction,
949 gmx_ddzone_t *buf_s,int n_s,
950 gmx_ddzone_t *buf_r,int n_r)
952 rvec vbuf_s[5*2],vbuf_r[5*2];
957 vbuf_s[i*2 ][0] = buf_s[i].min0;
958 vbuf_s[i*2 ][1] = buf_s[i].max1;
959 vbuf_s[i*2 ][2] = buf_s[i].mch0;
960 vbuf_s[i*2+1][0] = buf_s[i].mch1;
961 vbuf_s[i*2+1][1] = buf_s[i].p1_0;
962 vbuf_s[i*2+1][2] = buf_s[i].p1_1;
965 dd_sendrecv_rvec(dd, ddimind, direction,
971 buf_r[i].min0 = vbuf_r[i*2 ][0];
972 buf_r[i].max1 = vbuf_r[i*2 ][1];
973 buf_r[i].mch0 = vbuf_r[i*2 ][2];
974 buf_r[i].mch1 = vbuf_r[i*2+1][0];
975 buf_r[i].p1_0 = vbuf_r[i*2+1][1];
976 buf_r[i].p1_1 = vbuf_r[i*2+1][2];
980 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
981 rvec cell_ns_x0,rvec cell_ns_x1)
983 int d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
984 gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
985 rvec extr_s[2],extr_r[2];
988 gmx_domdec_comm_t *comm;
993 for(d=1; d<dd->ndim; d++)
996 zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
997 zp->min0 = cell_ns_x0[dim];
998 zp->max1 = cell_ns_x1[dim];
999 zp->mch0 = cell_ns_x0[dim];
1000 zp->mch1 = cell_ns_x1[dim];
1001 zp->p1_0 = cell_ns_x0[dim];
1002 zp->p1_1 = cell_ns_x1[dim];
1005 for(d=dd->ndim-2; d>=0; d--)
1008 bPBC = (dim < ddbox->npbcdim);
1010 /* Use an rvec to store two reals */
1011 extr_s[d][0] = comm->cell_f0[d+1];
1012 extr_s[d][1] = comm->cell_f1[d+1];
1016 /* Store the extremes in the backward sending buffer,
1017 * so the get updated separately from the forward communication.
1019 for(d1=d; d1<dd->ndim-1; d1++)
1021 /* We invert the order to be able to use the same loop for buf_e */
1022 buf_s[pos].min0 = extr_s[d1][1];
1023 buf_s[pos].max1 = extr_s[d1][0];
1024 buf_s[pos].mch0 = 0;
1025 buf_s[pos].mch1 = 0;
1026 /* Store the cell corner of the dimension we communicate along */
1027 buf_s[pos].p1_0 = comm->cell_x0[dim];
1028 buf_s[pos].p1_1 = 0;
1032 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1035 if (dd->ndim == 3 && d == 0)
1037 buf_s[pos] = comm->zone_d2[0][1];
1039 buf_s[pos] = comm->zone_d1[0];
1043 /* We only need to communicate the extremes
1044 * in the forward direction
1046 npulse = comm->cd[d].np;
1049 /* Take the minimum to avoid double communication */
1050 npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1054 /* Without PBC we should really not communicate over
1055 * the boundaries, but implementing that complicates
1056 * the communication setup and therefore we simply
1057 * do all communication, but ignore some data.
1059 npulse_min = npulse;
1061 for(p=0; p<npulse_min; p++)
1063 /* Communicate the extremes forward */
1064 bUse = (bPBC || dd->ci[dim] > 0);
1066 dd_sendrecv_rvec(dd, d, dddirForward,
1067 extr_s+d, dd->ndim-d-1,
1068 extr_r+d, dd->ndim-d-1);
1072 for(d1=d; d1<dd->ndim-1; d1++)
1074 extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1075 extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1081 for(p=0; p<npulse; p++)
1083 /* Communicate all the zone information backward */
1084 bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1086 dd_sendrecv_ddzone(dd, d, dddirBackward,
1093 for(d1=d+1; d1<dd->ndim; d1++)
1095 /* Determine the decrease of maximum required
1096 * communication height along d1 due to the distance along d,
1097 * this avoids a lot of useless atom communication.
1099 dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1101 if (ddbox->tric_dir[dim])
1103 /* c is the off-diagonal coupling between the cell planes
1104 * along directions d and d1.
1106 c = ddbox->v[dim][dd->dim[d1]][dim];
1112 det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1115 dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1119 /* A negative value signals out of range */
1125 /* Accumulate the extremes over all pulses */
1126 for(i=0; i<buf_size; i++)
1130 buf_e[i] = buf_r[i];
1136 buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1137 buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1140 if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1148 if (bUse && dh[d1] >= 0)
1150 buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1151 buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1154 /* Copy the received buffer to the send buffer,
1155 * to pass the data through with the next pulse.
1157 buf_s[i] = buf_r[i];
1159 if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1160 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1162 /* Store the extremes */
1165 for(d1=d; d1<dd->ndim-1; d1++)
1167 extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1168 extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1172 if (d == 1 || (d == 0 && dd->ndim == 3))
1176 comm->zone_d2[1-d][i] = buf_e[pos];
1182 comm->zone_d1[1] = buf_e[pos];
1196 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1198 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1199 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1211 print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1213 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1214 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1218 for(d=1; d<dd->ndim; d++)
1220 comm->cell_f_max0[d] = extr_s[d-1][0];
1221 comm->cell_f_min1[d] = extr_s[d-1][1];
1224 fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1225 d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1230 static void dd_collect_cg(gmx_domdec_t *dd,
1231 t_state *state_local)
1233 gmx_domdec_master_t *ma=NULL;
1234 int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1237 if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1239 /* The master has the correct distribution */
1243 if (state_local->ddp_count == dd->ddp_count)
1245 ncg_home = dd->ncg_home;
1247 nat_home = dd->nat_home;
1249 else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1251 cgs_gl = &dd->comm->cgs_gl;
1253 ncg_home = state_local->ncg_gl;
1254 cg = state_local->cg_gl;
1256 for(i=0; i<ncg_home; i++)
1258 nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1263 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1266 buf2[0] = dd->ncg_home;
1267 buf2[1] = dd->nat_home;
1277 /* Collect the charge group and atom counts on the master */
1278 dd_gather(dd,2*sizeof(int),buf2,ibuf);
1283 for(i=0; i<dd->nnodes; i++)
1285 ma->ncg[i] = ma->ibuf[2*i];
1286 ma->nat[i] = ma->ibuf[2*i+1];
1287 ma->index[i+1] = ma->index[i] + ma->ncg[i];
1290 /* Make byte counts and indices */
1291 for(i=0; i<dd->nnodes; i++)
1293 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1294 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1298 fprintf(debug,"Initial charge group distribution: ");
1299 for(i=0; i<dd->nnodes; i++)
1300 fprintf(debug," %d",ma->ncg[i]);
1301 fprintf(debug,"\n");
1305 /* Collect the charge group indices on the master */
1307 dd->ncg_home*sizeof(int),dd->index_gl,
1308 DDMASTER(dd) ? ma->ibuf : NULL,
1309 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1310 DDMASTER(dd) ? ma->cg : NULL);
1312 dd->comm->master_cg_ddp_count = state_local->ddp_count;
1315 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1318 gmx_domdec_master_t *ma;
1319 int n,i,c,a,nalloc=0;
1328 MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1329 dd->rank,dd->mpi_comm_all);
1332 /* Copy the master coordinates to the global array */
1333 cgs_gl = &dd->comm->cgs_gl;
1335 n = DDMASTERRANK(dd);
1337 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1339 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1341 copy_rvec(lv[a++],v[c]);
1345 for(n=0; n<dd->nnodes; n++)
1349 if (ma->nat[n] > nalloc)
1351 nalloc = over_alloc_dd(ma->nat[n]);
1355 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1356 n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1359 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1361 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1363 copy_rvec(buf[a++],v[c]);
1372 static void get_commbuffer_counts(gmx_domdec_t *dd,
1373 int **counts,int **disps)
1375 gmx_domdec_master_t *ma;
1380 /* Make the rvec count and displacment arrays */
1382 *disps = ma->ibuf + dd->nnodes;
1383 for(n=0; n<dd->nnodes; n++)
1385 (*counts)[n] = ma->nat[n]*sizeof(rvec);
1386 (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1390 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1393 gmx_domdec_master_t *ma;
1394 int *rcounts=NULL,*disps=NULL;
1403 get_commbuffer_counts(dd,&rcounts,&disps);
1408 dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1412 cgs_gl = &dd->comm->cgs_gl;
1415 for(n=0; n<dd->nnodes; n++)
1417 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1419 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1421 copy_rvec(buf[a++],v[c]);
1428 void dd_collect_vec(gmx_domdec_t *dd,
1429 t_state *state_local,rvec *lv,rvec *v)
1431 gmx_domdec_master_t *ma;
1432 int n,i,c,a,nalloc=0;
1435 dd_collect_cg(dd,state_local);
1437 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1439 dd_collect_vec_sendrecv(dd,lv,v);
1443 dd_collect_vec_gatherv(dd,lv,v);
1448 void dd_collect_state(gmx_domdec_t *dd,
1449 t_state *state_local,t_state *state)
1453 nh = state->nhchainlength;
1457 state->lambda = state_local->lambda;
1458 state->veta = state_local->veta;
1459 state->vol0 = state_local->vol0;
1460 copy_mat(state_local->box,state->box);
1461 copy_mat(state_local->boxv,state->boxv);
1462 copy_mat(state_local->svir_prev,state->svir_prev);
1463 copy_mat(state_local->fvir_prev,state->fvir_prev);
1464 copy_mat(state_local->pres_prev,state->pres_prev);
1467 for(i=0; i<state_local->ngtc; i++)
1469 for(j=0; j<nh; j++) {
1470 state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j];
1471 state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j];
1473 state->therm_integral[i] = state_local->therm_integral[i];
1475 for(i=0; i<state_local->nnhpres; i++)
1477 for(j=0; j<nh; j++) {
1478 state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j];
1479 state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j];
1483 for(est=0; est<estNR; est++)
1485 if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1489 dd_collect_vec(dd,state_local,state_local->x,state->x);
1492 dd_collect_vec(dd,state_local,state_local->v,state->v);
1495 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1498 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1501 if (state->nrngi == 1)
1505 for(i=0; i<state_local->nrng; i++)
1507 state->ld_rng[i] = state_local->ld_rng[i];
1513 dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1514 state_local->ld_rng,state->ld_rng);
1518 if (state->nrngi == 1)
1522 state->ld_rngi[0] = state_local->ld_rngi[0];
1527 dd_gather(dd,sizeof(state->ld_rngi[0]),
1528 state_local->ld_rngi,state->ld_rngi);
1531 case estDISRE_INITF:
1532 case estDISRE_RM3TAV:
1533 case estORIRE_INITF:
1537 gmx_incons("Unknown state entry encountered in dd_collect_state");
1543 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1547 fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1549 fr->cg_nalloc = over_alloc_dd(nalloc);
1550 srenew(fr->cg_cm,fr->cg_nalloc);
1551 srenew(fr->cginfo,fr->cg_nalloc);
1554 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1560 fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1563 state->nalloc = over_alloc_dd(nalloc);
1565 for(est=0; est<estNR; est++)
1567 if (EST_DISTR(est) && (state->flags & (1<<est)))
1571 srenew(state->x,state->nalloc);
1574 srenew(state->v,state->nalloc);
1577 srenew(state->sd_X,state->nalloc);
1580 srenew(state->cg_p,state->nalloc);
1584 case estDISRE_INITF:
1585 case estDISRE_RM3TAV:
1586 case estORIRE_INITF:
1588 /* No reallocation required */
1591 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1598 srenew(*f,state->nalloc);
1602 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1605 gmx_domdec_master_t *ma;
1606 int n,i,c,a,nalloc=0;
1613 for(n=0; n<dd->nnodes; n++)
1617 if (ma->nat[n] > nalloc)
1619 nalloc = over_alloc_dd(ma->nat[n]);
1622 /* Use lv as a temporary buffer */
1624 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1626 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1628 copy_rvec(v[c],buf[a++]);
1631 if (a != ma->nat[n])
1633 gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1638 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1639 DDRANK(dd,n),n,dd->mpi_comm_all);
1644 n = DDMASTERRANK(dd);
1646 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1648 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1650 copy_rvec(v[c],lv[a++]);
1657 MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1658 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1663 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1666 gmx_domdec_master_t *ma;
1667 int *scounts=NULL,*disps=NULL;
1668 int n,i,c,a,nalloc=0;
1675 get_commbuffer_counts(dd,&scounts,&disps);
1679 for(n=0; n<dd->nnodes; n++)
1681 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1683 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1685 copy_rvec(v[c],buf[a++]);
1691 dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1694 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1696 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1698 dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1702 dd_distribute_vec_scatterv(dd,cgs,v,lv);
1706 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1707 t_state *state,t_state *state_local,
1710 int i,j,ngtch,ngtcp,nh;
1712 nh = state->nhchainlength;
1716 state_local->lambda = state->lambda;
1717 state_local->veta = state->veta;
1718 state_local->vol0 = state->vol0;
1719 copy_mat(state->box,state_local->box);
1720 copy_mat(state->box_rel,state_local->box_rel);
1721 copy_mat(state->boxv,state_local->boxv);
1722 copy_mat(state->svir_prev,state_local->svir_prev);
1723 copy_mat(state->fvir_prev,state_local->fvir_prev);
1724 for(i=0; i<state_local->ngtc; i++)
1726 for(j=0; j<nh; j++) {
1727 state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j];
1728 state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j];
1730 state_local->therm_integral[i] = state->therm_integral[i];
1732 for(i=0; i<state_local->nnhpres; i++)
1734 for(j=0; j<nh; j++) {
1735 state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j];
1736 state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j];
1740 dd_bcast(dd,sizeof(real),&state_local->lambda);
1741 dd_bcast(dd,sizeof(real),&state_local->veta);
1742 dd_bcast(dd,sizeof(real),&state_local->vol0);
1743 dd_bcast(dd,sizeof(state_local->box),state_local->box);
1744 dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1745 dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1746 dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1747 dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1748 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1749 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1750 dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1751 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1752 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1754 if (dd->nat_home > state_local->nalloc)
1756 dd_realloc_state(state_local,f,dd->nat_home);
1758 for(i=0; i<estNR; i++)
1760 if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1764 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1767 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1770 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1773 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1776 if (state->nrngi == 1)
1779 state_local->nrng*sizeof(state_local->ld_rng[0]),
1780 state->ld_rng,state_local->ld_rng);
1785 state_local->nrng*sizeof(state_local->ld_rng[0]),
1786 state->ld_rng,state_local->ld_rng);
1790 if (state->nrngi == 1)
1792 dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1793 state->ld_rngi,state_local->ld_rngi);
1797 dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1798 state->ld_rngi,state_local->ld_rngi);
1801 case estDISRE_INITF:
1802 case estDISRE_RM3TAV:
1803 case estORIRE_INITF:
1805 /* Not implemented yet */
1808 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1814 static char dim2char(int dim)
1820 case XX: c = 'X'; break;
1821 case YY: c = 'Y'; break;
1822 case ZZ: c = 'Z'; break;
1823 default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1829 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1830 gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1832 rvec grid_s[2],*grid_r=NULL,cx,r;
1833 char fname[STRLEN],format[STRLEN],buf[22];
1839 copy_rvec(dd->comm->cell_x0,grid_s[0]);
1840 copy_rvec(dd->comm->cell_x1,grid_s[1]);
1844 snew(grid_r,2*dd->nnodes);
1847 dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1851 for(d=0; d<DIM; d++)
1853 for(i=0; i<DIM; i++)
1861 if (d < ddbox->npbcdim && dd->nc[d] > 1)
1863 tric[d][i] = box[i][d]/box[i][i];
1872 sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1873 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1874 out = gmx_fio_fopen(fname,"w");
1875 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1877 for(i=0; i<dd->nnodes; i++)
1879 vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1880 for(d=0; d<DIM; d++)
1882 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1890 cx[XX] = grid_r[i*2+x][XX];
1891 cx[YY] = grid_r[i*2+y][YY];
1892 cx[ZZ] = grid_r[i*2+z][ZZ];
1894 fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1895 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1899 for(d=0; d<DIM; d++)
1905 case 0: y = 1 + i*8 + 2*x; break;
1906 case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1907 case 2: y = 1 + i*8 + x; break;
1909 fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1913 gmx_fio_fclose(out);
1918 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1919 gmx_mtop_t *mtop,t_commrec *cr,
1920 int natoms,rvec x[],matrix box)
1922 char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1925 char *atomname,*resname;
1932 natoms = dd->comm->nat[ddnatVSITE];
1935 sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1937 sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1938 sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1940 out = gmx_fio_fopen(fname,"w");
1942 fprintf(out,"TITLE %s\n",title);
1943 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1944 for(i=0; i<natoms; i++)
1946 ii = dd->gatindex[i];
1947 gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1948 if (i < dd->comm->nat[ddnatZONE])
1951 while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1957 else if (i < dd->comm->nat[ddnatVSITE])
1959 b = dd->comm->zones.n;
1963 b = dd->comm->zones.n + 1;
1965 fprintf(out,strlen(atomname)<4 ? format : format4,
1966 "ATOM",(ii+1)%100000,
1967 atomname,resname,' ',resnr%10000,' ',
1968 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1970 fprintf(out,"TER\n");
1972 gmx_fio_fclose(out);
1975 real dd_cutoff_mbody(gmx_domdec_t *dd)
1977 gmx_domdec_comm_t *comm;
1984 if (comm->bInterCGBondeds)
1986 if (comm->cutoff_mbody > 0)
1988 r = comm->cutoff_mbody;
1992 /* cutoff_mbody=0 means we do not have DLB */
1993 r = comm->cellsize_min[dd->dim[0]];
1994 for(di=1; di<dd->ndim; di++)
1996 r = min(r,comm->cellsize_min[dd->dim[di]]);
1998 if (comm->bBondComm)
2000 r = max(r,comm->cutoff_mbody);
2004 r = min(r,comm->cutoff);
2012 real dd_cutoff_twobody(gmx_domdec_t *dd)
2016 r_mb = dd_cutoff_mbody(dd);
2018 return max(dd->comm->cutoff,r_mb);
2022 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2026 nc = dd->nc[dd->comm->cartpmedim];
2027 ntot = dd->comm->ntot[dd->comm->cartpmedim];
2028 copy_ivec(coord,coord_pme);
2029 coord_pme[dd->comm->cartpmedim] =
2030 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2033 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2035 /* Here we assign a PME node to communicate with this DD node
2036 * by assuming that the major index of both is x.
2037 * We add cr->npmenodes/2 to obtain an even distribution.
2039 return (ddindex*npme + npme/2)/ndd;
2042 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2044 return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2047 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2049 return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2052 static int *dd_pmenodes(t_commrec *cr)
2057 snew(pmenodes,cr->npmenodes);
2059 for(i=0; i<cr->dd->nnodes; i++) {
2060 p0 = cr_ddindex2pmeindex(cr,i);
2061 p1 = cr_ddindex2pmeindex(cr,i+1);
2062 if (i+1 == cr->dd->nnodes || p1 > p0) {
2064 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2065 pmenodes[n] = i + 1 + n;
2073 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2076 ivec coords,coords_pme,nc;
2081 if (dd->comm->bCartesian) {
2082 gmx_ddindex2xyz(dd->nc,ddindex,coords);
2083 dd_coords2pmecoords(dd,coords,coords_pme);
2084 copy_ivec(dd->ntot,nc);
2085 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2086 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2088 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2090 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2096 slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2101 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2103 gmx_domdec_comm_t *comm;
2105 int ddindex,nodeid=-1;
2107 comm = cr->dd->comm;
2112 if (comm->bCartesianPP_PME)
2115 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2120 ddindex = dd_index(cr->dd->nc,coords);
2121 if (comm->bCartesianPP)
2123 nodeid = comm->ddindex2simnodeid[ddindex];
2129 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2141 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2144 gmx_domdec_comm_t *comm;
2145 ivec coord,coord_pme;
2152 /* This assumes a uniform x domain decomposition grid cell size */
2153 if (comm->bCartesianPP_PME)
2156 MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2157 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2159 /* This is a PP node */
2160 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2161 MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2165 else if (comm->bCartesianPP)
2167 if (sim_nodeid < dd->nnodes)
2169 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2174 /* This assumes DD cells with identical x coordinates
2175 * are numbered sequentially.
2177 if (dd->comm->pmenodes == NULL)
2179 if (sim_nodeid < dd->nnodes)
2181 /* The DD index equals the nodeid */
2182 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2188 while (sim_nodeid > dd->comm->pmenodes[i])
2192 if (sim_nodeid < dd->comm->pmenodes[i])
2194 pmenode = dd->comm->pmenodes[i];
2202 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2204 gmx_bool bPMEOnlyNode;
2206 if (DOMAINDECOMP(cr))
2208 bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2212 bPMEOnlyNode = FALSE;
2215 return bPMEOnlyNode;
2218 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2219 int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2223 ivec coord,coord_pme;
2227 snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2230 for(x=0; x<dd->nc[XX]; x++)
2232 for(y=0; y<dd->nc[YY]; y++)
2234 for(z=0; z<dd->nc[ZZ]; z++)
2236 if (dd->comm->bCartesianPP_PME)
2241 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2242 if (dd->ci[XX] == coord_pme[XX] &&
2243 dd->ci[YY] == coord_pme[YY] &&
2244 dd->ci[ZZ] == coord_pme[ZZ])
2245 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2249 /* The slab corresponds to the nodeid in the PME group */
2250 if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2252 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2259 /* The last PP-only node is the peer node */
2260 *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2264 fprintf(debug,"Receive coordinates from PP nodes:");
2265 for(x=0; x<*nmy_ddnodes; x++)
2267 fprintf(debug," %d",(*my_ddnodes)[x]);
2269 fprintf(debug,"\n");
2273 static gmx_bool receive_vir_ener(t_commrec *cr)
2275 gmx_domdec_comm_t *comm;
2276 int pmenode,coords[DIM],rank;
2280 if (cr->npmenodes < cr->dd->nnodes)
2282 comm = cr->dd->comm;
2283 if (comm->bCartesianPP_PME)
2285 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2287 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2288 coords[comm->cartpmedim]++;
2289 if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2291 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2292 if (dd_simnode2pmenode(cr,rank) == pmenode)
2294 /* This is not the last PP node for pmenode */
2302 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2303 if (cr->sim_nodeid+1 < cr->nnodes &&
2304 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2306 /* This is not the last PP node for pmenode */
2315 static void set_zones_ncg_home(gmx_domdec_t *dd)
2317 gmx_domdec_zones_t *zones;
2320 zones = &dd->comm->zones;
2322 zones->cg_range[0] = 0;
2323 for(i=1; i<zones->n+1; i++)
2325 zones->cg_range[i] = dd->ncg_home;
2329 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2331 int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2334 dd_cg_gl = dd->index_gl;
2335 cgindex = dd->cgindex;
2338 for(i=0; i<state->ncg_gl; i++)
2342 dd_cg_gl[i] = cg_gl;
2343 nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2347 dd->ncg_home = state->ncg_gl;
2350 set_zones_ncg_home(dd);
2353 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2355 while (cg >= cginfo_mb->cg_end)
2360 return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2363 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2364 t_forcerec *fr,char *bLocalCG)
2366 cginfo_mb_t *cginfo_mb;
2372 cginfo_mb = fr->cginfo_mb;
2373 cginfo = fr->cginfo;
2375 for(cg=cg0; cg<cg1; cg++)
2377 cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2381 if (bLocalCG != NULL)
2383 for(cg=cg0; cg<cg1; cg++)
2385 bLocalCG[index_gl[cg]] = TRUE;
2390 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2392 int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2393 int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2397 bLocalCG = dd->comm->bLocalCG;
2399 if (dd->nat_tot > dd->gatindex_nalloc)
2401 dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2402 srenew(dd->gatindex,dd->gatindex_nalloc);
2405 nzone = dd->comm->zones.n;
2406 zone2cg = dd->comm->zones.cg_range;
2407 zone_ncg1 = dd->comm->zone_ncg1;
2408 index_gl = dd->index_gl;
2409 gatindex = dd->gatindex;
2411 if (zone2cg[1] != dd->ncg_home)
2413 gmx_incons("dd->ncg_zone is not up to date");
2416 /* Make the local to global and global to local atom index */
2417 a = dd->cgindex[cg_start];
2418 for(zone=0; zone<nzone; zone++)
2426 cg0 = zone2cg[zone];
2428 for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2431 if (cg - cg0 >= zone_ncg1[zone])
2433 /* Signal that this cg is from more than one zone away */
2436 cg_gl = index_gl[cg];
2437 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2440 ga2la_set(dd->ga2la,a_gl,a,zone1);
2447 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2453 if (bLocalCG == NULL)
2457 for(i=0; i<dd->ncg_tot; i++)
2459 if (!bLocalCG[dd->index_gl[i]])
2462 "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2467 for(i=0; i<ncg_sys; i++)
2474 if (ngl != dd->ncg_tot)
2476 fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2483 static void check_index_consistency(gmx_domdec_t *dd,
2484 int natoms_sys,int ncg_sys,
2487 int nerr,ngl,i,a,cell;
2492 if (dd->comm->DD_debug > 1)
2494 snew(have,natoms_sys);
2495 for(a=0; a<dd->nat_tot; a++)
2497 if (have[dd->gatindex[a]] > 0)
2499 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2503 have[dd->gatindex[a]] = a + 1;
2509 snew(have,dd->nat_tot);
2512 for(i=0; i<natoms_sys; i++)
2514 if (ga2la_get(dd->ga2la,i,&a,&cell))
2516 if (a >= dd->nat_tot)
2518 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2524 if (dd->gatindex[a] != i)
2526 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2533 if (ngl != dd->nat_tot)
2536 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2537 dd->rank,where,ngl,dd->nat_tot);
2539 for(a=0; a<dd->nat_tot; a++)
2544 "DD node %d, %s: local atom %d, global %d has no global index\n",
2545 dd->rank,where,a+1,dd->gatindex[a]+1);
2550 nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2553 gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2554 dd->rank,where,nerr);
2558 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2565 /* Clear the whole list without searching */
2566 ga2la_clear(dd->ga2la);
2570 for(i=a_start; i<dd->nat_tot; i++)
2572 ga2la_del(dd->ga2la,dd->gatindex[i]);
2576 bLocalCG = dd->comm->bLocalCG;
2579 for(i=cg_start; i<dd->ncg_tot; i++)
2581 bLocalCG[dd->index_gl[i]] = FALSE;
2585 dd_clear_local_vsite_indices(dd);
2587 if (dd->constraints)
2589 dd_clear_local_constraint_indices(dd);
2593 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2595 real grid_jump_limit;
2597 /* The distance between the boundaries of cells at distance
2598 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2599 * and by the fact that cells should not be shifted by more than
2600 * half their size, such that cg's only shift by one cell
2601 * at redecomposition.
2603 grid_jump_limit = comm->cellsize_limit;
2604 if (!comm->bVacDLBNoLimit)
2606 grid_jump_limit = max(grid_jump_limit,
2607 comm->cutoff/comm->cd[dim_ind].np);
2610 return grid_jump_limit;
2613 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2615 gmx_domdec_comm_t *comm;
2621 for(d=1; d<dd->ndim; d++)
2624 limit = grid_jump_limit(comm,d);
2625 bfac = ddbox->box_size[dim];
2626 if (ddbox->tric_dir[dim])
2628 bfac *= ddbox->skew_fac[dim];
2630 if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit ||
2631 (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2634 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2635 gmx_step_str(step,buf),
2636 dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2641 static int dd_load_count(gmx_domdec_comm_t *comm)
2643 return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2646 static float dd_force_load(gmx_domdec_comm_t *comm)
2653 if (comm->eFlop > 1)
2655 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2660 load = comm->cycl[ddCyclF];
2661 if (comm->cycl_n[ddCyclF] > 1)
2663 /* Subtract the maximum of the last n cycle counts
2664 * to get rid of possible high counts due to other soures,
2665 * for instance system activity, that would otherwise
2666 * affect the dynamic load balancing.
2668 load -= comm->cycl_max[ddCyclF];
2675 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2677 gmx_domdec_comm_t *comm;
2682 snew(*dim_f,dd->nc[dim]+1);
2684 for(i=1; i<dd->nc[dim]; i++)
2686 if (comm->slb_frac[dim])
2688 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2692 (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2695 (*dim_f)[dd->nc[dim]] = 1;
2698 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2700 int pmeindex,slab,nso,i;
2703 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2709 ddpme->dim = dimind;
2711 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2713 ddpme->nslab = (ddpme->dim == 0 ?
2714 dd->comm->npmenodes_x :
2715 dd->comm->npmenodes_y);
2717 if (ddpme->nslab <= 1)
2722 nso = dd->comm->npmenodes/ddpme->nslab;
2723 /* Determine for each PME slab the PP location range for dimension dim */
2724 snew(ddpme->pp_min,ddpme->nslab);
2725 snew(ddpme->pp_max,ddpme->nslab);
2726 for(slab=0; slab<ddpme->nslab; slab++) {
2727 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2728 ddpme->pp_max[slab] = 0;
2730 for(i=0; i<dd->nnodes; i++) {
2731 ddindex2xyz(dd->nc,i,xyz);
2732 /* For y only use our y/z slab.
2733 * This assumes that the PME x grid size matches the DD grid size.
2735 if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2736 pmeindex = ddindex2pmeindex(dd,i);
2738 slab = pmeindex/nso;
2740 slab = pmeindex % ddpme->nslab;
2742 ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2743 ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2747 set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2750 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2752 if (dd->comm->ddpme[0].dim == XX)
2754 return dd->comm->ddpme[0].maxshift;
2762 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2764 if (dd->comm->ddpme[0].dim == YY)
2766 return dd->comm->ddpme[0].maxshift;
2768 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2770 return dd->comm->ddpme[1].maxshift;
2778 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2779 gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2781 gmx_domdec_comm_t *comm;
2784 real range,pme_boundary;
2788 nc = dd->nc[ddpme->dim];
2791 if (!ddpme->dim_match)
2793 /* PP decomposition is not along dim: the worst situation */
2796 else if (ns <= 3 || (bUniform && ns == nc))
2798 /* The optimal situation */
2803 /* We need to check for all pme nodes which nodes they
2804 * could possibly need to communicate with.
2806 xmin = ddpme->pp_min;
2807 xmax = ddpme->pp_max;
2808 /* Allow for atoms to be maximally 2/3 times the cut-off
2809 * out of their DD cell. This is a reasonable balance between
2810 * between performance and support for most charge-group/cut-off
2813 range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2814 /* Avoid extra communication when we are exactly at a boundary */
2820 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2821 pme_boundary = (real)s/ns;
2824 cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) ||
2826 cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2830 pme_boundary = (real)(s+1)/ns;
2833 cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) ||
2835 cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary)))
2842 ddpme->maxshift = sh;
2846 fprintf(debug,"PME slab communication range for dim %d is %d\n",
2847 ddpme->dim,ddpme->maxshift);
2851 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2855 for(d=0; d<dd->ndim; d++)
2858 if (dim < ddbox->nboundeddim &&
2859 ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2860 dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2862 gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2863 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2864 dd->nc[dim],dd->comm->cellsize_limit);
2869 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2870 gmx_bool bMaster,ivec npulse)
2872 gmx_domdec_comm_t *comm;
2875 real *cell_x,cell_dx,cellsize;
2879 for(d=0; d<DIM; d++)
2881 cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2883 if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2886 cell_dx = ddbox->box_size[d]/dd->nc[d];
2889 for(j=0; j<dd->nc[d]+1; j++)
2891 dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2896 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx;
2897 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2899 cellsize = cell_dx*ddbox->skew_fac[d];
2900 while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2904 cellsize_min[d] = cellsize;
2908 /* Statically load balanced grid */
2909 /* Also when we are not doing a master distribution we determine
2910 * all cell borders in a loop to obtain identical values
2911 * to the master distribution case and to determine npulse.
2915 cell_x = dd->ma->cell_x[d];
2919 snew(cell_x,dd->nc[d]+1);
2921 cell_x[0] = ddbox->box0[d];
2922 for(j=0; j<dd->nc[d]; j++)
2924 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2925 cell_x[j+1] = cell_x[j] + cell_dx;
2926 cellsize = cell_dx*ddbox->skew_fac[d];
2927 while (cellsize*npulse[d] < comm->cutoff &&
2928 npulse[d] < dd->nc[d]-1)
2932 cellsize_min[d] = min(cellsize_min[d],cellsize);
2936 comm->cell_x0[d] = cell_x[dd->ci[d]];
2937 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2941 /* The following limitation is to avoid that a cell would receive
2942 * some of its own home charge groups back over the periodic boundary.
2943 * Double charge groups cause trouble with the global indices.
2945 if (d < ddbox->npbcdim &&
2946 dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2948 gmx_fatal_collective(FARGS,NULL,dd,
2949 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2950 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2952 dd->nc[d],dd->nc[d],
2953 dd->nnodes > dd->nc[d] ? "cells" : "processors");
2957 if (!comm->bDynLoadBal)
2959 copy_rvec(cellsize_min,comm->cellsize_min);
2962 for(d=0; d<comm->npmedecompdim; d++)
2964 set_pme_maxshift(dd,&comm->ddpme[d],
2965 comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2966 comm->ddpme[d].slb_dim_f);
2971 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2972 int d,int dim,gmx_domdec_root_t *root,
2974 gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2976 gmx_domdec_comm_t *comm;
2977 int ncd,i,j,nmin,nmin_old;
2978 gmx_bool bLimLo,bLimHi;
2980 real fac,halfway,cellsize_limit_f_i,region_size;
2981 gmx_bool bPBC,bLastHi=FALSE;
2982 int nrange[]={range[0],range[1]};
2984 region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2990 bPBC = (dim < ddbox->npbcdim);
2992 cell_size = root->buf_ncd;
2996 fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2999 /* First we need to check if the scaling does not make cells
3000 * smaller than the smallest allowed size.
3001 * We need to do this iteratively, since if a cell is too small,
3002 * it needs to be enlarged, which makes all the other cells smaller,
3003 * which could in turn make another cell smaller than allowed.
3005 for(i=range[0]; i<range[1]; i++)
3007 root->bCellMin[i] = FALSE;
3013 /* We need the total for normalization */
3015 for(i=range[0]; i<range[1]; i++)
3017 if (root->bCellMin[i] == FALSE)
3019 fac += cell_size[i];
3022 fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3023 /* Determine the cell boundaries */
3024 for(i=range[0]; i<range[1]; i++)
3026 if (root->bCellMin[i] == FALSE)
3028 cell_size[i] *= fac;
3029 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3031 cellsize_limit_f_i = 0;
3035 cellsize_limit_f_i = cellsize_limit_f;
3037 if (cell_size[i] < cellsize_limit_f_i)
3039 root->bCellMin[i] = TRUE;
3040 cell_size[i] = cellsize_limit_f_i;
3044 root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3047 while (nmin > nmin_old);
3050 cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3051 /* For this check we should not use DD_CELL_MARGIN,
3052 * but a slightly smaller factor,
3053 * since rounding could get use below the limit.
3055 if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3058 gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3059 gmx_step_str(step,buf),
3060 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3061 ncd,comm->cellsize_min[dim]);
3064 root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3068 /* Check if the boundary did not displace more than halfway
3069 * each of the cells it bounds, as this could cause problems,
3070 * especially when the differences between cell sizes are large.
3071 * If changes are applied, they will not make cells smaller
3072 * than the cut-off, as we check all the boundaries which
3073 * might be affected by a change and if the old state was ok,
3074 * the cells will at most be shrunk back to their old size.
3076 for(i=range[0]+1; i<range[1]; i++)
3078 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3079 if (root->cell_f[i] < halfway)
3081 root->cell_f[i] = halfway;
3082 /* Check if the change also causes shifts of the next boundaries */
3083 for(j=i+1; j<range[1]; j++)
3085 if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3086 root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f;
3089 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3090 if (root->cell_f[i] > halfway)
3092 root->cell_f[i] = halfway;
3093 /* Check if the change also causes shifts of the next boundaries */
3094 for(j=i-1; j>=range[0]+1; j--)
3096 if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3097 root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3103 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3104 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3105 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3106 * for a and b nrange is used */
3109 /* Take care of the staggering of the cell boundaries */
3112 for(i=range[0]; i<range[1]; i++)
3114 root->cell_f_max0[i] = root->cell_f[i];
3115 root->cell_f_min1[i] = root->cell_f[i+1];
3120 for(i=range[0]+1; i<range[1]; i++)
3122 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3123 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3124 if (bLimLo && bLimHi)
3126 /* Both limits violated, try the best we can */
3127 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3128 root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3131 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3135 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3141 /* root->cell_f[i] = root->bound_min[i]; */
3142 nrange[1]=i; /* only store violation location. There could be a LimLo violation following with an higher index */
3145 else if (bLimHi && !bLastHi)
3148 if (nrange[1] < range[1]) /* found a LimLo before */
3150 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3151 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3152 nrange[0]=nrange[1];
3154 root->cell_f[i] = root->bound_max[i];
3156 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3161 if (nrange[1] < range[1]) /* found last a LimLo */
3163 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3164 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3165 nrange[0]=nrange[1];
3167 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3169 else if (nrange[0] > range[0]) /* found at least one LimHi */
3171 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3178 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3179 int d,int dim,gmx_domdec_root_t *root,
3180 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3181 gmx_bool bUniform,gmx_large_int_t step)
3183 gmx_domdec_comm_t *comm;
3186 real load_aver,load_i,imbalance,change,change_max,sc;
3187 real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3191 int range[] = { 0, 0 };
3195 /* Convert the maximum change from the input percentage to a fraction */
3196 change_limit = comm->dlb_scale_lim*0.01;
3200 bPBC = (dim < ddbox->npbcdim);
3202 cell_size = root->buf_ncd;
3204 /* Store the original boundaries */
3205 for(i=0; i<ncd+1; i++)
3207 root->old_cell_f[i] = root->cell_f[i];
3210 for(i=0; i<ncd; i++)
3212 cell_size[i] = 1.0/ncd;
3215 else if (dd_load_count(comm))
3217 load_aver = comm->load[d].sum_m/ncd;
3219 for(i=0; i<ncd; i++)
3221 /* Determine the relative imbalance of cell i */
3222 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3223 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3224 /* Determine the change of the cell size using underrelaxation */
3225 change = -relax*imbalance;
3226 change_max = max(change_max,max(change,-change));
3228 /* Limit the amount of scaling.
3229 * We need to use the same rescaling for all cells in one row,
3230 * otherwise the load balancing might not converge.
3233 if (change_max > change_limit)
3235 sc *= change_limit/change_max;
3237 for(i=0; i<ncd; i++)
3239 /* Determine the relative imbalance of cell i */
3240 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3241 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3242 /* Determine the change of the cell size using underrelaxation */
3243 change = -sc*imbalance;
3244 cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3248 cellsize_limit_f = comm->cellsize_min[dim]/ddbox->box_size[dim];
3249 cellsize_limit_f *= DD_CELL_MARGIN;
3250 dist_min_f_hard = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3251 dist_min_f = dist_min_f_hard * DD_CELL_MARGIN;
3252 if (ddbox->tric_dir[dim])
3254 cellsize_limit_f /= ddbox->skew_fac[dim];
3255 dist_min_f /= ddbox->skew_fac[dim];
3257 if (bDynamicBox && d > 0)
3259 dist_min_f *= DD_PRES_SCALE_MARGIN;
3261 if (d > 0 && !bUniform)
3263 /* Make sure that the grid is not shifted too much */
3264 for(i=1; i<ncd; i++) {
3265 if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3267 gmx_incons("Inconsistent DD boundary staggering limits!");
3269 root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3270 space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3272 root->bound_min[i] += 0.5*space;
3274 root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3275 space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3277 root->bound_max[i] += 0.5*space;
3282 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3284 root->cell_f_max0[i-1] + dist_min_f,
3285 root->bound_min[i],root->cell_f[i],root->bound_max[i],
3286 root->cell_f_min1[i] - dist_min_f);
3291 root->cell_f[0] = 0;
3292 root->cell_f[ncd] = 1;
3293 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3296 /* After the checks above, the cells should obey the cut-off
3297 * restrictions, but it does not hurt to check.
3299 for(i=0; i<ncd; i++)
3303 fprintf(debug,"Relative bounds dim %d cell %d: %f %f\n",
3304 dim,i,root->cell_f[i],root->cell_f[i+1]);
3307 if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3308 root->cell_f[i+1] - root->cell_f[i] <
3309 cellsize_limit_f/DD_CELL_MARGIN)
3313 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3314 gmx_step_str(step,buf),dim2char(dim),i,
3315 (root->cell_f[i+1] - root->cell_f[i])
3316 *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3321 /* Store the cell boundaries of the lower dimensions at the end */
3322 for(d1=0; d1<d; d1++)
3324 root->cell_f[pos++] = comm->cell_f0[d1];
3325 root->cell_f[pos++] = comm->cell_f1[d1];
3328 if (d < comm->npmedecompdim)
3330 /* The master determines the maximum shift for
3331 * the coordinate communication between separate PME nodes.
3333 set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3335 root->cell_f[pos++] = comm->ddpme[0].maxshift;
3338 root->cell_f[pos++] = comm->ddpme[1].maxshift;
3342 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3343 gmx_ddbox_t *ddbox,int dimind)
3345 gmx_domdec_comm_t *comm;
3350 /* Set the cell dimensions */
3351 dim = dd->dim[dimind];
3352 comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3353 comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3354 if (dim >= ddbox->nboundeddim)
3356 comm->cell_x0[dim] += ddbox->box0[dim];
3357 comm->cell_x1[dim] += ddbox->box0[dim];
3361 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3362 int d,int dim,real *cell_f_row,
3365 gmx_domdec_comm_t *comm;
3371 /* Each node would only need to know two fractions,
3372 * but it is probably cheaper to broadcast the whole array.
3374 MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3375 0,comm->mpi_comm_load[d]);
3377 /* Copy the fractions for this dimension from the buffer */
3378 comm->cell_f0[d] = cell_f_row[dd->ci[dim] ];
3379 comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3380 /* The whole array was communicated, so set the buffer position */
3381 pos = dd->nc[dim] + 1;
3382 for(d1=0; d1<=d; d1++)
3386 /* Copy the cell fractions of the lower dimensions */
3387 comm->cell_f0[d1] = cell_f_row[pos++];
3388 comm->cell_f1[d1] = cell_f_row[pos++];
3390 relative_to_absolute_cell_bounds(dd,ddbox,d1);
3392 /* Convert the communicated shift from float to int */
3393 comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3396 comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3400 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3401 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3402 gmx_bool bUniform,gmx_large_int_t step)
3404 gmx_domdec_comm_t *comm;
3406 gmx_bool bRowMember,bRowRoot;
3411 for(d=0; d<dd->ndim; d++)
3416 for(d1=d; d1<dd->ndim; d1++)
3418 if (dd->ci[dd->dim[d1]] > 0)
3431 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3432 ddbox,bDynamicBox,bUniform,step);
3433 cell_f_row = comm->root[d]->cell_f;
3437 cell_f_row = comm->cell_f_row;
3439 distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3444 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3448 /* This function assumes the box is static and should therefore
3449 * not be called when the box has changed since the last
3450 * call to dd_partition_system.
3452 for(d=0; d<dd->ndim; d++)
3454 relative_to_absolute_cell_bounds(dd,ddbox,d);
3460 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3461 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3462 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3463 gmx_wallcycle_t wcycle)
3465 gmx_domdec_comm_t *comm;
3472 wallcycle_start(wcycle,ewcDDCOMMBOUND);
3473 set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3474 wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3476 else if (bDynamicBox)
3478 set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3481 /* Set the dimensions for which no DD is used */
3482 for(dim=0; dim<DIM; dim++) {
3483 if (dd->nc[dim] == 1) {
3484 comm->cell_x0[dim] = 0;
3485 comm->cell_x1[dim] = ddbox->box_size[dim];
3486 if (dim >= ddbox->nboundeddim)
3488 comm->cell_x0[dim] += ddbox->box0[dim];
3489 comm->cell_x1[dim] += ddbox->box0[dim];
3495 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3498 gmx_domdec_comm_dim_t *cd;
3500 for(d=0; d<dd->ndim; d++)
3502 cd = &dd->comm->cd[d];
3503 np = npulse[dd->dim[d]];
3504 if (np > cd->np_nalloc)
3508 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3509 dim2char(dd->dim[d]),np);
3511 if (DDMASTER(dd) && cd->np_nalloc > 0)
3513 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3516 for(i=cd->np_nalloc; i<np; i++)
3518 cd->ind[i].index = NULL;
3519 cd->ind[i].nalloc = 0;
3528 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3529 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3530 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3531 gmx_wallcycle_t wcycle)
3533 gmx_domdec_comm_t *comm;
3539 /* Copy the old cell boundaries for the cg displacement check */
3540 copy_rvec(comm->cell_x0,comm->old_cell_x0);
3541 copy_rvec(comm->cell_x1,comm->old_cell_x1);
3543 if (comm->bDynLoadBal)
3547 check_box_size(dd,ddbox);
3549 set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3553 set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3554 realloc_comm_ind(dd,npulse);
3559 for(d=0; d<DIM; d++)
3561 fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3562 d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3567 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3569 rvec cell_ns_x0,rvec cell_ns_x1,
3570 gmx_large_int_t step)
3572 gmx_domdec_comm_t *comm;
3577 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3579 dim = dd->dim[dim_ind];
3581 /* Without PBC we don't have restrictions on the outer cells */
3582 if (!(dim >= ddbox->npbcdim &&
3583 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3584 comm->bDynLoadBal &&
3585 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3586 comm->cellsize_min[dim])
3589 gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3590 gmx_step_str(step,buf),dim2char(dim),
3591 comm->cell_x1[dim] - comm->cell_x0[dim],
3592 ddbox->skew_fac[dim],
3593 dd->comm->cellsize_min[dim],
3594 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3598 if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3600 /* Communicate the boundaries and update cell_ns_x0/1 */
3601 dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3602 if (dd->bGridJump && dd->ndim > 1)
3604 check_grid_jump(step,dd,ddbox);
3609 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3613 tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3621 tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3622 tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3631 static void check_screw_box(matrix box)
3633 /* Mathematical limitation */
3634 if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3636 gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3639 /* Limitation due to the asymmetry of the eighth shell method */
3640 if (box[ZZ][YY] != 0)
3642 gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3646 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3647 matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3650 gmx_domdec_master_t *ma;
3651 int **tmp_ind=NULL,*tmp_nalloc=NULL;
3652 int i,icg,j,k,k0,k1,d,npbcdim;
3654 rvec box_size,cg_cm;
3656 real nrcg,inv_ncg,pos_d;
3658 gmx_bool bUnbounded,bScrew;
3662 if (tmp_ind == NULL)
3664 snew(tmp_nalloc,dd->nnodes);
3665 snew(tmp_ind,dd->nnodes);
3666 for(i=0; i<dd->nnodes; i++)
3668 tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3669 snew(tmp_ind[i],tmp_nalloc[i]);
3673 /* Clear the count */
3674 for(i=0; i<dd->nnodes; i++)
3680 make_tric_corr_matrix(dd->npbcdim,box,tcm);
3682 cgindex = cgs->index;
3684 /* Compute the center of geometry for all charge groups */
3685 for(icg=0; icg<cgs->nr; icg++)
3688 k1 = cgindex[icg+1];
3692 copy_rvec(pos[k0],cg_cm);
3699 for(k=k0; (k<k1); k++)
3701 rvec_inc(cg_cm,pos[k]);
3703 for(d=0; (d<DIM); d++)
3705 cg_cm[d] *= inv_ncg;
3708 /* Put the charge group in the box and determine the cell index */
3709 for(d=DIM-1; d>=0; d--) {
3711 if (d < dd->npbcdim)
3713 bScrew = (dd->bScrewPBC && d == XX);
3714 if (tric_dir[d] && dd->nc[d] > 1)
3716 /* Use triclinic coordintates for this dimension */
3717 for(j=d+1; j<DIM; j++)
3719 pos_d += cg_cm[j]*tcm[j][d];
3722 while(pos_d >= box[d][d])
3725 rvec_dec(cg_cm,box[d]);
3728 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3729 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3731 for(k=k0; (k<k1); k++)
3733 rvec_dec(pos[k],box[d]);
3736 pos[k][YY] = box[YY][YY] - pos[k][YY];
3737 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3744 rvec_inc(cg_cm,box[d]);
3747 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3748 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3750 for(k=k0; (k<k1); k++)
3752 rvec_inc(pos[k],box[d]);
3754 pos[k][YY] = box[YY][YY] - pos[k][YY];
3755 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3760 /* This could be done more efficiently */
3762 while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3767 i = dd_index(dd->nc,ind);
3768 if (ma->ncg[i] == tmp_nalloc[i])
3770 tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3771 srenew(tmp_ind[i],tmp_nalloc[i]);
3773 tmp_ind[i][ma->ncg[i]] = icg;
3775 ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3779 for(i=0; i<dd->nnodes; i++)
3782 for(k=0; k<ma->ncg[i]; k++)
3784 ma->cg[k1++] = tmp_ind[i][k];
3787 ma->index[dd->nnodes] = k1;
3789 for(i=0; i<dd->nnodes; i++)
3799 fprintf(fplog,"Charge group distribution at step %s:",
3800 gmx_step_str(step,buf));
3801 for(i=0; i<dd->nnodes; i++)
3803 fprintf(fplog," %d",ma->ncg[i]);
3805 fprintf(fplog,"\n");
3809 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3810 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3813 gmx_domdec_master_t *ma=NULL;
3816 int *ibuf,buf2[2] = { 0, 0 };
3824 check_screw_box(box);
3827 set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3829 distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3830 for(i=0; i<dd->nnodes; i++)
3832 ma->ibuf[2*i] = ma->ncg[i];
3833 ma->ibuf[2*i+1] = ma->nat[i];
3841 dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3843 dd->ncg_home = buf2[0];
3844 dd->nat_home = buf2[1];
3845 dd->ncg_tot = dd->ncg_home;
3846 dd->nat_tot = dd->nat_home;
3847 if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3849 dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3850 srenew(dd->index_gl,dd->cg_nalloc);
3851 srenew(dd->cgindex,dd->cg_nalloc+1);
3855 for(i=0; i<dd->nnodes; i++)
3857 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3858 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3863 DDMASTER(dd) ? ma->ibuf : NULL,
3864 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3865 DDMASTER(dd) ? ma->cg : NULL,
3866 dd->ncg_home*sizeof(int),dd->index_gl);
3868 /* Determine the home charge group sizes */
3870 for(i=0; i<dd->ncg_home; i++)
3872 cg_gl = dd->index_gl[i];
3874 dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3879 fprintf(debug,"Home charge groups:\n");
3880 for(i=0; i<dd->ncg_home; i++)
3882 fprintf(debug," %d",dd->index_gl[i]);
3884 fprintf(debug,"\n");
3886 fprintf(debug,"\n");
3890 static int compact_and_copy_vec_at(int ncg,int *move,
3893 rvec *src,gmx_domdec_comm_t *comm,
3896 int m,icg,i,i0,i1,nrcg;
3902 for(m=0; m<DIM*2; m++)
3908 for(icg=0; icg<ncg; icg++)
3910 i1 = cgindex[icg+1];
3916 /* Compact the home array in place */
3917 for(i=i0; i<i1; i++)
3919 copy_rvec(src[i],src[home_pos++]);
3925 /* Copy to the communication buffer */
3927 pos_vec[m] += 1 + vec*nrcg;
3928 for(i=i0; i<i1; i++)
3930 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3932 pos_vec[m] += (nvec - vec - 1)*nrcg;
3936 home_pos += i1 - i0;
3944 static int compact_and_copy_vec_cg(int ncg,int *move,
3946 int nvec,rvec *src,gmx_domdec_comm_t *comm,
3949 int m,icg,i0,i1,nrcg;
3955 for(m=0; m<DIM*2; m++)
3961 for(icg=0; icg<ncg; icg++)
3963 i1 = cgindex[icg+1];
3969 /* Compact the home array in place */
3970 copy_rvec(src[icg],src[home_pos++]);
3976 /* Copy to the communication buffer */
3977 copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3978 pos_vec[m] += 1 + nrcg*nvec;
3990 static int compact_ind(int ncg,int *move,
3991 int *index_gl,int *cgindex,
3993 gmx_ga2la_t ga2la,char *bLocalCG,
3996 int cg,nat,a0,a1,a,a_gl;
4001 for(cg=0; cg<ncg; cg++)
4007 /* Compact the home arrays in place.
4008 * Anything that can be done here avoids access to global arrays.
4010 cgindex[home_pos] = nat;
4011 for(a=a0; a<a1; a++)
4014 gatindex[nat] = a_gl;
4015 /* The cell number stays 0, so we don't need to set it */
4016 ga2la_change_la(ga2la,a_gl,nat);
4019 index_gl[home_pos] = index_gl[cg];
4020 cginfo[home_pos] = cginfo[cg];
4021 /* The charge group remains local, so bLocalCG does not change */
4026 /* Clear the global indices */
4027 for(a=a0; a<a1; a++)
4029 ga2la_del(ga2la,gatindex[a]);
4033 bLocalCG[index_gl[cg]] = FALSE;
4037 cgindex[home_pos] = nat;
4042 static void clear_and_mark_ind(int ncg,int *move,
4043 int *index_gl,int *cgindex,int *gatindex,
4044 gmx_ga2la_t ga2la,char *bLocalCG,
4049 for(cg=0; cg<ncg; cg++)
4055 /* Clear the global indices */
4056 for(a=a0; a<a1; a++)
4058 ga2la_del(ga2la,gatindex[a]);
4062 bLocalCG[index_gl[cg]] = FALSE;
4064 /* Signal that this cg has moved using the ns cell index.
4065 * Here we set it to -1.
4066 * fill_grid will change it from -1 to 4*grid->ncells.
4068 cell_index[cg] = -1;
4073 static void print_cg_move(FILE *fplog,
4075 gmx_large_int_t step,int cg,int dim,int dir,
4076 gmx_bool bHaveLimitdAndCMOld,real limitd,
4077 rvec cm_old,rvec cm_new,real pos_d)
4079 gmx_domdec_comm_t *comm;
4084 fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4085 if (bHaveLimitdAndCMOld)
4087 fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4088 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4092 fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4093 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4095 fprintf(fplog,"distance out of cell %f\n",
4096 dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4097 if (bHaveLimitdAndCMOld)
4099 fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4100 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4102 fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4103 cm_new[XX],cm_new[YY],cm_new[ZZ]);
4104 fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4106 comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4107 fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4109 comm->cell_x0[dim],comm->cell_x1[dim]);
4112 static void cg_move_error(FILE *fplog,
4114 gmx_large_int_t step,int cg,int dim,int dir,
4115 gmx_bool bHaveLimitdAndCMOld,real limitd,
4116 rvec cm_old,rvec cm_new,real pos_d)
4120 print_cg_move(fplog, dd,step,cg,dim,dir,
4121 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4123 print_cg_move(stderr,dd,step,cg,dim,dir,
4124 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4126 "A charge group moved too far between two domain decomposition steps\n"
4127 "This usually means that your system is not well equilibrated");
4130 static void rotate_state_atom(t_state *state,int a)
4134 for(est=0; est<estNR; est++)
4136 if (EST_DISTR(est) && (state->flags & (1<<est))) {
4139 /* Rotate the complete state; for a rectangular box only */
4140 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4141 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4144 state->v[a][YY] = -state->v[a][YY];
4145 state->v[a][ZZ] = -state->v[a][ZZ];
4148 state->sd_X[a][YY] = -state->sd_X[a][YY];
4149 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4152 state->cg_p[a][YY] = -state->cg_p[a][YY];
4153 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4155 case estDISRE_INITF:
4156 case estDISRE_RM3TAV:
4157 case estORIRE_INITF:
4159 /* These are distances, so not affected by rotation */
4162 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4168 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4169 gmx_domdec_t *dd,ivec tric_dir,
4170 t_state *state,rvec **f,
4171 t_forcerec *fr,t_mdatoms *md,
4177 int ncg[DIM*2],nat[DIM*2];
4178 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4179 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4180 int sbuf[2],rbuf[2];
4181 int home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4183 gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4188 rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4190 cginfo_mb_t *cginfo_mb;
4191 gmx_domdec_comm_t *comm;
4195 check_screw_box(state->box);
4201 for(i=0; i<estNR; i++)
4207 case estX: /* Always present */ break;
4208 case estV: bV = (state->flags & (1<<i)); break;
4209 case estSDX: bSDX = (state->flags & (1<<i)); break;
4210 case estCGP: bCGP = (state->flags & (1<<i)); break;
4213 case estDISRE_INITF:
4214 case estDISRE_RM3TAV:
4215 case estORIRE_INITF:
4217 /* No processing required */
4220 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4225 if (dd->ncg_tot > comm->nalloc_int)
4227 comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4228 srenew(comm->buf_int,comm->nalloc_int);
4230 move = comm->buf_int;
4232 /* Clear the count */
4233 for(c=0; c<dd->ndim*2; c++)
4239 npbcdim = dd->npbcdim;
4241 for(d=0; (d<DIM); d++)
4243 limitd[d] = dd->comm->cellsize_min[d];
4244 if (d >= npbcdim && dd->ci[d] == 0)
4246 cell_x0[d] = -GMX_FLOAT_MAX;
4250 cell_x0[d] = comm->cell_x0[d];
4252 if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4254 cell_x1[d] = GMX_FLOAT_MAX;
4258 cell_x1[d] = comm->cell_x1[d];
4262 limit0[d] = comm->old_cell_x0[d] - limitd[d];
4263 limit1[d] = comm->old_cell_x1[d] + limitd[d];
4267 /* We check after communication if a charge group moved
4268 * more than one cell. Set the pre-comm check limit to float_max.
4270 limit0[d] = -GMX_FLOAT_MAX;
4271 limit1[d] = GMX_FLOAT_MAX;
4275 make_tric_corr_matrix(npbcdim,state->box,tcm);
4277 cgindex = dd->cgindex;
4279 /* Compute the center of geometry for all home charge groups
4280 * and put them in the box and determine where they should go.
4282 for(cg=0; cg<dd->ncg_home; cg++)
4289 copy_rvec(state->x[k0],cm_new);
4296 for(k=k0; (k<k1); k++)
4298 rvec_inc(cm_new,state->x[k]);
4300 for(d=0; (d<DIM); d++)
4302 cm_new[d] = inv_ncg*cm_new[d];
4307 /* Do pbc and check DD cell boundary crossings */
4308 for(d=DIM-1; d>=0; d--)
4312 bScrew = (dd->bScrewPBC && d == XX);
4313 /* Determine the location of this cg in lattice coordinates */
4317 for(d2=d+1; d2<DIM; d2++)
4319 pos_d += cm_new[d2]*tcm[d2][d];
4322 /* Put the charge group in the triclinic unit-cell */
4323 if (pos_d >= cell_x1[d])
4325 if (pos_d >= limit1[d])
4327 cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4328 cg_cm[cg],cm_new,pos_d);
4331 if (dd->ci[d] == dd->nc[d] - 1)
4333 rvec_dec(cm_new,state->box[d]);
4336 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4337 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4339 for(k=k0; (k<k1); k++)
4341 rvec_dec(state->x[k],state->box[d]);
4344 rotate_state_atom(state,k);
4349 else if (pos_d < cell_x0[d])
4351 if (pos_d < limit0[d])
4353 cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4354 cg_cm[cg],cm_new,pos_d);
4359 rvec_inc(cm_new,state->box[d]);
4362 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4363 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4365 for(k=k0; (k<k1); k++)
4367 rvec_inc(state->x[k],state->box[d]);
4370 rotate_state_atom(state,k);
4376 else if (d < npbcdim)
4378 /* Put the charge group in the rectangular unit-cell */
4379 while (cm_new[d] >= state->box[d][d])
4381 rvec_dec(cm_new,state->box[d]);
4382 for(k=k0; (k<k1); k++)
4384 rvec_dec(state->x[k],state->box[d]);
4387 while (cm_new[d] < 0)
4389 rvec_inc(cm_new,state->box[d]);
4390 for(k=k0; (k<k1); k++)
4392 rvec_inc(state->x[k],state->box[d]);
4398 copy_rvec(cm_new,cg_cm[cg]);
4400 /* Determine where this cg should go */
4403 for(d=0; d<dd->ndim; d++)
4408 flag |= DD_FLAG_FW(d);
4414 else if (dev[dim] == -1)
4416 flag |= DD_FLAG_BW(d);
4418 if (dd->nc[dim] > 2)
4432 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4434 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4435 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4437 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg];
4438 /* We store the cg size in the lower 16 bits
4439 * and the place where the charge group should go
4440 * in the next 6 bits. This saves some communication volume.
4442 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4448 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4449 inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4465 /* Make sure the communication buffers are large enough */
4466 for(mc=0; mc<dd->ndim*2; mc++)
4468 nvr = ncg[mc] + nat[mc]*nvec;
4469 if (nvr > comm->cgcm_state_nalloc[mc])
4471 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4472 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4476 /* Recalculating cg_cm might be cheaper than communicating,
4477 * but that could give rise to rounding issues.
4480 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4481 nvec,cg_cm,comm,bCompact);
4485 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4486 nvec,vec++,state->x,comm,bCompact);
4489 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4490 nvec,vec++,state->v,comm,bCompact);
4494 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4495 nvec,vec++,state->sd_X,comm,bCompact);
4499 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4500 nvec,vec++,state->cg_p,comm,bCompact);
4505 compact_ind(dd->ncg_home,move,
4506 dd->index_gl,dd->cgindex,dd->gatindex,
4507 dd->ga2la,comm->bLocalCG,
4512 clear_and_mark_ind(dd->ncg_home,move,
4513 dd->index_gl,dd->cgindex,dd->gatindex,
4514 dd->ga2la,comm->bLocalCG,
4515 fr->ns.grid->cell_index);
4518 cginfo_mb = fr->cginfo_mb;
4520 ncg_stay_home = home_pos_cg;
4521 for(d=0; d<dd->ndim; d++)
4527 for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4530 /* Communicate the cg and atom counts */
4535 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4536 d,dir,sbuf[0],sbuf[1]);
4538 dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4540 if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4542 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4543 srenew(comm->buf_int,comm->nalloc_int);
4546 /* Communicate the charge group indices, sizes and flags */
4547 dd_sendrecv_int(dd, d, dir,
4548 comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4549 comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4551 nvs = ncg[cdd] + nat[cdd]*nvec;
4552 i = rbuf[0] + rbuf[1] *nvec;
4553 vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4555 /* Communicate cgcm and state */
4556 dd_sendrecv_rvec(dd, d, dir,
4557 comm->cgcm_state[cdd], nvs,
4558 comm->vbuf.v+nvr, i);
4559 ncg_recv += rbuf[0];
4560 nat_recv += rbuf[1];
4564 /* Process the received charge groups */
4566 for(cg=0; cg<ncg_recv; cg++)
4568 flag = comm->buf_int[cg*DD_CGIBS+1];
4570 if (dim >= npbcdim && dd->nc[dim] > 2)
4572 /* No pbc in this dim and more than one domain boundary.
4573 * We to a separate check if a charge did not move too far.
4575 if (((flag & DD_FLAG_FW(d)) &&
4576 comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4577 ((flag & DD_FLAG_BW(d)) &&
4578 comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4580 cg_move_error(fplog,dd,step,cg,d,
4581 (flag & DD_FLAG_FW(d)) ? 1 : 0,
4583 comm->vbuf.v[buf_pos],
4584 comm->vbuf.v[buf_pos],
4585 comm->vbuf.v[buf_pos][d]);
4592 /* Check which direction this cg should go */
4593 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4597 /* The cell boundaries for dimension d2 are not equal
4598 * for each cell row of the lower dimension(s),
4599 * therefore we might need to redetermine where
4600 * this cg should go.
4603 /* If this cg crosses the box boundary in dimension d2
4604 * we can use the communicated flag, so we do not
4605 * have to worry about pbc.
4607 if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4608 (flag & DD_FLAG_FW(d2))) ||
4609 (dd->ci[dim2] == 0 &&
4610 (flag & DD_FLAG_BW(d2)))))
4612 /* Clear the two flags for this dimension */
4613 flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4614 /* Determine the location of this cg
4615 * in lattice coordinates
4617 pos_d = comm->vbuf.v[buf_pos][dim2];
4620 for(d3=dim2+1; d3<DIM; d3++)
4623 comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4626 /* Check of we are not at the box edge.
4627 * pbc is only handled in the first step above,
4628 * but this check could move over pbc while
4629 * the first step did not due to different rounding.
4631 if (pos_d >= cell_x1[dim2] &&
4632 dd->ci[dim2] != dd->nc[dim2]-1)
4634 flag |= DD_FLAG_FW(d2);
4636 else if (pos_d < cell_x0[dim2] &&
4639 flag |= DD_FLAG_BW(d2);
4641 comm->buf_int[cg*DD_CGIBS+1] = flag;
4644 /* Set to which neighboring cell this cg should go */
4645 if (flag & DD_FLAG_FW(d2))
4649 else if (flag & DD_FLAG_BW(d2))
4651 if (dd->nc[dd->dim[d2]] > 2)
4663 nrcg = flag & DD_FLAG_NRCG;
4666 if (home_pos_cg+1 > dd->cg_nalloc)
4668 dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4669 srenew(dd->index_gl,dd->cg_nalloc);
4670 srenew(dd->cgindex,dd->cg_nalloc+1);
4672 /* Set the global charge group index and size */
4673 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4674 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4675 /* Copy the state from the buffer */
4676 if (home_pos_cg >= fr->cg_nalloc)
4678 dd_realloc_fr_cg(fr,home_pos_cg+1);
4681 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4682 /* Set the cginfo */
4683 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4684 dd->index_gl[home_pos_cg]);
4687 comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4690 if (home_pos_at+nrcg > state->nalloc)
4692 dd_realloc_state(state,f,home_pos_at+nrcg);
4694 for(i=0; i<nrcg; i++)
4696 copy_rvec(comm->vbuf.v[buf_pos++],
4697 state->x[home_pos_at+i]);
4701 for(i=0; i<nrcg; i++)
4703 copy_rvec(comm->vbuf.v[buf_pos++],
4704 state->v[home_pos_at+i]);
4709 for(i=0; i<nrcg; i++)
4711 copy_rvec(comm->vbuf.v[buf_pos++],
4712 state->sd_X[home_pos_at+i]);
4717 for(i=0; i<nrcg; i++)
4719 copy_rvec(comm->vbuf.v[buf_pos++],
4720 state->cg_p[home_pos_at+i]);
4724 home_pos_at += nrcg;
4728 /* Reallocate the buffers if necessary */
4729 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4731 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4732 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4734 nvr = ncg[mc] + nat[mc]*nvec;
4735 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4737 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4738 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4740 /* Copy from the receive to the send buffers */
4741 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4742 comm->buf_int + cg*DD_CGIBS,
4743 DD_CGIBS*sizeof(int));
4744 memcpy(comm->cgcm_state[mc][nvr],
4745 comm->vbuf.v[buf_pos],
4746 (1+nrcg*nvec)*sizeof(rvec));
4747 buf_pos += 1 + nrcg*nvec;
4754 /* With sorting (!bCompact) the indices are now only partially up to date
4755 * and ncg_home and nat_home are not the real count, since there are
4756 * "holes" in the arrays for the charge groups that moved to neighbors.
4758 dd->ncg_home = home_pos_cg;
4759 dd->nat_home = home_pos_at;
4763 fprintf(debug,"Finished repartitioning\n");
4766 return ncg_stay_home;
4769 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4771 dd->comm->cycl[ddCycl] += cycles;
4772 dd->comm->cycl_n[ddCycl]++;
4773 if (cycles > dd->comm->cycl_max[ddCycl])
4775 dd->comm->cycl_max[ddCycl] = cycles;
4779 static double force_flop_count(t_nrnb *nrnb)
4786 for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4788 /* To get closer to the real timings, we half the count
4789 * for the normal loops and again half it for water loops.
4792 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4794 sum += nrnb->n[i]*0.25*cost_nrnb(i);
4798 sum += nrnb->n[i]*0.50*cost_nrnb(i);
4801 for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4804 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4805 sum += nrnb->n[i]*cost_nrnb(i);
4807 for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4809 sum += nrnb->n[i]*cost_nrnb(i);
4815 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4817 if (dd->comm->eFlop)
4819 dd->comm->flop -= force_flop_count(nrnb);
4822 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4824 if (dd->comm->eFlop)
4826 dd->comm->flop += force_flop_count(nrnb);
4831 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4835 for(i=0; i<ddCyclNr; i++)
4837 dd->comm->cycl[i] = 0;
4838 dd->comm->cycl_n[i] = 0;
4839 dd->comm->cycl_max[i] = 0;
4842 dd->comm->flop_n = 0;
4845 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4847 gmx_domdec_comm_t *comm;
4848 gmx_domdec_load_t *load;
4849 gmx_domdec_root_t *root=NULL;
4850 int d,dim,cid,i,pos;
4851 float cell_frac=0,sbuf[DD_NLOAD_MAX];
4856 fprintf(debug,"get_load_distribution start\n");
4859 wallcycle_start(wcycle,ewcDDCOMMLOAD);
4863 bSepPME = (dd->pme_nodeid >= 0);
4865 for(d=dd->ndim-1; d>=0; d--)
4868 /* Check if we participate in the communication in this dimension */
4869 if (d == dd->ndim-1 ||
4870 (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4872 load = &comm->load[d];
4875 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4878 if (d == dd->ndim-1)
4880 sbuf[pos++] = dd_force_load(comm);
4881 sbuf[pos++] = sbuf[0];
4884 sbuf[pos++] = sbuf[0];
4885 sbuf[pos++] = cell_frac;
4888 sbuf[pos++] = comm->cell_f_max0[d];
4889 sbuf[pos++] = comm->cell_f_min1[d];
4894 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4895 sbuf[pos++] = comm->cycl[ddCyclPME];
4900 sbuf[pos++] = comm->load[d+1].sum;
4901 sbuf[pos++] = comm->load[d+1].max;
4904 sbuf[pos++] = comm->load[d+1].sum_m;
4905 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4906 sbuf[pos++] = comm->load[d+1].flags;
4909 sbuf[pos++] = comm->cell_f_max0[d];
4910 sbuf[pos++] = comm->cell_f_min1[d];
4915 sbuf[pos++] = comm->load[d+1].mdf;
4916 sbuf[pos++] = comm->load[d+1].pme;
4920 /* Communicate a row in DD direction d.
4921 * The communicators are setup such that the root always has rank 0.
4924 MPI_Gather(sbuf ,load->nload*sizeof(float),MPI_BYTE,
4925 load->load,load->nload*sizeof(float),MPI_BYTE,
4926 0,comm->mpi_comm_load[d]);
4928 if (dd->ci[dim] == dd->master_ci[dim])
4930 /* We are the root, process this row */
4931 if (comm->bDynLoadBal)
4933 root = comm->root[d];
4943 for(i=0; i<dd->nc[dim]; i++)
4945 load->sum += load->load[pos++];
4946 load->max = max(load->max,load->load[pos]);
4952 /* This direction could not be load balanced properly,
4953 * therefore we need to use the maximum iso the average load.
4955 load->sum_m = max(load->sum_m,load->load[pos]);
4959 load->sum_m += load->load[pos];
4962 load->cvol_min = min(load->cvol_min,load->load[pos]);
4966 load->flags = (int)(load->load[pos++] + 0.5);
4970 root->cell_f_max0[i] = load->load[pos++];
4971 root->cell_f_min1[i] = load->load[pos++];
4976 load->mdf = max(load->mdf,load->load[pos]);
4978 load->pme = max(load->pme,load->load[pos]);
4982 if (comm->bDynLoadBal && root->bLimited)
4984 load->sum_m *= dd->nc[dim];
4985 load->flags |= (1<<d);
4993 comm->nload += dd_load_count(comm);
4994 comm->load_step += comm->cycl[ddCyclStep];
4995 comm->load_sum += comm->load[0].sum;
4996 comm->load_max += comm->load[0].max;
4997 if (comm->bDynLoadBal)
4999 for(d=0; d<dd->ndim; d++)
5001 if (comm->load[0].flags & (1<<d))
5003 comm->load_lim[d]++;
5009 comm->load_mdf += comm->load[0].mdf;
5010 comm->load_pme += comm->load[0].pme;
5014 wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5018 fprintf(debug,"get_load_distribution finished\n");
5022 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5024 /* Return the relative performance loss on the total run time
5025 * due to the force calculation load imbalance.
5027 if (dd->comm->nload > 0)
5030 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5031 (dd->comm->load_step*dd->nnodes);
5039 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5042 int npp,npme,nnodes,d,limp;
5043 float imbal,pme_f_ratio,lossf,lossp=0;
5045 gmx_domdec_comm_t *comm;
5048 if (DDMASTER(dd) && comm->nload > 0)
5051 npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5052 nnodes = npp + npme;
5053 imbal = comm->load_max*npp/comm->load_sum - 1;
5054 lossf = dd_force_imb_perf_loss(dd);
5055 sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5056 fprintf(fplog,"%s",buf);
5057 fprintf(stderr,"\n");
5058 fprintf(stderr,"%s",buf);
5059 sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5060 fprintf(fplog,"%s",buf);
5061 fprintf(stderr,"%s",buf);
5063 if (comm->bDynLoadBal)
5065 sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5066 for(d=0; d<dd->ndim; d++)
5068 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5069 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5075 sprintf(buf+strlen(buf),"\n");
5076 fprintf(fplog,"%s",buf);
5077 fprintf(stderr,"%s",buf);
5081 pme_f_ratio = comm->load_pme/comm->load_mdf;
5082 lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5085 lossp *= (float)npme/(float)nnodes;
5089 lossp *= (float)npp/(float)nnodes;
5091 sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5092 fprintf(fplog,"%s",buf);
5093 fprintf(stderr,"%s",buf);
5094 sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5095 fprintf(fplog,"%s",buf);
5096 fprintf(stderr,"%s",buf);
5098 fprintf(fplog,"\n");
5099 fprintf(stderr,"\n");
5101 if (lossf >= DD_PERF_LOSS)
5104 "NOTE: %.1f %% performance was lost due to load imbalance\n"
5105 " in the domain decomposition.\n",lossf*100);
5106 if (!comm->bDynLoadBal)
5108 sprintf(buf+strlen(buf)," You might want to use dynamic load balancing (option -dlb.)\n");
5112 sprintf(buf+strlen(buf)," You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5114 fprintf(fplog,"%s\n",buf);
5115 fprintf(stderr,"%s\n",buf);
5117 if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5120 "NOTE: %.1f %% performance was lost because the PME nodes\n"
5121 " had %s work to do than the PP nodes.\n"
5122 " You might want to %s the number of PME nodes\n"
5123 " or %s the cut-off and the grid spacing.\n",
5125 (lossp < 0) ? "less" : "more",
5126 (lossp < 0) ? "decrease" : "increase",
5127 (lossp < 0) ? "decrease" : "increase");
5128 fprintf(fplog,"%s\n",buf);
5129 fprintf(stderr,"%s\n",buf);
5134 static float dd_vol_min(gmx_domdec_t *dd)
5136 return dd->comm->load[0].cvol_min*dd->nnodes;
5139 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5141 return dd->comm->load[0].flags;
5144 static float dd_f_imbal(gmx_domdec_t *dd)
5146 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5149 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5151 return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5154 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5159 flags = dd_load_flags(dd);
5163 "DD load balancing is limited by minimum cell size in dimension");
5164 for(d=0; d<dd->ndim; d++)
5168 fprintf(fplog," %c",dim2char(dd->dim[d]));
5171 fprintf(fplog,"\n");
5173 fprintf(fplog,"DD step %s",gmx_step_str(step,buf));
5174 if (dd->comm->bDynLoadBal)
5176 fprintf(fplog," vol min/aver %5.3f%c",
5177 dd_vol_min(dd),flags ? '!' : ' ');
5179 fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5180 if (dd->comm->cycl_n[ddCyclPME])
5182 fprintf(fplog," pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5184 fprintf(fplog,"\n\n");
5187 static void dd_print_load_verbose(gmx_domdec_t *dd)
5189 if (dd->comm->bDynLoadBal)
5191 fprintf(stderr,"vol %4.2f%c ",
5192 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5194 fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5195 if (dd->comm->cycl_n[ddCyclPME])
5197 fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5202 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5203 int dim_ind,ivec loc)
5205 MPI_Group g_row = MPI_GROUP_EMPTY;
5209 gmx_domdec_root_t *root;
5210 gmx_bool bPartOfGroup = FALSE;
5212 dim = dd->dim[dim_ind];
5213 copy_ivec(loc,loc_c);
5214 snew(rank,dd->nc[dim]);
5215 for(i=0; i<dd->nc[dim]; i++)
5218 rank[i] = dd_index(dd->nc,loc_c);
5219 if (rank[i] == dd->rank)
5221 /* This process is part of the group */
5222 bPartOfGroup = TRUE;
5227 MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5229 MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5232 dd->comm->mpi_comm_load[dim_ind] = c_row;
5233 if (dd->comm->eDLB != edlbNO)
5235 if (dd->ci[dim] == dd->master_ci[dim])
5237 /* This is the root process of this row */
5238 snew(dd->comm->root[dim_ind],1);
5239 root = dd->comm->root[dim_ind];
5240 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5241 snew(root->old_cell_f,dd->nc[dim]+1);
5242 snew(root->bCellMin,dd->nc[dim]);
5245 snew(root->cell_f_max0,dd->nc[dim]);
5246 snew(root->cell_f_min1,dd->nc[dim]);
5247 snew(root->bound_min,dd->nc[dim]);
5248 snew(root->bound_max,dd->nc[dim]);
5250 snew(root->buf_ncd,dd->nc[dim]);
5254 /* This is not a root process, we only need to receive cell_f */
5255 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5258 if (dd->ci[dim] == dd->master_ci[dim])
5260 snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5267 static void make_load_communicators(gmx_domdec_t *dd)
5275 fprintf(debug,"Making load communicators\n");
5277 MPI_Comm_group(dd->mpi_comm_all,&g_all);
5279 snew(dd->comm->load,dd->ndim);
5280 snew(dd->comm->mpi_comm_load,dd->ndim);
5283 make_load_communicator(dd,g_all,0,loc);
5286 for(i=0; i<dd->nc[dim0]; i++) {
5288 make_load_communicator(dd,g_all,1,loc);
5293 for(i=0; i<dd->nc[dim0]; i++) {
5296 for(j=0; j<dd->nc[dim1]; j++) {
5298 make_load_communicator(dd,g_all,2,loc);
5303 MPI_Group_free(&g_all);
5306 fprintf(debug,"Finished making load communicators\n");
5310 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5316 ivec dd_zp[DD_MAXIZONE];
5317 gmx_domdec_zones_t *zones;
5318 gmx_domdec_ns_ranges_t *izone;
5320 for(d=0; d<dd->ndim; d++)
5323 copy_ivec(dd->ci,tmp);
5324 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5325 dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5326 copy_ivec(dd->ci,tmp);
5327 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5328 dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5331 fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5334 dd->neighbor[d][1]);
5340 fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5341 dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5345 fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5347 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5348 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5355 for(i=0; i<nzonep; i++)
5357 copy_ivec(dd_zp3[i],dd_zp[i]);
5363 for(i=0; i<nzonep; i++)
5365 copy_ivec(dd_zp2[i],dd_zp[i]);
5371 for(i=0; i<nzonep; i++)
5373 copy_ivec(dd_zp1[i],dd_zp[i]);
5377 gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5382 zones = &dd->comm->zones;
5384 for(i=0; i<nzone; i++)
5387 clear_ivec(zones->shift[i]);
5388 for(d=0; d<dd->ndim; d++)
5390 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5395 for(i=0; i<nzone; i++)
5397 for(d=0; d<DIM; d++)
5399 s[d] = dd->ci[d] - zones->shift[i][d];
5404 else if (s[d] >= dd->nc[d])
5410 zones->nizone = nzonep;
5411 for(i=0; i<zones->nizone; i++)
5413 if (dd_zp[i][0] != i)
5415 gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5417 izone = &zones->izone[i];
5418 izone->j0 = dd_zp[i][1];
5419 izone->j1 = dd_zp[i][2];
5420 for(dim=0; dim<DIM; dim++)
5422 if (dd->nc[dim] == 1)
5424 /* All shifts should be allowed */
5425 izone->shift0[dim] = -1;
5426 izone->shift1[dim] = 1;
5431 izone->shift0[d] = 0;
5432 izone->shift1[d] = 0;
5433 for(j=izone->j0; j<izone->j1; j++) {
5434 if (dd->shift[j][d] > dd->shift[i][d])
5435 izone->shift0[d] = -1;
5436 if (dd->shift[j][d] < dd->shift[i][d])
5437 izone->shift1[d] = 1;
5443 /* Assume the shift are not more than 1 cell */
5444 izone->shift0[dim] = 1;
5445 izone->shift1[dim] = -1;
5446 for(j=izone->j0; j<izone->j1; j++)
5448 shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5449 if (shift_diff < izone->shift0[dim])
5451 izone->shift0[dim] = shift_diff;
5453 if (shift_diff > izone->shift1[dim])
5455 izone->shift1[dim] = shift_diff;
5462 if (dd->comm->eDLB != edlbNO)
5464 snew(dd->comm->root,dd->ndim);
5467 if (dd->comm->bRecordLoad)
5469 make_load_communicators(dd);
5473 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5476 gmx_domdec_comm_t *comm;
5487 if (comm->bCartesianPP)
5489 /* Set up cartesian communication for the particle-particle part */
5492 fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5493 dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5496 for(i=0; i<DIM; i++)
5500 MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5502 /* We overwrite the old communicator with the new cartesian one */
5503 cr->mpi_comm_mygroup = comm_cart;
5506 dd->mpi_comm_all = cr->mpi_comm_mygroup;
5507 MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5509 if (comm->bCartesianPP_PME)
5511 /* Since we want to use the original cartesian setup for sim,
5512 * and not the one after split, we need to make an index.
5514 snew(comm->ddindex2ddnodeid,dd->nnodes);
5515 comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5516 gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5517 /* Get the rank of the DD master,
5518 * above we made sure that the master node is a PP node.
5528 MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5530 else if (comm->bCartesianPP)
5532 if (cr->npmenodes == 0)
5534 /* The PP communicator is also
5535 * the communicator for this simulation
5537 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5539 cr->nodeid = dd->rank;
5541 MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5543 /* We need to make an index to go from the coordinates
5544 * to the nodeid of this simulation.
5546 snew(comm->ddindex2simnodeid,dd->nnodes);
5547 snew(buf,dd->nnodes);
5548 if (cr->duty & DUTY_PP)
5550 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5552 /* Communicate the ddindex to simulation nodeid index */
5553 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5554 cr->mpi_comm_mysim);
5557 /* Determine the master coordinates and rank.
5558 * The DD master should be the same node as the master of this sim.
5560 for(i=0; i<dd->nnodes; i++)
5562 if (comm->ddindex2simnodeid[i] == 0)
5564 ddindex2xyz(dd->nc,i,dd->master_ci);
5565 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5570 fprintf(debug,"The master rank is %d\n",dd->masterrank);
5575 /* No Cartesian communicators */
5576 /* We use the rank in dd->comm->all as DD index */
5577 ddindex2xyz(dd->nc,dd->rank,dd->ci);
5578 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5580 clear_ivec(dd->master_ci);
5587 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5588 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5593 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5594 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5598 static void receive_ddindex2simnodeid(t_commrec *cr)
5602 gmx_domdec_comm_t *comm;
5609 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5611 snew(comm->ddindex2simnodeid,dd->nnodes);
5612 snew(buf,dd->nnodes);
5613 if (cr->duty & DUTY_PP)
5615 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5618 /* Communicate the ddindex to simulation nodeid index */
5619 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5620 cr->mpi_comm_mysim);
5627 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5630 gmx_domdec_master_t *ma;
5635 snew(ma->ncg,dd->nnodes);
5636 snew(ma->index,dd->nnodes+1);
5638 snew(ma->nat,dd->nnodes);
5639 snew(ma->ibuf,dd->nnodes*2);
5640 snew(ma->cell_x,DIM);
5641 for(i=0; i<DIM; i++)
5643 snew(ma->cell_x[i],dd->nc[i]+1);
5646 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5652 snew(ma->vbuf,natoms);
5658 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5662 gmx_domdec_comm_t *comm;
5673 if (comm->bCartesianPP)
5675 for(i=1; i<DIM; i++)
5677 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5679 if (bDiv[YY] || bDiv[ZZ])
5681 comm->bCartesianPP_PME = TRUE;
5682 /* If we have 2D PME decomposition, which is always in x+y,
5683 * we stack the PME only nodes in z.
5684 * Otherwise we choose the direction that provides the thinnest slab
5685 * of PME only nodes as this will have the least effect
5686 * on the PP communication.
5687 * But for the PME communication the opposite might be better.
5689 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5691 dd->nc[YY] > dd->nc[ZZ]))
5693 comm->cartpmedim = ZZ;
5697 comm->cartpmedim = YY;
5699 comm->ntot[comm->cartpmedim]
5700 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5704 fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5706 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5711 if (comm->bCartesianPP_PME)
5715 fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5718 for(i=0; i<DIM; i++)
5722 MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5725 MPI_Comm_rank(comm_cart,&rank);
5726 if (MASTERNODE(cr) && rank != 0)
5728 gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5731 /* With this assigment we loose the link to the original communicator
5732 * which will usually be MPI_COMM_WORLD, unless have multisim.
5734 cr->mpi_comm_mysim = comm_cart;
5735 cr->sim_nodeid = rank;
5737 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5741 fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5742 cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5745 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5749 if (cr->npmenodes == 0 ||
5750 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5752 cr->duty = DUTY_PME;
5755 /* Split the sim communicator into PP and PME only nodes */
5756 MPI_Comm_split(cr->mpi_comm_mysim,
5758 dd_index(comm->ntot,dd->ci),
5759 &cr->mpi_comm_mygroup);
5763 switch (dd_node_order)
5768 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5771 case ddnoINTERLEAVE:
5772 /* Interleave the PP-only and PME-only nodes,
5773 * as on clusters with dual-core machines this will double
5774 * the communication bandwidth of the PME processes
5775 * and thus speed up the PP <-> PME and inter PME communication.
5779 fprintf(fplog,"Interleaving PP and PME nodes\n");
5781 comm->pmenodes = dd_pmenodes(cr);
5786 gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5789 if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5791 cr->duty = DUTY_PME;
5798 /* Split the sim communicator into PP and PME only nodes */
5799 MPI_Comm_split(cr->mpi_comm_mysim,
5802 &cr->mpi_comm_mygroup);
5803 MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5809 fprintf(fplog,"This is a %s only node\n\n",
5810 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5814 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5817 gmx_domdec_comm_t *comm;
5823 copy_ivec(dd->nc,comm->ntot);
5825 comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5826 comm->bCartesianPP_PME = FALSE;
5828 /* Reorder the nodes by default. This might change the MPI ranks.
5829 * Real reordering is only supported on very few architectures,
5830 * Blue Gene is one of them.
5832 CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5834 if (cr->npmenodes > 0)
5836 /* Split the communicator into a PP and PME part */
5837 split_communicator(fplog,cr,dd_node_order,CartReorder);
5838 if (comm->bCartesianPP_PME)
5840 /* We (possibly) reordered the nodes in split_communicator,
5841 * so it is no longer required in make_pp_communicator.
5843 CartReorder = FALSE;
5848 /* All nodes do PP and PME */
5850 /* We do not require separate communicators */
5851 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5855 if (cr->duty & DUTY_PP)
5857 /* Copy or make a new PP communicator */
5858 make_pp_communicator(fplog,cr,CartReorder);
5862 receive_ddindex2simnodeid(cr);
5865 if (!(cr->duty & DUTY_PME))
5867 /* Set up the commnuication to our PME node */
5868 dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5869 dd->pme_receive_vir_ener = receive_vir_ener(cr);
5872 fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5873 dd->pme_nodeid,dd->pme_receive_vir_ener);
5878 dd->pme_nodeid = -1;
5883 dd->ma = init_gmx_domdec_master_t(dd,
5885 comm->cgs_gl.index[comm->cgs_gl.nr]);
5889 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5896 if (nc > 1 && size_string != NULL)
5900 fprintf(fplog,"Using static load balancing for the %s direction\n",
5905 for (i=0; i<nc; i++)
5908 sscanf(size_string,"%lf%n",&dbl,&n);
5911 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5920 fprintf(fplog,"Relative cell sizes:");
5922 for (i=0; i<nc; i++)
5927 fprintf(fplog," %5.3f",slb_frac[i]);
5932 fprintf(fplog,"\n");
5939 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5942 gmx_mtop_ilistloop_t iloop;
5946 iloop = gmx_mtop_ilistloop_init(mtop);
5947 while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5949 for(ftype=0; ftype<F_NRE; ftype++)
5951 if ((interaction_function[ftype].flags & IF_BOND) &&
5954 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5962 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5968 val = getenv(env_var);
5971 if (sscanf(val,"%d",&nst) <= 0)
5977 fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5985 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5989 fprintf(stderr,"\n%s\n",warn_string);
5993 fprintf(fplog,"\n%s\n",warn_string);
5997 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5998 t_inputrec *ir,FILE *fplog)
6000 if (ir->ePBC == epbcSCREW &&
6001 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6003 gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6006 if (ir->ns_type == ensSIMPLE)
6008 gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6011 if (ir->nstlist == 0)
6013 gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6016 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6018 dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6022 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6027 r = ddbox->box_size[XX];
6028 for(di=0; di<dd->ndim; di++)
6031 /* Check using the initial average cell size */
6032 r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6038 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6039 const char *dlb_opt,gmx_bool bRecordLoad,
6040 unsigned long Flags,t_inputrec *ir)
6048 case 'a': eDLB = edlbAUTO; break;
6049 case 'n': eDLB = edlbNO; break;
6050 case 'y': eDLB = edlbYES; break;
6051 default: gmx_incons("Unknown dlb_opt");
6054 if (Flags & MD_RERUN)
6059 if (!EI_DYNAMICS(ir->eI))
6061 if (eDLB == edlbYES)
6063 sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6064 dd_warning(cr,fplog,buf);
6072 dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6077 if (Flags & MD_REPRODUCIBLE)
6084 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6088 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6091 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6099 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6104 if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6106 /* Decomposition order z,y,x */
6109 fprintf(fplog,"Using domain decomposition order z, y, x\n");
6111 for(dim=DIM-1; dim>=0; dim--)
6113 if (dd->nc[dim] > 1)
6115 dd->dim[dd->ndim++] = dim;
6121 /* Decomposition order x,y,z */
6122 for(dim=0; dim<DIM; dim++)
6124 if (dd->nc[dim] > 1)
6126 dd->dim[dd->ndim++] = dim;
6132 static gmx_domdec_comm_t *init_dd_comm()
6134 gmx_domdec_comm_t *comm;
6138 snew(comm->cggl_flag,DIM*2);
6139 snew(comm->cgcm_state,DIM*2);
6140 for(i=0; i<DIM*2; i++)
6142 comm->cggl_flag_nalloc[i] = 0;
6143 comm->cgcm_state_nalloc[i] = 0;
6146 comm->nalloc_int = 0;
6147 comm->buf_int = NULL;
6149 vec_rvec_init(&comm->vbuf);
6151 comm->n_load_have = 0;
6152 comm->n_load_collect = 0;
6154 for(i=0; i<ddnatNR-ddnatZONE; i++)
6156 comm->sum_nat[i] = 0;
6160 comm->load_step = 0;
6163 clear_ivec(comm->load_lim);
6170 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6171 unsigned long Flags,
6173 real comm_distance_min,real rconstr,
6174 const char *dlb_opt,real dlb_scale,
6175 const char *sizex,const char *sizey,const char *sizez,
6176 gmx_mtop_t *mtop,t_inputrec *ir,
6179 int *npme_x,int *npme_y)
6182 gmx_domdec_comm_t *comm;
6185 real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6192 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6197 dd->comm = init_dd_comm();
6199 snew(comm->cggl_flag,DIM*2);
6200 snew(comm->cgcm_state,DIM*2);
6202 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
6203 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6205 dd->bSendRecv2 = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6206 comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6207 comm->eFlop = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6208 recload = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6209 comm->nstSortCG = dd_nst_env(fplog,"GMX_DD_SORT",1);
6210 comm->nstDDDump = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6211 comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6212 comm->DD_debug = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6214 dd->pme_recv_f_alloc = 0;
6215 dd->pme_recv_f_buf = NULL;
6217 if (dd->bSendRecv2 && fplog)
6219 fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6225 fprintf(fplog,"Will load balance based on FLOP count\n");
6227 if (comm->eFlop > 1)
6229 srand(1+cr->nodeid);
6231 comm->bRecordLoad = TRUE;
6235 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6239 comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6241 comm->bDynLoadBal = (comm->eDLB == edlbYES);
6244 fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6246 dd->bGridJump = comm->bDynLoadBal;
6248 if (comm->nstSortCG)
6252 if (comm->nstSortCG == 1)
6254 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6258 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6268 fprintf(fplog,"Will not sort the charge groups\n");
6272 comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6273 if (comm->bInterCGBondeds)
6275 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6279 comm->bInterCGMultiBody = FALSE;
6282 dd->bInterCGcons = inter_charge_group_constraints(mtop);
6284 if (ir->rlistlong == 0)
6286 /* Set the cut-off to some very large value,
6287 * so we don't need if statements everywhere in the code.
6288 * We use sqrt, since the cut-off is squared in some places.
6290 comm->cutoff = GMX_CUTOFF_INF;
6294 comm->cutoff = ir->rlistlong;
6296 comm->cutoff_mbody = 0;
6298 comm->cellsize_limit = 0;
6299 comm->bBondComm = FALSE;
6301 if (comm->bInterCGBondeds)
6303 if (comm_distance_min > 0)
6305 comm->cutoff_mbody = comm_distance_min;
6306 if (Flags & MD_DDBONDCOMM)
6308 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6312 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6314 r_bonded_limit = comm->cutoff_mbody;
6316 else if (ir->bPeriodicMols)
6318 /* Can not easily determine the required cut-off */
6319 dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6320 comm->cutoff_mbody = comm->cutoff/2;
6321 r_bonded_limit = comm->cutoff_mbody;
6327 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6328 Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6330 gmx_bcast(sizeof(r_2b),&r_2b,cr);
6331 gmx_bcast(sizeof(r_mb),&r_mb,cr);
6333 /* We use an initial margin of 10% for the minimum cell size,
6334 * except when we are just below the non-bonded cut-off.
6336 if (Flags & MD_DDBONDCOMM)
6338 if (max(r_2b,r_mb) > comm->cutoff)
6340 r_bonded = max(r_2b,r_mb);
6341 r_bonded_limit = 1.1*r_bonded;
6342 comm->bBondComm = TRUE;
6347 r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6349 /* We determine cutoff_mbody later */
6353 /* No special bonded communication,
6354 * simply increase the DD cut-off.
6356 r_bonded_limit = 1.1*max(r_2b,r_mb);
6357 comm->cutoff_mbody = r_bonded_limit;
6358 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6361 comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6365 "Minimum cell size due to bonded interactions: %.3f nm\n",
6366 comm->cellsize_limit);
6370 if (dd->bInterCGcons && rconstr <= 0)
6372 /* There is a cell size limit due to the constraints (P-LINCS) */
6373 rconstr = constr_r_max(fplog,mtop,ir);
6377 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6379 if (rconstr > comm->cellsize_limit)
6381 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6385 else if (rconstr > 0 && fplog)
6387 /* Here we do not check for dd->bInterCGcons,
6388 * because one can also set a cell size limit for virtual sites only
6389 * and at this point we don't know yet if there are intercg v-sites.
6392 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6395 comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6397 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6401 copy_ivec(nc,dd->nc);
6402 set_dd_dim(fplog,dd);
6403 set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6405 if (cr->npmenodes == -1)
6409 acs = average_cellsize_min(dd,ddbox);
6410 if (acs < comm->cellsize_limit)
6414 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6416 gmx_fatal_collective(FARGS,cr,NULL,
6417 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6418 acs,comm->cellsize_limit);
6423 set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6425 /* We need to choose the optimal DD grid and possibly PME nodes */
6426 limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6427 comm->eDLB!=edlbNO,dlb_scale,
6428 comm->cellsize_limit,comm->cutoff,
6429 comm->bInterCGBondeds,comm->bInterCGMultiBody);
6431 if (dd->nc[XX] == 0)
6433 bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6434 sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6435 !bC ? "-rdd" : "-rcon",
6436 comm->eDLB!=edlbNO ? " or -dds" : "",
6437 bC ? " or your LINCS settings" : "");
6439 gmx_fatal_collective(FARGS,cr,NULL,
6440 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6442 "Look in the log file for details on the domain decomposition",
6443 cr->nnodes-cr->npmenodes,limit,buf);
6445 set_dd_dim(fplog,dd);
6451 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6452 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6455 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6456 if (cr->nnodes - dd->nnodes != cr->npmenodes)
6458 gmx_fatal_collective(FARGS,cr,NULL,
6459 "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6460 dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6462 if (cr->npmenodes > dd->nnodes)
6464 gmx_fatal_collective(FARGS,cr,NULL,
6465 "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6467 if (cr->npmenodes > 0)
6469 comm->npmenodes = cr->npmenodes;
6473 comm->npmenodes = dd->nnodes;
6476 if (EEL_PME(ir->coulombtype))
6478 /* The following choices should match those
6479 * in comm_cost_est in domdec_setup.c.
6480 * Note that here the checks have to take into account
6481 * that the decomposition might occur in a different order than xyz
6482 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6483 * in which case they will not match those in comm_cost_est,
6484 * but since that is mainly for testing purposes that's fine.
6486 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6487 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6488 getenv("GMX_PMEONEDD") == NULL)
6490 comm->npmedecompdim = 2;
6491 comm->npmenodes_x = dd->nc[XX];
6492 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
6496 /* In case nc is 1 in both x and y we could still choose to
6497 * decompose pme in y instead of x, but we use x for simplicity.
6499 comm->npmedecompdim = 1;
6500 if (dd->dim[0] == YY)
6502 comm->npmenodes_x = 1;
6503 comm->npmenodes_y = comm->npmenodes;
6507 comm->npmenodes_x = comm->npmenodes;
6508 comm->npmenodes_y = 1;
6513 fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6514 comm->npmenodes_x,comm->npmenodes_y,1);
6519 comm->npmedecompdim = 0;
6520 comm->npmenodes_x = 0;
6521 comm->npmenodes_y = 0;
6524 /* Technically we don't need both of these,
6525 * but it simplifies code not having to recalculate it.
6527 *npme_x = comm->npmenodes_x;
6528 *npme_y = comm->npmenodes_y;
6530 snew(comm->slb_frac,DIM);
6531 if (comm->eDLB == edlbNO)
6533 comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6534 comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6535 comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6538 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6540 if (comm->bBondComm || comm->eDLB != edlbNO)
6542 /* Set the bonded communication distance to halfway
6543 * the minimum and the maximum,
6544 * since the extra communication cost is nearly zero.
6546 acs = average_cellsize_min(dd,ddbox);
6547 comm->cutoff_mbody = 0.5*(r_bonded + acs);
6548 if (comm->eDLB != edlbNO)
6550 /* Check if this does not limit the scaling */
6551 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6553 if (!comm->bBondComm)
6555 /* Without bBondComm do not go beyond the n.b. cut-off */
6556 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6557 if (comm->cellsize_limit >= comm->cutoff)
6559 /* We don't loose a lot of efficieny
6560 * when increasing it to the n.b. cut-off.
6561 * It can even be slightly faster, because we need
6562 * less checks for the communication setup.
6564 comm->cutoff_mbody = comm->cutoff;
6567 /* Check if we did not end up below our original limit */
6568 comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6570 if (comm->cutoff_mbody > comm->cellsize_limit)
6572 comm->cellsize_limit = comm->cutoff_mbody;
6575 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6580 fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6581 "cellsize limit %f\n",
6582 comm->bBondComm,comm->cellsize_limit);
6587 check_dd_restrictions(cr,dd,ir,fplog);
6590 comm->globalcomm_step = INT_MIN;
6593 clear_dd_cycle_counts(dd);
6598 static void set_dlb_limits(gmx_domdec_t *dd)
6603 for(d=0; d<dd->ndim; d++)
6605 dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6606 dd->comm->cellsize_min[dd->dim[d]] =
6607 dd->comm->cellsize_min_dlb[dd->dim[d]];
6612 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6615 gmx_domdec_comm_t *comm;
6625 fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6628 cellsize_min = comm->cellsize_min[dd->dim[0]];
6629 for(d=1; d<dd->ndim; d++)
6631 cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6634 if (cellsize_min < comm->cellsize_limit*1.05)
6636 dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6638 /* Change DLB from "auto" to "no". */
6639 comm->eDLB = edlbNO;
6644 dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6645 comm->bDynLoadBal = TRUE;
6646 dd->bGridJump = TRUE;
6650 /* We can set the required cell size info here,
6651 * so we do not need to communicate this.
6652 * The grid is completely uniform.
6654 for(d=0; d<dd->ndim; d++)
6658 comm->load[d].sum_m = comm->load[d].sum;
6660 nc = dd->nc[dd->dim[d]];
6663 comm->root[d]->cell_f[i] = i/(real)nc;
6666 comm->root[d]->cell_f_max0[i] = i /(real)nc;
6667 comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6670 comm->root[d]->cell_f[nc] = 1.0;
6675 static char *init_bLocalCG(gmx_mtop_t *mtop)
6680 ncg = ncg_mtop(mtop);
6682 for(cg=0; cg<ncg; cg++)
6684 bLocalCG[cg] = FALSE;
6690 void dd_init_bondeds(FILE *fplog,
6691 gmx_domdec_t *dd,gmx_mtop_t *mtop,
6692 gmx_vsite_t *vsite,gmx_constr_t constr,
6693 t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6695 gmx_domdec_comm_t *comm;
6699 dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6703 if (comm->bBondComm)
6705 /* Communicate atoms beyond the cut-off for bonded interactions */
6708 comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6710 comm->bLocalCG = init_bLocalCG(mtop);
6714 /* Only communicate atoms based on cut-off */
6715 comm->cglink = NULL;
6716 comm->bLocalCG = NULL;
6720 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6722 gmx_bool bDynLoadBal,real dlb_scale,
6725 gmx_domdec_comm_t *comm;
6740 fprintf(fplog,"The maximum number of communication pulses is:");
6741 for(d=0; d<dd->ndim; d++)
6743 fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6745 fprintf(fplog,"\n");
6746 fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6747 fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6748 fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6749 for(d=0; d<DIM; d++)
6753 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6760 comm->cellsize_min_dlb[d]/
6761 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6763 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6766 fprintf(fplog,"\n");
6770 set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6771 fprintf(fplog,"The initial number of communication pulses is:");
6772 for(d=0; d<dd->ndim; d++)
6774 fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6776 fprintf(fplog,"\n");
6777 fprintf(fplog,"The initial domain decomposition cell size is:");
6778 for(d=0; d<DIM; d++) {
6781 fprintf(fplog," %c %.2f nm",
6782 dim2char(d),dd->comm->cellsize_min[d]);
6785 fprintf(fplog,"\n\n");
6788 if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6790 fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6791 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6792 "non-bonded interactions","",comm->cutoff);
6796 limit = dd->comm->cellsize_limit;
6800 if (dynamic_dd_box(ddbox,ir))
6802 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6804 limit = dd->comm->cellsize_min[XX];
6805 for(d=1; d<DIM; d++)
6807 limit = min(limit,dd->comm->cellsize_min[d]);
6811 if (comm->bInterCGBondeds)
6813 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6814 "two-body bonded interactions","(-rdd)",
6815 max(comm->cutoff,comm->cutoff_mbody));
6816 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6817 "multi-body bonded interactions","(-rdd)",
6818 (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6822 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6823 "virtual site constructions","(-rcon)",limit);
6825 if (dd->constraint_comm)
6827 sprintf(buf,"atoms separated by up to %d constraints",
6829 fprintf(fplog,"%40s %-7s %6.3f nm\n",
6830 buf,"(-rcon)",limit);
6832 fprintf(fplog,"\n");
6838 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6839 t_inputrec *ir,t_forcerec *fr,
6842 gmx_domdec_comm_t *comm;
6843 int d,dim,npulse,npulse_d_max,npulse_d;
6850 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6852 if (EEL_PME(ir->coulombtype))
6854 init_ddpme(dd,&comm->ddpme[0],0);
6855 if (comm->npmedecompdim >= 2)
6857 init_ddpme(dd,&comm->ddpme[1],1);
6862 comm->npmenodes = 0;
6863 if (dd->pme_nodeid >= 0)
6865 gmx_fatal_collective(FARGS,NULL,dd,
6866 "Can not have separate PME nodes without PME electrostatics");
6870 /* If each molecule is a single charge group
6871 * or we use domain decomposition for each periodic dimension,
6872 * we do not need to take pbc into account for the bonded interactions.
6874 if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6875 (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6877 fr->bMolPBC = FALSE;
6886 fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6888 if (comm->eDLB != edlbNO)
6890 /* Determine the maximum number of comm. pulses in one dimension */
6892 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6894 /* Determine the maximum required number of grid pulses */
6895 if (comm->cellsize_limit >= comm->cutoff)
6897 /* Only a single pulse is required */
6900 else if (!bNoCutOff && comm->cellsize_limit > 0)
6902 /* We round down slightly here to avoid overhead due to the latency
6903 * of extra communication calls when the cut-off
6904 * would be only slightly longer than the cell size.
6905 * Later cellsize_limit is redetermined,
6906 * so we can not miss interactions due to this rounding.
6908 npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6912 /* There is no cell size limit */
6913 npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6916 if (!bNoCutOff && npulse > 1)
6918 /* See if we can do with less pulses, based on dlb_scale */
6920 for(d=0; d<dd->ndim; d++)
6923 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6924 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6925 npulse_d_max = max(npulse_d_max,npulse_d);
6927 npulse = min(npulse,npulse_d_max);
6930 /* This env var can override npulse */
6931 d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6938 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6939 for(d=0; d<dd->ndim; d++)
6941 comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6942 comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6943 snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6944 comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6945 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6947 comm->bVacDLBNoLimit = FALSE;
6951 /* cellsize_limit is set for LINCS in init_domain_decomposition */
6952 if (!comm->bVacDLBNoLimit)
6954 comm->cellsize_limit = max(comm->cellsize_limit,
6955 comm->cutoff/comm->maxpulse);
6957 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6958 /* Set the minimum cell size for each DD dimension */
6959 for(d=0; d<dd->ndim; d++)
6961 if (comm->bVacDLBNoLimit ||
6962 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6964 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6968 comm->cellsize_min_dlb[dd->dim[d]] =
6969 comm->cutoff/comm->cd[d].np_dlb;
6972 if (comm->cutoff_mbody <= 0)
6974 comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6976 if (comm->bDynLoadBal)
6982 print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6983 if (comm->eDLB == edlbAUTO)
6987 fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6989 print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6992 if (ir->ePBC == epbcNONE)
6994 vol_frac = 1 - 1/(double)dd->nnodes;
6999 (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7003 fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7005 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7007 dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7010 static void merge_cg_buffers(int ncell,
7011 gmx_domdec_comm_dim_t *cd, int pulse,
7013 int *index_gl, int *recv_i,
7014 rvec *cg_cm, rvec *recv_vr,
7016 cginfo_mb_t *cginfo_mb,int *cginfo)
7018 gmx_domdec_ind_t *ind,*ind_p;
7019 int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7022 ind = &cd->ind[pulse];
7024 /* First correct the already stored data */
7025 shift = ind->nrecv[ncell];
7026 for(cell=ncell-1; cell>=0; cell--)
7028 shift -= ind->nrecv[cell];
7031 /* Move the cg's present from previous grid pulses */
7032 cg0 = ncg_cell[ncell+cell];
7033 cg1 = ncg_cell[ncell+cell+1];
7034 cgindex[cg1+shift] = cgindex[cg1];
7035 for(cg=cg1-1; cg>=cg0; cg--)
7037 index_gl[cg+shift] = index_gl[cg];
7038 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7039 cgindex[cg+shift] = cgindex[cg];
7040 cginfo[cg+shift] = cginfo[cg];
7042 /* Correct the already stored send indices for the shift */
7043 for(p=1; p<=pulse; p++)
7045 ind_p = &cd->ind[p];
7047 for(c=0; c<cell; c++)
7049 cg0 += ind_p->nsend[c];
7051 cg1 = cg0 + ind_p->nsend[cell];
7052 for(cg=cg0; cg<cg1; cg++)
7054 ind_p->index[cg] += shift;
7060 /* Merge in the communicated buffers */
7064 for(cell=0; cell<ncell; cell++)
7066 cg1 = ncg_cell[ncell+cell+1] + shift;
7069 /* Correct the old cg indices */
7070 for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7072 cgindex[cg+1] += shift_at;
7075 for(cg=0; cg<ind->nrecv[cell]; cg++)
7077 /* Copy this charge group from the buffer */
7078 index_gl[cg1] = recv_i[cg0];
7079 copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7080 /* Add it to the cgindex */
7081 cg_gl = index_gl[cg1];
7082 cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7083 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7084 cgindex[cg1+1] = cgindex[cg1] + nat;
7089 shift += ind->nrecv[cell];
7090 ncg_cell[ncell+cell+1] = cg1;
7094 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7095 int nzone,int cg0,const int *cgindex)
7099 /* Store the atom block boundaries for easy copying of communication buffers
7102 for(zone=0; zone<nzone; zone++)
7104 for(p=0; p<cd->np; p++) {
7105 cd->ind[p].cell2at0[zone] = cgindex[cg];
7106 cg += cd->ind[p].nrecv[zone];
7107 cd->ind[p].cell2at1[zone] = cgindex[cg];
7112 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7118 for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7120 if (!bLocalCG[link->a[i]])
7129 static void setup_dd_communication(gmx_domdec_t *dd,
7130 matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7132 int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7133 int nzone,nzone_send,zone,zonei,cg0,cg1;
7134 int c,i,j,cg,cg_gl,nrcg;
7135 int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7136 gmx_domdec_comm_t *comm;
7137 gmx_domdec_zones_t *zones;
7138 gmx_domdec_comm_dim_t *cd;
7139 gmx_domdec_ind_t *ind;
7140 cginfo_mb_t *cginfo_mb;
7141 gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7142 real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7144 real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7145 real bcorner[DIM],bcorner_round_1=0;
7147 rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7148 real skew_fac2_d,skew_fac_01;
7154 fprintf(debug,"Setting up DD communication\n");
7160 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7162 dim = dd->dim[dim_ind];
7164 /* Check if we need to use triclinic distances */
7165 tric_dist[dim_ind] = 0;
7166 for(i=0; i<=dim_ind; i++)
7168 if (ddbox->tric_dir[dd->dim[i]])
7170 tric_dist[dim_ind] = 1;
7175 bBondComm = comm->bBondComm;
7177 /* Do we need to determine extra distances for multi-body bondeds? */
7178 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7180 /* Do we need to determine extra distances for only two-body bondeds? */
7181 bDist2B = (bBondComm && !bDistMB);
7183 r_comm2 = sqr(comm->cutoff);
7184 r_bcomm2 = sqr(comm->cutoff_mbody);
7188 fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7191 zones = &comm->zones;
7194 /* The first dimension is equal for all cells */
7195 corner[0][0] = comm->cell_x0[dim0];
7198 bcorner[0] = corner[0][0];
7203 /* This cell row is only seen from the first row */
7204 corner[1][0] = comm->cell_x0[dim1];
7205 /* All rows can see this row */
7206 corner[1][1] = comm->cell_x0[dim1];
7209 corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7212 /* For the multi-body distance we need the maximum */
7213 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7216 /* Set the upper-right corner for rounding */
7217 corner_round_0 = comm->cell_x1[dim0];
7224 corner[2][j] = comm->cell_x0[dim2];
7228 /* Use the maximum of the i-cells that see a j-cell */
7229 for(i=0; i<zones->nizone; i++)
7231 for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7237 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7243 /* For the multi-body distance we need the maximum */
7244 bcorner[2] = comm->cell_x0[dim2];
7249 bcorner[2] = max(bcorner[2],
7250 comm->zone_d2[i][j].p1_0);
7256 /* Set the upper-right corner for rounding */
7257 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7258 * Only cell (0,0,0) can see cell 7 (1,1,1)
7260 corner_round_1[0] = comm->cell_x1[dim1];
7261 corner_round_1[3] = comm->cell_x1[dim1];
7264 corner_round_1[0] = max(comm->cell_x1[dim1],
7265 comm->zone_d1[1].mch1);
7268 /* For the multi-body distance we need the maximum */
7269 bcorner_round_1 = max(comm->cell_x1[dim1],
7270 comm->zone_d1[1].p1_1);
7276 /* Triclinic stuff */
7277 normal = ddbox->normal;
7281 v_0 = ddbox->v[dim0];
7282 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7284 /* Determine the coupling coefficient for the distances
7285 * to the cell planes along dim0 and dim1 through dim2.
7286 * This is required for correct rounding.
7289 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7292 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7298 v_1 = ddbox->v[dim1];
7301 zone_cg_range = zones->cg_range;
7302 index_gl = dd->index_gl;
7303 cgindex = dd->cgindex;
7304 cginfo_mb = fr->cginfo_mb;
7306 zone_cg_range[0] = 0;
7307 zone_cg_range[1] = dd->ncg_home;
7308 comm->zone_ncg1[0] = dd->ncg_home;
7309 pos_cg = dd->ncg_home;
7311 nat_tot = dd->nat_home;
7313 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7315 dim = dd->dim[dim_ind];
7316 cd = &comm->cd[dim_ind];
7318 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7320 /* No pbc in this dimension, the first node should not comm. */
7328 bScrew = (dd->bScrewPBC && dim == XX);
7330 v_d = ddbox->v[dim];
7331 skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7333 cd->bInPlace = TRUE;
7334 for(p=0; p<cd->np; p++)
7336 /* Only atoms communicated in the first pulse are used
7337 * for multi-body bonded interactions or for bBondComm.
7339 bDistBonded = ((bDistMB || bDist2B) && p == 0);
7340 bDistMB_pulse = (bDistMB && bDistBonded);
7345 for(zone=0; zone<nzone_send; zone++)
7347 if (tric_dist[dim_ind] && dim_ind > 0)
7349 /* Determine slightly more optimized skew_fac's
7351 * This reduces the number of communicated atoms
7352 * by about 10% for 3D DD of rhombic dodecahedra.
7354 for(dimd=0; dimd<dim; dimd++)
7356 sf2_round[dimd] = 1;
7357 if (ddbox->tric_dir[dimd])
7359 for(i=dd->dim[dimd]+1; i<DIM; i++)
7361 /* If we are shifted in dimension i
7362 * and the cell plane is tilted forward
7363 * in dimension i, skip this coupling.
7365 if (!(zones->shift[nzone+zone][i] &&
7366 ddbox->v[dimd][i][dimd] >= 0))
7369 sqr(ddbox->v[dimd][i][dimd]);
7372 sf2_round[dimd] = 1/sf2_round[dimd];
7377 zonei = zone_perm[dim_ind][zone];
7380 /* Here we permutate the zones to obtain a convenient order
7381 * for neighbor searching
7383 cg0 = zone_cg_range[zonei];
7384 cg1 = zone_cg_range[zonei+1];
7388 /* Look only at the cg's received in the previous grid pulse
7390 cg1 = zone_cg_range[nzone+zone+1];
7391 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7393 ind->nsend[zone] = 0;
7394 for(cg=cg0; cg<cg1; cg++)
7398 if (tric_dist[dim_ind] == 0)
7400 /* Rectangular direction, easy */
7401 r = cg_cm[cg][dim] - corner[dim_ind][zone];
7408 r = cg_cm[cg][dim] - bcorner[dim_ind];
7414 /* Rounding gives at most a 16% reduction
7415 * in communicated atoms
7417 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7419 r = cg_cm[cg][dim0] - corner_round_0;
7420 /* This is the first dimension, so always r >= 0 */
7427 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7429 r = cg_cm[cg][dim1] - corner_round_1[zone];
7436 r = cg_cm[cg][dim1] - bcorner_round_1;
7446 /* Triclinic direction, more complicated */
7449 /* Rounding, conservative as the skew_fac multiplication
7450 * will slightly underestimate the distance.
7452 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7454 rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7455 for(i=dim0+1; i<DIM; i++)
7457 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7459 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7462 rb[dim0] = rn[dim0];
7465 /* Take care that the cell planes along dim0 might not
7466 * be orthogonal to those along dim1 and dim2.
7468 for(i=1; i<=dim_ind; i++)
7471 if (normal[dim0][dimd] > 0)
7473 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7476 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7481 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7483 rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7485 for(i=dim1+1; i<DIM; i++)
7487 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7489 rn[dim1] += tric_sh;
7492 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7493 /* Take care of coupling of the distances
7494 * to the planes along dim0 and dim1 through dim2.
7496 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7497 /* Take care that the cell planes along dim1
7498 * might not be orthogonal to that along dim2.
7500 if (normal[dim1][dim2] > 0)
7502 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7508 cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7511 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7512 /* Take care of coupling of the distances
7513 * to the planes along dim0 and dim1 through dim2.
7515 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7516 /* Take care that the cell planes along dim1
7517 * might not be orthogonal to that along dim2.
7519 if (normal[dim1][dim2] > 0)
7521 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7526 /* The distance along the communication direction */
7527 rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7529 for(i=dim+1; i<DIM; i++)
7531 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7536 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7537 /* Take care of coupling of the distances
7538 * to the planes along dim0 and dim1 through dim2.
7540 if (dim_ind == 1 && zonei == 1)
7542 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7548 rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7551 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7552 /* Take care of coupling of the distances
7553 * to the planes along dim0 and dim1 through dim2.
7555 if (dim_ind == 1 && zonei == 1)
7557 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7565 ((bDistMB && rb2 < r_bcomm2) ||
7566 (bDist2B && r2 < r_bcomm2)) &&
7568 (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7569 missing_link(comm->cglink,index_gl[cg],
7572 /* Make an index to the local charge groups */
7573 if (nsend+1 > ind->nalloc)
7575 ind->nalloc = over_alloc_large(nsend+1);
7576 srenew(ind->index,ind->nalloc);
7578 if (nsend+1 > comm->nalloc_int)
7580 comm->nalloc_int = over_alloc_large(nsend+1);
7581 srenew(comm->buf_int,comm->nalloc_int);
7583 ind->index[nsend] = cg;
7584 comm->buf_int[nsend] = index_gl[cg];
7586 vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7588 if (dd->ci[dim] == 0)
7590 /* Correct cg_cm for pbc */
7591 rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7594 comm->vbuf.v[nsend][YY] =
7595 box[YY][YY]-comm->vbuf.v[nsend][YY];
7596 comm->vbuf.v[nsend][ZZ] =
7597 box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7602 copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7605 nat += cgindex[cg+1] - cgindex[cg];
7609 /* Clear the counts in case we do not have pbc */
7610 for(zone=nzone_send; zone<nzone; zone++)
7612 ind->nsend[zone] = 0;
7614 ind->nsend[nzone] = nsend;
7615 ind->nsend[nzone+1] = nat;
7616 /* Communicate the number of cg's and atoms to receive */
7617 dd_sendrecv_int(dd, dim_ind, dddirBackward,
7618 ind->nsend, nzone+2,
7619 ind->nrecv, nzone+2);
7621 /* The rvec buffer is also required for atom buffers of size nsend
7622 * in dd_move_x and dd_move_f.
7624 vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7628 /* We can receive in place if only the last zone is not empty */
7629 for(zone=0; zone<nzone-1; zone++)
7631 if (ind->nrecv[zone] > 0)
7633 cd->bInPlace = FALSE;
7638 /* The int buffer is only required here for the cg indices */
7639 if (ind->nrecv[nzone] > comm->nalloc_int2)
7641 comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7642 srenew(comm->buf_int2,comm->nalloc_int2);
7644 /* The rvec buffer is also required for atom buffers
7645 * of size nrecv in dd_move_x and dd_move_f.
7647 i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7648 vec_rvec_check_alloc(&comm->vbuf2,i);
7652 /* Make space for the global cg indices */
7653 if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7654 || dd->cg_nalloc == 0)
7656 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7657 srenew(index_gl,dd->cg_nalloc);
7658 srenew(cgindex,dd->cg_nalloc+1);
7660 /* Communicate the global cg indices */
7663 recv_i = index_gl + pos_cg;
7667 recv_i = comm->buf_int2;
7669 dd_sendrecv_int(dd, dim_ind, dddirBackward,
7670 comm->buf_int, nsend,
7671 recv_i, ind->nrecv[nzone]);
7673 /* Make space for cg_cm */
7674 if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7676 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7679 /* Communicate cg_cm */
7682 recv_vr = cg_cm + pos_cg;
7686 recv_vr = comm->vbuf2.v;
7688 dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7689 comm->vbuf.v, nsend,
7690 recv_vr, ind->nrecv[nzone]);
7692 /* Make the charge group index */
7695 zone = (p == 0 ? 0 : nzone - 1);
7696 while (zone < nzone)
7698 for(cg=0; cg<ind->nrecv[zone]; cg++)
7700 cg_gl = index_gl[pos_cg];
7701 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7702 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7703 cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7706 /* Update the charge group presence,
7707 * so we can use it in the next pass of the loop.
7709 comm->bLocalCG[cg_gl] = TRUE;
7715 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7718 zone_cg_range[nzone+zone] = pos_cg;
7723 /* This part of the code is never executed with bBondComm. */
7724 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7725 index_gl,recv_i,cg_cm,recv_vr,
7726 cgindex,fr->cginfo_mb,fr->cginfo);
7727 pos_cg += ind->nrecv[nzone];
7729 nat_tot += ind->nrecv[nzone+1];
7733 /* Store the atom block for easy copying of communication buffers */
7734 make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7738 dd->index_gl = index_gl;
7739 dd->cgindex = cgindex;
7741 dd->ncg_tot = zone_cg_range[zones->n];
7742 dd->nat_tot = nat_tot;
7743 comm->nat[ddnatHOME] = dd->nat_home;
7744 for(i=ddnatZONE; i<ddnatNR; i++)
7746 comm->nat[i] = dd->nat_tot;
7751 /* We don't need to update cginfo, since that was alrady done above.
7752 * So we pass NULL for the forcerec.
7754 dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7755 NULL,comm->bLocalCG);
7760 fprintf(debug,"Finished setting up DD communication, zones:");
7761 for(c=0; c<zones->n; c++)
7763 fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7765 fprintf(debug,"\n");
7769 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7773 for(c=0; c<zones->nizone; c++)
7775 zones->izone[c].cg1 = zones->cg_range[c+1];
7776 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7777 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7781 static int comp_cgsort(const void *a,const void *b)
7785 gmx_cgsort_t *cga,*cgb;
7786 cga = (gmx_cgsort_t *)a;
7787 cgb = (gmx_cgsort_t *)b;
7789 comp = cga->nsc - cgb->nsc;
7792 comp = cga->ind_gl - cgb->ind_gl;
7798 static void order_int_cg(int n,gmx_cgsort_t *sort,
7803 /* Order the data */
7806 buf[i] = a[sort[i].ind];
7809 /* Copy back to the original array */
7816 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7821 /* Order the data */
7824 copy_rvec(v[sort[i].ind],buf[i]);
7827 /* Copy back to the original array */
7830 copy_rvec(buf[i],v[i]);
7834 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7837 int a,atot,cg,cg0,cg1,i;
7839 /* Order the data */
7841 for(cg=0; cg<ncg; cg++)
7843 cg0 = cgindex[sort[cg].ind];
7844 cg1 = cgindex[sort[cg].ind+1];
7845 for(i=cg0; i<cg1; i++)
7847 copy_rvec(v[i],buf[a]);
7853 /* Copy back to the original array */
7854 for(a=0; a<atot; a++)
7856 copy_rvec(buf[a],v[a]);
7860 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7861 int nsort_new,gmx_cgsort_t *sort_new,
7862 gmx_cgsort_t *sort1)
7866 /* The new indices are not very ordered, so we qsort them */
7867 qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7869 /* sort2 is already ordered, so now we can merge the two arrays */
7873 while(i2 < nsort2 || i_new < nsort_new)
7877 sort1[i1++] = sort_new[i_new++];
7879 else if (i_new == nsort_new)
7881 sort1[i1++] = sort2[i2++];
7883 else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7884 (sort2[i2].nsc == sort_new[i_new].nsc &&
7885 sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7887 sort1[i1++] = sort2[i2++];
7891 sort1[i1++] = sort_new[i_new++];
7896 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7897 rvec *cgcm,t_forcerec *fr,t_state *state,
7900 gmx_domdec_sort_t *sort;
7901 gmx_cgsort_t *cgsort,*sort_i;
7902 int ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7905 sort = dd->comm->sort;
7907 if (dd->ncg_home > sort->sort_nalloc)
7909 sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7910 srenew(sort->sort1,sort->sort_nalloc);
7911 srenew(sort->sort2,sort->sort_nalloc);
7914 if (ncg_home_old >= 0)
7916 /* The charge groups that remained in the same ns grid cell
7917 * are completely ordered. So we can sort efficiently by sorting
7918 * the charge groups that did move into the stationary list.
7923 for(i=0; i<dd->ncg_home; i++)
7925 /* Check if this cg did not move to another node */
7926 cell_index = fr->ns.grid->cell_index[i];
7927 if (cell_index != 4*fr->ns.grid->ncells)
7929 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7931 /* This cg is new on this node or moved ns grid cell */
7932 if (nsort_new >= sort->sort_new_nalloc)
7934 sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7935 srenew(sort->sort_new,sort->sort_new_nalloc);
7937 sort_i = &(sort->sort_new[nsort_new++]);
7941 /* This cg did not move */
7942 sort_i = &(sort->sort2[nsort2++]);
7944 /* Sort on the ns grid cell indices
7945 * and the global topology index
7947 sort_i->nsc = cell_index;
7948 sort_i->ind_gl = dd->index_gl[i];
7955 fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7958 /* Sort efficiently */
7959 ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7963 cgsort = sort->sort1;
7965 for(i=0; i<dd->ncg_home; i++)
7967 /* Sort on the ns grid cell indices
7968 * and the global topology index
7970 cgsort[i].nsc = fr->ns.grid->cell_index[i];
7971 cgsort[i].ind_gl = dd->index_gl[i];
7973 if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7980 fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7982 /* Determine the order of the charge groups using qsort */
7983 qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7985 cgsort = sort->sort1;
7987 /* We alloc with the old size, since cgindex is still old */
7988 vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7989 vbuf = dd->comm->vbuf.v;
7991 /* Remove the charge groups which are no longer at home here */
7992 dd->ncg_home = ncg_new;
7994 /* Reorder the state */
7995 for(i=0; i<estNR; i++)
7997 if (EST_DISTR(i) && (state->flags & (1<<i)))
8002 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8005 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8008 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8011 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8015 case estDISRE_INITF:
8016 case estDISRE_RM3TAV:
8017 case estORIRE_INITF:
8019 /* No ordering required */
8022 gmx_incons("Unknown state entry encountered in dd_sort_state");
8028 order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8030 if (dd->ncg_home+1 > sort->ibuf_nalloc)
8032 sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8033 srenew(sort->ibuf,sort->ibuf_nalloc);
8036 /* Reorder the global cg index */
8037 order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8038 /* Reorder the cginfo */
8039 order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8040 /* Rebuild the local cg index */
8042 for(i=0; i<dd->ncg_home; i++)
8044 cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8045 ibuf[i+1] = ibuf[i] + cgsize;
8047 for(i=0; i<dd->ncg_home+1; i++)
8049 dd->cgindex[i] = ibuf[i];
8051 /* Set the home atom number */
8052 dd->nat_home = dd->cgindex[dd->ncg_home];
8054 /* Copy the sorted ns cell indices back to the ns grid struct */
8055 for(i=0; i<dd->ncg_home; i++)
8057 fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8059 fr->ns.grid->nr = dd->ncg_home;
8062 static void add_dd_statistics(gmx_domdec_t *dd)
8064 gmx_domdec_comm_t *comm;
8069 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8071 comm->sum_nat[ddnat-ddnatZONE] +=
8072 comm->nat[ddnat] - comm->nat[ddnat-1];
8077 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8079 gmx_domdec_comm_t *comm;
8084 /* Reset all the statistics and counters for total run counting */
8085 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8087 comm->sum_nat[ddnat-ddnatZONE] = 0;
8091 comm->load_step = 0;
8094 clear_ivec(comm->load_lim);
8099 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8101 gmx_domdec_comm_t *comm;
8105 comm = cr->dd->comm;
8107 gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8114 fprintf(fplog,"\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
8116 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8118 av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8123 " av. #atoms communicated per step for force: %d x %.1f\n",
8127 if (cr->dd->vsite_comm)
8130 " av. #atoms communicated per step for vsites: %d x %.1f\n",
8131 (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8136 if (cr->dd->constraint_comm)
8139 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
8140 1 + ir->nLincsIter,av);
8144 gmx_incons(" Unknown type for DD statistics");
8147 fprintf(fplog,"\n");
8149 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8151 print_dd_load_av(fplog,cr->dd);
8155 void dd_partition_system(FILE *fplog,
8156 gmx_large_int_t step,
8158 gmx_bool bMasterState,
8160 t_state *state_global,
8161 gmx_mtop_t *top_global,
8163 t_state *state_local,
8166 gmx_localtop_t *top_local,
8169 gmx_shellfc_t shellfc,
8170 gmx_constr_t constr,
8172 gmx_wallcycle_t wcycle,
8176 gmx_domdec_comm_t *comm;
8177 gmx_ddbox_t ddbox={0};
8179 gmx_large_int_t step_pcoupl;
8180 rvec cell_ns_x0,cell_ns_x1;
8181 int i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8182 gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8183 gmx_bool bRedist,bSortCG,bResortAll;
8191 bBoxChanged = (bMasterState || DEFORM(*ir));
8192 if (ir->epc != epcNO)
8194 /* With nstpcouple > 1 pressure coupling happens.
8195 * one step after calculating the pressure.
8196 * Box scaling happens at the end of the MD step,
8197 * after the DD partitioning.
8198 * We therefore have to do DLB in the first partitioning
8199 * after an MD step where P-coupling occured.
8200 * We need to determine the last step in which p-coupling occurred.
8201 * MRS -- need to validate this for vv?
8206 step_pcoupl = step - 1;
8210 step_pcoupl = ((step - 1)/n)*n + 1;
8212 if (step_pcoupl >= comm->globalcomm_step)
8218 bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8220 if (!comm->bDynLoadBal)
8226 /* Should we do dynamic load balacing this step?
8227 * Since it requires (possibly expensive) global communication,
8228 * we might want to do DLB less frequently.
8230 if (bBoxChanged || ir->epc != epcNO)
8232 bDoDLB = bBoxChanged;
8236 bDoDLB = bNStGlobalComm;
8240 /* Check if we have recorded loads on the nodes */
8241 if (comm->bRecordLoad && dd_load_count(comm))
8243 if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8245 /* Check if we should use DLB at the second partitioning
8246 * and every 100 partitionings,
8247 * so the extra communication cost is negligible.
8249 n = max(100,nstglobalcomm);
8250 bCheckDLB = (comm->n_load_collect == 0 ||
8251 comm->n_load_have % n == n-1);
8258 /* Print load every nstlog, first and last step to the log file */
8259 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8260 comm->n_load_collect == 0 ||
8262 (step + ir->nstlist > ir->init_step + ir->nsteps)));
8264 /* Avoid extra communication due to verbose screen output
8265 * when nstglobalcomm is set.
8267 if (bDoDLB || bLogLoad || bCheckDLB ||
8268 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8270 get_load_distribution(dd,wcycle);
8275 dd_print_load(fplog,dd,step-1);
8279 dd_print_load_verbose(dd);
8282 comm->n_load_collect++;
8285 /* Since the timings are node dependent, the master decides */
8289 (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8292 fprintf(debug,"step %s, imb loss %f\n",
8293 gmx_step_str(step,sbuf),
8294 dd_force_imb_perf_loss(dd));
8297 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8300 turn_on_dlb(fplog,cr,step);
8305 comm->n_load_have++;
8308 cgs_gl = &comm->cgs_gl;
8313 /* Clear the old state */
8314 clear_dd_indices(dd,0,0);
8316 set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8317 TRUE,cgs_gl,state_global->x,&ddbox);
8319 get_cg_distribution(fplog,step,dd,cgs_gl,
8320 state_global->box,&ddbox,state_global->x);
8322 dd_distribute_state(dd,cgs_gl,
8323 state_global,state_local,f);
8325 dd_make_local_cgs(dd,&top_local->cgs);
8327 if (dd->ncg_home > fr->cg_nalloc)
8329 dd_realloc_fr_cg(fr,dd->ncg_home);
8331 calc_cgcm(fplog,0,dd->ncg_home,
8332 &top_local->cgs,state_local->x,fr->cg_cm);
8334 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8336 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8340 else if (state_local->ddp_count != dd->ddp_count)
8342 if (state_local->ddp_count > dd->ddp_count)
8344 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8347 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8349 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8352 /* Clear the old state */
8353 clear_dd_indices(dd,0,0);
8355 /* Build the new indices */
8356 rebuild_cgindex(dd,cgs_gl->index,state_local);
8357 make_dd_indices(dd,cgs_gl->index,0);
8359 /* Redetermine the cg COMs */
8360 calc_cgcm(fplog,0,dd->ncg_home,
8361 &top_local->cgs,state_local->x,fr->cg_cm);
8363 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8365 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8367 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8368 TRUE,&top_local->cgs,state_local->x,&ddbox);
8370 bRedist = comm->bDynLoadBal;
8374 /* We have the full state, only redistribute the cgs */
8376 /* Clear the non-home indices */
8377 clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8379 /* Avoid global communication for dim's without pbc and -gcom */
8380 if (!bNStGlobalComm)
8382 copy_rvec(comm->box0 ,ddbox.box0 );
8383 copy_rvec(comm->box_size,ddbox.box_size);
8385 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8386 bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8391 /* For dim's without pbc and -gcom */
8392 copy_rvec(ddbox.box0 ,comm->box0 );
8393 copy_rvec(ddbox.box_size,comm->box_size);
8395 set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8398 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8400 write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8403 /* Check if we should sort the charge groups */
8404 if (comm->nstSortCG > 0)
8406 bSortCG = (bMasterState ||
8407 (bRedist && (step % comm->nstSortCG == 0)));
8414 ncg_home_old = dd->ncg_home;
8418 cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8419 state_local,f,fr,mdatoms,
8423 get_nsgrid_boundaries(fr->ns.grid,dd,
8424 state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8425 dd->ncg_home,fr->cg_cm,
8426 cell_ns_x0,cell_ns_x1,&grid_density);
8430 comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8433 copy_ivec(fr->ns.grid->n,ncells_old);
8434 grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8435 state_local->box,cell_ns_x0,cell_ns_x1,
8436 fr->rlistlong,grid_density);
8437 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8438 copy_ivec(ddbox.tric_dir,comm->tric_dir);
8442 /* Sort the state on charge group position.
8443 * This enables exact restarts from this step.
8444 * It also improves performance by about 15% with larger numbers
8445 * of atoms per node.
8448 /* Fill the ns grid with the home cell,
8449 * so we can sort with the indices.
8451 set_zones_ncg_home(dd);
8452 fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8453 0,dd->ncg_home,fr->cg_cm);
8455 /* Check if we can user the old order and ns grid cell indices
8456 * of the charge groups to sort the charge groups efficiently.
8458 bResortAll = (bMasterState ||
8459 fr->ns.grid->n[XX] != ncells_old[XX] ||
8460 fr->ns.grid->n[YY] != ncells_old[YY] ||
8461 fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8465 fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8466 gmx_step_str(step,sbuf),dd->ncg_home);
8468 dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8469 bResortAll ? -1 : ncg_home_old);
8470 /* Rebuild all the indices */
8472 ga2la_clear(dd->ga2la);
8475 /* Setup up the communication and communicate the coordinates */
8476 setup_dd_communication(dd,state_local->box,&ddbox,fr);
8478 /* Set the indices */
8479 make_dd_indices(dd,cgs_gl->index,cg0);
8481 /* Set the charge group boundaries for neighbor searching */
8482 set_cg_boundaries(&comm->zones);
8485 write_dd_pdb("dd_home",step,"dump",top_global,cr,
8486 -1,state_local->x,state_local->box);
8489 /* Extract a local topology from the global topology */
8490 for(i=0; i<dd->ndim; i++)
8492 np[dd->dim[i]] = comm->cd[i].np;
8494 dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8495 comm->cellsize_min,np,
8496 fr,vsite,top_global,top_local);
8498 /* Set up the special atom communication */
8499 n = comm->nat[ddnatZONE];
8500 for(i=ddnatZONE+1; i<ddnatNR; i++)
8505 if (vsite && vsite->n_intercg_vsite)
8507 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8511 if (dd->bInterCGcons)
8513 /* Only for inter-cg constraints we need special code */
8514 n = dd_make_local_constraints(dd,n,top_global,
8515 constr,ir->nProjOrder,
8516 &top_local->idef.il[F_CONSTR]);
8520 gmx_incons("Unknown special atom type setup");
8525 /* Make space for the extra coordinates for virtual site
8526 * or constraint communication.
8528 state_local->natoms = comm->nat[ddnatNR-1];
8529 if (state_local->natoms > state_local->nalloc)
8531 dd_realloc_state(state_local,f,state_local->natoms);
8534 if (fr->bF_NoVirSum)
8536 if (vsite && vsite->n_intercg_vsite)
8538 nat_f_novirsum = comm->nat[ddnatVSITE];
8542 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8544 nat_f_novirsum = dd->nat_tot;
8548 nat_f_novirsum = dd->nat_home;
8557 /* Set the number of atoms required for the force calculation.
8558 * Forces need to be constrained when using a twin-range setup
8559 * or with energy minimization. For simple simulations we could
8560 * avoid some allocation, zeroing and copying, but this is
8561 * probably not worth the complications ande checking.
8563 forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8564 dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8566 /* We make the all mdatoms up to nat_tot_con.
8567 * We could save some work by only setting invmass
8568 * between nat_tot and nat_tot_con.
8570 /* This call also sets the new number of home particles to dd->nat_home */
8571 atoms2md(top_global,ir,
8572 comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8574 /* Now we have the charges we can sort the FE interactions */
8575 dd_sort_local_top(dd,mdatoms,top_local);
8579 /* Make the local shell stuff, currently no communication is done */
8580 make_local_shells(cr,mdatoms,shellfc);
8583 if (ir->implicit_solvent)
8585 make_local_gb(cr,fr->born,ir->gb_algorithm);
8588 if (!(cr->duty & DUTY_PME))
8590 /* Send the charges to our PME only node */
8591 gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8592 mdatoms->chargeA,mdatoms->chargeB,
8593 dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8598 set_constraints(constr,top_local,ir,mdatoms,cr);
8601 if (ir->ePull != epullNO)
8603 /* Update the local pull groups */
8604 dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8609 /* Update the local rotation groups */
8610 dd_make_local_rotation_groups(dd,ir->rot);
8614 add_dd_statistics(dd);
8616 /* Make sure we only count the cycles for this DD partitioning */
8617 clear_dd_cycle_counts(dd);
8619 /* Because the order of the atoms might have changed since
8620 * the last vsite construction, we need to communicate the constructing
8621 * atom coordinates again (for spreading the forces this MD step).
8623 dd_move_x_vsites(dd,state_local->box,state_local->x);
8625 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8627 dd_move_x(dd,state_local->box,state_local->x);
8628 write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8629 -1,state_local->x,state_local->box);
8634 /* Store the global communication step */
8635 comm->globalcomm_step = step;
8638 /* Increase the DD partitioning counter */
8640 /* The state currently matches this DD partitioning count, store it */
8641 state_local->ddp_count = dd->ddp_count;
8644 /* The DD master node knows the complete cg distribution,
8645 * store the count so we can possibly skip the cg info communication.
8647 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8650 if (comm->DD_debug > 0)
8652 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8653 check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8654 "after partitioning");