1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
4 * This file is part of Gromacs Copyright (c) 1991-2008
5 * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
12 * To help us fund GROMACS development, we humbly ask that you cite
13 * the research papers on the package. Check out http://www.gromacs.org
16 * Gnomes, ROck Monsters And Chili Sauce
30 #include "gmx_fatal.h"
31 #include "gmx_fatal_collective.h"
34 #include "domdec_network.h"
37 #include "chargegroup.h"
46 #include "pull_rotation.h"
47 #include "gmx_wallcycle.h"
51 #include "mtop_util.h"
53 #include "gmx_ga2la.h"
56 #include "nbnxn_search.h"
58 #include "gmx_omp_nthreads.h"
67 #define DDRANK(dd,rank) (rank)
68 #define DDMASTERRANK(dd) (dd->masterrank)
70 typedef struct gmx_domdec_master
72 /* The cell boundaries */
74 /* The global charge group division */
75 int *ncg; /* Number of home charge groups for each node */
76 int *index; /* Index of nnodes+1 into cg */
77 int *cg; /* Global charge group index */
78 int *nat; /* Number of home atoms for each node. */
79 int *ibuf; /* Buffer for communication */
80 rvec *vbuf; /* Buffer for state scattering and gathering */
81 } gmx_domdec_master_t;
85 /* The numbers of charge groups to send and receive for each cell
86 * that requires communication, the last entry contains the total
87 * number of atoms that needs to be communicated.
89 int nsend[DD_MAXIZONE+2];
90 int nrecv[DD_MAXIZONE+2];
91 /* The charge groups to send */
94 /* The atom range for non-in-place communication */
95 int cell2at0[DD_MAXIZONE];
96 int cell2at1[DD_MAXIZONE];
101 int np; /* Number of grid pulses in this dimension */
102 int np_dlb; /* For dlb, for use with edlbAUTO */
103 gmx_domdec_ind_t *ind; /* The indices to communicate, size np */
105 gmx_bool bInPlace; /* Can we communicate in place? */
106 } gmx_domdec_comm_dim_t;
110 gmx_bool *bCellMin; /* Temp. var.: is this cell size at the limit */
111 real *cell_f; /* State var.: cell boundaries, box relative */
112 real *old_cell_f; /* Temp. var.: old cell size */
113 real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
114 real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
115 real *bound_min; /* Temp. var.: lower limit for cell boundary */
116 real *bound_max; /* Temp. var.: upper limit for cell boundary */
117 gmx_bool bLimited; /* State var.: is DLB limited in this dim and row */
118 real *buf_ncd; /* Temp. var. */
121 #define DD_NLOAD_MAX 9
123 /* Here floats are accurate enough, since these variables
124 * only influence the load balancing, not the actual MD results.
151 gmx_cgsort_t *sort_new;
163 /* This enum determines the order of the coordinates.
164 * ddnatHOME and ddnatZONE should be first and second,
165 * the others can be ordered as wanted.
167 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
169 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
170 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
174 int dim; /* The dimension */
175 gmx_bool dim_match;/* Tells if DD and PME dims match */
176 int nslab; /* The number of PME slabs in this dimension */
177 real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB */
178 int *pp_min; /* The minimum pp node location, size nslab */
179 int *pp_max; /* The maximum pp node location,size nslab */
180 int maxshift; /* The maximum shift for coordinate redistribution in PME */
185 real min0; /* The minimum bottom of this zone */
186 real max1; /* The maximum top of this zone */
187 real min1; /* The minimum top of this zone */
188 real mch0; /* The maximum bottom communicaton height for this zone */
189 real mch1; /* The maximum top communicaton height for this zone */
190 real p1_0; /* The bottom value of the first cell in this zone */
191 real p1_1; /* The top value of the first cell in this zone */
196 gmx_domdec_ind_t ind;
203 } dd_comm_setup_work_t;
205 typedef struct gmx_domdec_comm
207 /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
208 * unless stated otherwise.
211 /* The number of decomposition dimensions for PME, 0: no PME */
213 /* The number of nodes doing PME (PP/PME or only PME) */
217 /* The communication setup including the PME only nodes */
218 gmx_bool bCartesianPP_PME;
221 int *pmenodes; /* size npmenodes */
222 int *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
223 * but with bCartesianPP_PME */
224 gmx_ddpme_t ddpme[2];
226 /* The DD particle-particle nodes only */
227 gmx_bool bCartesianPP;
228 int *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
230 /* The global charge groups */
233 /* Should we sort the cgs */
235 gmx_domdec_sort_t *sort;
237 /* Are there charge groups? */
240 /* Are there bonded and multi-body interactions between charge groups? */
241 gmx_bool bInterCGBondeds;
242 gmx_bool bInterCGMultiBody;
244 /* Data for the optional bonded interaction atom communication range */
251 /* Are we actually using DLB? */
252 gmx_bool bDynLoadBal;
254 /* Cell sizes for static load balancing, first index cartesian */
257 /* The width of the communicated boundaries */
260 /* The minimum cell size (including triclinic correction) */
262 /* For dlb, for use with edlbAUTO */
263 rvec cellsize_min_dlb;
264 /* The lower limit for the DD cell size with DLB */
266 /* Effectively no NB cut-off limit with DLB for systems without PBC? */
267 gmx_bool bVacDLBNoLimit;
269 /* tric_dir is only stored here because dd_get_ns_ranges needs it */
271 /* box0 and box_size are required with dim's without pbc and -gcom */
275 /* The cell boundaries */
279 /* The old location of the cell boundaries, to check cg displacements */
283 /* The communication setup and charge group boundaries for the zones */
284 gmx_domdec_zones_t zones;
286 /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
287 * cell boundaries of neighboring cells for dynamic load balancing.
289 gmx_ddzone_t zone_d1[2];
290 gmx_ddzone_t zone_d2[2][2];
292 /* The coordinate/force communication setup and indices */
293 gmx_domdec_comm_dim_t cd[DIM];
294 /* The maximum number of cells to communicate with in one dimension */
297 /* Which cg distribution is stored on the master node */
298 int master_cg_ddp_count;
300 /* The number of cg's received from the direct neighbors */
301 int zone_ncg1[DD_MAXZONE];
303 /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
306 /* Array for signalling if atoms have moved to another domain */
310 /* Communication buffer for general use */
314 /* Communication buffer for general use */
317 /* Temporary storage for thread parallel communication setup */
319 dd_comm_setup_work_t *dth;
321 /* Communication buffers only used with multiple grid pulses */
326 /* Communication buffers for local redistribution */
328 int cggl_flag_nalloc[DIM*2];
330 int cgcm_state_nalloc[DIM*2];
332 /* Cell sizes for dynamic load balancing */
333 gmx_domdec_root_t **root;
337 real cell_f_max0[DIM];
338 real cell_f_min1[DIM];
340 /* Stuff for load communication */
341 gmx_bool bRecordLoad;
342 gmx_domdec_load_t *load;
344 MPI_Comm *mpi_comm_load;
347 /* Maximum DLB scaling per load balancing step in percent */
351 float cycl[ddCyclNr];
352 int cycl_n[ddCyclNr];
353 float cycl_max[ddCyclNr];
354 /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
358 /* Have often have did we have load measurements */
360 /* Have often have we collected the load measurements */
364 double sum_nat[ddnatNR-ddnatZONE];
374 /* The last partition step */
375 gmx_large_int_t partition_step;
383 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
386 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
387 #define DD_FLAG_NRCG 65535
388 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
389 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
391 /* Zone permutation required to obtain consecutive charge groups
392 * for neighbor searching.
394 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
396 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
397 * components see only j zones with that component 0.
400 /* The DD zone order */
401 static const ivec dd_zo[DD_MAXZONE] =
402 {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
407 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
412 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
417 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
419 /* Factors used to avoid problems due to rounding issues */
420 #define DD_CELL_MARGIN 1.0001
421 #define DD_CELL_MARGIN2 1.00005
422 /* Factor to account for pressure scaling during nstlist steps */
423 #define DD_PRES_SCALE_MARGIN 1.02
425 /* Allowed performance loss before we DLB or warn */
426 #define DD_PERF_LOSS 0.05
428 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
430 /* Use separate MPI send and receive commands
431 * when nnodes <= GMX_DD_NNODES_SENDRECV.
432 * This saves memory (and some copying for small nnodes).
433 * For high parallelization scatter and gather calls are used.
435 #define GMX_DD_NNODES_SENDRECV 4
439 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
441 static void index2xyz(ivec nc,int ind,ivec xyz)
443 xyz[XX] = ind % nc[XX];
444 xyz[YY] = (ind / nc[XX]) % nc[YY];
445 xyz[ZZ] = ind / (nc[YY]*nc[XX]);
449 /* This order is required to minimize the coordinate communication in PME
450 * which uses decomposition in the x direction.
452 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
454 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
456 xyz[XX] = ind / (nc[YY]*nc[ZZ]);
457 xyz[YY] = (ind / nc[ZZ]) % nc[YY];
458 xyz[ZZ] = ind % nc[ZZ];
461 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
466 ddindex = dd_index(dd->nc,c);
467 if (dd->comm->bCartesianPP_PME)
469 ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
471 else if (dd->comm->bCartesianPP)
474 MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
485 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
487 return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
490 int ddglatnr(gmx_domdec_t *dd,int i)
500 if (i >= dd->comm->nat[ddnatNR-1])
502 gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
504 atnr = dd->gatindex[i] + 1;
510 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
512 return &dd->comm->cgs_gl;
515 static void vec_rvec_init(vec_rvec_t *v)
521 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
525 v->nalloc = over_alloc_dd(n);
526 srenew(v->v,v->nalloc);
530 void dd_store_state(gmx_domdec_t *dd,t_state *state)
534 if (state->ddp_count != dd->ddp_count)
536 gmx_incons("The state does not the domain decomposition state");
539 state->ncg_gl = dd->ncg_home;
540 if (state->ncg_gl > state->cg_gl_nalloc)
542 state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
543 srenew(state->cg_gl,state->cg_gl_nalloc);
545 for(i=0; i<state->ncg_gl; i++)
547 state->cg_gl[i] = dd->index_gl[i];
550 state->ddp_count_cg_gl = dd->ddp_count;
553 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
555 return &dd->comm->zones;
558 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
559 int *jcg0,int *jcg1,ivec shift0,ivec shift1)
561 gmx_domdec_zones_t *zones;
564 zones = &dd->comm->zones;
567 while (icg >= zones->izone[izone].cg1)
576 else if (izone < zones->nizone)
578 *jcg0 = zones->izone[izone].jcg0;
582 gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
583 icg,izone,zones->nizone);
586 *jcg1 = zones->izone[izone].jcg1;
588 for(d=0; d<dd->ndim; d++)
591 shift0[dim] = zones->izone[izone].shift0[dim];
592 shift1[dim] = zones->izone[izone].shift1[dim];
593 if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
595 /* A conservative approach, this can be optimized */
602 int dd_natoms_vsite(gmx_domdec_t *dd)
604 return dd->comm->nat[ddnatVSITE];
607 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
609 *at_start = dd->comm->nat[ddnatCON-1];
610 *at_end = dd->comm->nat[ddnatCON];
613 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
615 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
617 gmx_domdec_comm_t *comm;
618 gmx_domdec_comm_dim_t *cd;
619 gmx_domdec_ind_t *ind;
620 rvec shift={0,0,0},*buf,*rbuf;
621 gmx_bool bPBC,bScrew;
625 cgindex = dd->cgindex;
630 nat_tot = dd->nat_home;
631 for(d=0; d<dd->ndim; d++)
633 bPBC = (dd->ci[dd->dim[d]] == 0);
634 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
637 copy_rvec(box[dd->dim[d]],shift);
640 for(p=0; p<cd->np; p++)
647 for(i=0; i<ind->nsend[nzone]; i++)
649 at0 = cgindex[index[i]];
650 at1 = cgindex[index[i]+1];
651 for(j=at0; j<at1; j++)
653 copy_rvec(x[j],buf[n]);
660 for(i=0; i<ind->nsend[nzone]; i++)
662 at0 = cgindex[index[i]];
663 at1 = cgindex[index[i]+1];
664 for(j=at0; j<at1; j++)
666 /* We need to shift the coordinates */
667 rvec_add(x[j],shift,buf[n]);
674 for(i=0; i<ind->nsend[nzone]; i++)
676 at0 = cgindex[index[i]];
677 at1 = cgindex[index[i]+1];
678 for(j=at0; j<at1; j++)
681 buf[n][XX] = x[j][XX] + shift[XX];
683 * This operation requires a special shift force
684 * treatment, which is performed in calc_vir.
686 buf[n][YY] = box[YY][YY] - x[j][YY];
687 buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
699 rbuf = comm->vbuf2.v;
701 /* Send and receive the coordinates */
702 dd_sendrecv_rvec(dd, d, dddirBackward,
703 buf, ind->nsend[nzone+1],
704 rbuf, ind->nrecv[nzone+1]);
708 for(zone=0; zone<nzone; zone++)
710 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
712 copy_rvec(rbuf[j],x[i]);
717 nat_tot += ind->nrecv[nzone+1];
723 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
725 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
727 gmx_domdec_comm_t *comm;
728 gmx_domdec_comm_dim_t *cd;
729 gmx_domdec_ind_t *ind;
733 gmx_bool bPBC,bScrew;
737 cgindex = dd->cgindex;
742 nzone = comm->zones.n/2;
743 nat_tot = dd->nat_tot;
744 for(d=dd->ndim-1; d>=0; d--)
746 bPBC = (dd->ci[dd->dim[d]] == 0);
747 bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
748 if (fshift == NULL && !bScrew)
752 /* Determine which shift vector we need */
758 for(p=cd->np-1; p>=0; p--) {
760 nat_tot -= ind->nrecv[nzone+1];
767 sbuf = comm->vbuf2.v;
769 for(zone=0; zone<nzone; zone++)
771 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
773 copy_rvec(f[i],sbuf[j]);
778 /* Communicate the forces */
779 dd_sendrecv_rvec(dd, d, dddirForward,
780 sbuf, ind->nrecv[nzone+1],
781 buf, ind->nsend[nzone+1]);
783 /* Add the received forces */
787 for(i=0; i<ind->nsend[nzone]; i++)
789 at0 = cgindex[index[i]];
790 at1 = cgindex[index[i]+1];
791 for(j=at0; j<at1; j++)
793 rvec_inc(f[j],buf[n]);
800 for(i=0; i<ind->nsend[nzone]; i++)
802 at0 = cgindex[index[i]];
803 at1 = cgindex[index[i]+1];
804 for(j=at0; j<at1; j++)
806 rvec_inc(f[j],buf[n]);
807 /* Add this force to the shift force */
808 rvec_inc(fshift[is],buf[n]);
815 for(i=0; i<ind->nsend[nzone]; i++)
817 at0 = cgindex[index[i]];
818 at1 = cgindex[index[i]+1];
819 for(j=at0; j<at1; j++)
821 /* Rotate the force */
822 f[j][XX] += buf[n][XX];
823 f[j][YY] -= buf[n][YY];
824 f[j][ZZ] -= buf[n][ZZ];
827 /* Add this force to the shift force */
828 rvec_inc(fshift[is],buf[n]);
839 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
841 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
843 gmx_domdec_comm_t *comm;
844 gmx_domdec_comm_dim_t *cd;
845 gmx_domdec_ind_t *ind;
850 cgindex = dd->cgindex;
852 buf = &comm->vbuf.v[0][0];
855 nat_tot = dd->nat_home;
856 for(d=0; d<dd->ndim; d++)
859 for(p=0; p<cd->np; p++)
864 for(i=0; i<ind->nsend[nzone]; i++)
866 at0 = cgindex[index[i]];
867 at1 = cgindex[index[i]+1];
868 for(j=at0; j<at1; j++)
881 rbuf = &comm->vbuf2.v[0][0];
883 /* Send and receive the coordinates */
884 dd_sendrecv_real(dd, d, dddirBackward,
885 buf, ind->nsend[nzone+1],
886 rbuf, ind->nrecv[nzone+1]);
890 for(zone=0; zone<nzone; zone++)
892 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
899 nat_tot += ind->nrecv[nzone+1];
905 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
907 int nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
909 gmx_domdec_comm_t *comm;
910 gmx_domdec_comm_dim_t *cd;
911 gmx_domdec_ind_t *ind;
916 cgindex = dd->cgindex;
918 buf = &comm->vbuf.v[0][0];
921 nzone = comm->zones.n/2;
922 nat_tot = dd->nat_tot;
923 for(d=dd->ndim-1; d>=0; d--)
926 for(p=cd->np-1; p>=0; p--) {
928 nat_tot -= ind->nrecv[nzone+1];
935 sbuf = &comm->vbuf2.v[0][0];
937 for(zone=0; zone<nzone; zone++)
939 for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
946 /* Communicate the forces */
947 dd_sendrecv_real(dd, d, dddirForward,
948 sbuf, ind->nrecv[nzone+1],
949 buf, ind->nsend[nzone+1]);
951 /* Add the received forces */
953 for(i=0; i<ind->nsend[nzone]; i++)
955 at0 = cgindex[index[i]];
956 at1 = cgindex[index[i]+1];
957 for(j=at0; j<at1; j++)
968 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
970 fprintf(fp,"zone d0 %d d1 %d d2 %d min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
972 zone->min0,zone->max1,
973 zone->mch0,zone->mch0,
974 zone->p1_0,zone->p1_1);
978 #define DDZONECOMM_MAXZONE 5
979 #define DDZONECOMM_BUFSIZE 3
981 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
982 int ddimind,int direction,
983 gmx_ddzone_t *buf_s,int n_s,
984 gmx_ddzone_t *buf_r,int n_r)
986 #define ZBS DDZONECOMM_BUFSIZE
987 rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
988 rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
993 vbuf_s[i*ZBS ][0] = buf_s[i].min0;
994 vbuf_s[i*ZBS ][1] = buf_s[i].max1;
995 vbuf_s[i*ZBS ][2] = buf_s[i].min1;
996 vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
997 vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
998 vbuf_s[i*ZBS+1][2] = 0;
999 vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1000 vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1001 vbuf_s[i*ZBS+2][2] = 0;
1004 dd_sendrecv_rvec(dd, ddimind, direction,
1008 for(i=0; i<n_r; i++)
1010 buf_r[i].min0 = vbuf_r[i*ZBS ][0];
1011 buf_r[i].max1 = vbuf_r[i*ZBS ][1];
1012 buf_r[i].min1 = vbuf_r[i*ZBS ][2];
1013 buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1014 buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1015 buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1016 buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1022 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1023 rvec cell_ns_x0,rvec cell_ns_x1)
1025 int d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1027 gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1028 gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1029 gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1030 rvec extr_s[2],extr_r[2];
1032 real dist_d,c=0,det;
1033 gmx_domdec_comm_t *comm;
1038 for(d=1; d<dd->ndim; d++)
1041 zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1042 zp->min0 = cell_ns_x0[dim];
1043 zp->max1 = cell_ns_x1[dim];
1044 zp->min1 = cell_ns_x1[dim];
1045 zp->mch0 = cell_ns_x0[dim];
1046 zp->mch1 = cell_ns_x1[dim];
1047 zp->p1_0 = cell_ns_x0[dim];
1048 zp->p1_1 = cell_ns_x1[dim];
1051 for(d=dd->ndim-2; d>=0; d--)
1054 bPBC = (dim < ddbox->npbcdim);
1056 /* Use an rvec to store two reals */
1057 extr_s[d][0] = comm->cell_f0[d+1];
1058 extr_s[d][1] = comm->cell_f1[d+1];
1059 extr_s[d][2] = comm->cell_f1[d+1];
1062 /* Store the extremes in the backward sending buffer,
1063 * so the get updated separately from the forward communication.
1065 for(d1=d; d1<dd->ndim-1; d1++)
1067 /* We invert the order to be able to use the same loop for buf_e */
1068 buf_s[pos].min0 = extr_s[d1][1];
1069 buf_s[pos].max1 = extr_s[d1][0];
1070 buf_s[pos].min1 = extr_s[d1][2];
1071 buf_s[pos].mch0 = 0;
1072 buf_s[pos].mch1 = 0;
1073 /* Store the cell corner of the dimension we communicate along */
1074 buf_s[pos].p1_0 = comm->cell_x0[dim];
1075 buf_s[pos].p1_1 = 0;
1079 buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1082 if (dd->ndim == 3 && d == 0)
1084 buf_s[pos] = comm->zone_d2[0][1];
1086 buf_s[pos] = comm->zone_d1[0];
1090 /* We only need to communicate the extremes
1091 * in the forward direction
1093 npulse = comm->cd[d].np;
1096 /* Take the minimum to avoid double communication */
1097 npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1101 /* Without PBC we should really not communicate over
1102 * the boundaries, but implementing that complicates
1103 * the communication setup and therefore we simply
1104 * do all communication, but ignore some data.
1106 npulse_min = npulse;
1108 for(p=0; p<npulse_min; p++)
1110 /* Communicate the extremes forward */
1111 bUse = (bPBC || dd->ci[dim] > 0);
1113 dd_sendrecv_rvec(dd, d, dddirForward,
1114 extr_s+d, dd->ndim-d-1,
1115 extr_r+d, dd->ndim-d-1);
1119 for(d1=d; d1<dd->ndim-1; d1++)
1121 extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1122 extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1123 extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1129 for(p=0; p<npulse; p++)
1131 /* Communicate all the zone information backward */
1132 bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1134 dd_sendrecv_ddzone(dd, d, dddirBackward,
1141 for(d1=d+1; d1<dd->ndim; d1++)
1143 /* Determine the decrease of maximum required
1144 * communication height along d1 due to the distance along d,
1145 * this avoids a lot of useless atom communication.
1147 dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1149 if (ddbox->tric_dir[dim])
1151 /* c is the off-diagonal coupling between the cell planes
1152 * along directions d and d1.
1154 c = ddbox->v[dim][dd->dim[d1]][dim];
1160 det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1163 dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1167 /* A negative value signals out of range */
1173 /* Accumulate the extremes over all pulses */
1174 for(i=0; i<buf_size; i++)
1178 buf_e[i] = buf_r[i];
1184 buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1185 buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1186 buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1189 if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1197 if (bUse && dh[d1] >= 0)
1199 buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1200 buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1203 /* Copy the received buffer to the send buffer,
1204 * to pass the data through with the next pulse.
1206 buf_s[i] = buf_r[i];
1208 if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1209 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1211 /* Store the extremes */
1214 for(d1=d; d1<dd->ndim-1; d1++)
1216 extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1217 extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1218 extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1222 if (d == 1 || (d == 0 && dd->ndim == 3))
1226 comm->zone_d2[1-d][i] = buf_e[pos];
1232 comm->zone_d1[1] = buf_e[pos];
1246 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1248 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1249 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1261 print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1263 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1264 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1268 for(d=1; d<dd->ndim; d++)
1270 comm->cell_f_max0[d] = extr_s[d-1][0];
1271 comm->cell_f_min1[d] = extr_s[d-1][1];
1274 fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1275 d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1280 static void dd_collect_cg(gmx_domdec_t *dd,
1281 t_state *state_local)
1283 gmx_domdec_master_t *ma=NULL;
1284 int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1287 if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1289 /* The master has the correct distribution */
1293 if (state_local->ddp_count == dd->ddp_count)
1295 ncg_home = dd->ncg_home;
1297 nat_home = dd->nat_home;
1299 else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1301 cgs_gl = &dd->comm->cgs_gl;
1303 ncg_home = state_local->ncg_gl;
1304 cg = state_local->cg_gl;
1306 for(i=0; i<ncg_home; i++)
1308 nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1313 gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1316 buf2[0] = dd->ncg_home;
1317 buf2[1] = dd->nat_home;
1327 /* Collect the charge group and atom counts on the master */
1328 dd_gather(dd,2*sizeof(int),buf2,ibuf);
1333 for(i=0; i<dd->nnodes; i++)
1335 ma->ncg[i] = ma->ibuf[2*i];
1336 ma->nat[i] = ma->ibuf[2*i+1];
1337 ma->index[i+1] = ma->index[i] + ma->ncg[i];
1340 /* Make byte counts and indices */
1341 for(i=0; i<dd->nnodes; i++)
1343 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1344 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1348 fprintf(debug,"Initial charge group distribution: ");
1349 for(i=0; i<dd->nnodes; i++)
1350 fprintf(debug," %d",ma->ncg[i]);
1351 fprintf(debug,"\n");
1355 /* Collect the charge group indices on the master */
1357 dd->ncg_home*sizeof(int),dd->index_gl,
1358 DDMASTER(dd) ? ma->ibuf : NULL,
1359 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1360 DDMASTER(dd) ? ma->cg : NULL);
1362 dd->comm->master_cg_ddp_count = state_local->ddp_count;
1365 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1368 gmx_domdec_master_t *ma;
1369 int n,i,c,a,nalloc=0;
1378 MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1379 dd->rank,dd->mpi_comm_all);
1382 /* Copy the master coordinates to the global array */
1383 cgs_gl = &dd->comm->cgs_gl;
1385 n = DDMASTERRANK(dd);
1387 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1389 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1391 copy_rvec(lv[a++],v[c]);
1395 for(n=0; n<dd->nnodes; n++)
1399 if (ma->nat[n] > nalloc)
1401 nalloc = over_alloc_dd(ma->nat[n]);
1405 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1406 n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1409 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1411 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1413 copy_rvec(buf[a++],v[c]);
1422 static void get_commbuffer_counts(gmx_domdec_t *dd,
1423 int **counts,int **disps)
1425 gmx_domdec_master_t *ma;
1430 /* Make the rvec count and displacment arrays */
1432 *disps = ma->ibuf + dd->nnodes;
1433 for(n=0; n<dd->nnodes; n++)
1435 (*counts)[n] = ma->nat[n]*sizeof(rvec);
1436 (*disps)[n] = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1440 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1443 gmx_domdec_master_t *ma;
1444 int *rcounts=NULL,*disps=NULL;
1453 get_commbuffer_counts(dd,&rcounts,&disps);
1458 dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1462 cgs_gl = &dd->comm->cgs_gl;
1465 for(n=0; n<dd->nnodes; n++)
1467 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1469 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1471 copy_rvec(buf[a++],v[c]);
1478 void dd_collect_vec(gmx_domdec_t *dd,
1479 t_state *state_local,rvec *lv,rvec *v)
1481 gmx_domdec_master_t *ma;
1482 int n,i,c,a,nalloc=0;
1485 dd_collect_cg(dd,state_local);
1487 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1489 dd_collect_vec_sendrecv(dd,lv,v);
1493 dd_collect_vec_gatherv(dd,lv,v);
1498 void dd_collect_state(gmx_domdec_t *dd,
1499 t_state *state_local,t_state *state)
1503 nh = state->nhchainlength;
1507 for (i=0;i<efptNR;i++) {
1508 state->lambda[i] = state_local->lambda[i];
1510 state->fep_state = state_local->fep_state;
1511 state->veta = state_local->veta;
1512 state->vol0 = state_local->vol0;
1513 copy_mat(state_local->box,state->box);
1514 copy_mat(state_local->boxv,state->boxv);
1515 copy_mat(state_local->svir_prev,state->svir_prev);
1516 copy_mat(state_local->fvir_prev,state->fvir_prev);
1517 copy_mat(state_local->pres_prev,state->pres_prev);
1520 for(i=0; i<state_local->ngtc; i++)
1522 for(j=0; j<nh; j++) {
1523 state->nosehoover_xi[i*nh+j] = state_local->nosehoover_xi[i*nh+j];
1524 state->nosehoover_vxi[i*nh+j] = state_local->nosehoover_vxi[i*nh+j];
1526 state->therm_integral[i] = state_local->therm_integral[i];
1528 for(i=0; i<state_local->nnhpres; i++)
1530 for(j=0; j<nh; j++) {
1531 state->nhpres_xi[i*nh+j] = state_local->nhpres_xi[i*nh+j];
1532 state->nhpres_vxi[i*nh+j] = state_local->nhpres_vxi[i*nh+j];
1536 for(est=0; est<estNR; est++)
1538 if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1542 dd_collect_vec(dd,state_local,state_local->x,state->x);
1545 dd_collect_vec(dd,state_local,state_local->v,state->v);
1548 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1551 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1554 if (state->nrngi == 1)
1558 for(i=0; i<state_local->nrng; i++)
1560 state->ld_rng[i] = state_local->ld_rng[i];
1566 dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1567 state_local->ld_rng,state->ld_rng);
1571 if (state->nrngi == 1)
1575 state->ld_rngi[0] = state_local->ld_rngi[0];
1580 dd_gather(dd,sizeof(state->ld_rngi[0]),
1581 state_local->ld_rngi,state->ld_rngi);
1584 case estDISRE_INITF:
1585 case estDISRE_RM3TAV:
1586 case estORIRE_INITF:
1590 gmx_incons("Unknown state entry encountered in dd_collect_state");
1596 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1602 fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1605 state->nalloc = over_alloc_dd(nalloc);
1607 for(est=0; est<estNR; est++)
1609 if (EST_DISTR(est) && (state->flags & (1<<est)))
1613 srenew(state->x,state->nalloc);
1616 srenew(state->v,state->nalloc);
1619 srenew(state->sd_X,state->nalloc);
1622 srenew(state->cg_p,state->nalloc);
1626 case estDISRE_INITF:
1627 case estDISRE_RM3TAV:
1628 case estORIRE_INITF:
1630 /* No reallocation required */
1633 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1640 srenew(*f,state->nalloc);
1644 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1647 if (nalloc > fr->cg_nalloc)
1651 fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1653 fr->cg_nalloc = over_alloc_dd(nalloc);
1654 srenew(fr->cginfo,fr->cg_nalloc);
1655 if (fr->cutoff_scheme == ecutsGROUP)
1657 srenew(fr->cg_cm,fr->cg_nalloc);
1660 if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1662 /* We don't use charge groups, we use x in state to set up
1663 * the atom communication.
1665 dd_realloc_state(state,f,nalloc);
1669 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1672 gmx_domdec_master_t *ma;
1673 int n,i,c,a,nalloc=0;
1680 for(n=0; n<dd->nnodes; n++)
1684 if (ma->nat[n] > nalloc)
1686 nalloc = over_alloc_dd(ma->nat[n]);
1689 /* Use lv as a temporary buffer */
1691 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1693 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1695 copy_rvec(v[c],buf[a++]);
1698 if (a != ma->nat[n])
1700 gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1705 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1706 DDRANK(dd,n),n,dd->mpi_comm_all);
1711 n = DDMASTERRANK(dd);
1713 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1715 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1717 copy_rvec(v[c],lv[a++]);
1724 MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1725 MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1730 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1733 gmx_domdec_master_t *ma;
1734 int *scounts=NULL,*disps=NULL;
1735 int n,i,c,a,nalloc=0;
1742 get_commbuffer_counts(dd,&scounts,&disps);
1746 for(n=0; n<dd->nnodes; n++)
1748 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1750 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1752 copy_rvec(v[c],buf[a++]);
1758 dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1761 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1763 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1765 dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1769 dd_distribute_vec_scatterv(dd,cgs,v,lv);
1773 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1774 t_state *state,t_state *state_local,
1779 nh = state->nhchainlength;
1783 for(i=0;i<efptNR;i++)
1785 state_local->lambda[i] = state->lambda[i];
1787 state_local->fep_state = state->fep_state;
1788 state_local->veta = state->veta;
1789 state_local->vol0 = state->vol0;
1790 copy_mat(state->box,state_local->box);
1791 copy_mat(state->box_rel,state_local->box_rel);
1792 copy_mat(state->boxv,state_local->boxv);
1793 copy_mat(state->svir_prev,state_local->svir_prev);
1794 copy_mat(state->fvir_prev,state_local->fvir_prev);
1795 for(i=0; i<state_local->ngtc; i++)
1797 for(j=0; j<nh; j++) {
1798 state_local->nosehoover_xi[i*nh+j] = state->nosehoover_xi[i*nh+j];
1799 state_local->nosehoover_vxi[i*nh+j] = state->nosehoover_vxi[i*nh+j];
1801 state_local->therm_integral[i] = state->therm_integral[i];
1803 for(i=0; i<state_local->nnhpres; i++)
1805 for(j=0; j<nh; j++) {
1806 state_local->nhpres_xi[i*nh+j] = state->nhpres_xi[i*nh+j];
1807 state_local->nhpres_vxi[i*nh+j] = state->nhpres_vxi[i*nh+j];
1811 dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1812 dd_bcast(dd,sizeof(int),&state_local->fep_state);
1813 dd_bcast(dd,sizeof(real),&state_local->veta);
1814 dd_bcast(dd,sizeof(real),&state_local->vol0);
1815 dd_bcast(dd,sizeof(state_local->box),state_local->box);
1816 dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1817 dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1818 dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1819 dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1820 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1821 dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1822 dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1823 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1824 dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1826 if (dd->nat_home > state_local->nalloc)
1828 dd_realloc_state(state_local,f,dd->nat_home);
1830 for(i=0; i<estNR; i++)
1832 if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1836 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1839 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1842 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1845 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1848 if (state->nrngi == 1)
1851 state_local->nrng*sizeof(state_local->ld_rng[0]),
1852 state->ld_rng,state_local->ld_rng);
1857 state_local->nrng*sizeof(state_local->ld_rng[0]),
1858 state->ld_rng,state_local->ld_rng);
1862 if (state->nrngi == 1)
1864 dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1865 state->ld_rngi,state_local->ld_rngi);
1869 dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1870 state->ld_rngi,state_local->ld_rngi);
1873 case estDISRE_INITF:
1874 case estDISRE_RM3TAV:
1875 case estORIRE_INITF:
1877 /* Not implemented yet */
1880 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1886 static char dim2char(int dim)
1892 case XX: c = 'X'; break;
1893 case YY: c = 'Y'; break;
1894 case ZZ: c = 'Z'; break;
1895 default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1901 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1902 gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1904 rvec grid_s[2],*grid_r=NULL,cx,r;
1905 char fname[STRLEN],format[STRLEN],buf[22];
1911 copy_rvec(dd->comm->cell_x0,grid_s[0]);
1912 copy_rvec(dd->comm->cell_x1,grid_s[1]);
1916 snew(grid_r,2*dd->nnodes);
1919 dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1923 for(d=0; d<DIM; d++)
1925 for(i=0; i<DIM; i++)
1933 if (d < ddbox->npbcdim && dd->nc[d] > 1)
1935 tric[d][i] = box[i][d]/box[i][i];
1944 sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1945 sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1946 out = gmx_fio_fopen(fname,"w");
1947 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1949 for(i=0; i<dd->nnodes; i++)
1951 vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1952 for(d=0; d<DIM; d++)
1954 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1962 cx[XX] = grid_r[i*2+x][XX];
1963 cx[YY] = grid_r[i*2+y][YY];
1964 cx[ZZ] = grid_r[i*2+z][ZZ];
1966 fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1967 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1971 for(d=0; d<DIM; d++)
1977 case 0: y = 1 + i*8 + 2*x; break;
1978 case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1979 case 2: y = 1 + i*8 + x; break;
1981 fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1985 gmx_fio_fclose(out);
1990 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1991 gmx_mtop_t *mtop,t_commrec *cr,
1992 int natoms,rvec x[],matrix box)
1994 char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1997 char *atomname,*resname;
2004 natoms = dd->comm->nat[ddnatVSITE];
2007 sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2009 sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
2010 sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
2012 out = gmx_fio_fopen(fname,"w");
2014 fprintf(out,"TITLE %s\n",title);
2015 gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2016 for(i=0; i<natoms; i++)
2018 ii = dd->gatindex[i];
2019 gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2020 if (i < dd->comm->nat[ddnatZONE])
2023 while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2029 else if (i < dd->comm->nat[ddnatVSITE])
2031 b = dd->comm->zones.n;
2035 b = dd->comm->zones.n + 1;
2037 fprintf(out,strlen(atomname)<4 ? format : format4,
2038 "ATOM",(ii+1)%100000,
2039 atomname,resname,' ',resnr%10000,' ',
2040 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2042 fprintf(out,"TER\n");
2044 gmx_fio_fclose(out);
2047 real dd_cutoff_mbody(gmx_domdec_t *dd)
2049 gmx_domdec_comm_t *comm;
2056 if (comm->bInterCGBondeds)
2058 if (comm->cutoff_mbody > 0)
2060 r = comm->cutoff_mbody;
2064 /* cutoff_mbody=0 means we do not have DLB */
2065 r = comm->cellsize_min[dd->dim[0]];
2066 for(di=1; di<dd->ndim; di++)
2068 r = min(r,comm->cellsize_min[dd->dim[di]]);
2070 if (comm->bBondComm)
2072 r = max(r,comm->cutoff_mbody);
2076 r = min(r,comm->cutoff);
2084 real dd_cutoff_twobody(gmx_domdec_t *dd)
2088 r_mb = dd_cutoff_mbody(dd);
2090 return max(dd->comm->cutoff,r_mb);
2094 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2098 nc = dd->nc[dd->comm->cartpmedim];
2099 ntot = dd->comm->ntot[dd->comm->cartpmedim];
2100 copy_ivec(coord,coord_pme);
2101 coord_pme[dd->comm->cartpmedim] =
2102 nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2105 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2107 /* Here we assign a PME node to communicate with this DD node
2108 * by assuming that the major index of both is x.
2109 * We add cr->npmenodes/2 to obtain an even distribution.
2111 return (ddindex*npme + npme/2)/ndd;
2114 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2116 return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2119 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2121 return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2124 static int *dd_pmenodes(t_commrec *cr)
2129 snew(pmenodes,cr->npmenodes);
2131 for(i=0; i<cr->dd->nnodes; i++) {
2132 p0 = cr_ddindex2pmeindex(cr,i);
2133 p1 = cr_ddindex2pmeindex(cr,i+1);
2134 if (i+1 == cr->dd->nnodes || p1 > p0) {
2136 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2137 pmenodes[n] = i + 1 + n;
2145 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2148 ivec coords,coords_pme,nc;
2153 if (dd->comm->bCartesian) {
2154 gmx_ddindex2xyz(dd->nc,ddindex,coords);
2155 dd_coords2pmecoords(dd,coords,coords_pme);
2156 copy_ivec(dd->ntot,nc);
2157 nc[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2158 coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2160 slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2162 slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2168 slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2173 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2175 gmx_domdec_comm_t *comm;
2177 int ddindex,nodeid=-1;
2179 comm = cr->dd->comm;
2184 if (comm->bCartesianPP_PME)
2187 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2192 ddindex = dd_index(cr->dd->nc,coords);
2193 if (comm->bCartesianPP)
2195 nodeid = comm->ddindex2simnodeid[ddindex];
2201 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2213 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2216 gmx_domdec_comm_t *comm;
2217 ivec coord,coord_pme;
2224 /* This assumes a uniform x domain decomposition grid cell size */
2225 if (comm->bCartesianPP_PME)
2228 MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2229 if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2231 /* This is a PP node */
2232 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2233 MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2237 else if (comm->bCartesianPP)
2239 if (sim_nodeid < dd->nnodes)
2241 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2246 /* This assumes DD cells with identical x coordinates
2247 * are numbered sequentially.
2249 if (dd->comm->pmenodes == NULL)
2251 if (sim_nodeid < dd->nnodes)
2253 /* The DD index equals the nodeid */
2254 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2260 while (sim_nodeid > dd->comm->pmenodes[i])
2264 if (sim_nodeid < dd->comm->pmenodes[i])
2266 pmenode = dd->comm->pmenodes[i];
2274 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2276 gmx_bool bPMEOnlyNode;
2278 if (DOMAINDECOMP(cr))
2280 bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2284 bPMEOnlyNode = FALSE;
2287 return bPMEOnlyNode;
2290 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2291 int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2295 ivec coord,coord_pme;
2299 snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2302 for(x=0; x<dd->nc[XX]; x++)
2304 for(y=0; y<dd->nc[YY]; y++)
2306 for(z=0; z<dd->nc[ZZ]; z++)
2308 if (dd->comm->bCartesianPP_PME)
2313 dd_cart_coord2pmecoord(dd,coord,coord_pme);
2314 if (dd->ci[XX] == coord_pme[XX] &&
2315 dd->ci[YY] == coord_pme[YY] &&
2316 dd->ci[ZZ] == coord_pme[ZZ])
2317 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2321 /* The slab corresponds to the nodeid in the PME group */
2322 if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2324 (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2331 /* The last PP-only node is the peer node */
2332 *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2336 fprintf(debug,"Receive coordinates from PP nodes:");
2337 for(x=0; x<*nmy_ddnodes; x++)
2339 fprintf(debug," %d",(*my_ddnodes)[x]);
2341 fprintf(debug,"\n");
2345 static gmx_bool receive_vir_ener(t_commrec *cr)
2347 gmx_domdec_comm_t *comm;
2348 int pmenode,coords[DIM],rank;
2352 if (cr->npmenodes < cr->dd->nnodes)
2354 comm = cr->dd->comm;
2355 if (comm->bCartesianPP_PME)
2357 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2359 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2360 coords[comm->cartpmedim]++;
2361 if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2363 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2364 if (dd_simnode2pmenode(cr,rank) == pmenode)
2366 /* This is not the last PP node for pmenode */
2374 pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2375 if (cr->sim_nodeid+1 < cr->nnodes &&
2376 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2378 /* This is not the last PP node for pmenode */
2387 static void set_zones_ncg_home(gmx_domdec_t *dd)
2389 gmx_domdec_zones_t *zones;
2392 zones = &dd->comm->zones;
2394 zones->cg_range[0] = 0;
2395 for(i=1; i<zones->n+1; i++)
2397 zones->cg_range[i] = dd->ncg_home;
2401 static void rebuild_cgindex(gmx_domdec_t *dd,
2402 const int *gcgs_index,t_state *state)
2404 int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2407 dd_cg_gl = dd->index_gl;
2408 cgindex = dd->cgindex;
2411 for(i=0; i<state->ncg_gl; i++)
2415 dd_cg_gl[i] = cg_gl;
2416 nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2420 dd->ncg_home = state->ncg_gl;
2423 set_zones_ncg_home(dd);
2426 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2428 while (cg >= cginfo_mb->cg_end)
2433 return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2436 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2437 t_forcerec *fr,char *bLocalCG)
2439 cginfo_mb_t *cginfo_mb;
2445 cginfo_mb = fr->cginfo_mb;
2446 cginfo = fr->cginfo;
2448 for(cg=cg0; cg<cg1; cg++)
2450 cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2454 if (bLocalCG != NULL)
2456 for(cg=cg0; cg<cg1; cg++)
2458 bLocalCG[index_gl[cg]] = TRUE;
2463 static void make_dd_indices(gmx_domdec_t *dd,
2464 const int *gcgs_index,int cg_start)
2466 int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2467 int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2472 bLocalCG = dd->comm->bLocalCG;
2474 if (dd->nat_tot > dd->gatindex_nalloc)
2476 dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2477 srenew(dd->gatindex,dd->gatindex_nalloc);
2480 nzone = dd->comm->zones.n;
2481 zone2cg = dd->comm->zones.cg_range;
2482 zone_ncg1 = dd->comm->zone_ncg1;
2483 index_gl = dd->index_gl;
2484 gatindex = dd->gatindex;
2485 bCGs = dd->comm->bCGs;
2487 if (zone2cg[1] != dd->ncg_home)
2489 gmx_incons("dd->ncg_zone is not up to date");
2492 /* Make the local to global and global to local atom index */
2493 a = dd->cgindex[cg_start];
2494 for(zone=0; zone<nzone; zone++)
2502 cg0 = zone2cg[zone];
2504 cg1 = zone2cg[zone+1];
2505 cg1_p1 = cg0 + zone_ncg1[zone];
2507 for(cg=cg0; cg<cg1; cg++)
2512 /* Signal that this cg is from more than one pulse away */
2515 cg_gl = index_gl[cg];
2518 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2521 ga2la_set(dd->ga2la,a_gl,a,zone1);
2527 gatindex[a] = cg_gl;
2528 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2535 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2541 if (bLocalCG == NULL)
2545 for(i=0; i<dd->ncg_tot; i++)
2547 if (!bLocalCG[dd->index_gl[i]])
2550 "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2555 for(i=0; i<ncg_sys; i++)
2562 if (ngl != dd->ncg_tot)
2564 fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2571 static void check_index_consistency(gmx_domdec_t *dd,
2572 int natoms_sys,int ncg_sys,
2575 int nerr,ngl,i,a,cell;
2580 if (dd->comm->DD_debug > 1)
2582 snew(have,natoms_sys);
2583 for(a=0; a<dd->nat_tot; a++)
2585 if (have[dd->gatindex[a]] > 0)
2587 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2591 have[dd->gatindex[a]] = a + 1;
2597 snew(have,dd->nat_tot);
2600 for(i=0; i<natoms_sys; i++)
2602 if (ga2la_get(dd->ga2la,i,&a,&cell))
2604 if (a >= dd->nat_tot)
2606 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2612 if (dd->gatindex[a] != i)
2614 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2621 if (ngl != dd->nat_tot)
2624 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2625 dd->rank,where,ngl,dd->nat_tot);
2627 for(a=0; a<dd->nat_tot; a++)
2632 "DD node %d, %s: local atom %d, global %d has no global index\n",
2633 dd->rank,where,a+1,dd->gatindex[a]+1);
2638 nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2641 gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2642 dd->rank,where,nerr);
2646 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2653 /* Clear the whole list without searching */
2654 ga2la_clear(dd->ga2la);
2658 for(i=a_start; i<dd->nat_tot; i++)
2660 ga2la_del(dd->ga2la,dd->gatindex[i]);
2664 bLocalCG = dd->comm->bLocalCG;
2667 for(i=cg_start; i<dd->ncg_tot; i++)
2669 bLocalCG[dd->index_gl[i]] = FALSE;
2673 dd_clear_local_vsite_indices(dd);
2675 if (dd->constraints)
2677 dd_clear_local_constraint_indices(dd);
2681 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2684 real grid_jump_limit;
2686 /* The distance between the boundaries of cells at distance
2687 * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2688 * and by the fact that cells should not be shifted by more than
2689 * half their size, such that cg's only shift by one cell
2690 * at redecomposition.
2692 grid_jump_limit = comm->cellsize_limit;
2693 if (!comm->bVacDLBNoLimit)
2695 grid_jump_limit = max(grid_jump_limit,
2696 cutoff/comm->cd[dim_ind].np);
2699 return grid_jump_limit;
2702 static gmx_bool check_grid_jump(gmx_large_int_t step,
2708 gmx_domdec_comm_t *comm;
2717 for(d=1; d<dd->ndim; d++)
2720 limit = grid_jump_limit(comm,cutoff,d);
2721 bfac = ddbox->box_size[dim];
2722 if (ddbox->tric_dir[dim])
2724 bfac *= ddbox->skew_fac[dim];
2726 if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac < limit ||
2727 (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2735 /* This error should never be triggered under normal
2736 * circumstances, but you never know ...
2738 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2739 gmx_step_str(step,buf),
2740 dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2748 static int dd_load_count(gmx_domdec_comm_t *comm)
2750 return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2753 static float dd_force_load(gmx_domdec_comm_t *comm)
2760 if (comm->eFlop > 1)
2762 load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2767 load = comm->cycl[ddCyclF];
2768 if (comm->cycl_n[ddCyclF] > 1)
2770 /* Subtract the maximum of the last n cycle counts
2771 * to get rid of possible high counts due to other soures,
2772 * for instance system activity, that would otherwise
2773 * affect the dynamic load balancing.
2775 load -= comm->cycl_max[ddCyclF];
2782 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2784 gmx_domdec_comm_t *comm;
2789 snew(*dim_f,dd->nc[dim]+1);
2791 for(i=1; i<dd->nc[dim]; i++)
2793 if (comm->slb_frac[dim])
2795 (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2799 (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2802 (*dim_f)[dd->nc[dim]] = 1;
2805 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2807 int pmeindex,slab,nso,i;
2810 if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2816 ddpme->dim = dimind;
2818 ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2820 ddpme->nslab = (ddpme->dim == 0 ?
2821 dd->comm->npmenodes_x :
2822 dd->comm->npmenodes_y);
2824 if (ddpme->nslab <= 1)
2829 nso = dd->comm->npmenodes/ddpme->nslab;
2830 /* Determine for each PME slab the PP location range for dimension dim */
2831 snew(ddpme->pp_min,ddpme->nslab);
2832 snew(ddpme->pp_max,ddpme->nslab);
2833 for(slab=0; slab<ddpme->nslab; slab++) {
2834 ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2835 ddpme->pp_max[slab] = 0;
2837 for(i=0; i<dd->nnodes; i++) {
2838 ddindex2xyz(dd->nc,i,xyz);
2839 /* For y only use our y/z slab.
2840 * This assumes that the PME x grid size matches the DD grid size.
2842 if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2843 pmeindex = ddindex2pmeindex(dd,i);
2845 slab = pmeindex/nso;
2847 slab = pmeindex % ddpme->nslab;
2849 ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2850 ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2854 set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2857 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2859 if (dd->comm->ddpme[0].dim == XX)
2861 return dd->comm->ddpme[0].maxshift;
2869 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2871 if (dd->comm->ddpme[0].dim == YY)
2873 return dd->comm->ddpme[0].maxshift;
2875 else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2877 return dd->comm->ddpme[1].maxshift;
2885 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2886 gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2888 gmx_domdec_comm_t *comm;
2891 real range,pme_boundary;
2895 nc = dd->nc[ddpme->dim];
2898 if (!ddpme->dim_match)
2900 /* PP decomposition is not along dim: the worst situation */
2903 else if (ns <= 3 || (bUniform && ns == nc))
2905 /* The optimal situation */
2910 /* We need to check for all pme nodes which nodes they
2911 * could possibly need to communicate with.
2913 xmin = ddpme->pp_min;
2914 xmax = ddpme->pp_max;
2915 /* Allow for atoms to be maximally 2/3 times the cut-off
2916 * out of their DD cell. This is a reasonable balance between
2917 * between performance and support for most charge-group/cut-off
2920 range = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2921 /* Avoid extra communication when we are exactly at a boundary */
2927 /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2928 pme_boundary = (real)s/ns;
2931 cell_f[xmax[s-(sh+1) ]+1] + range > pme_boundary) ||
2933 cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2937 pme_boundary = (real)(s+1)/ns;
2940 cell_f[xmin[s+(sh+1) ] ] - range < pme_boundary) ||
2942 cell_f[xmin[s+(sh+1)-ns] ] + 1 - range < pme_boundary)))
2949 ddpme->maxshift = sh;
2953 fprintf(debug,"PME slab communication range for dim %d is %d\n",
2954 ddpme->dim,ddpme->maxshift);
2958 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2962 for(d=0; d<dd->ndim; d++)
2965 if (dim < ddbox->nboundeddim &&
2966 ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2967 dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2969 gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2970 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2971 dd->nc[dim],dd->comm->cellsize_limit);
2976 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2977 gmx_bool bMaster,ivec npulse)
2979 gmx_domdec_comm_t *comm;
2982 real *cell_x,cell_dx,cellsize;
2986 for(d=0; d<DIM; d++)
2988 cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2990 if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2993 cell_dx = ddbox->box_size[d]/dd->nc[d];
2996 for(j=0; j<dd->nc[d]+1; j++)
2998 dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3003 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d] )*cell_dx;
3004 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3006 cellsize = cell_dx*ddbox->skew_fac[d];
3007 while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3011 cellsize_min[d] = cellsize;
3015 /* Statically load balanced grid */
3016 /* Also when we are not doing a master distribution we determine
3017 * all cell borders in a loop to obtain identical values
3018 * to the master distribution case and to determine npulse.
3022 cell_x = dd->ma->cell_x[d];
3026 snew(cell_x,dd->nc[d]+1);
3028 cell_x[0] = ddbox->box0[d];
3029 for(j=0; j<dd->nc[d]; j++)
3031 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3032 cell_x[j+1] = cell_x[j] + cell_dx;
3033 cellsize = cell_dx*ddbox->skew_fac[d];
3034 while (cellsize*npulse[d] < comm->cutoff &&
3035 npulse[d] < dd->nc[d]-1)
3039 cellsize_min[d] = min(cellsize_min[d],cellsize);
3043 comm->cell_x0[d] = cell_x[dd->ci[d]];
3044 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3048 /* The following limitation is to avoid that a cell would receive
3049 * some of its own home charge groups back over the periodic boundary.
3050 * Double charge groups cause trouble with the global indices.
3052 if (d < ddbox->npbcdim &&
3053 dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3055 gmx_fatal_collective(FARGS,NULL,dd,
3056 "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3057 dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3059 dd->nc[d],dd->nc[d],
3060 dd->nnodes > dd->nc[d] ? "cells" : "processors");
3064 if (!comm->bDynLoadBal)
3066 copy_rvec(cellsize_min,comm->cellsize_min);
3069 for(d=0; d<comm->npmedecompdim; d++)
3071 set_pme_maxshift(dd,&comm->ddpme[d],
3072 comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3073 comm->ddpme[d].slb_dim_f);
3078 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3079 int d,int dim,gmx_domdec_root_t *root,
3081 gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3083 gmx_domdec_comm_t *comm;
3084 int ncd,i,j,nmin,nmin_old;
3085 gmx_bool bLimLo,bLimHi;
3087 real fac,halfway,cellsize_limit_f_i,region_size;
3088 gmx_bool bPBC,bLastHi=FALSE;
3089 int nrange[]={range[0],range[1]};
3091 region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
3097 bPBC = (dim < ddbox->npbcdim);
3099 cell_size = root->buf_ncd;
3103 fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3106 /* First we need to check if the scaling does not make cells
3107 * smaller than the smallest allowed size.
3108 * We need to do this iteratively, since if a cell is too small,
3109 * it needs to be enlarged, which makes all the other cells smaller,
3110 * which could in turn make another cell smaller than allowed.
3112 for(i=range[0]; i<range[1]; i++)
3114 root->bCellMin[i] = FALSE;
3120 /* We need the total for normalization */
3122 for(i=range[0]; i<range[1]; i++)
3124 if (root->bCellMin[i] == FALSE)
3126 fac += cell_size[i];
3129 fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3130 /* Determine the cell boundaries */
3131 for(i=range[0]; i<range[1]; i++)
3133 if (root->bCellMin[i] == FALSE)
3135 cell_size[i] *= fac;
3136 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3138 cellsize_limit_f_i = 0;
3142 cellsize_limit_f_i = cellsize_limit_f;
3144 if (cell_size[i] < cellsize_limit_f_i)
3146 root->bCellMin[i] = TRUE;
3147 cell_size[i] = cellsize_limit_f_i;
3151 root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3154 while (nmin > nmin_old);
3157 cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3158 /* For this check we should not use DD_CELL_MARGIN,
3159 * but a slightly smaller factor,
3160 * since rounding could get use below the limit.
3162 if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3165 gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3166 gmx_step_str(step,buf),
3167 dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3168 ncd,comm->cellsize_min[dim]);
3171 root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3175 /* Check if the boundary did not displace more than halfway
3176 * each of the cells it bounds, as this could cause problems,
3177 * especially when the differences between cell sizes are large.
3178 * If changes are applied, they will not make cells smaller
3179 * than the cut-off, as we check all the boundaries which
3180 * might be affected by a change and if the old state was ok,
3181 * the cells will at most be shrunk back to their old size.
3183 for(i=range[0]+1; i<range[1]; i++)
3185 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3186 if (root->cell_f[i] < halfway)
3188 root->cell_f[i] = halfway;
3189 /* Check if the change also causes shifts of the next boundaries */
3190 for(j=i+1; j<range[1]; j++)
3192 if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3193 root->cell_f[j] = root->cell_f[j-1] + cellsize_limit_f;
3196 halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3197 if (root->cell_f[i] > halfway)
3199 root->cell_f[i] = halfway;
3200 /* Check if the change also causes shifts of the next boundaries */
3201 for(j=i-1; j>=range[0]+1; j--)
3203 if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3204 root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3210 /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3211 /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3212 * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3213 * for a and b nrange is used */
3216 /* Take care of the staggering of the cell boundaries */
3219 for(i=range[0]; i<range[1]; i++)
3221 root->cell_f_max0[i] = root->cell_f[i];
3222 root->cell_f_min1[i] = root->cell_f[i+1];
3227 for(i=range[0]+1; i<range[1]; i++)
3229 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3230 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3231 if (bLimLo && bLimHi)
3233 /* Both limits violated, try the best we can */
3234 /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3235 root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3238 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3242 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3248 /* root->cell_f[i] = root->bound_min[i]; */
3249 nrange[1]=i; /* only store violation location. There could be a LimLo violation following with an higher index */
3252 else if (bLimHi && !bLastHi)
3255 if (nrange[1] < range[1]) /* found a LimLo before */
3257 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3258 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3259 nrange[0]=nrange[1];
3261 root->cell_f[i] = root->bound_max[i];
3263 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3268 if (nrange[1] < range[1]) /* found last a LimLo */
3270 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3271 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3272 nrange[0]=nrange[1];
3274 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3276 else if (nrange[0] > range[0]) /* found at least one LimHi */
3278 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3285 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3286 int d,int dim,gmx_domdec_root_t *root,
3287 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3288 gmx_bool bUniform,gmx_large_int_t step)
3290 gmx_domdec_comm_t *comm;
3293 real load_aver,load_i,imbalance,change,change_max,sc;
3294 real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3298 int range[] = { 0, 0 };
3302 /* Convert the maximum change from the input percentage to a fraction */
3303 change_limit = comm->dlb_scale_lim*0.01;
3307 bPBC = (dim < ddbox->npbcdim);
3309 cell_size = root->buf_ncd;
3311 /* Store the original boundaries */
3312 for(i=0; i<ncd+1; i++)
3314 root->old_cell_f[i] = root->cell_f[i];
3317 for(i=0; i<ncd; i++)
3319 cell_size[i] = 1.0/ncd;
3322 else if (dd_load_count(comm))
3324 load_aver = comm->load[d].sum_m/ncd;
3326 for(i=0; i<ncd; i++)
3328 /* Determine the relative imbalance of cell i */
3329 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3330 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3331 /* Determine the change of the cell size using underrelaxation */
3332 change = -relax*imbalance;
3333 change_max = max(change_max,max(change,-change));
3335 /* Limit the amount of scaling.
3336 * We need to use the same rescaling for all cells in one row,
3337 * otherwise the load balancing might not converge.
3340 if (change_max > change_limit)
3342 sc *= change_limit/change_max;
3344 for(i=0; i<ncd; i++)
3346 /* Determine the relative imbalance of cell i */
3347 load_i = comm->load[d].load[i*comm->load[d].nload+2];
3348 imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3349 /* Determine the change of the cell size using underrelaxation */
3350 change = -sc*imbalance;
3351 cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3355 cellsize_limit_f = comm->cellsize_min[dim]/ddbox->box_size[dim];
3356 cellsize_limit_f *= DD_CELL_MARGIN;
3357 dist_min_f_hard = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3358 dist_min_f = dist_min_f_hard * DD_CELL_MARGIN;
3359 if (ddbox->tric_dir[dim])
3361 cellsize_limit_f /= ddbox->skew_fac[dim];
3362 dist_min_f /= ddbox->skew_fac[dim];
3364 if (bDynamicBox && d > 0)
3366 dist_min_f *= DD_PRES_SCALE_MARGIN;
3368 if (d > 0 && !bUniform)
3370 /* Make sure that the grid is not shifted too much */
3371 for(i=1; i<ncd; i++) {
3372 if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3374 gmx_incons("Inconsistent DD boundary staggering limits!");
3376 root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3377 space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3379 root->bound_min[i] += 0.5*space;
3381 root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3382 space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3384 root->bound_max[i] += 0.5*space;
3389 "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3391 root->cell_f_max0[i-1] + dist_min_f,
3392 root->bound_min[i],root->cell_f[i],root->bound_max[i],
3393 root->cell_f_min1[i] - dist_min_f);
3398 root->cell_f[0] = 0;
3399 root->cell_f[ncd] = 1;
3400 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3403 /* After the checks above, the cells should obey the cut-off
3404 * restrictions, but it does not hurt to check.
3406 for(i=0; i<ncd; i++)
3410 fprintf(debug,"Relative bounds dim %d cell %d: %f %f\n",
3411 dim,i,root->cell_f[i],root->cell_f[i+1]);
3414 if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3415 root->cell_f[i+1] - root->cell_f[i] <
3416 cellsize_limit_f/DD_CELL_MARGIN)
3420 "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3421 gmx_step_str(step,buf),dim2char(dim),i,
3422 (root->cell_f[i+1] - root->cell_f[i])
3423 *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3428 /* Store the cell boundaries of the lower dimensions at the end */
3429 for(d1=0; d1<d; d1++)
3431 root->cell_f[pos++] = comm->cell_f0[d1];
3432 root->cell_f[pos++] = comm->cell_f1[d1];
3435 if (d < comm->npmedecompdim)
3437 /* The master determines the maximum shift for
3438 * the coordinate communication between separate PME nodes.
3440 set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3442 root->cell_f[pos++] = comm->ddpme[0].maxshift;
3445 root->cell_f[pos++] = comm->ddpme[1].maxshift;
3449 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3450 gmx_ddbox_t *ddbox,int dimind)
3452 gmx_domdec_comm_t *comm;
3457 /* Set the cell dimensions */
3458 dim = dd->dim[dimind];
3459 comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3460 comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3461 if (dim >= ddbox->nboundeddim)
3463 comm->cell_x0[dim] += ddbox->box0[dim];
3464 comm->cell_x1[dim] += ddbox->box0[dim];
3468 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3469 int d,int dim,real *cell_f_row,
3472 gmx_domdec_comm_t *comm;
3478 /* Each node would only need to know two fractions,
3479 * but it is probably cheaper to broadcast the whole array.
3481 MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3482 0,comm->mpi_comm_load[d]);
3484 /* Copy the fractions for this dimension from the buffer */
3485 comm->cell_f0[d] = cell_f_row[dd->ci[dim] ];
3486 comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3487 /* The whole array was communicated, so set the buffer position */
3488 pos = dd->nc[dim] + 1;
3489 for(d1=0; d1<=d; d1++)
3493 /* Copy the cell fractions of the lower dimensions */
3494 comm->cell_f0[d1] = cell_f_row[pos++];
3495 comm->cell_f1[d1] = cell_f_row[pos++];
3497 relative_to_absolute_cell_bounds(dd,ddbox,d1);
3499 /* Convert the communicated shift from float to int */
3500 comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3503 comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3507 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3508 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3509 gmx_bool bUniform,gmx_large_int_t step)
3511 gmx_domdec_comm_t *comm;
3513 gmx_bool bRowMember,bRowRoot;
3518 for(d=0; d<dd->ndim; d++)
3523 for(d1=d; d1<dd->ndim; d1++)
3525 if (dd->ci[dd->dim[d1]] > 0)
3538 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3539 ddbox,bDynamicBox,bUniform,step);
3540 cell_f_row = comm->root[d]->cell_f;
3544 cell_f_row = comm->cell_f_row;
3546 distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3551 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3555 /* This function assumes the box is static and should therefore
3556 * not be called when the box has changed since the last
3557 * call to dd_partition_system.
3559 for(d=0; d<dd->ndim; d++)
3561 relative_to_absolute_cell_bounds(dd,ddbox,d);
3567 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3568 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3569 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3570 gmx_wallcycle_t wcycle)
3572 gmx_domdec_comm_t *comm;
3579 wallcycle_start(wcycle,ewcDDCOMMBOUND);
3580 set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3581 wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3583 else if (bDynamicBox)
3585 set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3588 /* Set the dimensions for which no DD is used */
3589 for(dim=0; dim<DIM; dim++) {
3590 if (dd->nc[dim] == 1) {
3591 comm->cell_x0[dim] = 0;
3592 comm->cell_x1[dim] = ddbox->box_size[dim];
3593 if (dim >= ddbox->nboundeddim)
3595 comm->cell_x0[dim] += ddbox->box0[dim];
3596 comm->cell_x1[dim] += ddbox->box0[dim];
3602 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3605 gmx_domdec_comm_dim_t *cd;
3607 for(d=0; d<dd->ndim; d++)
3609 cd = &dd->comm->cd[d];
3610 np = npulse[dd->dim[d]];
3611 if (np > cd->np_nalloc)
3615 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3616 dim2char(dd->dim[d]),np);
3618 if (DDMASTER(dd) && cd->np_nalloc > 0)
3620 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3623 for(i=cd->np_nalloc; i<np; i++)
3625 cd->ind[i].index = NULL;
3626 cd->ind[i].nalloc = 0;
3635 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3636 gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3637 gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3638 gmx_wallcycle_t wcycle)
3640 gmx_domdec_comm_t *comm;
3646 /* Copy the old cell boundaries for the cg displacement check */
3647 copy_rvec(comm->cell_x0,comm->old_cell_x0);
3648 copy_rvec(comm->cell_x1,comm->old_cell_x1);
3650 if (comm->bDynLoadBal)
3654 check_box_size(dd,ddbox);
3656 set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3660 set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3661 realloc_comm_ind(dd,npulse);
3666 for(d=0; d<DIM; d++)
3668 fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3669 d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3674 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3676 rvec cell_ns_x0,rvec cell_ns_x1,
3677 gmx_large_int_t step)
3679 gmx_domdec_comm_t *comm;
3684 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3686 dim = dd->dim[dim_ind];
3688 /* Without PBC we don't have restrictions on the outer cells */
3689 if (!(dim >= ddbox->npbcdim &&
3690 (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3691 comm->bDynLoadBal &&
3692 (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3693 comm->cellsize_min[dim])
3696 gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3697 gmx_step_str(step,buf),dim2char(dim),
3698 comm->cell_x1[dim] - comm->cell_x0[dim],
3699 ddbox->skew_fac[dim],
3700 dd->comm->cellsize_min[dim],
3701 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3705 if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3707 /* Communicate the boundaries and update cell_ns_x0/1 */
3708 dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3709 if (dd->bGridJump && dd->ndim > 1)
3711 check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3716 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3720 tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3728 tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3729 tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3738 static void check_screw_box(matrix box)
3740 /* Mathematical limitation */
3741 if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3743 gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3746 /* Limitation due to the asymmetry of the eighth shell method */
3747 if (box[ZZ][YY] != 0)
3749 gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3753 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3754 matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3757 gmx_domdec_master_t *ma;
3758 int **tmp_ind=NULL,*tmp_nalloc=NULL;
3759 int i,icg,j,k,k0,k1,d,npbcdim;
3761 rvec box_size,cg_cm;
3763 real nrcg,inv_ncg,pos_d;
3765 gmx_bool bUnbounded,bScrew;
3769 if (tmp_ind == NULL)
3771 snew(tmp_nalloc,dd->nnodes);
3772 snew(tmp_ind,dd->nnodes);
3773 for(i=0; i<dd->nnodes; i++)
3775 tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3776 snew(tmp_ind[i],tmp_nalloc[i]);
3780 /* Clear the count */
3781 for(i=0; i<dd->nnodes; i++)
3787 make_tric_corr_matrix(dd->npbcdim,box,tcm);
3789 cgindex = cgs->index;
3791 /* Compute the center of geometry for all charge groups */
3792 for(icg=0; icg<cgs->nr; icg++)
3795 k1 = cgindex[icg+1];
3799 copy_rvec(pos[k0],cg_cm);
3806 for(k=k0; (k<k1); k++)
3808 rvec_inc(cg_cm,pos[k]);
3810 for(d=0; (d<DIM); d++)
3812 cg_cm[d] *= inv_ncg;
3815 /* Put the charge group in the box and determine the cell index */
3816 for(d=DIM-1; d>=0; d--) {
3818 if (d < dd->npbcdim)
3820 bScrew = (dd->bScrewPBC && d == XX);
3821 if (tric_dir[d] && dd->nc[d] > 1)
3823 /* Use triclinic coordintates for this dimension */
3824 for(j=d+1; j<DIM; j++)
3826 pos_d += cg_cm[j]*tcm[j][d];
3829 while(pos_d >= box[d][d])
3832 rvec_dec(cg_cm,box[d]);
3835 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3836 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3838 for(k=k0; (k<k1); k++)
3840 rvec_dec(pos[k],box[d]);
3843 pos[k][YY] = box[YY][YY] - pos[k][YY];
3844 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3851 rvec_inc(cg_cm,box[d]);
3854 cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3855 cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3857 for(k=k0; (k<k1); k++)
3859 rvec_inc(pos[k],box[d]);
3861 pos[k][YY] = box[YY][YY] - pos[k][YY];
3862 pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3867 /* This could be done more efficiently */
3869 while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3874 i = dd_index(dd->nc,ind);
3875 if (ma->ncg[i] == tmp_nalloc[i])
3877 tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3878 srenew(tmp_ind[i],tmp_nalloc[i]);
3880 tmp_ind[i][ma->ncg[i]] = icg;
3882 ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3886 for(i=0; i<dd->nnodes; i++)
3889 for(k=0; k<ma->ncg[i]; k++)
3891 ma->cg[k1++] = tmp_ind[i][k];
3894 ma->index[dd->nnodes] = k1;
3896 for(i=0; i<dd->nnodes; i++)
3906 fprintf(fplog,"Charge group distribution at step %s:",
3907 gmx_step_str(step,buf));
3908 for(i=0; i<dd->nnodes; i++)
3910 fprintf(fplog," %d",ma->ncg[i]);
3912 fprintf(fplog,"\n");
3916 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3917 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3920 gmx_domdec_master_t *ma=NULL;
3923 int *ibuf,buf2[2] = { 0, 0 };
3924 gmx_bool bMaster = DDMASTER(dd);
3931 check_screw_box(box);
3934 set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3936 distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3937 for(i=0; i<dd->nnodes; i++)
3939 ma->ibuf[2*i] = ma->ncg[i];
3940 ma->ibuf[2*i+1] = ma->nat[i];
3948 dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3950 dd->ncg_home = buf2[0];
3951 dd->nat_home = buf2[1];
3952 dd->ncg_tot = dd->ncg_home;
3953 dd->nat_tot = dd->nat_home;
3954 if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3956 dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3957 srenew(dd->index_gl,dd->cg_nalloc);
3958 srenew(dd->cgindex,dd->cg_nalloc+1);
3962 for(i=0; i<dd->nnodes; i++)
3964 ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3965 ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3970 DDMASTER(dd) ? ma->ibuf : NULL,
3971 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3972 DDMASTER(dd) ? ma->cg : NULL,
3973 dd->ncg_home*sizeof(int),dd->index_gl);
3975 /* Determine the home charge group sizes */
3977 for(i=0; i<dd->ncg_home; i++)
3979 cg_gl = dd->index_gl[i];
3981 dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3986 fprintf(debug,"Home charge groups:\n");
3987 for(i=0; i<dd->ncg_home; i++)
3989 fprintf(debug," %d",dd->index_gl[i]);
3991 fprintf(debug,"\n");
3993 fprintf(debug,"\n");
3997 static int compact_and_copy_vec_at(int ncg,int *move,
4000 rvec *src,gmx_domdec_comm_t *comm,
4003 int m,icg,i,i0,i1,nrcg;
4009 for(m=0; m<DIM*2; m++)
4015 for(icg=0; icg<ncg; icg++)
4017 i1 = cgindex[icg+1];
4023 /* Compact the home array in place */
4024 for(i=i0; i<i1; i++)
4026 copy_rvec(src[i],src[home_pos++]);
4032 /* Copy to the communication buffer */
4034 pos_vec[m] += 1 + vec*nrcg;
4035 for(i=i0; i<i1; i++)
4037 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4039 pos_vec[m] += (nvec - vec - 1)*nrcg;
4043 home_pos += i1 - i0;
4051 static int compact_and_copy_vec_cg(int ncg,int *move,
4053 int nvec,rvec *src,gmx_domdec_comm_t *comm,
4056 int m,icg,i0,i1,nrcg;
4062 for(m=0; m<DIM*2; m++)
4068 for(icg=0; icg<ncg; icg++)
4070 i1 = cgindex[icg+1];
4076 /* Compact the home array in place */
4077 copy_rvec(src[icg],src[home_pos++]);
4083 /* Copy to the communication buffer */
4084 copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4085 pos_vec[m] += 1 + nrcg*nvec;
4097 static int compact_ind(int ncg,int *move,
4098 int *index_gl,int *cgindex,
4100 gmx_ga2la_t ga2la,char *bLocalCG,
4103 int cg,nat,a0,a1,a,a_gl;
4108 for(cg=0; cg<ncg; cg++)
4114 /* Compact the home arrays in place.
4115 * Anything that can be done here avoids access to global arrays.
4117 cgindex[home_pos] = nat;
4118 for(a=a0; a<a1; a++)
4121 gatindex[nat] = a_gl;
4122 /* The cell number stays 0, so we don't need to set it */
4123 ga2la_change_la(ga2la,a_gl,nat);
4126 index_gl[home_pos] = index_gl[cg];
4127 cginfo[home_pos] = cginfo[cg];
4128 /* The charge group remains local, so bLocalCG does not change */
4133 /* Clear the global indices */
4134 for(a=a0; a<a1; a++)
4136 ga2la_del(ga2la,gatindex[a]);
4140 bLocalCG[index_gl[cg]] = FALSE;
4144 cgindex[home_pos] = nat;
4149 static void clear_and_mark_ind(int ncg,int *move,
4150 int *index_gl,int *cgindex,int *gatindex,
4151 gmx_ga2la_t ga2la,char *bLocalCG,
4156 for(cg=0; cg<ncg; cg++)
4162 /* Clear the global indices */
4163 for(a=a0; a<a1; a++)
4165 ga2la_del(ga2la,gatindex[a]);
4169 bLocalCG[index_gl[cg]] = FALSE;
4171 /* Signal that this cg has moved using the ns cell index.
4172 * Here we set it to -1. fill_grid will change it
4173 * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4175 cell_index[cg] = -1;
4180 static void print_cg_move(FILE *fplog,
4182 gmx_large_int_t step,int cg,int dim,int dir,
4183 gmx_bool bHaveLimitdAndCMOld,real limitd,
4184 rvec cm_old,rvec cm_new,real pos_d)
4186 gmx_domdec_comm_t *comm;
4191 fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4192 if (bHaveLimitdAndCMOld)
4194 fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4195 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4199 fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4200 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4202 fprintf(fplog,"distance out of cell %f\n",
4203 dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4204 if (bHaveLimitdAndCMOld)
4206 fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4207 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4209 fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4210 cm_new[XX],cm_new[YY],cm_new[ZZ]);
4211 fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4213 comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4214 fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4216 comm->cell_x0[dim],comm->cell_x1[dim]);
4219 static void cg_move_error(FILE *fplog,
4221 gmx_large_int_t step,int cg,int dim,int dir,
4222 gmx_bool bHaveLimitdAndCMOld,real limitd,
4223 rvec cm_old,rvec cm_new,real pos_d)
4227 print_cg_move(fplog, dd,step,cg,dim,dir,
4228 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4230 print_cg_move(stderr,dd,step,cg,dim,dir,
4231 bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4233 "A charge group moved too far between two domain decomposition steps\n"
4234 "This usually means that your system is not well equilibrated");
4237 static void rotate_state_atom(t_state *state,int a)
4241 for(est=0; est<estNR; est++)
4243 if (EST_DISTR(est) && (state->flags & (1<<est))) {
4246 /* Rotate the complete state; for a rectangular box only */
4247 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4248 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4251 state->v[a][YY] = -state->v[a][YY];
4252 state->v[a][ZZ] = -state->v[a][ZZ];
4255 state->sd_X[a][YY] = -state->sd_X[a][YY];
4256 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4259 state->cg_p[a][YY] = -state->cg_p[a][YY];
4260 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4262 case estDISRE_INITF:
4263 case estDISRE_RM3TAV:
4264 case estORIRE_INITF:
4266 /* These are distances, so not affected by rotation */
4269 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4275 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4277 if (natoms > comm->moved_nalloc)
4279 /* Contents should be preserved here */
4280 comm->moved_nalloc = over_alloc_dd(natoms);
4281 srenew(comm->moved,comm->moved_nalloc);
4287 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4290 ivec tric_dir,matrix tcm,
4291 rvec cell_x0,rvec cell_x1,
4292 rvec limitd,rvec limit0,rvec limit1,
4294 int cg_start,int cg_end,
4299 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4300 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4307 npbcdim = dd->npbcdim;
4309 for(cg=cg_start; cg<cg_end; cg++)
4316 copy_rvec(state->x[k0],cm_new);
4323 for(k=k0; (k<k1); k++)
4325 rvec_inc(cm_new,state->x[k]);
4327 for(d=0; (d<DIM); d++)
4329 cm_new[d] = inv_ncg*cm_new[d];
4334 /* Do pbc and check DD cell boundary crossings */
4335 for(d=DIM-1; d>=0; d--)
4339 bScrew = (dd->bScrewPBC && d == XX);
4340 /* Determine the location of this cg in lattice coordinates */
4344 for(d2=d+1; d2<DIM; d2++)
4346 pos_d += cm_new[d2]*tcm[d2][d];
4349 /* Put the charge group in the triclinic unit-cell */
4350 if (pos_d >= cell_x1[d])
4352 if (pos_d >= limit1[d])
4354 cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4355 cg_cm[cg],cm_new,pos_d);
4358 if (dd->ci[d] == dd->nc[d] - 1)
4360 rvec_dec(cm_new,state->box[d]);
4363 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4364 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4366 for(k=k0; (k<k1); k++)
4368 rvec_dec(state->x[k],state->box[d]);
4371 rotate_state_atom(state,k);
4376 else if (pos_d < cell_x0[d])
4378 if (pos_d < limit0[d])
4380 cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4381 cg_cm[cg],cm_new,pos_d);
4386 rvec_inc(cm_new,state->box[d]);
4389 cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4390 cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4392 for(k=k0; (k<k1); k++)
4394 rvec_inc(state->x[k],state->box[d]);
4397 rotate_state_atom(state,k);
4403 else if (d < npbcdim)
4405 /* Put the charge group in the rectangular unit-cell */
4406 while (cm_new[d] >= state->box[d][d])
4408 rvec_dec(cm_new,state->box[d]);
4409 for(k=k0; (k<k1); k++)
4411 rvec_dec(state->x[k],state->box[d]);
4414 while (cm_new[d] < 0)
4416 rvec_inc(cm_new,state->box[d]);
4417 for(k=k0; (k<k1); k++)
4419 rvec_inc(state->x[k],state->box[d]);
4425 copy_rvec(cm_new,cg_cm[cg]);
4427 /* Determine where this cg should go */
4430 for(d=0; d<dd->ndim; d++)
4435 flag |= DD_FLAG_FW(d);
4441 else if (dev[dim] == -1)
4443 flag |= DD_FLAG_BW(d);
4445 if (dd->nc[dim] > 2)
4456 /* Temporarily store the flag in move */
4457 move[cg] = mc + flag;
4461 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4462 gmx_domdec_t *dd,ivec tric_dir,
4463 t_state *state,rvec **f,
4464 t_forcerec *fr,t_mdatoms *md,
4472 int ncg[DIM*2],nat[DIM*2];
4473 int c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4474 int mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4475 int sbuf[2],rbuf[2];
4476 int home_pos_cg,home_pos_at,buf_pos;
4478 gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4483 rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4485 cginfo_mb_t *cginfo_mb;
4486 gmx_domdec_comm_t *comm;
4492 check_screw_box(state->box);
4496 if (fr->cutoff_scheme == ecutsGROUP)
4501 for(i=0; i<estNR; i++)
4507 case estX: /* Always present */ break;
4508 case estV: bV = (state->flags & (1<<i)); break;
4509 case estSDX: bSDX = (state->flags & (1<<i)); break;
4510 case estCGP: bCGP = (state->flags & (1<<i)); break;
4513 case estDISRE_INITF:
4514 case estDISRE_RM3TAV:
4515 case estORIRE_INITF:
4517 /* No processing required */
4520 gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4525 if (dd->ncg_tot > comm->nalloc_int)
4527 comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4528 srenew(comm->buf_int,comm->nalloc_int);
4530 move = comm->buf_int;
4532 /* Clear the count */
4533 for(c=0; c<dd->ndim*2; c++)
4539 npbcdim = dd->npbcdim;
4541 for(d=0; (d<DIM); d++)
4543 limitd[d] = dd->comm->cellsize_min[d];
4544 if (d >= npbcdim && dd->ci[d] == 0)
4546 cell_x0[d] = -GMX_FLOAT_MAX;
4550 cell_x0[d] = comm->cell_x0[d];
4552 if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4554 cell_x1[d] = GMX_FLOAT_MAX;
4558 cell_x1[d] = comm->cell_x1[d];
4562 limit0[d] = comm->old_cell_x0[d] - limitd[d];
4563 limit1[d] = comm->old_cell_x1[d] + limitd[d];
4567 /* We check after communication if a charge group moved
4568 * more than one cell. Set the pre-comm check limit to float_max.
4570 limit0[d] = -GMX_FLOAT_MAX;
4571 limit1[d] = GMX_FLOAT_MAX;
4575 make_tric_corr_matrix(npbcdim,state->box,tcm);
4577 cgindex = dd->cgindex;
4579 nthread = gmx_omp_nthreads_get(emntDomdec);
4581 /* Compute the center of geometry for all home charge groups
4582 * and put them in the box and determine where they should go.
4584 #pragma omp parallel for num_threads(nthread) schedule(static)
4585 for(thread=0; thread<nthread; thread++)
4587 calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4588 cell_x0,cell_x1,limitd,limit0,limit1,
4590 ( thread *dd->ncg_home)/nthread,
4591 ((thread+1)*dd->ncg_home)/nthread,
4592 fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4596 for(cg=0; cg<dd->ncg_home; cg++)
4601 flag = mc & ~DD_FLAG_NRCG;
4602 mc = mc & DD_FLAG_NRCG;
4605 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4607 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4608 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4610 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS ] = dd->index_gl[cg];
4611 /* We store the cg size in the lower 16 bits
4612 * and the place where the charge group should go
4613 * in the next 6 bits. This saves some communication volume.
4615 nrcg = cgindex[cg+1] - cgindex[cg];
4616 comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4622 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4623 inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4626 for(i=0; i<dd->ndim*2; i++)
4628 *ncg_moved += ncg[i];
4645 /* Make sure the communication buffers are large enough */
4646 for(mc=0; mc<dd->ndim*2; mc++)
4648 nvr = ncg[mc] + nat[mc]*nvec;
4649 if (nvr > comm->cgcm_state_nalloc[mc])
4651 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4652 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4656 switch (fr->cutoff_scheme)
4659 /* Recalculating cg_cm might be cheaper than communicating,
4660 * but that could give rise to rounding issues.
4663 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4664 nvec,cg_cm,comm,bCompact);
4667 /* Without charge groups we send the moved atom coordinates
4668 * over twice. This is so the code below can be used without
4669 * many conditionals for both for with and without charge groups.
4672 compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4673 nvec,state->x,comm,FALSE);
4676 home_pos_cg -= *ncg_moved;
4680 gmx_incons("unimplemented");
4686 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4687 nvec,vec++,state->x,comm,bCompact);
4690 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4691 nvec,vec++,state->v,comm,bCompact);
4695 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4696 nvec,vec++,state->sd_X,comm,bCompact);
4700 compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4701 nvec,vec++,state->cg_p,comm,bCompact);
4706 compact_ind(dd->ncg_home,move,
4707 dd->index_gl,dd->cgindex,dd->gatindex,
4708 dd->ga2la,comm->bLocalCG,
4713 if (fr->cutoff_scheme == ecutsVERLET)
4715 moved = get_moved(comm,dd->ncg_home);
4717 for(k=0; k<dd->ncg_home; k++)
4724 moved = fr->ns.grid->cell_index;
4727 clear_and_mark_ind(dd->ncg_home,move,
4728 dd->index_gl,dd->cgindex,dd->gatindex,
4729 dd->ga2la,comm->bLocalCG,
4733 cginfo_mb = fr->cginfo_mb;
4735 *ncg_stay_home = home_pos_cg;
4736 for(d=0; d<dd->ndim; d++)
4742 for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4745 /* Communicate the cg and atom counts */
4750 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4751 d,dir,sbuf[0],sbuf[1]);
4753 dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4755 if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4757 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4758 srenew(comm->buf_int,comm->nalloc_int);
4761 /* Communicate the charge group indices, sizes and flags */
4762 dd_sendrecv_int(dd, d, dir,
4763 comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4764 comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4766 nvs = ncg[cdd] + nat[cdd]*nvec;
4767 i = rbuf[0] + rbuf[1] *nvec;
4768 vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4770 /* Communicate cgcm and state */
4771 dd_sendrecv_rvec(dd, d, dir,
4772 comm->cgcm_state[cdd], nvs,
4773 comm->vbuf.v+nvr, i);
4774 ncg_recv += rbuf[0];
4775 nat_recv += rbuf[1];
4779 /* Process the received charge groups */
4781 for(cg=0; cg<ncg_recv; cg++)
4783 flag = comm->buf_int[cg*DD_CGIBS+1];
4785 if (dim >= npbcdim && dd->nc[dim] > 2)
4787 /* No pbc in this dim and more than one domain boundary.
4788 * We do a separate check if a charge group didn't move too far.
4790 if (((flag & DD_FLAG_FW(d)) &&
4791 comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4792 ((flag & DD_FLAG_BW(d)) &&
4793 comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4795 cg_move_error(fplog,dd,step,cg,dim,
4796 (flag & DD_FLAG_FW(d)) ? 1 : 0,
4798 comm->vbuf.v[buf_pos],
4799 comm->vbuf.v[buf_pos],
4800 comm->vbuf.v[buf_pos][dim]);
4807 /* Check which direction this cg should go */
4808 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4812 /* The cell boundaries for dimension d2 are not equal
4813 * for each cell row of the lower dimension(s),
4814 * therefore we might need to redetermine where
4815 * this cg should go.
4818 /* If this cg crosses the box boundary in dimension d2
4819 * we can use the communicated flag, so we do not
4820 * have to worry about pbc.
4822 if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4823 (flag & DD_FLAG_FW(d2))) ||
4824 (dd->ci[dim2] == 0 &&
4825 (flag & DD_FLAG_BW(d2)))))
4827 /* Clear the two flags for this dimension */
4828 flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4829 /* Determine the location of this cg
4830 * in lattice coordinates
4832 pos_d = comm->vbuf.v[buf_pos][dim2];
4835 for(d3=dim2+1; d3<DIM; d3++)
4838 comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4841 /* Check of we are not at the box edge.
4842 * pbc is only handled in the first step above,
4843 * but this check could move over pbc while
4844 * the first step did not due to different rounding.
4846 if (pos_d >= cell_x1[dim2] &&
4847 dd->ci[dim2] != dd->nc[dim2]-1)
4849 flag |= DD_FLAG_FW(d2);
4851 else if (pos_d < cell_x0[dim2] &&
4854 flag |= DD_FLAG_BW(d2);
4856 comm->buf_int[cg*DD_CGIBS+1] = flag;
4859 /* Set to which neighboring cell this cg should go */
4860 if (flag & DD_FLAG_FW(d2))
4864 else if (flag & DD_FLAG_BW(d2))
4866 if (dd->nc[dd->dim[d2]] > 2)
4878 nrcg = flag & DD_FLAG_NRCG;
4881 if (home_pos_cg+1 > dd->cg_nalloc)
4883 dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4884 srenew(dd->index_gl,dd->cg_nalloc);
4885 srenew(dd->cgindex,dd->cg_nalloc+1);
4887 /* Set the global charge group index and size */
4888 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4889 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4890 /* Copy the state from the buffer */
4891 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4892 if (fr->cutoff_scheme == ecutsGROUP)
4895 copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4899 /* Set the cginfo */
4900 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4901 dd->index_gl[home_pos_cg]);
4904 comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4907 if (home_pos_at+nrcg > state->nalloc)
4909 dd_realloc_state(state,f,home_pos_at+nrcg);
4911 for(i=0; i<nrcg; i++)
4913 copy_rvec(comm->vbuf.v[buf_pos++],
4914 state->x[home_pos_at+i]);
4918 for(i=0; i<nrcg; i++)
4920 copy_rvec(comm->vbuf.v[buf_pos++],
4921 state->v[home_pos_at+i]);
4926 for(i=0; i<nrcg; i++)
4928 copy_rvec(comm->vbuf.v[buf_pos++],
4929 state->sd_X[home_pos_at+i]);
4934 for(i=0; i<nrcg; i++)
4936 copy_rvec(comm->vbuf.v[buf_pos++],
4937 state->cg_p[home_pos_at+i]);
4941 home_pos_at += nrcg;
4945 /* Reallocate the buffers if necessary */
4946 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4948 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4949 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4951 nvr = ncg[mc] + nat[mc]*nvec;
4952 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4954 comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4955 srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4957 /* Copy from the receive to the send buffers */
4958 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4959 comm->buf_int + cg*DD_CGIBS,
4960 DD_CGIBS*sizeof(int));
4961 memcpy(comm->cgcm_state[mc][nvr],
4962 comm->vbuf.v[buf_pos],
4963 (1+nrcg*nvec)*sizeof(rvec));
4964 buf_pos += 1 + nrcg*nvec;
4971 /* With sorting (!bCompact) the indices are now only partially up to date
4972 * and ncg_home and nat_home are not the real count, since there are
4973 * "holes" in the arrays for the charge groups that moved to neighbors.
4975 if (fr->cutoff_scheme == ecutsVERLET)
4977 moved = get_moved(comm,home_pos_cg);
4979 for(i=dd->ncg_home; i<home_pos_cg; i++)
4984 dd->ncg_home = home_pos_cg;
4985 dd->nat_home = home_pos_at;
4990 "Finished repartitioning: cgs moved out %d, new home %d\n",
4991 *ncg_moved,dd->ncg_home-*ncg_moved);
4996 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4998 dd->comm->cycl[ddCycl] += cycles;
4999 dd->comm->cycl_n[ddCycl]++;
5000 if (cycles > dd->comm->cycl_max[ddCycl])
5002 dd->comm->cycl_max[ddCycl] = cycles;
5006 static double force_flop_count(t_nrnb *nrnb)
5013 for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5015 /* To get closer to the real timings, we half the count
5016 * for the normal loops and again half it for water loops.
5019 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5021 sum += nrnb->n[i]*0.25*cost_nrnb(i);
5025 sum += nrnb->n[i]*0.50*cost_nrnb(i);
5028 for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5031 if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5032 sum += nrnb->n[i]*cost_nrnb(i);
5034 for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5036 sum += nrnb->n[i]*cost_nrnb(i);
5042 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5044 if (dd->comm->eFlop)
5046 dd->comm->flop -= force_flop_count(nrnb);
5049 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5051 if (dd->comm->eFlop)
5053 dd->comm->flop += force_flop_count(nrnb);
5058 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5062 for(i=0; i<ddCyclNr; i++)
5064 dd->comm->cycl[i] = 0;
5065 dd->comm->cycl_n[i] = 0;
5066 dd->comm->cycl_max[i] = 0;
5069 dd->comm->flop_n = 0;
5072 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5074 gmx_domdec_comm_t *comm;
5075 gmx_domdec_load_t *load;
5076 gmx_domdec_root_t *root=NULL;
5077 int d,dim,cid,i,pos;
5078 float cell_frac=0,sbuf[DD_NLOAD_MAX];
5083 fprintf(debug,"get_load_distribution start\n");
5086 wallcycle_start(wcycle,ewcDDCOMMLOAD);
5090 bSepPME = (dd->pme_nodeid >= 0);
5092 for(d=dd->ndim-1; d>=0; d--)
5095 /* Check if we participate in the communication in this dimension */
5096 if (d == dd->ndim-1 ||
5097 (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5099 load = &comm->load[d];
5102 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5105 if (d == dd->ndim-1)
5107 sbuf[pos++] = dd_force_load(comm);
5108 sbuf[pos++] = sbuf[0];
5111 sbuf[pos++] = sbuf[0];
5112 sbuf[pos++] = cell_frac;
5115 sbuf[pos++] = comm->cell_f_max0[d];
5116 sbuf[pos++] = comm->cell_f_min1[d];
5121 sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5122 sbuf[pos++] = comm->cycl[ddCyclPME];
5127 sbuf[pos++] = comm->load[d+1].sum;
5128 sbuf[pos++] = comm->load[d+1].max;
5131 sbuf[pos++] = comm->load[d+1].sum_m;
5132 sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5133 sbuf[pos++] = comm->load[d+1].flags;
5136 sbuf[pos++] = comm->cell_f_max0[d];
5137 sbuf[pos++] = comm->cell_f_min1[d];
5142 sbuf[pos++] = comm->load[d+1].mdf;
5143 sbuf[pos++] = comm->load[d+1].pme;
5147 /* Communicate a row in DD direction d.
5148 * The communicators are setup such that the root always has rank 0.
5151 MPI_Gather(sbuf ,load->nload*sizeof(float),MPI_BYTE,
5152 load->load,load->nload*sizeof(float),MPI_BYTE,
5153 0,comm->mpi_comm_load[d]);
5155 if (dd->ci[dim] == dd->master_ci[dim])
5157 /* We are the root, process this row */
5158 if (comm->bDynLoadBal)
5160 root = comm->root[d];
5170 for(i=0; i<dd->nc[dim]; i++)
5172 load->sum += load->load[pos++];
5173 load->max = max(load->max,load->load[pos]);
5179 /* This direction could not be load balanced properly,
5180 * therefore we need to use the maximum iso the average load.
5182 load->sum_m = max(load->sum_m,load->load[pos]);
5186 load->sum_m += load->load[pos];
5189 load->cvol_min = min(load->cvol_min,load->load[pos]);
5193 load->flags = (int)(load->load[pos++] + 0.5);
5197 root->cell_f_max0[i] = load->load[pos++];
5198 root->cell_f_min1[i] = load->load[pos++];
5203 load->mdf = max(load->mdf,load->load[pos]);
5205 load->pme = max(load->pme,load->load[pos]);
5209 if (comm->bDynLoadBal && root->bLimited)
5211 load->sum_m *= dd->nc[dim];
5212 load->flags |= (1<<d);
5220 comm->nload += dd_load_count(comm);
5221 comm->load_step += comm->cycl[ddCyclStep];
5222 comm->load_sum += comm->load[0].sum;
5223 comm->load_max += comm->load[0].max;
5224 if (comm->bDynLoadBal)
5226 for(d=0; d<dd->ndim; d++)
5228 if (comm->load[0].flags & (1<<d))
5230 comm->load_lim[d]++;
5236 comm->load_mdf += comm->load[0].mdf;
5237 comm->load_pme += comm->load[0].pme;
5241 wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5245 fprintf(debug,"get_load_distribution finished\n");
5249 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5251 /* Return the relative performance loss on the total run time
5252 * due to the force calculation load imbalance.
5254 if (dd->comm->nload > 0)
5257 (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5258 (dd->comm->load_step*dd->nnodes);
5266 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5269 int npp,npme,nnodes,d,limp;
5270 float imbal,pme_f_ratio,lossf,lossp=0;
5272 gmx_domdec_comm_t *comm;
5275 if (DDMASTER(dd) && comm->nload > 0)
5278 npme = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5279 nnodes = npp + npme;
5280 imbal = comm->load_max*npp/comm->load_sum - 1;
5281 lossf = dd_force_imb_perf_loss(dd);
5282 sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5283 fprintf(fplog,"%s",buf);
5284 fprintf(stderr,"\n");
5285 fprintf(stderr,"%s",buf);
5286 sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5287 fprintf(fplog,"%s",buf);
5288 fprintf(stderr,"%s",buf);
5290 if (comm->bDynLoadBal)
5292 sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5293 for(d=0; d<dd->ndim; d++)
5295 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5296 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5302 sprintf(buf+strlen(buf),"\n");
5303 fprintf(fplog,"%s",buf);
5304 fprintf(stderr,"%s",buf);
5308 pme_f_ratio = comm->load_pme/comm->load_mdf;
5309 lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5312 lossp *= (float)npme/(float)nnodes;
5316 lossp *= (float)npp/(float)nnodes;
5318 sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5319 fprintf(fplog,"%s",buf);
5320 fprintf(stderr,"%s",buf);
5321 sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5322 fprintf(fplog,"%s",buf);
5323 fprintf(stderr,"%s",buf);
5325 fprintf(fplog,"\n");
5326 fprintf(stderr,"\n");
5328 if (lossf >= DD_PERF_LOSS)
5331 "NOTE: %.1f %% performance was lost due to load imbalance\n"
5332 " in the domain decomposition.\n",lossf*100);
5333 if (!comm->bDynLoadBal)
5335 sprintf(buf+strlen(buf)," You might want to use dynamic load balancing (option -dlb.)\n");
5339 sprintf(buf+strlen(buf)," You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5341 fprintf(fplog,"%s\n",buf);
5342 fprintf(stderr,"%s\n",buf);
5344 if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5347 "NOTE: %.1f %% performance was lost because the PME nodes\n"
5348 " had %s work to do than the PP nodes.\n"
5349 " You might want to %s the number of PME nodes\n"
5350 " or %s the cut-off and the grid spacing.\n",
5352 (lossp < 0) ? "less" : "more",
5353 (lossp < 0) ? "decrease" : "increase",
5354 (lossp < 0) ? "decrease" : "increase");
5355 fprintf(fplog,"%s\n",buf);
5356 fprintf(stderr,"%s\n",buf);
5361 static float dd_vol_min(gmx_domdec_t *dd)
5363 return dd->comm->load[0].cvol_min*dd->nnodes;
5366 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5368 return dd->comm->load[0].flags;
5371 static float dd_f_imbal(gmx_domdec_t *dd)
5373 return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5376 float dd_pme_f_ratio(gmx_domdec_t *dd)
5378 if (dd->comm->cycl_n[ddCyclPME] > 0)
5380 return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5388 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5393 flags = dd_load_flags(dd);
5397 "DD load balancing is limited by minimum cell size in dimension");
5398 for(d=0; d<dd->ndim; d++)
5402 fprintf(fplog," %c",dim2char(dd->dim[d]));
5405 fprintf(fplog,"\n");
5407 fprintf(fplog,"DD step %s",gmx_step_str(step,buf));
5408 if (dd->comm->bDynLoadBal)
5410 fprintf(fplog," vol min/aver %5.3f%c",
5411 dd_vol_min(dd),flags ? '!' : ' ');
5413 fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5414 if (dd->comm->cycl_n[ddCyclPME])
5416 fprintf(fplog," pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5418 fprintf(fplog,"\n\n");
5421 static void dd_print_load_verbose(gmx_domdec_t *dd)
5423 if (dd->comm->bDynLoadBal)
5425 fprintf(stderr,"vol %4.2f%c ",
5426 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5428 fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5429 if (dd->comm->cycl_n[ddCyclPME])
5431 fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5436 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5441 gmx_domdec_root_t *root;
5442 gmx_bool bPartOfGroup = FALSE;
5444 dim = dd->dim[dim_ind];
5445 copy_ivec(loc,loc_c);
5446 for(i=0; i<dd->nc[dim]; i++)
5449 rank = dd_index(dd->nc,loc_c);
5450 if (rank == dd->rank)
5452 /* This process is part of the group */
5453 bPartOfGroup = TRUE;
5456 MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5460 dd->comm->mpi_comm_load[dim_ind] = c_row;
5461 if (dd->comm->eDLB != edlbNO)
5463 if (dd->ci[dim] == dd->master_ci[dim])
5465 /* This is the root process of this row */
5466 snew(dd->comm->root[dim_ind],1);
5467 root = dd->comm->root[dim_ind];
5468 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5469 snew(root->old_cell_f,dd->nc[dim]+1);
5470 snew(root->bCellMin,dd->nc[dim]);
5473 snew(root->cell_f_max0,dd->nc[dim]);
5474 snew(root->cell_f_min1,dd->nc[dim]);
5475 snew(root->bound_min,dd->nc[dim]);
5476 snew(root->bound_max,dd->nc[dim]);
5478 snew(root->buf_ncd,dd->nc[dim]);
5482 /* This is not a root process, we only need to receive cell_f */
5483 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5486 if (dd->ci[dim] == dd->master_ci[dim])
5488 snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5494 static void make_load_communicators(gmx_domdec_t *dd)
5501 fprintf(debug,"Making load communicators\n");
5503 snew(dd->comm->load,dd->ndim);
5504 snew(dd->comm->mpi_comm_load,dd->ndim);
5507 make_load_communicator(dd,0,loc);
5510 for(i=0; i<dd->nc[dim0]; i++) {
5512 make_load_communicator(dd,1,loc);
5517 for(i=0; i<dd->nc[dim0]; i++) {
5520 for(j=0; j<dd->nc[dim1]; j++) {
5522 make_load_communicator(dd,2,loc);
5528 fprintf(debug,"Finished making load communicators\n");
5532 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5538 ivec dd_zp[DD_MAXIZONE];
5539 gmx_domdec_zones_t *zones;
5540 gmx_domdec_ns_ranges_t *izone;
5542 for(d=0; d<dd->ndim; d++)
5545 copy_ivec(dd->ci,tmp);
5546 tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5547 dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5548 copy_ivec(dd->ci,tmp);
5549 tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5550 dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5553 fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5556 dd->neighbor[d][1]);
5562 fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5563 dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5567 fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5569 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5570 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5577 for(i=0; i<nzonep; i++)
5579 copy_ivec(dd_zp3[i],dd_zp[i]);
5585 for(i=0; i<nzonep; i++)
5587 copy_ivec(dd_zp2[i],dd_zp[i]);
5593 for(i=0; i<nzonep; i++)
5595 copy_ivec(dd_zp1[i],dd_zp[i]);
5599 gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5604 zones = &dd->comm->zones;
5606 for(i=0; i<nzone; i++)
5609 clear_ivec(zones->shift[i]);
5610 for(d=0; d<dd->ndim; d++)
5612 zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5617 for(i=0; i<nzone; i++)
5619 for(d=0; d<DIM; d++)
5621 s[d] = dd->ci[d] - zones->shift[i][d];
5626 else if (s[d] >= dd->nc[d])
5632 zones->nizone = nzonep;
5633 for(i=0; i<zones->nizone; i++)
5635 if (dd_zp[i][0] != i)
5637 gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5639 izone = &zones->izone[i];
5640 izone->j0 = dd_zp[i][1];
5641 izone->j1 = dd_zp[i][2];
5642 for(dim=0; dim<DIM; dim++)
5644 if (dd->nc[dim] == 1)
5646 /* All shifts should be allowed */
5647 izone->shift0[dim] = -1;
5648 izone->shift1[dim] = 1;
5653 izone->shift0[d] = 0;
5654 izone->shift1[d] = 0;
5655 for(j=izone->j0; j<izone->j1; j++) {
5656 if (dd->shift[j][d] > dd->shift[i][d])
5657 izone->shift0[d] = -1;
5658 if (dd->shift[j][d] < dd->shift[i][d])
5659 izone->shift1[d] = 1;
5665 /* Assume the shift are not more than 1 cell */
5666 izone->shift0[dim] = 1;
5667 izone->shift1[dim] = -1;
5668 for(j=izone->j0; j<izone->j1; j++)
5670 shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5671 if (shift_diff < izone->shift0[dim])
5673 izone->shift0[dim] = shift_diff;
5675 if (shift_diff > izone->shift1[dim])
5677 izone->shift1[dim] = shift_diff;
5684 if (dd->comm->eDLB != edlbNO)
5686 snew(dd->comm->root,dd->ndim);
5689 if (dd->comm->bRecordLoad)
5691 make_load_communicators(dd);
5695 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5698 gmx_domdec_comm_t *comm;
5709 if (comm->bCartesianPP)
5711 /* Set up cartesian communication for the particle-particle part */
5714 fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5715 dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5718 for(i=0; i<DIM; i++)
5722 MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5724 /* We overwrite the old communicator with the new cartesian one */
5725 cr->mpi_comm_mygroup = comm_cart;
5728 dd->mpi_comm_all = cr->mpi_comm_mygroup;
5729 MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5731 if (comm->bCartesianPP_PME)
5733 /* Since we want to use the original cartesian setup for sim,
5734 * and not the one after split, we need to make an index.
5736 snew(comm->ddindex2ddnodeid,dd->nnodes);
5737 comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5738 gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5739 /* Get the rank of the DD master,
5740 * above we made sure that the master node is a PP node.
5750 MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5752 else if (comm->bCartesianPP)
5754 if (cr->npmenodes == 0)
5756 /* The PP communicator is also
5757 * the communicator for this simulation
5759 cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5761 cr->nodeid = dd->rank;
5763 MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5765 /* We need to make an index to go from the coordinates
5766 * to the nodeid of this simulation.
5768 snew(comm->ddindex2simnodeid,dd->nnodes);
5769 snew(buf,dd->nnodes);
5770 if (cr->duty & DUTY_PP)
5772 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5774 /* Communicate the ddindex to simulation nodeid index */
5775 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5776 cr->mpi_comm_mysim);
5779 /* Determine the master coordinates and rank.
5780 * The DD master should be the same node as the master of this sim.
5782 for(i=0; i<dd->nnodes; i++)
5784 if (comm->ddindex2simnodeid[i] == 0)
5786 ddindex2xyz(dd->nc,i,dd->master_ci);
5787 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5792 fprintf(debug,"The master rank is %d\n",dd->masterrank);
5797 /* No Cartesian communicators */
5798 /* We use the rank in dd->comm->all as DD index */
5799 ddindex2xyz(dd->nc,dd->rank,dd->ci);
5800 /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5802 clear_ivec(dd->master_ci);
5809 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5810 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5815 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5816 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5820 static void receive_ddindex2simnodeid(t_commrec *cr)
5824 gmx_domdec_comm_t *comm;
5831 if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5833 snew(comm->ddindex2simnodeid,dd->nnodes);
5834 snew(buf,dd->nnodes);
5835 if (cr->duty & DUTY_PP)
5837 buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5840 /* Communicate the ddindex to simulation nodeid index */
5841 MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5842 cr->mpi_comm_mysim);
5849 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5852 gmx_domdec_master_t *ma;
5857 snew(ma->ncg,dd->nnodes);
5858 snew(ma->index,dd->nnodes+1);
5860 snew(ma->nat,dd->nnodes);
5861 snew(ma->ibuf,dd->nnodes*2);
5862 snew(ma->cell_x,DIM);
5863 for(i=0; i<DIM; i++)
5865 snew(ma->cell_x[i],dd->nc[i]+1);
5868 if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5874 snew(ma->vbuf,natoms);
5880 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5884 gmx_domdec_comm_t *comm;
5895 if (comm->bCartesianPP)
5897 for(i=1; i<DIM; i++)
5899 bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5901 if (bDiv[YY] || bDiv[ZZ])
5903 comm->bCartesianPP_PME = TRUE;
5904 /* If we have 2D PME decomposition, which is always in x+y,
5905 * we stack the PME only nodes in z.
5906 * Otherwise we choose the direction that provides the thinnest slab
5907 * of PME only nodes as this will have the least effect
5908 * on the PP communication.
5909 * But for the PME communication the opposite might be better.
5911 if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5913 dd->nc[YY] > dd->nc[ZZ]))
5915 comm->cartpmedim = ZZ;
5919 comm->cartpmedim = YY;
5921 comm->ntot[comm->cartpmedim]
5922 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5926 fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5928 "Will not use a Cartesian communicator for PP <-> PME\n\n");
5933 if (comm->bCartesianPP_PME)
5937 fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5940 for(i=0; i<DIM; i++)
5944 MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5947 MPI_Comm_rank(comm_cart,&rank);
5948 if (MASTERNODE(cr) && rank != 0)
5950 gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5953 /* With this assigment we loose the link to the original communicator
5954 * which will usually be MPI_COMM_WORLD, unless have multisim.
5956 cr->mpi_comm_mysim = comm_cart;
5957 cr->sim_nodeid = rank;
5959 MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5963 fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5964 cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5967 if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5971 if (cr->npmenodes == 0 ||
5972 dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5974 cr->duty = DUTY_PME;
5977 /* Split the sim communicator into PP and PME only nodes */
5978 MPI_Comm_split(cr->mpi_comm_mysim,
5980 dd_index(comm->ntot,dd->ci),
5981 &cr->mpi_comm_mygroup);
5985 switch (dd_node_order)
5990 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5993 case ddnoINTERLEAVE:
5994 /* Interleave the PP-only and PME-only nodes,
5995 * as on clusters with dual-core machines this will double
5996 * the communication bandwidth of the PME processes
5997 * and thus speed up the PP <-> PME and inter PME communication.
6001 fprintf(fplog,"Interleaving PP and PME nodes\n");
6003 comm->pmenodes = dd_pmenodes(cr);
6008 gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6011 if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6013 cr->duty = DUTY_PME;
6020 /* Split the sim communicator into PP and PME only nodes */
6021 MPI_Comm_split(cr->mpi_comm_mysim,
6024 &cr->mpi_comm_mygroup);
6025 MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6031 fprintf(fplog,"This is a %s only node\n\n",
6032 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6036 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6039 gmx_domdec_comm_t *comm;
6045 copy_ivec(dd->nc,comm->ntot);
6047 comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6048 comm->bCartesianPP_PME = FALSE;
6050 /* Reorder the nodes by default. This might change the MPI ranks.
6051 * Real reordering is only supported on very few architectures,
6052 * Blue Gene is one of them.
6054 CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6056 if (cr->npmenodes > 0)
6058 /* Split the communicator into a PP and PME part */
6059 split_communicator(fplog,cr,dd_node_order,CartReorder);
6060 if (comm->bCartesianPP_PME)
6062 /* We (possibly) reordered the nodes in split_communicator,
6063 * so it is no longer required in make_pp_communicator.
6065 CartReorder = FALSE;
6070 /* All nodes do PP and PME */
6072 /* We do not require separate communicators */
6073 cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6077 if (cr->duty & DUTY_PP)
6079 /* Copy or make a new PP communicator */
6080 make_pp_communicator(fplog,cr,CartReorder);
6084 receive_ddindex2simnodeid(cr);
6087 if (!(cr->duty & DUTY_PME))
6089 /* Set up the commnuication to our PME node */
6090 dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6091 dd->pme_receive_vir_ener = receive_vir_ener(cr);
6094 fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6095 dd->pme_nodeid,dd->pme_receive_vir_ener);
6100 dd->pme_nodeid = -1;
6105 dd->ma = init_gmx_domdec_master_t(dd,
6107 comm->cgs_gl.index[comm->cgs_gl.nr]);
6111 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6118 if (nc > 1 && size_string != NULL)
6122 fprintf(fplog,"Using static load balancing for the %s direction\n",
6127 for (i=0; i<nc; i++)
6130 sscanf(size_string,"%lf%n",&dbl,&n);
6133 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6142 fprintf(fplog,"Relative cell sizes:");
6144 for (i=0; i<nc; i++)
6149 fprintf(fplog," %5.3f",slb_frac[i]);
6154 fprintf(fplog,"\n");
6161 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6164 gmx_mtop_ilistloop_t iloop;
6168 iloop = gmx_mtop_ilistloop_init(mtop);
6169 while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6171 for(ftype=0; ftype<F_NRE; ftype++)
6173 if ((interaction_function[ftype].flags & IF_BOND) &&
6176 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6184 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6190 val = getenv(env_var);
6193 if (sscanf(val,"%d",&nst) <= 0)
6199 fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6207 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6211 fprintf(stderr,"\n%s\n",warn_string);
6215 fprintf(fplog,"\n%s\n",warn_string);
6219 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6220 t_inputrec *ir,FILE *fplog)
6222 if (ir->ePBC == epbcSCREW &&
6223 (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6225 gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6228 if (ir->ns_type == ensSIMPLE)
6230 gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6233 if (ir->nstlist == 0)
6235 gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6238 if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6240 dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6244 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6249 r = ddbox->box_size[XX];
6250 for(di=0; di<dd->ndim; di++)
6253 /* Check using the initial average cell size */
6254 r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6260 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6261 const char *dlb_opt,gmx_bool bRecordLoad,
6262 unsigned long Flags,t_inputrec *ir)
6270 case 'a': eDLB = edlbAUTO; break;
6271 case 'n': eDLB = edlbNO; break;
6272 case 'y': eDLB = edlbYES; break;
6273 default: gmx_incons("Unknown dlb_opt");
6276 if (Flags & MD_RERUN)
6281 if (!EI_DYNAMICS(ir->eI))
6283 if (eDLB == edlbYES)
6285 sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6286 dd_warning(cr,fplog,buf);
6294 dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6299 if (Flags & MD_REPRODUCIBLE)
6306 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6310 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6313 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6321 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6326 if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6328 /* Decomposition order z,y,x */
6331 fprintf(fplog,"Using domain decomposition order z, y, x\n");
6333 for(dim=DIM-1; dim>=0; dim--)
6335 if (dd->nc[dim] > 1)
6337 dd->dim[dd->ndim++] = dim;
6343 /* Decomposition order x,y,z */
6344 for(dim=0; dim<DIM; dim++)
6346 if (dd->nc[dim] > 1)
6348 dd->dim[dd->ndim++] = dim;
6354 static gmx_domdec_comm_t *init_dd_comm()
6356 gmx_domdec_comm_t *comm;
6360 snew(comm->cggl_flag,DIM*2);
6361 snew(comm->cgcm_state,DIM*2);
6362 for(i=0; i<DIM*2; i++)
6364 comm->cggl_flag_nalloc[i] = 0;
6365 comm->cgcm_state_nalloc[i] = 0;
6368 comm->nalloc_int = 0;
6369 comm->buf_int = NULL;
6371 vec_rvec_init(&comm->vbuf);
6373 comm->n_load_have = 0;
6374 comm->n_load_collect = 0;
6376 for(i=0; i<ddnatNR-ddnatZONE; i++)
6378 comm->sum_nat[i] = 0;
6382 comm->load_step = 0;
6385 clear_ivec(comm->load_lim);
6392 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6393 unsigned long Flags,
6395 real comm_distance_min,real rconstr,
6396 const char *dlb_opt,real dlb_scale,
6397 const char *sizex,const char *sizey,const char *sizez,
6398 gmx_mtop_t *mtop,t_inputrec *ir,
6401 int *npme_x,int *npme_y)
6404 gmx_domdec_comm_t *comm;
6407 real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6414 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6419 dd->comm = init_dd_comm();
6421 snew(comm->cggl_flag,DIM*2);
6422 snew(comm->cgcm_state,DIM*2);
6424 dd->npbcdim = ePBC2npbcdim(ir->ePBC);
6425 dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6427 dd->bSendRecv2 = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6428 comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6429 comm->eFlop = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6430 recload = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6431 comm->nstSortCG = dd_nst_env(fplog,"GMX_DD_SORT",1);
6432 comm->nstDDDump = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6433 comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6434 comm->DD_debug = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6436 dd->pme_recv_f_alloc = 0;
6437 dd->pme_recv_f_buf = NULL;
6439 if (dd->bSendRecv2 && fplog)
6441 fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6447 fprintf(fplog,"Will load balance based on FLOP count\n");
6449 if (comm->eFlop > 1)
6451 srand(1+cr->nodeid);
6453 comm->bRecordLoad = TRUE;
6457 comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6461 comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6463 comm->bDynLoadBal = (comm->eDLB == edlbYES);
6466 fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6468 dd->bGridJump = comm->bDynLoadBal;
6470 if (comm->nstSortCG)
6474 if (comm->nstSortCG == 1)
6476 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6480 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6490 fprintf(fplog,"Will not sort the charge groups\n");
6494 comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6496 comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6497 if (comm->bInterCGBondeds)
6499 comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6503 comm->bInterCGMultiBody = FALSE;
6506 dd->bInterCGcons = inter_charge_group_constraints(mtop);
6507 dd->bInterCGsettles = inter_charge_group_settles(mtop);
6509 if (ir->rlistlong == 0)
6511 /* Set the cut-off to some very large value,
6512 * so we don't need if statements everywhere in the code.
6513 * We use sqrt, since the cut-off is squared in some places.
6515 comm->cutoff = GMX_CUTOFF_INF;
6519 comm->cutoff = ir->rlistlong;
6521 comm->cutoff_mbody = 0;
6523 comm->cellsize_limit = 0;
6524 comm->bBondComm = FALSE;
6526 if (comm->bInterCGBondeds)
6528 if (comm_distance_min > 0)
6530 comm->cutoff_mbody = comm_distance_min;
6531 if (Flags & MD_DDBONDCOMM)
6533 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6537 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6539 r_bonded_limit = comm->cutoff_mbody;
6541 else if (ir->bPeriodicMols)
6543 /* Can not easily determine the required cut-off */
6544 dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6545 comm->cutoff_mbody = comm->cutoff/2;
6546 r_bonded_limit = comm->cutoff_mbody;
6552 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6553 Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6555 gmx_bcast(sizeof(r_2b),&r_2b,cr);
6556 gmx_bcast(sizeof(r_mb),&r_mb,cr);
6558 /* We use an initial margin of 10% for the minimum cell size,
6559 * except when we are just below the non-bonded cut-off.
6561 if (Flags & MD_DDBONDCOMM)
6563 if (max(r_2b,r_mb) > comm->cutoff)
6565 r_bonded = max(r_2b,r_mb);
6566 r_bonded_limit = 1.1*r_bonded;
6567 comm->bBondComm = TRUE;
6572 r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6574 /* We determine cutoff_mbody later */
6578 /* No special bonded communication,
6579 * simply increase the DD cut-off.
6581 r_bonded_limit = 1.1*max(r_2b,r_mb);
6582 comm->cutoff_mbody = r_bonded_limit;
6583 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6586 comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6590 "Minimum cell size due to bonded interactions: %.3f nm\n",
6591 comm->cellsize_limit);
6595 if (dd->bInterCGcons && rconstr <= 0)
6597 /* There is a cell size limit due to the constraints (P-LINCS) */
6598 rconstr = constr_r_max(fplog,mtop,ir);
6602 "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6604 if (rconstr > comm->cellsize_limit)
6606 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6610 else if (rconstr > 0 && fplog)
6612 /* Here we do not check for dd->bInterCGcons,
6613 * because one can also set a cell size limit for virtual sites only
6614 * and at this point we don't know yet if there are intercg v-sites.
6617 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6620 comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6622 comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6626 copy_ivec(nc,dd->nc);
6627 set_dd_dim(fplog,dd);
6628 set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6630 if (cr->npmenodes == -1)
6634 acs = average_cellsize_min(dd,ddbox);
6635 if (acs < comm->cellsize_limit)
6639 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6641 gmx_fatal_collective(FARGS,cr,NULL,
6642 "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6643 acs,comm->cellsize_limit);
6648 set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6650 /* We need to choose the optimal DD grid and possibly PME nodes */
6651 limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6652 comm->eDLB!=edlbNO,dlb_scale,
6653 comm->cellsize_limit,comm->cutoff,
6654 comm->bInterCGBondeds,comm->bInterCGMultiBody);
6656 if (dd->nc[XX] == 0)
6658 bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6659 sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6660 !bC ? "-rdd" : "-rcon",
6661 comm->eDLB!=edlbNO ? " or -dds" : "",
6662 bC ? " or your LINCS settings" : "");
6664 gmx_fatal_collective(FARGS,cr,NULL,
6665 "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6667 "Look in the log file for details on the domain decomposition",
6668 cr->nnodes-cr->npmenodes,limit,buf);
6670 set_dd_dim(fplog,dd);
6676 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6677 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6680 dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6681 if (cr->nnodes - dd->nnodes != cr->npmenodes)
6683 gmx_fatal_collective(FARGS,cr,NULL,
6684 "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6685 dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6687 if (cr->npmenodes > dd->nnodes)
6689 gmx_fatal_collective(FARGS,cr,NULL,
6690 "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6692 if (cr->npmenodes > 0)
6694 comm->npmenodes = cr->npmenodes;
6698 comm->npmenodes = dd->nnodes;
6701 if (EEL_PME(ir->coulombtype))
6703 /* The following choices should match those
6704 * in comm_cost_est in domdec_setup.c.
6705 * Note that here the checks have to take into account
6706 * that the decomposition might occur in a different order than xyz
6707 * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6708 * in which case they will not match those in comm_cost_est,
6709 * but since that is mainly for testing purposes that's fine.
6711 if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6712 comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6713 getenv("GMX_PMEONEDD") == NULL)
6715 comm->npmedecompdim = 2;
6716 comm->npmenodes_x = dd->nc[XX];
6717 comm->npmenodes_y = comm->npmenodes/comm->npmenodes_x;
6721 /* In case nc is 1 in both x and y we could still choose to
6722 * decompose pme in y instead of x, but we use x for simplicity.
6724 comm->npmedecompdim = 1;
6725 if (dd->dim[0] == YY)
6727 comm->npmenodes_x = 1;
6728 comm->npmenodes_y = comm->npmenodes;
6732 comm->npmenodes_x = comm->npmenodes;
6733 comm->npmenodes_y = 1;
6738 fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6739 comm->npmenodes_x,comm->npmenodes_y,1);
6744 comm->npmedecompdim = 0;
6745 comm->npmenodes_x = 0;
6746 comm->npmenodes_y = 0;
6749 /* Technically we don't need both of these,
6750 * but it simplifies code not having to recalculate it.
6752 *npme_x = comm->npmenodes_x;
6753 *npme_y = comm->npmenodes_y;
6755 snew(comm->slb_frac,DIM);
6756 if (comm->eDLB == edlbNO)
6758 comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6759 comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6760 comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6763 if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6765 if (comm->bBondComm || comm->eDLB != edlbNO)
6767 /* Set the bonded communication distance to halfway
6768 * the minimum and the maximum,
6769 * since the extra communication cost is nearly zero.
6771 acs = average_cellsize_min(dd,ddbox);
6772 comm->cutoff_mbody = 0.5*(r_bonded + acs);
6773 if (comm->eDLB != edlbNO)
6775 /* Check if this does not limit the scaling */
6776 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6778 if (!comm->bBondComm)
6780 /* Without bBondComm do not go beyond the n.b. cut-off */
6781 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6782 if (comm->cellsize_limit >= comm->cutoff)
6784 /* We don't loose a lot of efficieny
6785 * when increasing it to the n.b. cut-off.
6786 * It can even be slightly faster, because we need
6787 * less checks for the communication setup.
6789 comm->cutoff_mbody = comm->cutoff;
6792 /* Check if we did not end up below our original limit */
6793 comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6795 if (comm->cutoff_mbody > comm->cellsize_limit)
6797 comm->cellsize_limit = comm->cutoff_mbody;
6800 /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6805 fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6806 "cellsize limit %f\n",
6807 comm->bBondComm,comm->cellsize_limit);
6812 check_dd_restrictions(cr,dd,ir,fplog);
6815 comm->partition_step = INT_MIN;
6818 clear_dd_cycle_counts(dd);
6823 static void set_dlb_limits(gmx_domdec_t *dd)
6828 for(d=0; d<dd->ndim; d++)
6830 dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6831 dd->comm->cellsize_min[dd->dim[d]] =
6832 dd->comm->cellsize_min_dlb[dd->dim[d]];
6837 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6840 gmx_domdec_comm_t *comm;
6850 fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6853 cellsize_min = comm->cellsize_min[dd->dim[0]];
6854 for(d=1; d<dd->ndim; d++)
6856 cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6859 if (cellsize_min < comm->cellsize_limit*1.05)
6861 dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6863 /* Change DLB from "auto" to "no". */
6864 comm->eDLB = edlbNO;
6869 dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6870 comm->bDynLoadBal = TRUE;
6871 dd->bGridJump = TRUE;
6875 /* We can set the required cell size info here,
6876 * so we do not need to communicate this.
6877 * The grid is completely uniform.
6879 for(d=0; d<dd->ndim; d++)
6883 comm->load[d].sum_m = comm->load[d].sum;
6885 nc = dd->nc[dd->dim[d]];
6888 comm->root[d]->cell_f[i] = i/(real)nc;
6891 comm->root[d]->cell_f_max0[i] = i /(real)nc;
6892 comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6895 comm->root[d]->cell_f[nc] = 1.0;
6900 static char *init_bLocalCG(gmx_mtop_t *mtop)
6905 ncg = ncg_mtop(mtop);
6907 for(cg=0; cg<ncg; cg++)
6909 bLocalCG[cg] = FALSE;
6915 void dd_init_bondeds(FILE *fplog,
6916 gmx_domdec_t *dd,gmx_mtop_t *mtop,
6917 gmx_vsite_t *vsite,gmx_constr_t constr,
6918 t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6920 gmx_domdec_comm_t *comm;
6924 dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6928 if (comm->bBondComm)
6930 /* Communicate atoms beyond the cut-off for bonded interactions */
6933 comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6935 comm->bLocalCG = init_bLocalCG(mtop);
6939 /* Only communicate atoms based on cut-off */
6940 comm->cglink = NULL;
6941 comm->bLocalCG = NULL;
6945 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6947 gmx_bool bDynLoadBal,real dlb_scale,
6950 gmx_domdec_comm_t *comm;
6965 fprintf(fplog,"The maximum number of communication pulses is:");
6966 for(d=0; d<dd->ndim; d++)
6968 fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6970 fprintf(fplog,"\n");
6971 fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6972 fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6973 fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6974 for(d=0; d<DIM; d++)
6978 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6985 comm->cellsize_min_dlb[d]/
6986 (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6988 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6991 fprintf(fplog,"\n");
6995 set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6996 fprintf(fplog,"The initial number of communication pulses is:");
6997 for(d=0; d<dd->ndim; d++)
6999 fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7001 fprintf(fplog,"\n");
7002 fprintf(fplog,"The initial domain decomposition cell size is:");
7003 for(d=0; d<DIM; d++) {
7006 fprintf(fplog," %c %.2f nm",
7007 dim2char(d),dd->comm->cellsize_min[d]);
7010 fprintf(fplog,"\n\n");
7013 if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7015 fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7016 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7017 "non-bonded interactions","",comm->cutoff);
7021 limit = dd->comm->cellsize_limit;
7025 if (dynamic_dd_box(ddbox,ir))
7027 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7029 limit = dd->comm->cellsize_min[XX];
7030 for(d=1; d<DIM; d++)
7032 limit = min(limit,dd->comm->cellsize_min[d]);
7036 if (comm->bInterCGBondeds)
7038 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7039 "two-body bonded interactions","(-rdd)",
7040 max(comm->cutoff,comm->cutoff_mbody));
7041 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7042 "multi-body bonded interactions","(-rdd)",
7043 (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7047 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7048 "virtual site constructions","(-rcon)",limit);
7050 if (dd->constraint_comm)
7052 sprintf(buf,"atoms separated by up to %d constraints",
7054 fprintf(fplog,"%40s %-7s %6.3f nm\n",
7055 buf,"(-rcon)",limit);
7057 fprintf(fplog,"\n");
7063 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7065 const t_inputrec *ir,
7066 const gmx_ddbox_t *ddbox)
7068 gmx_domdec_comm_t *comm;
7069 int d,dim,npulse,npulse_d_max,npulse_d;
7074 bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7076 /* Determine the maximum number of comm. pulses in one dimension */
7078 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7080 /* Determine the maximum required number of grid pulses */
7081 if (comm->cellsize_limit >= comm->cutoff)
7083 /* Only a single pulse is required */
7086 else if (!bNoCutOff && comm->cellsize_limit > 0)
7088 /* We round down slightly here to avoid overhead due to the latency
7089 * of extra communication calls when the cut-off
7090 * would be only slightly longer than the cell size.
7091 * Later cellsize_limit is redetermined,
7092 * so we can not miss interactions due to this rounding.
7094 npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7098 /* There is no cell size limit */
7099 npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7102 if (!bNoCutOff && npulse > 1)
7104 /* See if we can do with less pulses, based on dlb_scale */
7106 for(d=0; d<dd->ndim; d++)
7109 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7110 /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7111 npulse_d_max = max(npulse_d_max,npulse_d);
7113 npulse = min(npulse,npulse_d_max);
7116 /* This env var can override npulse */
7117 d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7124 comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7125 for(d=0; d<dd->ndim; d++)
7127 comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7128 comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7129 snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7130 comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7131 if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7133 comm->bVacDLBNoLimit = FALSE;
7137 /* cellsize_limit is set for LINCS in init_domain_decomposition */
7138 if (!comm->bVacDLBNoLimit)
7140 comm->cellsize_limit = max(comm->cellsize_limit,
7141 comm->cutoff/comm->maxpulse);
7143 comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7144 /* Set the minimum cell size for each DD dimension */
7145 for(d=0; d<dd->ndim; d++)
7147 if (comm->bVacDLBNoLimit ||
7148 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7150 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7154 comm->cellsize_min_dlb[dd->dim[d]] =
7155 comm->cutoff/comm->cd[d].np_dlb;
7158 if (comm->cutoff_mbody <= 0)
7160 comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7162 if (comm->bDynLoadBal)
7168 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7170 /* If each molecule is a single charge group
7171 * or we use domain decomposition for each periodic dimension,
7172 * we do not need to take pbc into account for the bonded interactions.
7174 return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7177 (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7180 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7181 t_inputrec *ir,t_forcerec *fr,
7184 gmx_domdec_comm_t *comm;
7190 /* Initialize the thread data.
7191 * This can not be done in init_domain_decomposition,
7192 * as the numbers of threads is determined later.
7194 comm->nth = gmx_omp_nthreads_get(emntDomdec);
7197 snew(comm->dth,comm->nth);
7200 if (EEL_PME(ir->coulombtype))
7202 init_ddpme(dd,&comm->ddpme[0],0);
7203 if (comm->npmedecompdim >= 2)
7205 init_ddpme(dd,&comm->ddpme[1],1);
7210 comm->npmenodes = 0;
7211 if (dd->pme_nodeid >= 0)
7213 gmx_fatal_collective(FARGS,NULL,dd,
7214 "Can not have separate PME nodes without PME electrostatics");
7220 fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7222 if (comm->eDLB != edlbNO)
7224 set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7227 print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7228 if (comm->eDLB == edlbAUTO)
7232 fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7234 print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7237 if (ir->ePBC == epbcNONE)
7239 vol_frac = 1 - 1/(double)dd->nnodes;
7244 (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7248 fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7250 natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7252 dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7255 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7266 set_ddbox(dd,FALSE,cr,ir,state->box,
7267 TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7271 for(d=0; d<dd->ndim; d++)
7275 inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7276 if (dynamic_dd_box(&ddbox,ir))
7278 inv_cell_size *= DD_PRES_SCALE_MARGIN;
7281 np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7283 if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7284 dd->comm->cd[d].np_dlb > 0)
7286 if (np > dd->comm->cd[d].np_dlb)
7291 /* If a current local cell size is smaller than the requested
7292 * cut-off, we could still fix it, but this gets very complicated.
7293 * Without fixing here, we might actually need more checks.
7295 if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7302 if (dd->comm->eDLB != edlbNO)
7304 /* If DLB is not active yet, we don't need to check the grid jumps.
7305 * Actually we shouldn't, because then the grid jump data is not set.
7307 if (dd->comm->bDynLoadBal &&
7308 check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7313 gmx_sumi(1,&LocallyLimited,cr);
7315 if (LocallyLimited > 0)
7321 dd->comm->cutoff = cutoff_req;
7326 static void merge_cg_buffers(int ncell,
7327 gmx_domdec_comm_dim_t *cd, int pulse,
7329 int *index_gl, int *recv_i,
7330 rvec *cg_cm, rvec *recv_vr,
7332 cginfo_mb_t *cginfo_mb,int *cginfo)
7334 gmx_domdec_ind_t *ind,*ind_p;
7335 int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7338 ind = &cd->ind[pulse];
7340 /* First correct the already stored data */
7341 shift = ind->nrecv[ncell];
7342 for(cell=ncell-1; cell>=0; cell--)
7344 shift -= ind->nrecv[cell];
7347 /* Move the cg's present from previous grid pulses */
7348 cg0 = ncg_cell[ncell+cell];
7349 cg1 = ncg_cell[ncell+cell+1];
7350 cgindex[cg1+shift] = cgindex[cg1];
7351 for(cg=cg1-1; cg>=cg0; cg--)
7353 index_gl[cg+shift] = index_gl[cg];
7354 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7355 cgindex[cg+shift] = cgindex[cg];
7356 cginfo[cg+shift] = cginfo[cg];
7358 /* Correct the already stored send indices for the shift */
7359 for(p=1; p<=pulse; p++)
7361 ind_p = &cd->ind[p];
7363 for(c=0; c<cell; c++)
7365 cg0 += ind_p->nsend[c];
7367 cg1 = cg0 + ind_p->nsend[cell];
7368 for(cg=cg0; cg<cg1; cg++)
7370 ind_p->index[cg] += shift;
7376 /* Merge in the communicated buffers */
7380 for(cell=0; cell<ncell; cell++)
7382 cg1 = ncg_cell[ncell+cell+1] + shift;
7385 /* Correct the old cg indices */
7386 for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7388 cgindex[cg+1] += shift_at;
7391 for(cg=0; cg<ind->nrecv[cell]; cg++)
7393 /* Copy this charge group from the buffer */
7394 index_gl[cg1] = recv_i[cg0];
7395 copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7396 /* Add it to the cgindex */
7397 cg_gl = index_gl[cg1];
7398 cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7399 nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7400 cgindex[cg1+1] = cgindex[cg1] + nat;
7405 shift += ind->nrecv[cell];
7406 ncg_cell[ncell+cell+1] = cg1;
7410 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7411 int nzone,int cg0,const int *cgindex)
7415 /* Store the atom block boundaries for easy copying of communication buffers
7418 for(zone=0; zone<nzone; zone++)
7420 for(p=0; p<cd->np; p++) {
7421 cd->ind[p].cell2at0[zone] = cgindex[cg];
7422 cg += cd->ind[p].nrecv[zone];
7423 cd->ind[p].cell2at1[zone] = cgindex[cg];
7428 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7434 for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7436 if (!bLocalCG[link->a[i]])
7445 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7447 real c[DIM][4]; /* the corners for the non-bonded communication */
7448 real cr0; /* corner for rounding */
7449 real cr1[4]; /* corners for rounding */
7450 real bc[DIM]; /* corners for bounded communication */
7451 real bcr1; /* corner for rounding for bonded communication */
7454 /* Determine the corners of the domain(s) we are communicating with */
7456 set_dd_corners(const gmx_domdec_t *dd,
7457 int dim0, int dim1, int dim2,
7461 const gmx_domdec_comm_t *comm;
7462 const gmx_domdec_zones_t *zones;
7467 zones = &comm->zones;
7469 /* Keep the compiler happy */
7473 /* The first dimension is equal for all cells */
7474 c->c[0][0] = comm->cell_x0[dim0];
7477 c->bc[0] = c->c[0][0];
7482 /* This cell row is only seen from the first row */
7483 c->c[1][0] = comm->cell_x0[dim1];
7484 /* All rows can see this row */
7485 c->c[1][1] = comm->cell_x0[dim1];
7488 c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7491 /* For the multi-body distance we need the maximum */
7492 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7495 /* Set the upper-right corner for rounding */
7496 c->cr0 = comm->cell_x1[dim0];
7503 c->c[2][j] = comm->cell_x0[dim2];
7507 /* Use the maximum of the i-cells that see a j-cell */
7508 for(i=0; i<zones->nizone; i++)
7510 for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7516 comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7522 /* For the multi-body distance we need the maximum */
7523 c->bc[2] = comm->cell_x0[dim2];
7528 c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7534 /* Set the upper-right corner for rounding */
7535 /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7536 * Only cell (0,0,0) can see cell 7 (1,1,1)
7538 c->cr1[0] = comm->cell_x1[dim1];
7539 c->cr1[3] = comm->cell_x1[dim1];
7542 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7545 /* For the multi-body distance we need the maximum */
7546 c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7553 /* Determine which cg's we need to send in this pulse from this zone */
7555 get_zone_pulse_cgs(gmx_domdec_t *dd,
7556 int zonei, int zone,
7558 const int *index_gl,
7560 int dim, int dim_ind,
7561 int dim0, int dim1, int dim2,
7562 real r_comm2, real r_bcomm2,
7566 real skew_fac2_d, real skew_fac_01,
7567 rvec *v_d, rvec *v_0, rvec *v_1,
7568 const dd_corners_t *c,
7570 gmx_bool bDistBonded,
7576 gmx_domdec_ind_t *ind,
7577 int **ibuf, int *ibuf_nalloc,
7583 gmx_domdec_comm_t *comm;
7585 gmx_bool bDistMB_pulse;
7587 real r2,rb2,r,tric_sh;
7590 int nsend_z,nsend,nat;
7594 bScrew = (dd->bScrewPBC && dim == XX);
7596 bDistMB_pulse = (bDistMB && bDistBonded);
7602 for(cg=cg0; cg<cg1; cg++)
7606 if (tric_dist[dim_ind] == 0)
7608 /* Rectangular direction, easy */
7609 r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7616 r = cg_cm[cg][dim] - c->bc[dim_ind];
7622 /* Rounding gives at most a 16% reduction
7623 * in communicated atoms
7625 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7627 r = cg_cm[cg][dim0] - c->cr0;
7628 /* This is the first dimension, so always r >= 0 */
7635 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7637 r = cg_cm[cg][dim1] - c->cr1[zone];
7644 r = cg_cm[cg][dim1] - c->bcr1;
7654 /* Triclinic direction, more complicated */
7657 /* Rounding, conservative as the skew_fac multiplication
7658 * will slightly underestimate the distance.
7660 if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7662 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7663 for(i=dim0+1; i<DIM; i++)
7665 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7667 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7670 rb[dim0] = rn[dim0];
7673 /* Take care that the cell planes along dim0 might not
7674 * be orthogonal to those along dim1 and dim2.
7676 for(i=1; i<=dim_ind; i++)
7679 if (normal[dim0][dimd] > 0)
7681 rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7684 rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7689 if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7691 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7693 for(i=dim1+1; i<DIM; i++)
7695 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7697 rn[dim1] += tric_sh;
7700 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7701 /* Take care of coupling of the distances
7702 * to the planes along dim0 and dim1 through dim2.
7704 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7705 /* Take care that the cell planes along dim1
7706 * might not be orthogonal to that along dim2.
7708 if (normal[dim1][dim2] > 0)
7710 rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7716 cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7719 rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7720 /* Take care of coupling of the distances
7721 * to the planes along dim0 and dim1 through dim2.
7723 rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7724 /* Take care that the cell planes along dim1
7725 * might not be orthogonal to that along dim2.
7727 if (normal[dim1][dim2] > 0)
7729 rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7734 /* The distance along the communication direction */
7735 rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7737 for(i=dim+1; i<DIM; i++)
7739 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7744 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7745 /* Take care of coupling of the distances
7746 * to the planes along dim0 and dim1 through dim2.
7748 if (dim_ind == 1 && zonei == 1)
7750 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7756 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7759 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7760 /* Take care of coupling of the distances
7761 * to the planes along dim0 and dim1 through dim2.
7763 if (dim_ind == 1 && zonei == 1)
7765 rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7773 ((bDistMB && rb2 < r_bcomm2) ||
7774 (bDist2B && r2 < r_bcomm2)) &&
7776 (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7777 missing_link(comm->cglink,index_gl[cg],
7780 /* Make an index to the local charge groups */
7781 if (nsend+1 > ind->nalloc)
7783 ind->nalloc = over_alloc_large(nsend+1);
7784 srenew(ind->index,ind->nalloc);
7786 if (nsend+1 > *ibuf_nalloc)
7788 *ibuf_nalloc = over_alloc_large(nsend+1);
7789 srenew(*ibuf,*ibuf_nalloc);
7791 ind->index[nsend] = cg;
7792 (*ibuf)[nsend] = index_gl[cg];
7794 vec_rvec_check_alloc(vbuf,nsend+1);
7796 if (dd->ci[dim] == 0)
7798 /* Correct cg_cm for pbc */
7799 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7802 vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7803 vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7808 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7811 nat += cgindex[cg+1] - cgindex[cg];
7817 *nsend_z_ptr = nsend_z;
7820 static void setup_dd_communication(gmx_domdec_t *dd,
7821 matrix box,gmx_ddbox_t *ddbox,
7822 t_forcerec *fr,t_state *state,rvec **f)
7824 int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7825 int nzone,nzone_send,zone,zonei,cg0,cg1;
7826 int c,i,j,cg,cg_gl,nrcg;
7827 int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7828 gmx_domdec_comm_t *comm;
7829 gmx_domdec_zones_t *zones;
7830 gmx_domdec_comm_dim_t *cd;
7831 gmx_domdec_ind_t *ind;
7832 cginfo_mb_t *cginfo_mb;
7833 gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7834 real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7835 dd_corners_t corners;
7837 rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7838 real skew_fac2_d,skew_fac_01;
7845 fprintf(debug,"Setting up DD communication\n");
7850 switch (fr->cutoff_scheme)
7859 gmx_incons("unimplemented");
7863 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7865 dim = dd->dim[dim_ind];
7867 /* Check if we need to use triclinic distances */
7868 tric_dist[dim_ind] = 0;
7869 for(i=0; i<=dim_ind; i++)
7871 if (ddbox->tric_dir[dd->dim[i]])
7873 tric_dist[dim_ind] = 1;
7878 bBondComm = comm->bBondComm;
7880 /* Do we need to determine extra distances for multi-body bondeds? */
7881 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7883 /* Do we need to determine extra distances for only two-body bondeds? */
7884 bDist2B = (bBondComm && !bDistMB);
7886 r_comm2 = sqr(comm->cutoff);
7887 r_bcomm2 = sqr(comm->cutoff_mbody);
7891 fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7894 zones = &comm->zones;
7897 dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7898 dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7900 set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7902 /* Triclinic stuff */
7903 normal = ddbox->normal;
7907 v_0 = ddbox->v[dim0];
7908 if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7910 /* Determine the coupling coefficient for the distances
7911 * to the cell planes along dim0 and dim1 through dim2.
7912 * This is required for correct rounding.
7915 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7918 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7924 v_1 = ddbox->v[dim1];
7927 zone_cg_range = zones->cg_range;
7928 index_gl = dd->index_gl;
7929 cgindex = dd->cgindex;
7930 cginfo_mb = fr->cginfo_mb;
7932 zone_cg_range[0] = 0;
7933 zone_cg_range[1] = dd->ncg_home;
7934 comm->zone_ncg1[0] = dd->ncg_home;
7935 pos_cg = dd->ncg_home;
7937 nat_tot = dd->nat_home;
7939 for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7941 dim = dd->dim[dim_ind];
7942 cd = &comm->cd[dim_ind];
7944 if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7946 /* No pbc in this dimension, the first node should not comm. */
7954 v_d = ddbox->v[dim];
7955 skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7957 cd->bInPlace = TRUE;
7958 for(p=0; p<cd->np; p++)
7960 /* Only atoms communicated in the first pulse are used
7961 * for multi-body bonded interactions or for bBondComm.
7963 bDistBonded = ((bDistMB || bDist2B) && p == 0);
7968 for(zone=0; zone<nzone_send; zone++)
7970 if (tric_dist[dim_ind] && dim_ind > 0)
7972 /* Determine slightly more optimized skew_fac's
7974 * This reduces the number of communicated atoms
7975 * by about 10% for 3D DD of rhombic dodecahedra.
7977 for(dimd=0; dimd<dim; dimd++)
7979 sf2_round[dimd] = 1;
7980 if (ddbox->tric_dir[dimd])
7982 for(i=dd->dim[dimd]+1; i<DIM; i++)
7984 /* If we are shifted in dimension i
7985 * and the cell plane is tilted forward
7986 * in dimension i, skip this coupling.
7988 if (!(zones->shift[nzone+zone][i] &&
7989 ddbox->v[dimd][i][dimd] >= 0))
7992 sqr(ddbox->v[dimd][i][dimd]);
7995 sf2_round[dimd] = 1/sf2_round[dimd];
8000 zonei = zone_perm[dim_ind][zone];
8003 /* Here we permutate the zones to obtain a convenient order
8004 * for neighbor searching
8006 cg0 = zone_cg_range[zonei];
8007 cg1 = zone_cg_range[zonei+1];
8011 /* Look only at the cg's received in the previous grid pulse
8013 cg1 = zone_cg_range[nzone+zone+1];
8014 cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8017 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8018 for(th=0; th<comm->nth; th++)
8020 gmx_domdec_ind_t *ind_p;
8021 int **ibuf_p,*ibuf_nalloc_p;
8023 int *nsend_p,*nat_p;
8029 /* Thread 0 writes in the comm buffers */
8031 ibuf_p = &comm->buf_int;
8032 ibuf_nalloc_p = &comm->nalloc_int;
8033 vbuf_p = &comm->vbuf;
8036 nsend_zone_p = &ind->nsend[zone];
8040 /* Other threads write into temp buffers */
8041 ind_p = &comm->dth[th].ind;
8042 ibuf_p = &comm->dth[th].ibuf;
8043 ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8044 vbuf_p = &comm->dth[th].vbuf;
8045 nsend_p = &comm->dth[th].nsend;
8046 nat_p = &comm->dth[th].nat;
8047 nsend_zone_p = &comm->dth[th].nsend_zone;
8049 comm->dth[th].nsend = 0;
8050 comm->dth[th].nat = 0;
8051 comm->dth[th].nsend_zone = 0;
8061 cg0_th = cg0 + ((cg1 - cg0)* th )/comm->nth;
8062 cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8065 /* Get the cg's for this pulse in this zone */
8066 get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8068 dim,dim_ind,dim0,dim1,dim2,
8071 normal,skew_fac2_d,skew_fac_01,
8072 v_d,v_0,v_1,&corners,sf2_round,
8073 bDistBonded,bBondComm,
8077 ibuf_p,ibuf_nalloc_p,
8083 /* Append data of threads>=1 to the communication buffers */
8084 for(th=1; th<comm->nth; th++)
8086 dd_comm_setup_work_t *dth;
8089 dth = &comm->dth[th];
8091 ns1 = nsend + dth->nsend_zone;
8092 if (ns1 > ind->nalloc)
8094 ind->nalloc = over_alloc_dd(ns1);
8095 srenew(ind->index,ind->nalloc);
8097 if (ns1 > comm->nalloc_int)
8099 comm->nalloc_int = over_alloc_dd(ns1);
8100 srenew(comm->buf_int,comm->nalloc_int);
8102 if (ns1 > comm->vbuf.nalloc)
8104 comm->vbuf.nalloc = over_alloc_dd(ns1);
8105 srenew(comm->vbuf.v,comm->vbuf.nalloc);
8108 for(i=0; i<dth->nsend_zone; i++)
8110 ind->index[nsend] = dth->ind.index[i];
8111 comm->buf_int[nsend] = dth->ibuf[i];
8112 copy_rvec(dth->vbuf.v[i],
8113 comm->vbuf.v[nsend]);
8117 ind->nsend[zone] += dth->nsend_zone;
8120 /* Clear the counts in case we do not have pbc */
8121 for(zone=nzone_send; zone<nzone; zone++)
8123 ind->nsend[zone] = 0;
8125 ind->nsend[nzone] = nsend;
8126 ind->nsend[nzone+1] = nat;
8127 /* Communicate the number of cg's and atoms to receive */
8128 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8129 ind->nsend, nzone+2,
8130 ind->nrecv, nzone+2);
8132 /* The rvec buffer is also required for atom buffers of size nsend
8133 * in dd_move_x and dd_move_f.
8135 vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8139 /* We can receive in place if only the last zone is not empty */
8140 for(zone=0; zone<nzone-1; zone++)
8142 if (ind->nrecv[zone] > 0)
8144 cd->bInPlace = FALSE;
8149 /* The int buffer is only required here for the cg indices */
8150 if (ind->nrecv[nzone] > comm->nalloc_int2)
8152 comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8153 srenew(comm->buf_int2,comm->nalloc_int2);
8155 /* The rvec buffer is also required for atom buffers
8156 * of size nrecv in dd_move_x and dd_move_f.
8158 i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8159 vec_rvec_check_alloc(&comm->vbuf2,i);
8163 /* Make space for the global cg indices */
8164 if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8165 || dd->cg_nalloc == 0)
8167 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8168 srenew(index_gl,dd->cg_nalloc);
8169 srenew(cgindex,dd->cg_nalloc+1);
8171 /* Communicate the global cg indices */
8174 recv_i = index_gl + pos_cg;
8178 recv_i = comm->buf_int2;
8180 dd_sendrecv_int(dd, dim_ind, dddirBackward,
8181 comm->buf_int, nsend,
8182 recv_i, ind->nrecv[nzone]);
8184 /* Make space for cg_cm */
8185 dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8186 if (fr->cutoff_scheme == ecutsGROUP)
8194 /* Communicate cg_cm */
8197 recv_vr = cg_cm + pos_cg;
8201 recv_vr = comm->vbuf2.v;
8203 dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8204 comm->vbuf.v, nsend,
8205 recv_vr, ind->nrecv[nzone]);
8207 /* Make the charge group index */
8210 zone = (p == 0 ? 0 : nzone - 1);
8211 while (zone < nzone)
8213 for(cg=0; cg<ind->nrecv[zone]; cg++)
8215 cg_gl = index_gl[pos_cg];
8216 fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8217 nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8218 cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8221 /* Update the charge group presence,
8222 * so we can use it in the next pass of the loop.
8224 comm->bLocalCG[cg_gl] = TRUE;
8230 comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8233 zone_cg_range[nzone+zone] = pos_cg;
8238 /* This part of the code is never executed with bBondComm. */
8239 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8240 index_gl,recv_i,cg_cm,recv_vr,
8241 cgindex,fr->cginfo_mb,fr->cginfo);
8242 pos_cg += ind->nrecv[nzone];
8244 nat_tot += ind->nrecv[nzone+1];
8248 /* Store the atom block for easy copying of communication buffers */
8249 make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8253 dd->index_gl = index_gl;
8254 dd->cgindex = cgindex;
8256 dd->ncg_tot = zone_cg_range[zones->n];
8257 dd->nat_tot = nat_tot;
8258 comm->nat[ddnatHOME] = dd->nat_home;
8259 for(i=ddnatZONE; i<ddnatNR; i++)
8261 comm->nat[i] = dd->nat_tot;
8266 /* We don't need to update cginfo, since that was alrady done above.
8267 * So we pass NULL for the forcerec.
8269 dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8270 NULL,comm->bLocalCG);
8275 fprintf(debug,"Finished setting up DD communication, zones:");
8276 for(c=0; c<zones->n; c++)
8278 fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8280 fprintf(debug,"\n");
8284 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8288 for(c=0; c<zones->nizone; c++)
8290 zones->izone[c].cg1 = zones->cg_range[c+1];
8291 zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8292 zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8296 static void set_zones_size(gmx_domdec_t *dd,
8297 matrix box,const gmx_ddbox_t *ddbox,
8298 int zone_start,int zone_end)
8300 gmx_domdec_comm_t *comm;
8301 gmx_domdec_zones_t *zones;
8303 int z,zi,zj0,zj1,d,dim;
8306 real size_j,add_tric;
8311 zones = &comm->zones;
8313 /* Do we need to determine extra distances for multi-body bondeds? */
8314 bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8316 for(z=zone_start; z<zone_end; z++)
8318 /* Copy cell limits to zone limits.
8319 * Valid for non-DD dims and non-shifted dims.
8321 copy_rvec(comm->cell_x0,zones->size[z].x0);
8322 copy_rvec(comm->cell_x1,zones->size[z].x1);
8325 for(d=0; d<dd->ndim; d++)
8329 for(z=0; z<zones->n; z++)
8331 /* With a staggered grid we have different sizes
8332 * for non-shifted dimensions.
8334 if (dd->bGridJump && zones->shift[z][dim] == 0)
8338 zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8339 zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8343 zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8344 zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8350 rcmbs = comm->cutoff_mbody;
8351 if (ddbox->tric_dir[dim])
8353 rcs /= ddbox->skew_fac[dim];
8354 rcmbs /= ddbox->skew_fac[dim];
8357 /* Set the lower limit for the shifted zone dimensions */
8358 for(z=zone_start; z<zone_end; z++)
8360 if (zones->shift[z][dim] > 0)
8363 if (!dd->bGridJump || d == 0)
8365 zones->size[z].x0[dim] = comm->cell_x1[dim];
8366 zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8370 /* Here we take the lower limit of the zone from
8371 * the lowest domain of the zone below.
8375 zones->size[z].x0[dim] =
8376 comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8382 zones->size[z].x0[dim] =
8383 zones->size[zone_perm[2][z-4]].x0[dim];
8387 zones->size[z].x0[dim] =
8388 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8391 /* A temporary limit, is updated below */
8392 zones->size[z].x1[dim] = zones->size[z].x0[dim];
8396 for(zi=0; zi<zones->nizone; zi++)
8398 if (zones->shift[zi][dim] == 0)
8400 /* This takes the whole zone into account.
8401 * With multiple pulses this will lead
8402 * to a larger zone then strictly necessary.
8404 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8405 zones->size[zi].x1[dim]+rcmbs);
8413 /* Loop over the i-zones to set the upper limit of each
8416 for(zi=0; zi<zones->nizone; zi++)
8418 if (zones->shift[zi][dim] == 0)
8420 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8422 if (zones->shift[z][dim] > 0)
8424 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8425 zones->size[zi].x1[dim]+rcs);
8432 for(z=zone_start; z<zone_end; z++)
8434 for(i=0; i<DIM; i++)
8436 zones->size[z].bb_x0[i] = zones->size[z].x0[i];
8437 zones->size[z].bb_x1[i] = zones->size[z].x1[i];
8439 for(j=i+1; j<ddbox->npbcdim; j++)
8441 /* With 1D domain decomposition the cg's are not in
8442 * the triclinic box, but trilinic x-y and rectangular y-z.
8444 if (box[j][i] != 0 &&
8445 !(dd->ndim == 1 && i == YY && j == ZZ))
8447 /* Correct for triclinic offset of the lower corner */
8448 add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
8449 zones->size[z].bb_x0[i] += add_tric;
8450 zones->size[z].bb_x1[i] += add_tric;
8452 /* Correct for triclinic offset of the upper corner */
8453 size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
8454 add_tric = size_j*box[j][i]/box[j][j];
8458 zones->size[z].bb_x0[i] += add_tric;
8462 zones->size[z].bb_x1[i] += add_tric;
8469 if (zone_start == 0)
8472 for(dim=0; dim<DIM; dim++)
8474 vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8476 zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8481 for(z=zone_start; z<zone_end; z++)
8483 fprintf(debug,"zone %d %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8485 zones->size[z].x0[XX],zones->size[z].x1[XX],
8486 zones->size[z].x0[YY],zones->size[z].x1[YY],
8487 zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8488 fprintf(debug,"zone %d bb %6.3f - %6.3f %6.3f - %6.3f %6.3f - %6.3f\n",
8490 zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8491 zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8492 zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8497 static int comp_cgsort(const void *a,const void *b)
8501 gmx_cgsort_t *cga,*cgb;
8502 cga = (gmx_cgsort_t *)a;
8503 cgb = (gmx_cgsort_t *)b;
8505 comp = cga->nsc - cgb->nsc;
8508 comp = cga->ind_gl - cgb->ind_gl;
8514 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8519 /* Order the data */
8522 buf[i] = a[sort[i].ind];
8525 /* Copy back to the original array */
8532 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8537 /* Order the data */
8540 copy_rvec(v[sort[i].ind],buf[i]);
8543 /* Copy back to the original array */
8546 copy_rvec(buf[i],v[i]);
8550 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8553 int a,atot,cg,cg0,cg1,i;
8555 if (cgindex == NULL)
8557 /* Avoid the useless loop of the atoms within a cg */
8558 order_vec_cg(ncg,sort,v,buf);
8563 /* Order the data */
8565 for(cg=0; cg<ncg; cg++)
8567 cg0 = cgindex[sort[cg].ind];
8568 cg1 = cgindex[sort[cg].ind+1];
8569 for(i=cg0; i<cg1; i++)
8571 copy_rvec(v[i],buf[a]);
8577 /* Copy back to the original array */
8578 for(a=0; a<atot; a++)
8580 copy_rvec(buf[a],v[a]);
8584 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8585 int nsort_new,gmx_cgsort_t *sort_new,
8586 gmx_cgsort_t *sort1)
8590 /* The new indices are not very ordered, so we qsort them */
8591 qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8593 /* sort2 is already ordered, so now we can merge the two arrays */
8597 while(i2 < nsort2 || i_new < nsort_new)
8601 sort1[i1++] = sort_new[i_new++];
8603 else if (i_new == nsort_new)
8605 sort1[i1++] = sort2[i2++];
8607 else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8608 (sort2[i2].nsc == sort_new[i_new].nsc &&
8609 sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8611 sort1[i1++] = sort2[i2++];
8615 sort1[i1++] = sort_new[i_new++];
8620 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8622 gmx_domdec_sort_t *sort;
8623 gmx_cgsort_t *cgsort,*sort_i;
8624 int ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8625 int sort_last,sort_skip;
8627 sort = dd->comm->sort;
8629 a = fr->ns.grid->cell_index;
8631 moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8633 if (ncg_home_old >= 0)
8635 /* The charge groups that remained in the same ns grid cell
8636 * are completely ordered. So we can sort efficiently by sorting
8637 * the charge groups that did move into the stationary list.
8642 for(i=0; i<dd->ncg_home; i++)
8644 /* Check if this cg did not move to another node */
8647 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8649 /* This cg is new on this node or moved ns grid cell */
8650 if (nsort_new >= sort->sort_new_nalloc)
8652 sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8653 srenew(sort->sort_new,sort->sort_new_nalloc);
8655 sort_i = &(sort->sort_new[nsort_new++]);
8659 /* This cg did not move */
8660 sort_i = &(sort->sort2[nsort2++]);
8662 /* Sort on the ns grid cell indices
8663 * and the global topology index.
8664 * index_gl is irrelevant with cell ns,
8665 * but we set it here anyhow to avoid a conditional.
8668 sort_i->ind_gl = dd->index_gl[i];
8675 fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8678 /* Sort efficiently */
8679 ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8684 cgsort = sort->sort;
8686 for(i=0; i<dd->ncg_home; i++)
8688 /* Sort on the ns grid cell indices
8689 * and the global topology index
8691 cgsort[i].nsc = a[i];
8692 cgsort[i].ind_gl = dd->index_gl[i];
8694 if (cgsort[i].nsc < moved)
8701 fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8703 /* Determine the order of the charge groups using qsort */
8704 qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8710 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8713 int ncg_new,i,*a,na;
8715 sort = dd->comm->sort->sort;
8717 nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8724 sort[ncg_new].ind = a[i];
8732 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8733 rvec *cgcm,t_forcerec *fr,t_state *state,
8736 gmx_domdec_sort_t *sort;
8737 gmx_cgsort_t *cgsort,*sort_i;
8739 int ncg_new,i,*ibuf,cgsize;
8742 sort = dd->comm->sort;
8744 if (dd->ncg_home > sort->sort_nalloc)
8746 sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8747 srenew(sort->sort,sort->sort_nalloc);
8748 srenew(sort->sort2,sort->sort_nalloc);
8750 cgsort = sort->sort;
8752 switch (fr->cutoff_scheme)
8755 ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8758 ncg_new = dd_sort_order_nbnxn(dd,fr);
8761 gmx_incons("unimplemented");
8765 /* We alloc with the old size, since cgindex is still old */
8766 vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8767 vbuf = dd->comm->vbuf.v;
8771 cgindex = dd->cgindex;
8778 /* Remove the charge groups which are no longer at home here */
8779 dd->ncg_home = ncg_new;
8782 fprintf(debug,"Set the new home charge group count to %d\n",
8786 /* Reorder the state */
8787 for(i=0; i<estNR; i++)
8789 if (EST_DISTR(i) && (state->flags & (1<<i)))
8794 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8797 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8800 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8803 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8807 case estDISRE_INITF:
8808 case estDISRE_RM3TAV:
8809 case estORIRE_INITF:
8811 /* No ordering required */
8814 gmx_incons("Unknown state entry encountered in dd_sort_state");
8819 if (fr->cutoff_scheme == ecutsGROUP)
8822 order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8825 if (dd->ncg_home+1 > sort->ibuf_nalloc)
8827 sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8828 srenew(sort->ibuf,sort->ibuf_nalloc);
8831 /* Reorder the global cg index */
8832 order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8833 /* Reorder the cginfo */
8834 order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8835 /* Rebuild the local cg index */
8839 for(i=0; i<dd->ncg_home; i++)
8841 cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8842 ibuf[i+1] = ibuf[i] + cgsize;
8844 for(i=0; i<dd->ncg_home+1; i++)
8846 dd->cgindex[i] = ibuf[i];
8851 for(i=0; i<dd->ncg_home+1; i++)
8856 /* Set the home atom number */
8857 dd->nat_home = dd->cgindex[dd->ncg_home];
8859 if (fr->cutoff_scheme == ecutsVERLET)
8861 /* The atoms are now exactly in grid order, update the grid order */
8862 nbnxn_set_atomorder(fr->nbv->nbs);
8866 /* Copy the sorted ns cell indices back to the ns grid struct */
8867 for(i=0; i<dd->ncg_home; i++)
8869 fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8871 fr->ns.grid->nr = dd->ncg_home;
8875 static void add_dd_statistics(gmx_domdec_t *dd)
8877 gmx_domdec_comm_t *comm;
8882 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8884 comm->sum_nat[ddnat-ddnatZONE] +=
8885 comm->nat[ddnat] - comm->nat[ddnat-1];
8890 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8892 gmx_domdec_comm_t *comm;
8897 /* Reset all the statistics and counters for total run counting */
8898 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8900 comm->sum_nat[ddnat-ddnatZONE] = 0;
8904 comm->load_step = 0;
8907 clear_ivec(comm->load_lim);
8912 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8914 gmx_domdec_comm_t *comm;
8918 comm = cr->dd->comm;
8920 gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8927 fprintf(fplog,"\n D O M A I N D E C O M P O S I T I O N S T A T I S T I C S\n\n");
8929 for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8931 av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8936 " av. #atoms communicated per step for force: %d x %.1f\n",
8940 if (cr->dd->vsite_comm)
8943 " av. #atoms communicated per step for vsites: %d x %.1f\n",
8944 (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8949 if (cr->dd->constraint_comm)
8952 " av. #atoms communicated per step for LINCS: %d x %.1f\n",
8953 1 + ir->nLincsIter,av);
8957 gmx_incons(" Unknown type for DD statistics");
8960 fprintf(fplog,"\n");
8962 if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8964 print_dd_load_av(fplog,cr->dd);
8968 void dd_partition_system(FILE *fplog,
8969 gmx_large_int_t step,
8971 gmx_bool bMasterState,
8973 t_state *state_global,
8974 gmx_mtop_t *top_global,
8976 t_state *state_local,
8979 gmx_localtop_t *top_local,
8982 gmx_shellfc_t shellfc,
8983 gmx_constr_t constr,
8985 gmx_wallcycle_t wcycle,
8989 gmx_domdec_comm_t *comm;
8990 gmx_ddbox_t ddbox={0};
8992 gmx_large_int_t step_pcoupl;
8993 rvec cell_ns_x0,cell_ns_x1;
8994 int i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
8995 gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8996 gmx_bool bRedist,bSortCG,bResortAll;
8997 ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
9004 bBoxChanged = (bMasterState || DEFORM(*ir));
9005 if (ir->epc != epcNO)
9007 /* With nstpcouple > 1 pressure coupling happens.
9008 * one step after calculating the pressure.
9009 * Box scaling happens at the end of the MD step,
9010 * after the DD partitioning.
9011 * We therefore have to do DLB in the first partitioning
9012 * after an MD step where P-coupling occured.
9013 * We need to determine the last step in which p-coupling occurred.
9014 * MRS -- need to validate this for vv?
9019 step_pcoupl = step - 1;
9023 step_pcoupl = ((step - 1)/n)*n + 1;
9025 if (step_pcoupl >= comm->partition_step)
9031 bNStGlobalComm = (step % nstglobalcomm == 0);
9033 if (!comm->bDynLoadBal)
9039 /* Should we do dynamic load balacing this step?
9040 * Since it requires (possibly expensive) global communication,
9041 * we might want to do DLB less frequently.
9043 if (bBoxChanged || ir->epc != epcNO)
9045 bDoDLB = bBoxChanged;
9049 bDoDLB = bNStGlobalComm;
9053 /* Check if we have recorded loads on the nodes */
9054 if (comm->bRecordLoad && dd_load_count(comm))
9056 if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9058 /* Check if we should use DLB at the second partitioning
9059 * and every 100 partitionings,
9060 * so the extra communication cost is negligible.
9062 n = max(100,nstglobalcomm);
9063 bCheckDLB = (comm->n_load_collect == 0 ||
9064 comm->n_load_have % n == n-1);
9071 /* Print load every nstlog, first and last step to the log file */
9072 bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9073 comm->n_load_collect == 0 ||
9075 (step + ir->nstlist > ir->init_step + ir->nsteps)));
9077 /* Avoid extra communication due to verbose screen output
9078 * when nstglobalcomm is set.
9080 if (bDoDLB || bLogLoad || bCheckDLB ||
9081 (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9083 get_load_distribution(dd,wcycle);
9088 dd_print_load(fplog,dd,step-1);
9092 dd_print_load_verbose(dd);
9095 comm->n_load_collect++;
9098 /* Since the timings are node dependent, the master decides */
9102 (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9105 fprintf(debug,"step %s, imb loss %f\n",
9106 gmx_step_str(step,sbuf),
9107 dd_force_imb_perf_loss(dd));
9110 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9113 turn_on_dlb(fplog,cr,step);
9118 comm->n_load_have++;
9121 cgs_gl = &comm->cgs_gl;
9126 /* Clear the old state */
9127 clear_dd_indices(dd,0,0);
9129 set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9130 TRUE,cgs_gl,state_global->x,&ddbox);
9132 get_cg_distribution(fplog,step,dd,cgs_gl,
9133 state_global->box,&ddbox,state_global->x);
9135 dd_distribute_state(dd,cgs_gl,
9136 state_global,state_local,f);
9138 dd_make_local_cgs(dd,&top_local->cgs);
9140 /* Ensure that we have space for the new distribution */
9141 dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9143 if (fr->cutoff_scheme == ecutsGROUP)
9145 calc_cgcm(fplog,0,dd->ncg_home,
9146 &top_local->cgs,state_local->x,fr->cg_cm);
9149 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9151 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9155 else if (state_local->ddp_count != dd->ddp_count)
9157 if (state_local->ddp_count > dd->ddp_count)
9159 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9162 if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9164 gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9167 /* Clear the old state */
9168 clear_dd_indices(dd,0,0);
9170 /* Build the new indices */
9171 rebuild_cgindex(dd,cgs_gl->index,state_local);
9172 make_dd_indices(dd,cgs_gl->index,0);
9174 if (fr->cutoff_scheme == ecutsGROUP)
9176 /* Redetermine the cg COMs */
9177 calc_cgcm(fplog,0,dd->ncg_home,
9178 &top_local->cgs,state_local->x,fr->cg_cm);
9181 inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9183 dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9185 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9186 TRUE,&top_local->cgs,state_local->x,&ddbox);
9188 bRedist = comm->bDynLoadBal;
9192 /* We have the full state, only redistribute the cgs */
9194 /* Clear the non-home indices */
9195 clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9197 /* Avoid global communication for dim's without pbc and -gcom */
9198 if (!bNStGlobalComm)
9200 copy_rvec(comm->box0 ,ddbox.box0 );
9201 copy_rvec(comm->box_size,ddbox.box_size);
9203 set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9204 bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9209 /* For dim's without pbc and -gcom */
9210 copy_rvec(ddbox.box0 ,comm->box0 );
9211 copy_rvec(ddbox.box_size,comm->box_size);
9213 set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9216 if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9218 write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9221 /* Check if we should sort the charge groups */
9222 if (comm->nstSortCG > 0)
9224 bSortCG = (bMasterState ||
9225 (bRedist && (step % comm->nstSortCG == 0)));
9232 ncg_home_old = dd->ncg_home;
9237 wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9239 dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9240 state_local,f,fr,mdatoms,
9241 !bSortCG,nrnb,&cg0,&ncg_moved);
9243 wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9246 get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9248 &comm->cell_x0,&comm->cell_x1,
9249 dd->ncg_home,fr->cg_cm,
9250 cell_ns_x0,cell_ns_x1,&grid_density);
9254 comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9257 switch (fr->cutoff_scheme)
9260 copy_ivec(fr->ns.grid->n,ncells_old);
9261 grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9262 state_local->box,cell_ns_x0,cell_ns_x1,
9263 fr->rlistlong,grid_density);
9266 nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9269 gmx_incons("unimplemented");
9271 /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9272 copy_ivec(ddbox.tric_dir,comm->tric_dir);
9276 wallcycle_sub_start(wcycle,ewcsDD_GRID);
9278 /* Sort the state on charge group position.
9279 * This enables exact restarts from this step.
9280 * It also improves performance by about 15% with larger numbers
9281 * of atoms per node.
9284 /* Fill the ns grid with the home cell,
9285 * so we can sort with the indices.
9287 set_zones_ncg_home(dd);
9289 switch (fr->cutoff_scheme)
9292 set_zones_size(dd,state_local->box,&ddbox,0,1);
9294 nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9296 comm->zones.size[0].bb_x0,
9297 comm->zones.size[0].bb_x1,
9299 comm->zones.dens_zone0,
9302 ncg_moved,comm->moved,
9303 fr->nbv->grp[eintLocal].kernel_type,
9304 fr->nbv->grp[eintLocal].nbat);
9306 nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9309 fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9310 0,dd->ncg_home,fr->cg_cm);
9312 copy_ivec(fr->ns.grid->n,ncells_new);
9315 gmx_incons("unimplemented");
9318 bResortAll = bMasterState;
9320 /* Check if we can user the old order and ns grid cell indices
9321 * of the charge groups to sort the charge groups efficiently.
9323 if (ncells_new[XX] != ncells_old[XX] ||
9324 ncells_new[YY] != ncells_old[YY] ||
9325 ncells_new[ZZ] != ncells_old[ZZ])
9332 fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9333 gmx_step_str(step,sbuf),dd->ncg_home);
9335 dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9336 bResortAll ? -1 : ncg_home_old);
9337 /* Rebuild all the indices */
9339 ga2la_clear(dd->ga2la);
9341 wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9344 wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9346 /* Setup up the communication and communicate the coordinates */
9347 setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9349 /* Set the indices */
9350 make_dd_indices(dd,cgs_gl->index,cg0);
9352 /* Set the charge group boundaries for neighbor searching */
9353 set_cg_boundaries(&comm->zones);
9355 if (fr->cutoff_scheme == ecutsVERLET)
9357 set_zones_size(dd,state_local->box,&ddbox,
9358 bSortCG ? 1 : 0,comm->zones.n);
9361 wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9364 write_dd_pdb("dd_home",step,"dump",top_global,cr,
9365 -1,state_local->x,state_local->box);
9368 wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9370 /* Extract a local topology from the global topology */
9371 for(i=0; i<dd->ndim; i++)
9373 np[dd->dim[i]] = comm->cd[i].np;
9375 dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9376 comm->cellsize_min,np,
9378 fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9379 vsite,top_global,top_local);
9381 wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9383 wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9385 /* Set up the special atom communication */
9386 n = comm->nat[ddnatZONE];
9387 for(i=ddnatZONE+1; i<ddnatNR; i++)
9392 if (vsite && vsite->n_intercg_vsite)
9394 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9398 if (dd->bInterCGcons || dd->bInterCGsettles)
9400 /* Only for inter-cg constraints we need special code */
9401 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9402 constr,ir->nProjOrder,
9403 top_local->idef.il);
9407 gmx_incons("Unknown special atom type setup");
9412 wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9414 wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9416 /* Make space for the extra coordinates for virtual site
9417 * or constraint communication.
9419 state_local->natoms = comm->nat[ddnatNR-1];
9420 if (state_local->natoms > state_local->nalloc)
9422 dd_realloc_state(state_local,f,state_local->natoms);
9425 if (fr->bF_NoVirSum)
9427 if (vsite && vsite->n_intercg_vsite)
9429 nat_f_novirsum = comm->nat[ddnatVSITE];
9433 if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9435 nat_f_novirsum = dd->nat_tot;
9439 nat_f_novirsum = dd->nat_home;
9448 /* Set the number of atoms required for the force calculation.
9449 * Forces need to be constrained when using a twin-range setup
9450 * or with energy minimization. For simple simulations we could
9451 * avoid some allocation, zeroing and copying, but this is
9452 * probably not worth the complications ande checking.
9454 forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9455 dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9457 /* We make the all mdatoms up to nat_tot_con.
9458 * We could save some work by only setting invmass
9459 * between nat_tot and nat_tot_con.
9461 /* This call also sets the new number of home particles to dd->nat_home */
9462 atoms2md(top_global,ir,
9463 comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9465 /* Now we have the charges we can sort the FE interactions */
9466 dd_sort_local_top(dd,mdatoms,top_local);
9470 /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9471 split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
9476 /* Make the local shell stuff, currently no communication is done */
9477 make_local_shells(cr,mdatoms,shellfc);
9480 if (ir->implicit_solvent)
9482 make_local_gb(cr,fr->born,ir->gb_algorithm);
9485 init_bonded_thread_force_reduction(fr,&top_local->idef);
9487 if (!(cr->duty & DUTY_PME))
9489 /* Send the charges to our PME only node */
9490 gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9491 mdatoms->chargeA,mdatoms->chargeB,
9492 dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9497 set_constraints(constr,top_local,ir,mdatoms,cr);
9500 if (ir->ePull != epullNO)
9502 /* Update the local pull groups */
9503 dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9508 /* Update the local rotation groups */
9509 dd_make_local_rotation_groups(dd,ir->rot);
9513 add_dd_statistics(dd);
9515 /* Make sure we only count the cycles for this DD partitioning */
9516 clear_dd_cycle_counts(dd);
9518 /* Because the order of the atoms might have changed since
9519 * the last vsite construction, we need to communicate the constructing
9520 * atom coordinates again (for spreading the forces this MD step).
9522 dd_move_x_vsites(dd,state_local->box,state_local->x);
9524 wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9526 if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9528 dd_move_x(dd,state_local->box,state_local->x);
9529 write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9530 -1,state_local->x,state_local->box);
9533 /* Store the partitioning step */
9534 comm->partition_step = step;
9536 /* Increase the DD partitioning counter */
9538 /* The state currently matches this DD partitioning count, store it */
9539 state_local->ddp_count = dd->ddp_count;
9542 /* The DD master node knows the complete cg distribution,
9543 * store the count so we can possibly skip the cg info communication.
9545 comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9548 if (comm->DD_debug > 0)
9550 /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9551 check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9552 "after partitioning");