2f19d21b3b39fbdd665e5720ac333fa245dcffc9
[alexxy/gromacs.git] / src / mdlib / domdec.c
1 /*
2  * This file is part of the GROMACS molecular simulation package.
3  *
4  * Copyright (c) 1991-2008
5  * Copyright (c) 2012, by the GROMACS development team, led by
6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
7  * others, as listed in the AUTHORS file in the top-level source
8  * directory and at http://www.gromacs.org.
9  *
10  * GROMACS is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public License
12  * as published by the Free Software Foundation; either version 2.1
13  * of the License, or (at your option) any later version.
14  *
15  * GROMACS is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with GROMACS; if not, see
22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
24  *
25  * If you want to redistribute modifications to GROMACS, please
26  * consider that scientific software is very special. Version
27  * control is crucial - bugs must be traceable. We will be happy to
28  * consider code for inclusion in the official distribution, but
29  * derived work must not be called official GROMACS. Details are found
30  * in the README & COPYING files - if they are missing, get the
31  * official version at http://www.gromacs.org.
32  *
33  * To help us fund GROMACS development, we humbly ask that you cite
34  * the research papers on the package. Check out http://www.gromacs.org.
35  */
36
37 #ifdef HAVE_CONFIG_H
38 #include <config.h>
39 #endif
40
41 #include <stdio.h>
42 #include <time.h>
43 #include <math.h>
44 #include <string.h>
45 #include <stdlib.h>
46 #include "typedefs.h"
47 #include "smalloc.h"
48 #include "gmx_fatal.h"
49 #include "gmx_fatal_collective.h"
50 #include "vec.h"
51 #include "domdec.h"
52 #include "domdec_network.h"
53 #include "nrnb.h"
54 #include "pbc.h"
55 #include "chargegroup.h"
56 #include "constr.h"
57 #include "mdatoms.h"
58 #include "names.h"
59 #include "pdbio.h"
60 #include "futil.h"
61 #include "force.h"
62 #include "pme.h"
63 #include "pull.h"
64 #include "pull_rotation.h"
65 #include "gmx_wallcycle.h"
66 #include "mdrun.h"
67 #include "nsgrid.h"
68 #include "shellfc.h"
69 #include "mtop_util.h"
70 #include "gmxfio.h"
71 #include "gmx_ga2la.h"
72 #include "gmx_sort.h"
73 #include "nbnxn_search.h"
74 #include "bondf.h"
75 #include "gmx_omp_nthreads.h"
76
77 #ifdef GMX_LIB_MPI
78 #include <mpi.h>
79 #endif
80 #ifdef GMX_THREAD_MPI
81 #include "tmpi.h"
82 #endif
83
84 #define DDRANK(dd,rank)    (rank)
85 #define DDMASTERRANK(dd)   (dd->masterrank)
86
87 typedef struct gmx_domdec_master
88 {
89     /* The cell boundaries */
90     real **cell_x;
91     /* The global charge group division */
92     int  *ncg;     /* Number of home charge groups for each node */
93     int  *index;   /* Index of nnodes+1 into cg */
94     int  *cg;      /* Global charge group index */
95     int  *nat;     /* Number of home atoms for each node. */
96     int  *ibuf;    /* Buffer for communication */
97     rvec *vbuf;    /* Buffer for state scattering and gathering */
98 } gmx_domdec_master_t;
99
100 typedef struct
101 {
102     /* The numbers of charge groups to send and receive for each cell
103      * that requires communication, the last entry contains the total
104      * number of atoms that needs to be communicated.
105      */
106     int nsend[DD_MAXIZONE+2];
107     int nrecv[DD_MAXIZONE+2];
108     /* The charge groups to send */
109     int *index;
110     int nalloc;
111     /* The atom range for non-in-place communication */
112     int cell2at0[DD_MAXIZONE];
113     int cell2at1[DD_MAXIZONE];
114 } gmx_domdec_ind_t;
115
116 typedef struct
117 {
118     int  np;                   /* Number of grid pulses in this dimension */
119     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
120     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
121     int  np_nalloc;
122     gmx_bool bInPlace;             /* Can we communicate in place?            */
123 } gmx_domdec_comm_dim_t;
124
125 typedef struct
126 {
127     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
128     real *cell_f;      /* State var.: cell boundaries, box relative      */
129     real *old_cell_f;  /* Temp. var.: old cell size                      */
130     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
131     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
132     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
133     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
134     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
135     real *buf_ncd;     /* Temp. var.                                     */
136 } gmx_domdec_root_t;
137
138 #define DD_NLOAD_MAX 9
139
140 /* Here floats are accurate enough, since these variables
141  * only influence the load balancing, not the actual MD results.
142  */
143 typedef struct
144 {
145     int  nload;
146     float *load;
147     float sum;
148     float max;
149     float sum_m;
150     float cvol_min;
151     float mdf;
152     float pme;
153     int   flags;
154 } gmx_domdec_load_t;
155
156 typedef struct
157 {
158     int  nsc;
159     int  ind_gl;
160     int  ind;
161 } gmx_cgsort_t;
162
163 typedef struct
164 {
165     gmx_cgsort_t *sort;
166     gmx_cgsort_t *sort2;
167     int  sort_nalloc;
168     gmx_cgsort_t *sort_new;
169     int  sort_new_nalloc;
170     int  *ibuf;
171     int  ibuf_nalloc;
172 } gmx_domdec_sort_t;
173
174 typedef struct
175 {
176     rvec *v;
177     int  nalloc;
178 } vec_rvec_t;
179
180 /* This enum determines the order of the coordinates.
181  * ddnatHOME and ddnatZONE should be first and second,
182  * the others can be ordered as wanted.
183  */
184 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
185
186 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
187 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
188
189 typedef struct
190 {
191     int  dim;      /* The dimension                                          */
192     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
193     int  nslab;    /* The number of PME slabs in this dimension              */
194     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
195     int  *pp_min;  /* The minimum pp node location, size nslab               */
196     int  *pp_max;  /* The maximum pp node location,size nslab                */
197     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
198 } gmx_ddpme_t;
199
200 typedef struct
201 {
202     real min0;    /* The minimum bottom of this zone                        */
203     real max1;    /* The maximum top of this zone                           */
204     real min1;    /* The minimum top of this zone                           */
205     real mch0;    /* The maximum bottom communicaton height for this zone   */
206     real mch1;    /* The maximum top communicaton height for this zone      */
207     real p1_0;    /* The bottom value of the first cell in this zone        */
208     real p1_1;    /* The top value of the first cell in this zone           */
209 } gmx_ddzone_t;
210
211 typedef struct
212 {
213     gmx_domdec_ind_t ind;
214     int *ibuf;
215     int ibuf_nalloc;
216     vec_rvec_t vbuf;
217     int nsend;
218     int nat;
219     int nsend_zone;
220 } dd_comm_setup_work_t;
221
222 typedef struct gmx_domdec_comm
223 {
224     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
225      * unless stated otherwise.
226      */
227
228     /* The number of decomposition dimensions for PME, 0: no PME */
229     int  npmedecompdim;
230     /* The number of nodes doing PME (PP/PME or only PME) */
231     int  npmenodes;
232     int  npmenodes_x;
233     int  npmenodes_y;
234     /* The communication setup including the PME only nodes */
235     gmx_bool bCartesianPP_PME;
236     ivec ntot;
237     int  cartpmedim;
238     int  *pmenodes;          /* size npmenodes                         */
239     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
240                               * but with bCartesianPP_PME              */
241     gmx_ddpme_t ddpme[2];
242     
243     /* The DD particle-particle nodes only */
244     gmx_bool bCartesianPP;
245     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
246     
247     /* The global charge groups */
248     t_block cgs_gl;
249
250     /* Should we sort the cgs */
251     int  nstSortCG;
252     gmx_domdec_sort_t *sort;
253     
254     /* Are there charge groups? */
255     gmx_bool bCGs;
256
257     /* Are there bonded and multi-body interactions between charge groups? */
258     gmx_bool bInterCGBondeds;
259     gmx_bool bInterCGMultiBody;
260
261     /* Data for the optional bonded interaction atom communication range */
262     gmx_bool bBondComm;
263     t_blocka *cglink;
264     char *bLocalCG;
265
266     /* The DLB option */
267     int  eDLB;
268     /* Are we actually using DLB? */
269     gmx_bool bDynLoadBal;
270
271     /* Cell sizes for static load balancing, first index cartesian */
272     real **slb_frac;
273
274     /* The width of the communicated boundaries */
275     real cutoff_mbody;
276     real cutoff;
277     /* The minimum cell size (including triclinic correction) */
278     rvec cellsize_min;
279     /* For dlb, for use with edlbAUTO */
280     rvec cellsize_min_dlb;
281     /* The lower limit for the DD cell size with DLB */
282     real cellsize_limit;
283     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
284     gmx_bool bVacDLBNoLimit;
285
286     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
287     ivec tric_dir;
288     /* box0 and box_size are required with dim's without pbc and -gcom */
289     rvec box0;
290     rvec box_size;
291     
292     /* The cell boundaries */
293     rvec cell_x0;
294     rvec cell_x1;
295
296     /* The old location of the cell boundaries, to check cg displacements */
297     rvec old_cell_x0;
298     rvec old_cell_x1;
299
300     /* The communication setup and charge group boundaries for the zones */
301     gmx_domdec_zones_t zones;
302     
303     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
304      * cell boundaries of neighboring cells for dynamic load balancing.
305      */
306     gmx_ddzone_t zone_d1[2];
307     gmx_ddzone_t zone_d2[2][2];
308     
309     /* The coordinate/force communication setup and indices */
310     gmx_domdec_comm_dim_t cd[DIM];
311     /* The maximum number of cells to communicate with in one dimension */
312     int  maxpulse;
313     
314     /* Which cg distribution is stored on the master node */
315     int master_cg_ddp_count;
316     
317     /* The number of cg's received from the direct neighbors */
318     int  zone_ncg1[DD_MAXZONE];
319     
320     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
321     int  nat[ddnatNR];
322
323     /* Array for signalling if atoms have moved to another domain */
324     int  *moved;
325     int  moved_nalloc;
326     
327     /* Communication buffer for general use */
328     int  *buf_int;
329     int  nalloc_int;
330
331     /* Communication buffer for general use */
332     vec_rvec_t vbuf;
333
334     /* Temporary storage for thread parallel communication setup */
335     int nth;
336     dd_comm_setup_work_t *dth;
337
338     /* Communication buffers only used with multiple grid pulses */
339     int  *buf_int2;
340     int  nalloc_int2;
341     vec_rvec_t vbuf2;
342     
343     /* Communication buffers for local redistribution */
344     int  **cggl_flag;
345     int  cggl_flag_nalloc[DIM*2];
346     rvec **cgcm_state;
347     int  cgcm_state_nalloc[DIM*2];
348     
349     /* Cell sizes for dynamic load balancing */
350     gmx_domdec_root_t **root;
351     real *cell_f_row;
352     real cell_f0[DIM];
353     real cell_f1[DIM];
354     real cell_f_max0[DIM];
355     real cell_f_min1[DIM];
356     
357     /* Stuff for load communication */
358     gmx_bool bRecordLoad;
359     gmx_domdec_load_t *load;
360 #ifdef GMX_MPI
361     MPI_Comm *mpi_comm_load;
362 #endif
363
364     /* Maximum DLB scaling per load balancing step in percent */
365     int dlb_scale_lim;
366
367     /* Cycle counters */
368     float cycl[ddCyclNr];
369     int   cycl_n[ddCyclNr];
370     float cycl_max[ddCyclNr];
371     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
372     int eFlop;
373     double flop;
374     int    flop_n;
375     /* Have often have did we have load measurements */
376     int    n_load_have;
377     /* Have often have we collected the load measurements */
378     int    n_load_collect;
379     
380     /* Statistics */
381     double sum_nat[ddnatNR-ddnatZONE];
382     int    ndecomp;
383     int    nload;
384     double load_step;
385     double load_sum;
386     double load_max;
387     ivec   load_lim;
388     double load_mdf;
389     double load_pme;
390
391     /* The last partition step */
392     gmx_large_int_t partition_step;
393
394     /* Debugging */
395     int  nstDDDump;
396     int  nstDDDumpGrid;
397     int  DD_debug;
398 } gmx_domdec_comm_t;
399
400 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
401 #define DD_CGIBS 2
402
403 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
404 #define DD_FLAG_NRCG  65535
405 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
406 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
407
408 /* Zone permutation required to obtain consecutive charge groups
409  * for neighbor searching.
410  */
411 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
412
413 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
414  * components see only j zones with that component 0.
415  */
416
417 /* The DD zone order */
418 static const ivec dd_zo[DD_MAXZONE] =
419   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
420
421 /* The 3D setup */
422 #define dd_z3n  8
423 #define dd_zp3n 4
424 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
425
426 /* The 2D setup */
427 #define dd_z2n  4
428 #define dd_zp2n 2
429 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
430
431 /* The 1D setup */
432 #define dd_z1n  2
433 #define dd_zp1n 1
434 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
435
436 /* Factors used to avoid problems due to rounding issues */
437 #define DD_CELL_MARGIN       1.0001
438 #define DD_CELL_MARGIN2      1.00005
439 /* Factor to account for pressure scaling during nstlist steps */
440 #define DD_PRES_SCALE_MARGIN 1.02
441
442 /* Allowed performance loss before we DLB or warn */
443 #define DD_PERF_LOSS 0.05
444
445 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
446
447 /* Use separate MPI send and receive commands
448  * when nnodes <= GMX_DD_NNODES_SENDRECV.
449  * This saves memory (and some copying for small nnodes).
450  * For high parallelization scatter and gather calls are used.
451  */
452 #define GMX_DD_NNODES_SENDRECV 4
453
454
455 /*
456 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
457
458 static void index2xyz(ivec nc,int ind,ivec xyz)
459 {
460   xyz[XX] = ind % nc[XX];
461   xyz[YY] = (ind / nc[XX]) % nc[YY];
462   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
463 }
464 */
465
466 /* This order is required to minimize the coordinate communication in PME
467  * which uses decomposition in the x direction.
468  */
469 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
470
471 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
472 {
473     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
474     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
475     xyz[ZZ] = ind % nc[ZZ];
476 }
477
478 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
479 {
480     int ddindex;
481     int ddnodeid=-1;
482     
483     ddindex = dd_index(dd->nc,c);
484     if (dd->comm->bCartesianPP_PME)
485     {
486         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
487     }
488     else if (dd->comm->bCartesianPP)
489     {
490 #ifdef GMX_MPI
491         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
492 #endif
493     }
494     else
495     {
496         ddnodeid = ddindex;
497     }
498     
499     return ddnodeid;
500 }
501
502 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
503 {
504     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
505 }
506
507 int ddglatnr(gmx_domdec_t *dd,int i)
508 {
509     int atnr;
510     
511     if (dd == NULL)
512     {
513         atnr = i + 1;
514     }
515     else
516     {
517         if (i >= dd->comm->nat[ddnatNR-1])
518         {
519             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
520         }
521         atnr = dd->gatindex[i] + 1;
522     }
523     
524     return atnr;
525 }
526
527 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
528 {
529     return &dd->comm->cgs_gl;
530 }
531
532 static void vec_rvec_init(vec_rvec_t *v)
533 {
534     v->nalloc = 0;
535     v->v      = NULL;
536 }
537
538 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
539 {
540     if (n > v->nalloc)
541     {
542         v->nalloc = over_alloc_dd(n);
543         srenew(v->v,v->nalloc);
544     }
545 }
546
547 void dd_store_state(gmx_domdec_t *dd,t_state *state)
548 {
549     int i;
550     
551     if (state->ddp_count != dd->ddp_count)
552     {
553         gmx_incons("The state does not the domain decomposition state");
554     }
555     
556     state->ncg_gl = dd->ncg_home;
557     if (state->ncg_gl > state->cg_gl_nalloc)
558     {
559         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
560         srenew(state->cg_gl,state->cg_gl_nalloc);
561     }
562     for(i=0; i<state->ncg_gl; i++)
563     {
564         state->cg_gl[i] = dd->index_gl[i];
565     }
566     
567     state->ddp_count_cg_gl = dd->ddp_count;
568 }
569
570 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
571 {
572     return &dd->comm->zones;
573 }
574
575 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
576                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
577 {
578     gmx_domdec_zones_t *zones;
579     int izone,d,dim;
580
581     zones = &dd->comm->zones;
582
583     izone = 0;
584     while (icg >= zones->izone[izone].cg1)
585     {
586         izone++;
587     }
588     
589     if (izone == 0)
590     {
591         *jcg0 = icg;
592     }
593     else if (izone < zones->nizone)
594     {
595         *jcg0 = zones->izone[izone].jcg0;
596     }
597     else
598     {
599         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
600                   icg,izone,zones->nizone);
601     }
602         
603     *jcg1 = zones->izone[izone].jcg1;
604     
605     for(d=0; d<dd->ndim; d++)
606     {
607         dim = dd->dim[d];
608         shift0[dim] = zones->izone[izone].shift0[dim];
609         shift1[dim] = zones->izone[izone].shift1[dim];
610         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
611         {
612             /* A conservative approach, this can be optimized */
613             shift0[dim] -= 1;
614             shift1[dim] += 1;
615         }
616     }
617 }
618
619 int dd_natoms_vsite(gmx_domdec_t *dd)
620 {
621     return dd->comm->nat[ddnatVSITE];
622 }
623
624 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
625 {
626     *at_start = dd->comm->nat[ddnatCON-1];
627     *at_end   = dd->comm->nat[ddnatCON];
628 }
629
630 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
631 {
632     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
633     int  *index,*cgindex;
634     gmx_domdec_comm_t *comm;
635     gmx_domdec_comm_dim_t *cd;
636     gmx_domdec_ind_t *ind;
637     rvec shift={0,0,0},*buf,*rbuf;
638     gmx_bool bPBC,bScrew;
639     
640     comm = dd->comm;
641     
642     cgindex = dd->cgindex;
643     
644     buf = comm->vbuf.v;
645
646     nzone = 1;
647     nat_tot = dd->nat_home;
648     for(d=0; d<dd->ndim; d++)
649     {
650         bPBC   = (dd->ci[dd->dim[d]] == 0);
651         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
652         if (bPBC)
653         {
654             copy_rvec(box[dd->dim[d]],shift);
655         }
656         cd = &comm->cd[d];
657         for(p=0; p<cd->np; p++)
658         {
659             ind = &cd->ind[p];
660             index = ind->index;
661             n = 0;
662             if (!bPBC)
663             {
664                 for(i=0; i<ind->nsend[nzone]; i++)
665                 {
666                     at0 = cgindex[index[i]];
667                     at1 = cgindex[index[i]+1];
668                     for(j=at0; j<at1; j++)
669                     {
670                         copy_rvec(x[j],buf[n]);
671                         n++;
672                     }
673                 }
674             }
675             else if (!bScrew)
676             {
677                 for(i=0; i<ind->nsend[nzone]; i++)
678                 {
679                     at0 = cgindex[index[i]];
680                     at1 = cgindex[index[i]+1];
681                     for(j=at0; j<at1; j++)
682                     {
683                         /* We need to shift the coordinates */
684                         rvec_add(x[j],shift,buf[n]);
685                         n++;
686                     }
687                 }
688             }
689             else
690             {
691                 for(i=0; i<ind->nsend[nzone]; i++)
692                 {
693                     at0 = cgindex[index[i]];
694                     at1 = cgindex[index[i]+1];
695                     for(j=at0; j<at1; j++)
696                     {
697                         /* Shift x */
698                         buf[n][XX] = x[j][XX] + shift[XX];
699                         /* Rotate y and z.
700                          * This operation requires a special shift force
701                          * treatment, which is performed in calc_vir.
702                          */
703                         buf[n][YY] = box[YY][YY] - x[j][YY];
704                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
705                         n++;
706                     }
707                 }
708             }
709             
710             if (cd->bInPlace)
711             {
712                 rbuf = x + nat_tot;
713             }
714             else
715             {
716                 rbuf = comm->vbuf2.v;
717             }
718             /* Send and receive the coordinates */
719             dd_sendrecv_rvec(dd, d, dddirBackward,
720                              buf,  ind->nsend[nzone+1],
721                              rbuf, ind->nrecv[nzone+1]);
722             if (!cd->bInPlace)
723             {
724                 j = 0;
725                 for(zone=0; zone<nzone; zone++)
726                 {
727                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
728                     {
729                         copy_rvec(rbuf[j],x[i]);
730                         j++;
731                     }
732                 }
733             }
734             nat_tot += ind->nrecv[nzone+1];
735         }
736         nzone += nzone;
737     }
738 }
739
740 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
741 {
742     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
743     int  *index,*cgindex;
744     gmx_domdec_comm_t *comm;
745     gmx_domdec_comm_dim_t *cd;
746     gmx_domdec_ind_t *ind;
747     rvec *buf,*sbuf;
748     ivec vis;
749     int  is;
750     gmx_bool bPBC,bScrew;
751     
752     comm = dd->comm;
753     
754     cgindex = dd->cgindex;
755
756     buf = comm->vbuf.v;
757
758     n = 0;
759     nzone = comm->zones.n/2;
760     nat_tot = dd->nat_tot;
761     for(d=dd->ndim-1; d>=0; d--)
762     {
763         bPBC   = (dd->ci[dd->dim[d]] == 0);
764         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
765         if (fshift == NULL && !bScrew)
766         {
767             bPBC = FALSE;
768         }
769         /* Determine which shift vector we need */
770         clear_ivec(vis);
771         vis[dd->dim[d]] = 1;
772         is = IVEC2IS(vis);
773         
774         cd = &comm->cd[d];
775         for(p=cd->np-1; p>=0; p--) {
776             ind = &cd->ind[p];
777             nat_tot -= ind->nrecv[nzone+1];
778             if (cd->bInPlace)
779             {
780                 sbuf = f + nat_tot;
781             }
782             else
783             {
784                 sbuf = comm->vbuf2.v;
785                 j = 0;
786                 for(zone=0; zone<nzone; zone++)
787                 {
788                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
789                     {
790                         copy_rvec(f[i],sbuf[j]);
791                         j++;
792                     }
793                 }
794             }
795             /* Communicate the forces */
796             dd_sendrecv_rvec(dd, d, dddirForward,
797                              sbuf, ind->nrecv[nzone+1],
798                              buf,  ind->nsend[nzone+1]);
799             index = ind->index;
800             /* Add the received forces */
801             n = 0;
802             if (!bPBC)
803             {
804                 for(i=0; i<ind->nsend[nzone]; i++)
805                 {
806                     at0 = cgindex[index[i]];
807                     at1 = cgindex[index[i]+1];
808                     for(j=at0; j<at1; j++)
809                     {
810                         rvec_inc(f[j],buf[n]);
811                         n++;
812                     }
813                 } 
814             }
815             else if (!bScrew)
816             {
817                 for(i=0; i<ind->nsend[nzone]; i++)
818                 {
819                     at0 = cgindex[index[i]];
820                     at1 = cgindex[index[i]+1];
821                     for(j=at0; j<at1; j++)
822                     {
823                         rvec_inc(f[j],buf[n]);
824                         /* Add this force to the shift force */
825                         rvec_inc(fshift[is],buf[n]);
826                         n++;
827                     }
828                 }
829             }
830             else
831             {
832                 for(i=0; i<ind->nsend[nzone]; i++)
833                 {
834                     at0 = cgindex[index[i]];
835                     at1 = cgindex[index[i]+1];
836                     for(j=at0; j<at1; j++)
837                     {
838                         /* Rotate the force */
839                         f[j][XX] += buf[n][XX];
840                         f[j][YY] -= buf[n][YY];
841                         f[j][ZZ] -= buf[n][ZZ];
842                         if (fshift)
843                         {
844                             /* Add this force to the shift force */
845                             rvec_inc(fshift[is],buf[n]);
846                         }
847                         n++;
848                     }
849                 }
850             }
851         }
852         nzone /= 2;
853     }
854 }
855
856 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
857 {
858     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
859     int  *index,*cgindex;
860     gmx_domdec_comm_t *comm;
861     gmx_domdec_comm_dim_t *cd;
862     gmx_domdec_ind_t *ind;
863     real *buf,*rbuf;
864     
865     comm = dd->comm;
866     
867     cgindex = dd->cgindex;
868     
869     buf = &comm->vbuf.v[0][0];
870
871     nzone = 1;
872     nat_tot = dd->nat_home;
873     for(d=0; d<dd->ndim; d++)
874     {
875         cd = &comm->cd[d];
876         for(p=0; p<cd->np; p++)
877         {
878             ind = &cd->ind[p];
879             index = ind->index;
880             n = 0;
881             for(i=0; i<ind->nsend[nzone]; i++)
882             {
883                 at0 = cgindex[index[i]];
884                 at1 = cgindex[index[i]+1];
885                 for(j=at0; j<at1; j++)
886                 {
887                     buf[n] = v[j];
888                     n++;
889                 }
890             }
891             
892             if (cd->bInPlace)
893             {
894                 rbuf = v + nat_tot;
895             }
896             else
897             {
898                 rbuf = &comm->vbuf2.v[0][0];
899             }
900             /* Send and receive the coordinates */
901             dd_sendrecv_real(dd, d, dddirBackward,
902                              buf,  ind->nsend[nzone+1],
903                              rbuf, ind->nrecv[nzone+1]);
904             if (!cd->bInPlace)
905             {
906                 j = 0;
907                 for(zone=0; zone<nzone; zone++)
908                 {
909                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
910                     {
911                         v[i] = rbuf[j];
912                         j++;
913                     }
914                 }
915             }
916             nat_tot += ind->nrecv[nzone+1];
917         }
918         nzone += nzone;
919     }
920 }
921
922 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
923 {
924     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
925     int  *index,*cgindex;
926     gmx_domdec_comm_t *comm;
927     gmx_domdec_comm_dim_t *cd;
928     gmx_domdec_ind_t *ind;
929     real *buf,*sbuf;
930     
931     comm = dd->comm;
932     
933     cgindex = dd->cgindex;
934
935     buf = &comm->vbuf.v[0][0];
936
937     n = 0;
938     nzone = comm->zones.n/2;
939     nat_tot = dd->nat_tot;
940     for(d=dd->ndim-1; d>=0; d--)
941     {
942         cd = &comm->cd[d];
943         for(p=cd->np-1; p>=0; p--) {
944             ind = &cd->ind[p];
945             nat_tot -= ind->nrecv[nzone+1];
946             if (cd->bInPlace)
947             {
948                 sbuf = v + nat_tot;
949             }
950             else
951             {
952                 sbuf = &comm->vbuf2.v[0][0];
953                 j = 0;
954                 for(zone=0; zone<nzone; zone++)
955                 {
956                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
957                     {
958                         sbuf[j] = v[i];
959                         j++;
960                     }
961                 }
962             }
963             /* Communicate the forces */
964             dd_sendrecv_real(dd, d, dddirForward,
965                              sbuf, ind->nrecv[nzone+1],
966                              buf,  ind->nsend[nzone+1]);
967             index = ind->index;
968             /* Add the received forces */
969             n = 0;
970             for(i=0; i<ind->nsend[nzone]; i++)
971             {
972                 at0 = cgindex[index[i]];
973                 at1 = cgindex[index[i]+1];
974                 for(j=at0; j<at1; j++)
975                 {
976                     v[j] += buf[n];
977                     n++;
978                 }
979             } 
980         }
981         nzone /= 2;
982     }
983 }
984
985 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
986 {
987     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
988             d,i,j,
989             zone->min0,zone->max1,
990             zone->mch0,zone->mch0,
991             zone->p1_0,zone->p1_1);
992 }
993
994
995 #define DDZONECOMM_MAXZONE  5
996 #define DDZONECOMM_BUFSIZE  3
997
998 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
999                                int ddimind,int direction,
1000                                gmx_ddzone_t *buf_s,int n_s,
1001                                gmx_ddzone_t *buf_r,int n_r)
1002 {
1003 #define ZBS  DDZONECOMM_BUFSIZE
1004     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1005     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1006     int i;
1007
1008     for(i=0; i<n_s; i++)
1009     {
1010         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1011         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1012         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1013         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1014         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1015         vbuf_s[i*ZBS+1][2] = 0;
1016         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1017         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1018         vbuf_s[i*ZBS+2][2] = 0;
1019     }
1020
1021     dd_sendrecv_rvec(dd, ddimind, direction,
1022                      vbuf_s, n_s*ZBS,
1023                      vbuf_r, n_r*ZBS);
1024
1025     for(i=0; i<n_r; i++)
1026     {
1027         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1028         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1029         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1030         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1031         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1032         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1033         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1034     }
1035
1036 #undef ZBS
1037 }
1038
1039 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1040                           rvec cell_ns_x0,rvec cell_ns_x1)
1041 {
1042     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1043     gmx_ddzone_t *zp;
1044     gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1045     gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1046     gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1047     rvec extr_s[2],extr_r[2];
1048     rvec dh;
1049     real dist_d,c=0,det;
1050     gmx_domdec_comm_t *comm;
1051     gmx_bool bPBC,bUse;
1052
1053     comm = dd->comm;
1054
1055     for(d=1; d<dd->ndim; d++)
1056     {
1057         dim = dd->dim[d];
1058         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1059         zp->min0 = cell_ns_x0[dim];
1060         zp->max1 = cell_ns_x1[dim];
1061         zp->min1 = cell_ns_x1[dim];
1062         zp->mch0 = cell_ns_x0[dim];
1063         zp->mch1 = cell_ns_x1[dim];
1064         zp->p1_0 = cell_ns_x0[dim];
1065         zp->p1_1 = cell_ns_x1[dim];
1066     }
1067     
1068     for(d=dd->ndim-2; d>=0; d--)
1069     {
1070         dim  = dd->dim[d];
1071         bPBC = (dim < ddbox->npbcdim);
1072
1073         /* Use an rvec to store two reals */
1074         extr_s[d][0] = comm->cell_f0[d+1];
1075         extr_s[d][1] = comm->cell_f1[d+1];
1076         extr_s[d][2] = comm->cell_f1[d+1];
1077
1078         pos = 0;
1079         /* Store the extremes in the backward sending buffer,
1080          * so the get updated separately from the forward communication.
1081          */
1082         for(d1=d; d1<dd->ndim-1; d1++)
1083         {
1084             /* We invert the order to be able to use the same loop for buf_e */
1085             buf_s[pos].min0 = extr_s[d1][1];
1086             buf_s[pos].max1 = extr_s[d1][0];
1087             buf_s[pos].min1 = extr_s[d1][2];
1088             buf_s[pos].mch0 = 0;
1089             buf_s[pos].mch1 = 0;
1090             /* Store the cell corner of the dimension we communicate along */
1091             buf_s[pos].p1_0 = comm->cell_x0[dim];
1092             buf_s[pos].p1_1 = 0;
1093             pos++;
1094         }
1095
1096         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1097         pos++;
1098
1099         if (dd->ndim == 3 && d == 0)
1100         {
1101             buf_s[pos] = comm->zone_d2[0][1];
1102             pos++;
1103             buf_s[pos] = comm->zone_d1[0];
1104             pos++;
1105         }
1106
1107         /* We only need to communicate the extremes
1108          * in the forward direction
1109          */
1110         npulse = comm->cd[d].np;
1111         if (bPBC)
1112         {
1113             /* Take the minimum to avoid double communication */
1114             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1115         }
1116         else
1117         {
1118             /* Without PBC we should really not communicate over
1119              * the boundaries, but implementing that complicates
1120              * the communication setup and therefore we simply
1121              * do all communication, but ignore some data.
1122              */
1123             npulse_min = npulse;
1124         }
1125         for(p=0; p<npulse_min; p++)
1126         {
1127             /* Communicate the extremes forward */
1128             bUse = (bPBC || dd->ci[dim] > 0);
1129
1130             dd_sendrecv_rvec(dd, d, dddirForward,
1131                              extr_s+d, dd->ndim-d-1,
1132                              extr_r+d, dd->ndim-d-1);
1133
1134             if (bUse)
1135             {
1136                 for(d1=d; d1<dd->ndim-1; d1++)
1137                 {
1138                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1139                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1140                     extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1141                 }
1142             }
1143         }
1144
1145         buf_size = pos;
1146         for(p=0; p<npulse; p++)
1147         {
1148             /* Communicate all the zone information backward */
1149             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1150
1151             dd_sendrecv_ddzone(dd, d, dddirBackward,
1152                                buf_s, buf_size,
1153                                buf_r, buf_size);
1154
1155             clear_rvec(dh);
1156             if (p > 0)
1157             {
1158                 for(d1=d+1; d1<dd->ndim; d1++)
1159                 {
1160                     /* Determine the decrease of maximum required
1161                      * communication height along d1 due to the distance along d,
1162                      * this avoids a lot of useless atom communication.
1163                      */
1164                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1165
1166                     if (ddbox->tric_dir[dim])
1167                     {
1168                         /* c is the off-diagonal coupling between the cell planes
1169                          * along directions d and d1.
1170                          */
1171                         c = ddbox->v[dim][dd->dim[d1]][dim];
1172                     }
1173                     else
1174                     {
1175                         c = 0;
1176                     }
1177                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1178                     if (det > 0)
1179                     {
1180                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1181                     }
1182                     else
1183                     {
1184                         /* A negative value signals out of range */
1185                         dh[d1] = -1;
1186                     }
1187                 }
1188             }
1189
1190             /* Accumulate the extremes over all pulses */
1191             for(i=0; i<buf_size; i++)
1192             {
1193                 if (p == 0)
1194                 {
1195                     buf_e[i] = buf_r[i];
1196                 }
1197                 else
1198                 {
1199                     if (bUse)
1200                     {
1201                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1202                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1203                         buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1204                     }
1205
1206                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1207                     {
1208                         d1 = 1;
1209                     }
1210                     else
1211                     {
1212                         d1 = d + 1;
1213                     }
1214                     if (bUse && dh[d1] >= 0)
1215                     {
1216                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1217                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1218                     }
1219                 }
1220                 /* Copy the received buffer to the send buffer,
1221                  * to pass the data through with the next pulse.
1222                  */
1223                 buf_s[i] = buf_r[i];
1224             }
1225             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1226                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1227             {
1228                 /* Store the extremes */ 
1229                 pos = 0;
1230
1231                 for(d1=d; d1<dd->ndim-1; d1++)
1232                 {
1233                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1234                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1235                     extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1236                     pos++;
1237                 }
1238
1239                 if (d == 1 || (d == 0 && dd->ndim == 3))
1240                 {
1241                     for(i=d; i<2; i++)
1242                     {
1243                         comm->zone_d2[1-d][i] = buf_e[pos];
1244                         pos++;
1245                     }
1246                 }
1247                 if (d == 0)
1248                 {
1249                     comm->zone_d1[1] = buf_e[pos];
1250                     pos++;
1251                 }
1252             }
1253         }
1254     }
1255     
1256     if (dd->ndim >= 2)
1257     {
1258         dim = dd->dim[1];
1259         for(i=0; i<2; i++)
1260         {
1261             if (debug)
1262             {
1263                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1264             }
1265             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1266             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1267         }
1268     }
1269     if (dd->ndim >= 3)
1270     {
1271         dim = dd->dim[2];
1272         for(i=0; i<2; i++)
1273         {
1274             for(j=0; j<2; j++)
1275             {
1276                 if (debug)
1277                 {
1278                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1279                 }
1280                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1281                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1282             }
1283         }
1284     }
1285     for(d=1; d<dd->ndim; d++)
1286     {
1287         comm->cell_f_max0[d] = extr_s[d-1][0];
1288         comm->cell_f_min1[d] = extr_s[d-1][1];
1289         if (debug)
1290         {
1291             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1292                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1293         }
1294     }
1295 }
1296
1297 static void dd_collect_cg(gmx_domdec_t *dd,
1298                           t_state *state_local)
1299 {
1300     gmx_domdec_master_t *ma=NULL;
1301     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1302     t_block *cgs_gl;
1303
1304     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1305     {
1306         /* The master has the correct distribution */
1307         return;
1308     }
1309     
1310     if (state_local->ddp_count == dd->ddp_count)
1311     {
1312         ncg_home = dd->ncg_home;
1313         cg       = dd->index_gl;
1314         nat_home = dd->nat_home;
1315     } 
1316     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1317     {
1318         cgs_gl = &dd->comm->cgs_gl;
1319
1320         ncg_home = state_local->ncg_gl;
1321         cg       = state_local->cg_gl;
1322         nat_home = 0;
1323         for(i=0; i<ncg_home; i++)
1324         {
1325             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1326         }
1327     }
1328     else
1329     {
1330         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1331     }
1332     
1333     buf2[0] = dd->ncg_home;
1334     buf2[1] = dd->nat_home;
1335     if (DDMASTER(dd))
1336     {
1337         ma = dd->ma;
1338         ibuf = ma->ibuf;
1339     }
1340     else
1341     {
1342         ibuf = NULL;
1343     }
1344     /* Collect the charge group and atom counts on the master */
1345     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1346     
1347     if (DDMASTER(dd))
1348     {
1349         ma->index[0] = 0;
1350         for(i=0; i<dd->nnodes; i++)
1351         {
1352             ma->ncg[i] = ma->ibuf[2*i];
1353             ma->nat[i] = ma->ibuf[2*i+1];
1354             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1355             
1356         }
1357         /* Make byte counts and indices */
1358         for(i=0; i<dd->nnodes; i++)
1359         {
1360             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1361             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1362         }
1363         if (debug)
1364         {
1365             fprintf(debug,"Initial charge group distribution: ");
1366             for(i=0; i<dd->nnodes; i++)
1367                 fprintf(debug," %d",ma->ncg[i]);
1368             fprintf(debug,"\n");
1369         }
1370     }
1371     
1372     /* Collect the charge group indices on the master */
1373     dd_gatherv(dd,
1374                dd->ncg_home*sizeof(int),dd->index_gl,
1375                DDMASTER(dd) ? ma->ibuf : NULL,
1376                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1377                DDMASTER(dd) ? ma->cg : NULL);
1378     
1379     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1380 }
1381
1382 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1383                                     rvec *lv,rvec *v)
1384 {
1385     gmx_domdec_master_t *ma;
1386     int  n,i,c,a,nalloc=0;
1387     rvec *buf=NULL;
1388     t_block *cgs_gl;
1389
1390     ma = dd->ma;
1391     
1392     if (!DDMASTER(dd))
1393     {
1394 #ifdef GMX_MPI
1395         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1396                  dd->rank,dd->mpi_comm_all);
1397 #endif
1398     } else {
1399         /* Copy the master coordinates to the global array */
1400         cgs_gl = &dd->comm->cgs_gl;
1401
1402         n = DDMASTERRANK(dd);
1403         a = 0;
1404         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1405         {
1406             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1407             {
1408                 copy_rvec(lv[a++],v[c]);
1409             }
1410         }
1411         
1412         for(n=0; n<dd->nnodes; n++)
1413         {
1414             if (n != dd->rank)
1415             {
1416                 if (ma->nat[n] > nalloc)
1417                 {
1418                     nalloc = over_alloc_dd(ma->nat[n]);
1419                     srenew(buf,nalloc);
1420                 }
1421 #ifdef GMX_MPI
1422                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1423                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1424 #endif
1425                 a = 0;
1426                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1427                 {
1428                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1429                     {
1430                         copy_rvec(buf[a++],v[c]);
1431                     }
1432                 }
1433             }
1434         }
1435         sfree(buf);
1436     }
1437 }
1438
1439 static void get_commbuffer_counts(gmx_domdec_t *dd,
1440                                   int **counts,int **disps)
1441 {
1442     gmx_domdec_master_t *ma;
1443     int n;
1444
1445     ma = dd->ma;
1446     
1447     /* Make the rvec count and displacment arrays */
1448     *counts  = ma->ibuf;
1449     *disps   = ma->ibuf + dd->nnodes;
1450     for(n=0; n<dd->nnodes; n++)
1451     {
1452         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1453         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1454     }
1455 }
1456
1457 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1458                                    rvec *lv,rvec *v)
1459 {
1460     gmx_domdec_master_t *ma;
1461     int  *rcounts=NULL,*disps=NULL;
1462     int  n,i,c,a;
1463     rvec *buf=NULL;
1464     t_block *cgs_gl;
1465     
1466     ma = dd->ma;
1467     
1468     if (DDMASTER(dd))
1469     {
1470         get_commbuffer_counts(dd,&rcounts,&disps);
1471
1472         buf = ma->vbuf;
1473     }
1474     
1475     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1476
1477     if (DDMASTER(dd))
1478     {
1479         cgs_gl = &dd->comm->cgs_gl;
1480
1481         a = 0;
1482         for(n=0; n<dd->nnodes; n++)
1483         {
1484             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1485             {
1486                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1487                 {
1488                     copy_rvec(buf[a++],v[c]);
1489                 }
1490             }
1491         }
1492     }
1493 }
1494
1495 void dd_collect_vec(gmx_domdec_t *dd,
1496                     t_state *state_local,rvec *lv,rvec *v)
1497 {
1498     gmx_domdec_master_t *ma;
1499     int  n,i,c,a,nalloc=0;
1500     rvec *buf=NULL;
1501     
1502     dd_collect_cg(dd,state_local);
1503
1504     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1505     {
1506         dd_collect_vec_sendrecv(dd,lv,v);
1507     }
1508     else
1509     {
1510         dd_collect_vec_gatherv(dd,lv,v);
1511     }
1512 }
1513
1514
1515 void dd_collect_state(gmx_domdec_t *dd,
1516                       t_state *state_local,t_state *state)
1517 {
1518     int est,i,j,nh;
1519
1520     nh = state->nhchainlength;
1521
1522     if (DDMASTER(dd))
1523     {
1524         for (i=0;i<efptNR;i++) {
1525             state->lambda[i] = state_local->lambda[i];
1526         }
1527         state->fep_state = state_local->fep_state;
1528         state->veta = state_local->veta;
1529         state->vol0 = state_local->vol0;
1530         copy_mat(state_local->box,state->box);
1531         copy_mat(state_local->boxv,state->boxv);
1532         copy_mat(state_local->svir_prev,state->svir_prev);
1533         copy_mat(state_local->fvir_prev,state->fvir_prev);
1534         copy_mat(state_local->pres_prev,state->pres_prev);
1535
1536
1537         for(i=0; i<state_local->ngtc; i++)
1538         {
1539             for(j=0; j<nh; j++) {
1540                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1541                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1542             }
1543             state->therm_integral[i] = state_local->therm_integral[i];            
1544         }
1545         for(i=0; i<state_local->nnhpres; i++) 
1546         {
1547             for(j=0; j<nh; j++) {
1548                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1549                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1550             }
1551         }
1552     }
1553     for(est=0; est<estNR; est++)
1554     {
1555         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1556         {
1557             switch (est) {
1558             case estX:
1559                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1560                 break;
1561             case estV:
1562                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1563                 break;
1564             case estSDX:
1565                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1566                 break;
1567             case estCGP:
1568                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1569                 break;
1570             case estLD_RNG:
1571                 if (state->nrngi == 1)
1572                 {
1573                     if (DDMASTER(dd))
1574                     {
1575                         for(i=0; i<state_local->nrng; i++)
1576                         {
1577                             state->ld_rng[i] = state_local->ld_rng[i];
1578                         }
1579                     }
1580                 }
1581                 else
1582                 {
1583                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1584                               state_local->ld_rng,state->ld_rng);
1585                 }
1586                 break;
1587             case estLD_RNGI:
1588                 if (state->nrngi == 1)
1589                 {
1590                    if (DDMASTER(dd))
1591                     {
1592                         state->ld_rngi[0] = state_local->ld_rngi[0];
1593                     } 
1594                 }
1595                 else
1596                 {
1597                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1598                               state_local->ld_rngi,state->ld_rngi);
1599                 }
1600                 break;
1601             case estDISRE_INITF:
1602             case estDISRE_RM3TAV:
1603             case estORIRE_INITF:
1604             case estORIRE_DTAV:
1605                 break;
1606             default:
1607                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1608             }
1609         }
1610     }
1611 }
1612
1613 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1614 {
1615     int est;
1616
1617     if (debug)
1618     {
1619         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1620     }
1621
1622     state->nalloc = over_alloc_dd(nalloc);
1623     
1624     for(est=0; est<estNR; est++)
1625     {
1626         if (EST_DISTR(est) && (state->flags & (1<<est)))
1627         {
1628             switch(est) {
1629             case estX:
1630                 srenew(state->x,state->nalloc);
1631                 break;
1632             case estV:
1633                 srenew(state->v,state->nalloc);
1634                 break;
1635             case estSDX:
1636                 srenew(state->sd_X,state->nalloc);
1637                 break;
1638             case estCGP:
1639                 srenew(state->cg_p,state->nalloc);
1640                 break;
1641             case estLD_RNG:
1642             case estLD_RNGI:
1643             case estDISRE_INITF:
1644             case estDISRE_RM3TAV:
1645             case estORIRE_INITF:
1646             case estORIRE_DTAV:
1647                 /* No reallocation required */
1648                 break;
1649             default:
1650                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1651             }
1652         }
1653     }
1654     
1655     if (f != NULL)
1656     {
1657         srenew(*f,state->nalloc);
1658     }
1659 }
1660
1661 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1662                                int nalloc)
1663 {
1664     if (nalloc > fr->cg_nalloc)
1665     {
1666         if (debug)
1667         {
1668             fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1669         }
1670         fr->cg_nalloc = over_alloc_dd(nalloc);
1671         srenew(fr->cginfo,fr->cg_nalloc);
1672         if (fr->cutoff_scheme == ecutsGROUP)
1673         {
1674             srenew(fr->cg_cm,fr->cg_nalloc);
1675         }
1676     }
1677     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1678     {
1679         /* We don't use charge groups, we use x in state to set up
1680          * the atom communication.
1681          */
1682         dd_realloc_state(state,f,nalloc);
1683     }
1684 }
1685
1686 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1687                                        rvec *v,rvec *lv)
1688 {
1689     gmx_domdec_master_t *ma;
1690     int  n,i,c,a,nalloc=0;
1691     rvec *buf=NULL;
1692     
1693     if (DDMASTER(dd))
1694     {
1695         ma  = dd->ma;
1696         
1697         for(n=0; n<dd->nnodes; n++)
1698         {
1699             if (n != dd->rank)
1700             {
1701                 if (ma->nat[n] > nalloc)
1702                 {
1703                     nalloc = over_alloc_dd(ma->nat[n]);
1704                     srenew(buf,nalloc);
1705                 }
1706                 /* Use lv as a temporary buffer */
1707                 a = 0;
1708                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1709                 {
1710                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1711                     {
1712                         copy_rvec(v[c],buf[a++]);
1713                     }
1714                 }
1715                 if (a != ma->nat[n])
1716                 {
1717                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1718                               a,ma->nat[n]);
1719                 }
1720                 
1721 #ifdef GMX_MPI
1722                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1723                          DDRANK(dd,n),n,dd->mpi_comm_all);
1724 #endif
1725             }
1726         }
1727         sfree(buf);
1728         n = DDMASTERRANK(dd);
1729         a = 0;
1730         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1731         {
1732             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1733             {
1734                 copy_rvec(v[c],lv[a++]);
1735             }
1736         }
1737     }
1738     else
1739     {
1740 #ifdef GMX_MPI
1741         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1742                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1743 #endif
1744     }
1745 }
1746
1747 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1748                                        rvec *v,rvec *lv)
1749 {
1750     gmx_domdec_master_t *ma;
1751     int  *scounts=NULL,*disps=NULL;
1752     int  n,i,c,a,nalloc=0;
1753     rvec *buf=NULL;
1754     
1755     if (DDMASTER(dd))
1756     {
1757         ma  = dd->ma;
1758      
1759         get_commbuffer_counts(dd,&scounts,&disps);
1760
1761         buf = ma->vbuf;
1762         a = 0;
1763         for(n=0; n<dd->nnodes; n++)
1764         {
1765             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1766             {
1767                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1768                 {
1769                     copy_rvec(v[c],buf[a++]);
1770                 }
1771             }
1772         }
1773     }
1774
1775     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1776 }
1777
1778 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1779 {
1780     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1781     {
1782         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1783     }
1784     else
1785     {
1786         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1787     }
1788 }
1789
1790 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1791                                 t_state *state,t_state *state_local,
1792                                 rvec **f)
1793 {
1794     int  i,j,nh;
1795
1796     nh = state->nhchainlength;
1797
1798     if (DDMASTER(dd))
1799     {
1800         for(i=0;i<efptNR;i++)
1801         {
1802             state_local->lambda[i] = state->lambda[i];
1803         }
1804         state_local->fep_state = state->fep_state;
1805         state_local->veta   = state->veta;
1806         state_local->vol0   = state->vol0;
1807         copy_mat(state->box,state_local->box);
1808         copy_mat(state->box_rel,state_local->box_rel);
1809         copy_mat(state->boxv,state_local->boxv);
1810         copy_mat(state->svir_prev,state_local->svir_prev);
1811         copy_mat(state->fvir_prev,state_local->fvir_prev);
1812         for(i=0; i<state_local->ngtc; i++)
1813         {
1814             for(j=0; j<nh; j++) {
1815                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1816                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1817             }
1818             state_local->therm_integral[i] = state->therm_integral[i];
1819         }
1820         for(i=0; i<state_local->nnhpres; i++)
1821         {
1822             for(j=0; j<nh; j++) {
1823                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1824                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1825             }
1826         }
1827     }
1828     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1829     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1830     dd_bcast(dd,sizeof(real),&state_local->veta);
1831     dd_bcast(dd,sizeof(real),&state_local->vol0);
1832     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1833     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1834     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1835     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1836     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1837     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1838     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1839     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1840     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1841     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1842
1843     if (dd->nat_home > state_local->nalloc)
1844     {
1845         dd_realloc_state(state_local,f,dd->nat_home);
1846     }
1847     for(i=0; i<estNR; i++)
1848     {
1849         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1850         {
1851             switch (i) {
1852             case estX:
1853                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1854                 break;
1855             case estV:
1856                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1857                 break;
1858             case estSDX:
1859                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1860                 break;
1861             case estCGP:
1862                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1863                 break;
1864             case estLD_RNG:
1865                 if (state->nrngi == 1)
1866                 {
1867                     dd_bcastc(dd,
1868                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1869                               state->ld_rng,state_local->ld_rng);
1870                 }
1871                 else
1872                 {
1873                     dd_scatter(dd,
1874                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1875                                state->ld_rng,state_local->ld_rng);
1876                 }
1877                 break;
1878             case estLD_RNGI:
1879                 if (state->nrngi == 1)
1880                 {
1881                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1882                               state->ld_rngi,state_local->ld_rngi);
1883                 }
1884                 else
1885                 {
1886                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1887                                state->ld_rngi,state_local->ld_rngi);
1888                 }   
1889                 break;
1890             case estDISRE_INITF:
1891             case estDISRE_RM3TAV:
1892             case estORIRE_INITF:
1893             case estORIRE_DTAV:
1894                 /* Not implemented yet */
1895                 break;
1896             default:
1897                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1898             }
1899         }
1900     }
1901 }
1902
1903 static char dim2char(int dim)
1904 {
1905     char c='?';
1906     
1907     switch (dim)
1908     {
1909     case XX: c = 'X'; break;
1910     case YY: c = 'Y'; break;
1911     case ZZ: c = 'Z'; break;
1912     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1913     }
1914     
1915     return c;
1916 }
1917
1918 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1919                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1920 {
1921     rvec grid_s[2],*grid_r=NULL,cx,r;
1922     char fname[STRLEN],format[STRLEN],buf[22];
1923     FILE *out;
1924     int  a,i,d,z,y,x;
1925     matrix tric;
1926     real vol;
1927
1928     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1929     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1930     
1931     if (DDMASTER(dd))
1932     {
1933         snew(grid_r,2*dd->nnodes);
1934     }
1935     
1936     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1937     
1938     if (DDMASTER(dd))
1939     {
1940         for(d=0; d<DIM; d++)
1941         {
1942             for(i=0; i<DIM; i++)
1943             {
1944                 if (d == i)
1945                 {
1946                     tric[d][i] = 1;
1947                 }
1948                 else
1949                 {
1950                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1951                     {
1952                         tric[d][i] = box[i][d]/box[i][i];
1953                     }
1954                     else
1955                     {
1956                         tric[d][i] = 0;
1957                     }
1958                 }
1959             }
1960         }
1961         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1962         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1963         out = gmx_fio_fopen(fname,"w");
1964         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1965         a = 1;
1966         for(i=0; i<dd->nnodes; i++)
1967         {
1968             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1969             for(d=0; d<DIM; d++)
1970             {
1971                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1972             }
1973             for(z=0; z<2; z++)
1974             {
1975                 for(y=0; y<2; y++)
1976                 {
1977                     for(x=0; x<2; x++)
1978                     {
1979                         cx[XX] = grid_r[i*2+x][XX];
1980                         cx[YY] = grid_r[i*2+y][YY];
1981                         cx[ZZ] = grid_r[i*2+z][ZZ];
1982                         mvmul(tric,cx,r);
1983                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1984                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1985                     }
1986                 }
1987             }
1988             for(d=0; d<DIM; d++)
1989             {
1990                 for(x=0; x<4; x++)
1991                 {
1992                     switch(d)
1993                     {
1994                     case 0: y = 1 + i*8 + 2*x; break;
1995                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1996                     case 2: y = 1 + i*8 + x; break;
1997                     }
1998                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1999                 }
2000             }
2001         }
2002         gmx_fio_fclose(out);
2003         sfree(grid_r);
2004     }
2005 }
2006
2007 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
2008                   gmx_mtop_t *mtop,t_commrec *cr,
2009                   int natoms,rvec x[],matrix box)
2010 {
2011     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
2012     FILE *out;
2013     int  i,ii,resnr,c;
2014     char *atomname,*resname;
2015     real b;
2016     gmx_domdec_t *dd;
2017     
2018     dd = cr->dd;
2019     if (natoms == -1)
2020     {
2021         natoms = dd->comm->nat[ddnatVSITE];
2022     }
2023     
2024     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2025     
2026     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
2027     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
2028     
2029     out = gmx_fio_fopen(fname,"w");
2030     
2031     fprintf(out,"TITLE     %s\n",title);
2032     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2033     for(i=0; i<natoms; i++)
2034     {
2035         ii = dd->gatindex[i];
2036         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2037         if (i < dd->comm->nat[ddnatZONE])
2038         {
2039             c = 0;
2040             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2041             {
2042                 c++;
2043             }
2044             b = c;
2045         }
2046         else if (i < dd->comm->nat[ddnatVSITE])
2047         {
2048             b = dd->comm->zones.n;
2049         }
2050         else
2051         {
2052             b = dd->comm->zones.n + 1;
2053         }
2054         fprintf(out,strlen(atomname)<4 ? format : format4,
2055                 "ATOM",(ii+1)%100000,
2056                 atomname,resname,' ',resnr%10000,' ',
2057                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2058     }
2059     fprintf(out,"TER\n");
2060     
2061     gmx_fio_fclose(out);
2062 }
2063
2064 real dd_cutoff_mbody(gmx_domdec_t *dd)
2065 {
2066     gmx_domdec_comm_t *comm;
2067     int  di;
2068     real r;
2069
2070     comm = dd->comm;
2071
2072     r = -1;
2073     if (comm->bInterCGBondeds)
2074     {
2075         if (comm->cutoff_mbody > 0)
2076         {
2077             r = comm->cutoff_mbody;
2078         }
2079         else
2080         {
2081             /* cutoff_mbody=0 means we do not have DLB */
2082             r = comm->cellsize_min[dd->dim[0]];
2083             for(di=1; di<dd->ndim; di++)
2084             {
2085                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2086             }
2087             if (comm->bBondComm)
2088             {
2089                 r = max(r,comm->cutoff_mbody);
2090             }
2091             else
2092             {
2093                 r = min(r,comm->cutoff);
2094             }
2095         }
2096     }
2097
2098     return r;
2099 }
2100
2101 real dd_cutoff_twobody(gmx_domdec_t *dd)
2102 {
2103     real r_mb;
2104
2105     r_mb = dd_cutoff_mbody(dd);
2106
2107     return max(dd->comm->cutoff,r_mb);
2108 }
2109
2110
2111 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2112 {
2113     int nc,ntot;
2114     
2115     nc   = dd->nc[dd->comm->cartpmedim];
2116     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2117     copy_ivec(coord,coord_pme);
2118     coord_pme[dd->comm->cartpmedim] =
2119         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2120 }
2121
2122 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2123 {
2124     /* Here we assign a PME node to communicate with this DD node
2125      * by assuming that the major index of both is x.
2126      * We add cr->npmenodes/2 to obtain an even distribution.
2127      */
2128     return (ddindex*npme + npme/2)/ndd;
2129 }
2130
2131 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2132 {
2133     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2134 }
2135
2136 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2137 {
2138     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2139 }
2140
2141 static int *dd_pmenodes(t_commrec *cr)
2142 {
2143     int *pmenodes;
2144     int n,i,p0,p1;
2145     
2146     snew(pmenodes,cr->npmenodes);
2147     n = 0;
2148     for(i=0; i<cr->dd->nnodes; i++) {
2149         p0 = cr_ddindex2pmeindex(cr,i);
2150         p1 = cr_ddindex2pmeindex(cr,i+1);
2151         if (i+1 == cr->dd->nnodes || p1 > p0) {
2152             if (debug)
2153                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2154             pmenodes[n] = i + 1 + n;
2155             n++;
2156         }
2157     }
2158
2159     return pmenodes;
2160 }
2161
2162 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2163 {
2164     gmx_domdec_t *dd;
2165     ivec coords,coords_pme,nc;
2166     int  slab;
2167     
2168     dd = cr->dd;
2169     /*
2170       if (dd->comm->bCartesian) {
2171       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2172       dd_coords2pmecoords(dd,coords,coords_pme);
2173       copy_ivec(dd->ntot,nc);
2174       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2175       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2176       
2177       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2178       } else {
2179       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2180       }
2181     */
2182     coords[XX] = x;
2183     coords[YY] = y;
2184     coords[ZZ] = z;
2185     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2186     
2187     return slab;
2188 }
2189
2190 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2191 {
2192     gmx_domdec_comm_t *comm;
2193     ivec coords;
2194     int  ddindex,nodeid=-1;
2195     
2196     comm = cr->dd->comm;
2197     
2198     coords[XX] = x;
2199     coords[YY] = y;
2200     coords[ZZ] = z;
2201     if (comm->bCartesianPP_PME)
2202     {
2203 #ifdef GMX_MPI
2204         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2205 #endif
2206     }
2207     else
2208     {
2209         ddindex = dd_index(cr->dd->nc,coords);
2210         if (comm->bCartesianPP)
2211         {
2212             nodeid = comm->ddindex2simnodeid[ddindex];
2213         }
2214         else
2215         {
2216             if (comm->pmenodes)
2217             {
2218                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2219             }
2220             else
2221             {
2222                 nodeid = ddindex;
2223             }
2224         }
2225     }
2226   
2227     return nodeid;
2228 }
2229
2230 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2231 {
2232     gmx_domdec_t *dd;
2233     gmx_domdec_comm_t *comm;
2234     ivec coord,coord_pme;
2235     int  i;
2236     int  pmenode=-1;
2237     
2238     dd = cr->dd;
2239     comm = dd->comm;
2240     
2241     /* This assumes a uniform x domain decomposition grid cell size */
2242     if (comm->bCartesianPP_PME)
2243     {
2244 #ifdef GMX_MPI
2245         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2246         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2247         {
2248             /* This is a PP node */
2249             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2250             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2251         }
2252 #endif
2253     }
2254     else if (comm->bCartesianPP)
2255     {
2256         if (sim_nodeid < dd->nnodes)
2257         {
2258             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2259         }
2260     }
2261     else
2262     {
2263         /* This assumes DD cells with identical x coordinates
2264          * are numbered sequentially.
2265          */
2266         if (dd->comm->pmenodes == NULL)
2267         {
2268             if (sim_nodeid < dd->nnodes)
2269             {
2270                 /* The DD index equals the nodeid */
2271                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2272             }
2273         }
2274         else
2275         {
2276             i = 0;
2277             while (sim_nodeid > dd->comm->pmenodes[i])
2278             {
2279                 i++;
2280             }
2281             if (sim_nodeid < dd->comm->pmenodes[i])
2282             {
2283                 pmenode = dd->comm->pmenodes[i];
2284             }
2285         }
2286     }
2287     
2288     return pmenode;
2289 }
2290
2291 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2292 {
2293     gmx_bool bPMEOnlyNode;
2294     
2295     if (DOMAINDECOMP(cr))
2296     {
2297         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2298     }
2299     else
2300     {
2301         bPMEOnlyNode = FALSE;
2302     }
2303     
2304     return bPMEOnlyNode;
2305 }
2306
2307 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2308                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2309 {
2310     gmx_domdec_t *dd;
2311     int x,y,z;
2312     ivec coord,coord_pme;
2313     
2314     dd = cr->dd;
2315     
2316     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2317     
2318     *nmy_ddnodes = 0;
2319     for(x=0; x<dd->nc[XX]; x++)
2320     {
2321         for(y=0; y<dd->nc[YY]; y++)
2322         {
2323             for(z=0; z<dd->nc[ZZ]; z++)
2324             {
2325                 if (dd->comm->bCartesianPP_PME)
2326                 {
2327                     coord[XX] = x;
2328                     coord[YY] = y;
2329                     coord[ZZ] = z;
2330                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2331                     if (dd->ci[XX] == coord_pme[XX] &&
2332                         dd->ci[YY] == coord_pme[YY] &&
2333                         dd->ci[ZZ] == coord_pme[ZZ])
2334                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2335                 }
2336                 else
2337                 {
2338                     /* The slab corresponds to the nodeid in the PME group */
2339                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2340                     {
2341                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2342                     }
2343                 }
2344             }
2345         }
2346     }
2347     
2348     /* The last PP-only node is the peer node */
2349     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2350     
2351     if (debug)
2352     {
2353         fprintf(debug,"Receive coordinates from PP nodes:");
2354         for(x=0; x<*nmy_ddnodes; x++)
2355         {
2356             fprintf(debug," %d",(*my_ddnodes)[x]);
2357         }
2358         fprintf(debug,"\n");
2359     }
2360 }
2361
2362 static gmx_bool receive_vir_ener(t_commrec *cr)
2363 {
2364     gmx_domdec_comm_t *comm;
2365     int  pmenode,coords[DIM],rank;
2366     gmx_bool bReceive;
2367     
2368     bReceive = TRUE;
2369     if (cr->npmenodes < cr->dd->nnodes)
2370     {
2371         comm = cr->dd->comm;
2372         if (comm->bCartesianPP_PME)
2373         {
2374             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2375 #ifdef GMX_MPI
2376             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2377             coords[comm->cartpmedim]++;
2378             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2379             {
2380                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2381                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2382                 {
2383                     /* This is not the last PP node for pmenode */
2384                     bReceive = FALSE;
2385                 }
2386             }
2387 #endif  
2388         }
2389         else
2390         {
2391             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2392             if (cr->sim_nodeid+1 < cr->nnodes &&
2393                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2394             {
2395                 /* This is not the last PP node for pmenode */
2396                 bReceive = FALSE;
2397             }
2398         }
2399     }
2400     
2401     return bReceive;
2402 }
2403
2404 static void set_zones_ncg_home(gmx_domdec_t *dd)
2405 {
2406     gmx_domdec_zones_t *zones;
2407     int i;
2408
2409     zones = &dd->comm->zones;
2410
2411     zones->cg_range[0] = 0;
2412     for(i=1; i<zones->n+1; i++)
2413     {
2414         zones->cg_range[i] = dd->ncg_home;
2415     }
2416 }
2417
2418 static void rebuild_cgindex(gmx_domdec_t *dd,
2419                             const int *gcgs_index,t_state *state)
2420 {
2421     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2422     
2423     ind = state->cg_gl;
2424     dd_cg_gl = dd->index_gl;
2425     cgindex  = dd->cgindex;
2426     nat = 0;
2427     cgindex[0] = nat;
2428     for(i=0; i<state->ncg_gl; i++)
2429     {
2430         cgindex[i] = nat;
2431         cg_gl = ind[i];
2432         dd_cg_gl[i] = cg_gl;
2433         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2434     }
2435     cgindex[i] = nat;
2436     
2437     dd->ncg_home = state->ncg_gl;
2438     dd->nat_home = nat;
2439
2440     set_zones_ncg_home(dd);
2441 }
2442
2443 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2444 {
2445     while (cg >= cginfo_mb->cg_end)
2446     {
2447         cginfo_mb++;
2448     }
2449
2450     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2451 }
2452
2453 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2454                           t_forcerec *fr,char *bLocalCG)
2455 {
2456     cginfo_mb_t *cginfo_mb;
2457     int *cginfo;
2458     int cg;
2459
2460     if (fr != NULL)
2461     {
2462         cginfo_mb = fr->cginfo_mb;
2463         cginfo    = fr->cginfo;
2464
2465         for(cg=cg0; cg<cg1; cg++)
2466         {
2467             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2468         }
2469     }
2470
2471     if (bLocalCG != NULL)
2472     {
2473         for(cg=cg0; cg<cg1; cg++)
2474         {
2475             bLocalCG[index_gl[cg]] = TRUE;
2476         }
2477     }
2478 }
2479
2480 static void make_dd_indices(gmx_domdec_t *dd,
2481                             const int *gcgs_index,int cg_start)
2482 {
2483     int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2484     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2485     gmx_ga2la_t *ga2la;
2486     char *bLocalCG;
2487     gmx_bool bCGs;
2488
2489     bLocalCG = dd->comm->bLocalCG;
2490
2491     if (dd->nat_tot > dd->gatindex_nalloc)
2492     {
2493         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2494         srenew(dd->gatindex,dd->gatindex_nalloc);
2495     }
2496
2497     nzone      = dd->comm->zones.n;
2498     zone2cg    = dd->comm->zones.cg_range;
2499     zone_ncg1  = dd->comm->zone_ncg1;
2500     index_gl   = dd->index_gl;
2501     gatindex   = dd->gatindex;
2502     bCGs       = dd->comm->bCGs;
2503
2504     if (zone2cg[1] != dd->ncg_home)
2505     {
2506         gmx_incons("dd->ncg_zone is not up to date");
2507     }
2508     
2509     /* Make the local to global and global to local atom index */
2510     a = dd->cgindex[cg_start];
2511     for(zone=0; zone<nzone; zone++)
2512     {
2513         if (zone == 0)
2514         {
2515             cg0 = cg_start;
2516         }
2517         else
2518         {
2519             cg0 = zone2cg[zone];
2520         }
2521         cg1    = zone2cg[zone+1];
2522         cg1_p1 = cg0 + zone_ncg1[zone];
2523
2524         for(cg=cg0; cg<cg1; cg++)
2525         {
2526             zone1 = zone;
2527             if (cg >= cg1_p1)
2528             {
2529                 /* Signal that this cg is from more than one pulse away */
2530                 zone1 += nzone;
2531             }
2532             cg_gl = index_gl[cg];
2533             if (bCGs)
2534             {
2535                 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2536                 {
2537                     gatindex[a] = a_gl;
2538                     ga2la_set(dd->ga2la,a_gl,a,zone1);
2539                     a++;
2540                 }
2541             }
2542             else
2543             {
2544                 gatindex[a] = cg_gl;
2545                 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2546                 a++;
2547             }
2548         }
2549     }
2550 }
2551
2552 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2553                           const char *where)
2554 {
2555     int ncg,i,ngl,nerr;
2556
2557     nerr = 0;
2558     if (bLocalCG == NULL)
2559     {
2560         return nerr;
2561     }
2562     for(i=0; i<dd->ncg_tot; i++)
2563     {
2564         if (!bLocalCG[dd->index_gl[i]])
2565         {
2566             fprintf(stderr,
2567                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2568             nerr++;
2569         }
2570     }
2571     ngl = 0;
2572     for(i=0; i<ncg_sys; i++)
2573     {
2574         if (bLocalCG[i])
2575         {
2576             ngl++;
2577         }
2578     }
2579     if (ngl != dd->ncg_tot)
2580     {
2581         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2582         nerr++;
2583     }
2584
2585     return nerr;
2586 }
2587
2588 static void check_index_consistency(gmx_domdec_t *dd,
2589                                     int natoms_sys,int ncg_sys,
2590                                     const char *where)
2591 {
2592     int  nerr,ngl,i,a,cell;
2593     int  *have;
2594
2595     nerr = 0;
2596
2597     if (dd->comm->DD_debug > 1)
2598     {
2599         snew(have,natoms_sys);
2600         for(a=0; a<dd->nat_tot; a++)
2601         {
2602             if (have[dd->gatindex[a]] > 0)
2603             {
2604                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2605             }
2606             else
2607             {
2608                 have[dd->gatindex[a]] = a + 1;
2609             }
2610         }
2611         sfree(have);
2612     }
2613
2614     snew(have,dd->nat_tot);
2615
2616     ngl  = 0;
2617     for(i=0; i<natoms_sys; i++)
2618     {
2619         if (ga2la_get(dd->ga2la,i,&a,&cell))
2620         {
2621             if (a >= dd->nat_tot)
2622             {
2623                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2624                 nerr++;
2625             }
2626             else
2627             {
2628                 have[a] = 1;
2629                 if (dd->gatindex[a] != i)
2630                 {
2631                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2632                     nerr++;
2633                 }
2634             }
2635             ngl++;
2636         }
2637     }
2638     if (ngl != dd->nat_tot)
2639     {
2640         fprintf(stderr,
2641                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2642                 dd->rank,where,ngl,dd->nat_tot);
2643     }
2644     for(a=0; a<dd->nat_tot; a++)
2645     {
2646         if (have[a] == 0)
2647         {
2648             fprintf(stderr,
2649                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2650                     dd->rank,where,a+1,dd->gatindex[a]+1);
2651         }
2652     }
2653     sfree(have);
2654
2655     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2656
2657     if (nerr > 0) {
2658         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2659                   dd->rank,where,nerr);
2660     }
2661 }
2662
2663 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2664 {
2665     int  i;
2666     char *bLocalCG;
2667
2668     if (a_start == 0)
2669     {
2670         /* Clear the whole list without searching */
2671         ga2la_clear(dd->ga2la);
2672     }
2673     else
2674     {
2675         for(i=a_start; i<dd->nat_tot; i++)
2676         {
2677             ga2la_del(dd->ga2la,dd->gatindex[i]);
2678         }
2679     }
2680
2681     bLocalCG = dd->comm->bLocalCG;
2682     if (bLocalCG)
2683     {
2684         for(i=cg_start; i<dd->ncg_tot; i++)
2685         {
2686             bLocalCG[dd->index_gl[i]] = FALSE;
2687         }
2688     }
2689
2690     dd_clear_local_vsite_indices(dd);
2691     
2692     if (dd->constraints)
2693     {
2694         dd_clear_local_constraint_indices(dd);
2695     }
2696 }
2697
2698 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2699                             int dim_ind)
2700 {
2701     real grid_jump_limit;
2702
2703     /* The distance between the boundaries of cells at distance
2704      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2705      * and by the fact that cells should not be shifted by more than
2706      * half their size, such that cg's only shift by one cell
2707      * at redecomposition.
2708      */
2709     grid_jump_limit = comm->cellsize_limit;
2710     if (!comm->bVacDLBNoLimit)
2711     {
2712         grid_jump_limit = max(grid_jump_limit,
2713                               cutoff/comm->cd[dim_ind].np);
2714     }
2715
2716     return grid_jump_limit;
2717 }
2718
2719 static gmx_bool check_grid_jump(gmx_large_int_t step,
2720                                 gmx_domdec_t *dd,
2721                                 real cutoff,
2722                                 gmx_ddbox_t *ddbox,
2723                                 gmx_bool bFatal)
2724 {
2725     gmx_domdec_comm_t *comm;
2726     int  d,dim;
2727     real limit,bfac;
2728     gmx_bool bInvalid;
2729
2730     bInvalid = FALSE;
2731
2732     comm = dd->comm;
2733     
2734     for(d=1; d<dd->ndim; d++)
2735     {
2736         dim = dd->dim[d];
2737         limit = grid_jump_limit(comm,cutoff,d);
2738         bfac = ddbox->box_size[dim];
2739         if (ddbox->tric_dir[dim])
2740         {
2741             bfac *= ddbox->skew_fac[dim];
2742         }
2743         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2744             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2745         {
2746             bInvalid = TRUE;
2747
2748             if (bFatal)
2749             {
2750                 char buf[22];
2751
2752                 /* This error should never be triggered under normal
2753                  * circumstances, but you never know ...
2754                  */
2755                 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2756                           gmx_step_str(step,buf),
2757                           dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2758             }
2759         }
2760     }
2761
2762     return bInvalid;
2763 }
2764
2765 static int dd_load_count(gmx_domdec_comm_t *comm)
2766 {
2767     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2768 }
2769
2770 static float dd_force_load(gmx_domdec_comm_t *comm)
2771 {
2772     float load;
2773     
2774     if (comm->eFlop)
2775     {
2776         load = comm->flop;
2777         if (comm->eFlop > 1)
2778         {
2779             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2780         }
2781     } 
2782     else
2783     {
2784         load = comm->cycl[ddCyclF];
2785         if (comm->cycl_n[ddCyclF] > 1)
2786         {
2787             /* Subtract the maximum of the last n cycle counts
2788              * to get rid of possible high counts due to other soures,
2789              * for instance system activity, that would otherwise
2790              * affect the dynamic load balancing.
2791              */
2792             load -= comm->cycl_max[ddCyclF];
2793         }
2794     }
2795     
2796     return load;
2797 }
2798
2799 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2800 {
2801     gmx_domdec_comm_t *comm;
2802     int i;
2803     
2804     comm = dd->comm;
2805     
2806     snew(*dim_f,dd->nc[dim]+1);
2807     (*dim_f)[0] = 0;
2808     for(i=1; i<dd->nc[dim]; i++)
2809     {
2810         if (comm->slb_frac[dim])
2811         {
2812             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2813         }
2814         else
2815         {
2816             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2817         }
2818     }
2819     (*dim_f)[dd->nc[dim]] = 1;
2820 }
2821
2822 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2823 {
2824     int  pmeindex,slab,nso,i;
2825     ivec xyz;
2826     
2827     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2828     {
2829         ddpme->dim = YY;
2830     }
2831     else
2832     {
2833         ddpme->dim = dimind;
2834     }
2835     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2836     
2837     ddpme->nslab = (ddpme->dim == 0 ?
2838                     dd->comm->npmenodes_x :
2839                     dd->comm->npmenodes_y);
2840
2841     if (ddpme->nslab <= 1)
2842     {
2843         return;
2844     }
2845
2846     nso = dd->comm->npmenodes/ddpme->nslab;
2847     /* Determine for each PME slab the PP location range for dimension dim */
2848     snew(ddpme->pp_min,ddpme->nslab);
2849     snew(ddpme->pp_max,ddpme->nslab);
2850     for(slab=0; slab<ddpme->nslab; slab++) {
2851         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2852         ddpme->pp_max[slab] = 0;
2853     }
2854     for(i=0; i<dd->nnodes; i++) {
2855         ddindex2xyz(dd->nc,i,xyz);
2856         /* For y only use our y/z slab.
2857          * This assumes that the PME x grid size matches the DD grid size.
2858          */
2859         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2860             pmeindex = ddindex2pmeindex(dd,i);
2861             if (dimind == 0) {
2862                 slab = pmeindex/nso;
2863             } else {
2864                 slab = pmeindex % ddpme->nslab;
2865             }
2866             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2867             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2868         }
2869     }
2870
2871     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2872 }
2873
2874 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2875 {
2876     if (dd->comm->ddpme[0].dim == XX)
2877     {
2878         return dd->comm->ddpme[0].maxshift;
2879     }
2880     else
2881     {
2882         return 0;
2883     }
2884 }
2885
2886 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2887 {
2888     if (dd->comm->ddpme[0].dim == YY)
2889     {
2890         return dd->comm->ddpme[0].maxshift;
2891     }
2892     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2893     {
2894         return dd->comm->ddpme[1].maxshift;
2895     }
2896     else
2897     {
2898         return 0;
2899     }
2900 }
2901
2902 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2903                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2904 {
2905     gmx_domdec_comm_t *comm;
2906     int  nc,ns,s;
2907     int  *xmin,*xmax;
2908     real range,pme_boundary;
2909     int  sh;
2910     
2911     comm = dd->comm;
2912     nc  = dd->nc[ddpme->dim];
2913     ns  = ddpme->nslab;
2914     
2915     if (!ddpme->dim_match)
2916     {
2917         /* PP decomposition is not along dim: the worst situation */
2918         sh = ns/2;
2919     }
2920     else if (ns <= 3 || (bUniform && ns == nc))
2921     {
2922         /* The optimal situation */
2923         sh = 1;
2924     }
2925     else
2926     {
2927         /* We need to check for all pme nodes which nodes they
2928          * could possibly need to communicate with.
2929          */
2930         xmin = ddpme->pp_min;
2931         xmax = ddpme->pp_max;
2932         /* Allow for atoms to be maximally 2/3 times the cut-off
2933          * out of their DD cell. This is a reasonable balance between
2934          * between performance and support for most charge-group/cut-off
2935          * combinations.
2936          */
2937         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2938         /* Avoid extra communication when we are exactly at a boundary */
2939         range *= 0.999;
2940         
2941         sh = 1;
2942         for(s=0; s<ns; s++)
2943         {
2944             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2945             pme_boundary = (real)s/ns;
2946             while (sh+1 < ns &&
2947                    ((s-(sh+1) >= 0 &&
2948                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2949                     (s-(sh+1) <  0 &&
2950                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2951             {
2952                 sh++;
2953             }
2954             pme_boundary = (real)(s+1)/ns;
2955             while (sh+1 < ns &&
2956                    ((s+(sh+1) <  ns &&
2957                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2958                     (s+(sh+1) >= ns &&
2959                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2960             {
2961                 sh++;
2962             }
2963         }
2964     }
2965     
2966     ddpme->maxshift = sh;
2967     
2968     if (debug)
2969     {
2970         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2971                 ddpme->dim,ddpme->maxshift);
2972     }
2973 }
2974
2975 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2976 {
2977     int d,dim;
2978     
2979     for(d=0; d<dd->ndim; d++)
2980     {
2981         dim = dd->dim[d];
2982         if (dim < ddbox->nboundeddim &&
2983             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2984             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2985         {
2986             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2987                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2988                       dd->nc[dim],dd->comm->cellsize_limit);
2989         }
2990     }
2991 }
2992
2993 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2994                                   gmx_bool bMaster,ivec npulse)
2995 {
2996     gmx_domdec_comm_t *comm;
2997     int  d,j;
2998     rvec cellsize_min;
2999     real *cell_x,cell_dx,cellsize;
3000     
3001     comm = dd->comm;
3002     
3003     for(d=0; d<DIM; d++)
3004     {
3005         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3006         npulse[d] = 1;
3007         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3008         {
3009             /* Uniform grid */
3010             cell_dx = ddbox->box_size[d]/dd->nc[d];
3011             if (bMaster)
3012             {
3013                 for(j=0; j<dd->nc[d]+1; j++)
3014                 {
3015                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3016                 }
3017             }
3018             else
3019             {
3020                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3021                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3022             }
3023             cellsize = cell_dx*ddbox->skew_fac[d];
3024             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3025             {
3026                 npulse[d]++;
3027             }
3028             cellsize_min[d] = cellsize;
3029         }
3030         else
3031         {
3032             /* Statically load balanced grid */
3033             /* Also when we are not doing a master distribution we determine
3034              * all cell borders in a loop to obtain identical values
3035              * to the master distribution case and to determine npulse.
3036              */
3037             if (bMaster)
3038             {
3039                 cell_x = dd->ma->cell_x[d];
3040             }
3041             else
3042             {
3043                 snew(cell_x,dd->nc[d]+1);
3044             }
3045             cell_x[0] = ddbox->box0[d];
3046             for(j=0; j<dd->nc[d]; j++)
3047             {
3048                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3049                 cell_x[j+1] = cell_x[j] + cell_dx;
3050                 cellsize = cell_dx*ddbox->skew_fac[d];
3051                 while (cellsize*npulse[d] < comm->cutoff &&
3052                        npulse[d] < dd->nc[d]-1)
3053                 {
3054                     npulse[d]++;
3055                 }
3056                 cellsize_min[d] = min(cellsize_min[d],cellsize);
3057             }
3058             if (!bMaster)
3059             {
3060                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3061                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3062                 sfree(cell_x);
3063             }
3064         }
3065         /* The following limitation is to avoid that a cell would receive
3066          * some of its own home charge groups back over the periodic boundary.
3067          * Double charge groups cause trouble with the global indices.
3068          */
3069         if (d < ddbox->npbcdim &&
3070             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3071         {
3072             gmx_fatal_collective(FARGS,NULL,dd,
3073                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3074                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3075                                  comm->cutoff,
3076                                  dd->nc[d],dd->nc[d],
3077                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3078         }
3079     }
3080     
3081     if (!comm->bDynLoadBal)
3082     {
3083         copy_rvec(cellsize_min,comm->cellsize_min);
3084     }
3085    
3086     for(d=0; d<comm->npmedecompdim; d++)
3087     {
3088         set_pme_maxshift(dd,&comm->ddpme[d],
3089                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3090                          comm->ddpme[d].slb_dim_f);
3091     }
3092 }
3093
3094
3095 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3096                                        int d,int dim,gmx_domdec_root_t *root,
3097                                        gmx_ddbox_t *ddbox,
3098                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3099 {
3100     gmx_domdec_comm_t *comm;
3101     int  ncd,i,j,nmin,nmin_old;
3102     gmx_bool bLimLo,bLimHi;
3103     real *cell_size;
3104     real fac,halfway,cellsize_limit_f_i,region_size;
3105     gmx_bool bPBC,bLastHi=FALSE;
3106     int nrange[]={range[0],range[1]};
3107
3108     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
3109
3110     comm = dd->comm;
3111
3112     ncd = dd->nc[dim];
3113
3114     bPBC = (dim < ddbox->npbcdim);
3115
3116     cell_size = root->buf_ncd;
3117
3118     if (debug) 
3119     {
3120         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3121     }
3122
3123     /* First we need to check if the scaling does not make cells
3124      * smaller than the smallest allowed size.
3125      * We need to do this iteratively, since if a cell is too small,
3126      * it needs to be enlarged, which makes all the other cells smaller,
3127      * which could in turn make another cell smaller than allowed.
3128      */
3129     for(i=range[0]; i<range[1]; i++)
3130     {
3131         root->bCellMin[i] = FALSE;
3132     }
3133     nmin = 0;
3134     do
3135     {
3136         nmin_old = nmin;
3137         /* We need the total for normalization */
3138         fac = 0;
3139         for(i=range[0]; i<range[1]; i++)
3140         {
3141             if (root->bCellMin[i] == FALSE)
3142             {
3143                 fac += cell_size[i];
3144             }
3145         }
3146         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3147         /* Determine the cell boundaries */
3148         for(i=range[0]; i<range[1]; i++)
3149         {
3150             if (root->bCellMin[i] == FALSE)
3151             {
3152                 cell_size[i] *= fac;
3153                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3154                 {
3155                     cellsize_limit_f_i = 0;
3156                 }
3157                 else
3158                 {
3159                     cellsize_limit_f_i = cellsize_limit_f;
3160                 }
3161                 if (cell_size[i] < cellsize_limit_f_i)
3162                 {
3163                     root->bCellMin[i] = TRUE;
3164                     cell_size[i] = cellsize_limit_f_i;
3165                     nmin++;
3166                 }
3167             }
3168             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3169         }
3170     }
3171     while (nmin > nmin_old);
3172     
3173     i=range[1]-1;
3174     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3175     /* For this check we should not use DD_CELL_MARGIN,
3176      * but a slightly smaller factor,
3177      * since rounding could get use below the limit.
3178      */
3179     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3180     {
3181         char buf[22];
3182         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3183                   gmx_step_str(step,buf),
3184                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3185                   ncd,comm->cellsize_min[dim]);
3186     }
3187     
3188     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3189     
3190     if (!bUniform)
3191     {
3192         /* Check if the boundary did not displace more than halfway
3193          * each of the cells it bounds, as this could cause problems,
3194          * especially when the differences between cell sizes are large.
3195          * If changes are applied, they will not make cells smaller
3196          * than the cut-off, as we check all the boundaries which
3197          * might be affected by a change and if the old state was ok,
3198          * the cells will at most be shrunk back to their old size.
3199          */
3200         for(i=range[0]+1; i<range[1]; i++)
3201         {
3202             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3203             if (root->cell_f[i] < halfway)
3204             {
3205                 root->cell_f[i] = halfway;
3206                 /* Check if the change also causes shifts of the next boundaries */
3207                 for(j=i+1; j<range[1]; j++)
3208                 {
3209                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3210                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3211                 }
3212             }
3213             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3214             if (root->cell_f[i] > halfway)
3215             {
3216                 root->cell_f[i] = halfway;
3217                 /* Check if the change also causes shifts of the next boundaries */
3218                 for(j=i-1; j>=range[0]+1; j--)
3219                 {
3220                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3221                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3222                 }
3223             }
3224         }
3225     }
3226     
3227     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3228     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3229      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3230      * for a and b nrange is used */
3231     if (d > 0)
3232     {
3233         /* Take care of the staggering of the cell boundaries */
3234         if (bUniform)
3235         {
3236             for(i=range[0]; i<range[1]; i++)
3237             {
3238                 root->cell_f_max0[i] = root->cell_f[i];
3239                 root->cell_f_min1[i] = root->cell_f[i+1];
3240             }
3241         }
3242         else
3243         {
3244             for(i=range[0]+1; i<range[1]; i++)
3245             {
3246                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3247                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3248                 if (bLimLo && bLimHi)
3249                 {
3250                     /* Both limits violated, try the best we can */
3251                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3252                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3253                     nrange[0]=range[0];
3254                     nrange[1]=i;
3255                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3256
3257                     nrange[0]=i;
3258                     nrange[1]=range[1];
3259                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3260
3261                     return;
3262                 }
3263                 else if (bLimLo)
3264                 {
3265                     /* root->cell_f[i] = root->bound_min[i]; */
3266                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3267                     bLastHi=FALSE;
3268                 }
3269                 else if (bLimHi && !bLastHi)
3270                 {
3271                     bLastHi=TRUE;
3272                     if (nrange[1] < range[1])   /* found a LimLo before */
3273                     {
3274                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3275                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3276                         nrange[0]=nrange[1];
3277                     }
3278                     root->cell_f[i] = root->bound_max[i];
3279                     nrange[1]=i; 
3280                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3281                     nrange[0]=i;
3282                     nrange[1]=range[1];
3283                 }
3284             }
3285             if (nrange[1] < range[1])   /* found last a LimLo */
3286             {
3287                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3288                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3289                 nrange[0]=nrange[1];
3290                 nrange[1]=range[1];
3291                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3292             } 
3293             else if (nrange[0] > range[0]) /* found at least one LimHi */
3294             {
3295                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3296             }
3297         }
3298     }
3299 }
3300
3301
3302 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3303                                        int d,int dim,gmx_domdec_root_t *root,
3304                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3305                                        gmx_bool bUniform,gmx_large_int_t step)
3306 {
3307     gmx_domdec_comm_t *comm;
3308     int  ncd,d1,i,j,pos;
3309     real *cell_size;
3310     real load_aver,load_i,imbalance,change,change_max,sc;
3311     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3312     real change_limit;
3313     real relax = 0.5;
3314     gmx_bool bPBC;
3315     int range[] = { 0, 0 };
3316
3317     comm = dd->comm;
3318
3319     /* Convert the maximum change from the input percentage to a fraction */
3320     change_limit = comm->dlb_scale_lim*0.01;
3321
3322     ncd = dd->nc[dim];
3323
3324     bPBC = (dim < ddbox->npbcdim);
3325
3326     cell_size = root->buf_ncd;
3327
3328     /* Store the original boundaries */
3329     for(i=0; i<ncd+1; i++)
3330     {
3331         root->old_cell_f[i] = root->cell_f[i];
3332     }
3333     if (bUniform) {
3334         for(i=0; i<ncd; i++)
3335         {
3336             cell_size[i] = 1.0/ncd;
3337         }
3338     }
3339     else if (dd_load_count(comm))
3340     {
3341         load_aver = comm->load[d].sum_m/ncd;
3342         change_max = 0;
3343         for(i=0; i<ncd; i++)
3344         {
3345             /* Determine the relative imbalance of cell i */
3346             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3347             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3348             /* Determine the change of the cell size using underrelaxation */
3349             change = -relax*imbalance;
3350             change_max = max(change_max,max(change,-change));
3351         }
3352         /* Limit the amount of scaling.
3353          * We need to use the same rescaling for all cells in one row,
3354          * otherwise the load balancing might not converge.
3355          */
3356         sc = relax;
3357         if (change_max > change_limit)
3358         {
3359             sc *= change_limit/change_max;
3360         }
3361         for(i=0; i<ncd; i++)
3362         {
3363             /* Determine the relative imbalance of cell i */
3364             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3365             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3366             /* Determine the change of the cell size using underrelaxation */
3367             change = -sc*imbalance;
3368             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3369         }
3370     }
3371     
3372     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3373     cellsize_limit_f *= DD_CELL_MARGIN;
3374     dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3375     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3376     if (ddbox->tric_dir[dim])
3377     {
3378         cellsize_limit_f /= ddbox->skew_fac[dim];
3379         dist_min_f       /= ddbox->skew_fac[dim];
3380     }
3381     if (bDynamicBox && d > 0)
3382     {
3383         dist_min_f *= DD_PRES_SCALE_MARGIN;
3384     }
3385     if (d > 0 && !bUniform)
3386     {
3387         /* Make sure that the grid is not shifted too much */
3388         for(i=1; i<ncd; i++) {
3389             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3390             {
3391                 gmx_incons("Inconsistent DD boundary staggering limits!");
3392             }
3393             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3394             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3395             if (space > 0) {
3396                 root->bound_min[i] += 0.5*space;
3397             }
3398             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3399             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3400             if (space < 0) {
3401                 root->bound_max[i] += 0.5*space;
3402             }
3403             if (debug)
3404             {
3405                 fprintf(debug,
3406                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3407                         d,i,
3408                         root->cell_f_max0[i-1] + dist_min_f,
3409                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3410                         root->cell_f_min1[i] - dist_min_f);
3411             }
3412         }
3413     }
3414     range[1]=ncd;
3415     root->cell_f[0] = 0;
3416     root->cell_f[ncd] = 1;
3417     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3418
3419
3420     /* After the checks above, the cells should obey the cut-off
3421      * restrictions, but it does not hurt to check.
3422      */
3423     for(i=0; i<ncd; i++)
3424     {
3425         if (debug)
3426         {
3427             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3428                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3429         }
3430
3431         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3432             root->cell_f[i+1] - root->cell_f[i] <
3433             cellsize_limit_f/DD_CELL_MARGIN)
3434         {
3435             char buf[22];
3436             fprintf(stderr,
3437                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3438                     gmx_step_str(step,buf),dim2char(dim),i,
3439                     (root->cell_f[i+1] - root->cell_f[i])
3440                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3441         }
3442     }
3443     
3444     pos = ncd + 1;
3445     /* Store the cell boundaries of the lower dimensions at the end */
3446     for(d1=0; d1<d; d1++)
3447     {
3448         root->cell_f[pos++] = comm->cell_f0[d1];
3449         root->cell_f[pos++] = comm->cell_f1[d1];
3450     }
3451     
3452     if (d < comm->npmedecompdim)
3453     {
3454         /* The master determines the maximum shift for
3455          * the coordinate communication between separate PME nodes.
3456          */
3457         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3458     }
3459     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3460     if (d >= 1)
3461     {
3462         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3463     }
3464 }    
3465
3466 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3467                                              gmx_ddbox_t *ddbox,int dimind)
3468 {
3469     gmx_domdec_comm_t *comm;
3470     int dim;
3471
3472     comm = dd->comm;
3473
3474     /* Set the cell dimensions */
3475     dim = dd->dim[dimind];
3476     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3477     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3478     if (dim >= ddbox->nboundeddim)
3479     {
3480         comm->cell_x0[dim] += ddbox->box0[dim];
3481         comm->cell_x1[dim] += ddbox->box0[dim];
3482     }
3483 }
3484
3485 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3486                                          int d,int dim,real *cell_f_row,
3487                                          gmx_ddbox_t *ddbox)
3488 {
3489     gmx_domdec_comm_t *comm;
3490     int d1,dim1,pos;
3491
3492     comm = dd->comm;
3493
3494 #ifdef GMX_MPI
3495     /* Each node would only need to know two fractions,
3496      * but it is probably cheaper to broadcast the whole array.
3497      */
3498     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3499               0,comm->mpi_comm_load[d]);
3500 #endif
3501     /* Copy the fractions for this dimension from the buffer */
3502     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3503     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3504     /* The whole array was communicated, so set the buffer position */
3505     pos = dd->nc[dim] + 1;
3506     for(d1=0; d1<=d; d1++)
3507     {
3508         if (d1 < d)
3509         {
3510             /* Copy the cell fractions of the lower dimensions */
3511             comm->cell_f0[d1] = cell_f_row[pos++];
3512             comm->cell_f1[d1] = cell_f_row[pos++];
3513         }
3514         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3515     }
3516     /* Convert the communicated shift from float to int */
3517     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3518     if (d >= 1)
3519     {
3520         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3521     }
3522 }
3523
3524 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3525                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3526                                          gmx_bool bUniform,gmx_large_int_t step)
3527 {
3528     gmx_domdec_comm_t *comm;
3529     int d,dim,d1;
3530     gmx_bool bRowMember,bRowRoot;
3531     real *cell_f_row;
3532     
3533     comm = dd->comm;
3534
3535     for(d=0; d<dd->ndim; d++)
3536     {
3537         dim = dd->dim[d];
3538         bRowMember = TRUE;
3539         bRowRoot = TRUE;
3540         for(d1=d; d1<dd->ndim; d1++)
3541         {
3542             if (dd->ci[dd->dim[d1]] > 0)
3543             {
3544                 if (d1 > d)
3545                 {
3546                     bRowMember = FALSE;
3547                 }
3548                 bRowRoot = FALSE;
3549             }
3550         }
3551         if (bRowMember)
3552         {
3553             if (bRowRoot)
3554             {
3555                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3556                                            ddbox,bDynamicBox,bUniform,step);
3557                 cell_f_row = comm->root[d]->cell_f;
3558             }
3559             else
3560             {
3561                 cell_f_row = comm->cell_f_row;
3562             }
3563             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3564         }
3565     }
3566 }    
3567
3568 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3569 {
3570     int d;
3571
3572     /* This function assumes the box is static and should therefore
3573      * not be called when the box has changed since the last
3574      * call to dd_partition_system.
3575      */
3576     for(d=0; d<dd->ndim; d++)
3577     {
3578         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3579     }
3580 }
3581
3582
3583
3584 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3585                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3586                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3587                                   gmx_wallcycle_t wcycle)
3588 {
3589     gmx_domdec_comm_t *comm;
3590     int dim;
3591
3592     comm = dd->comm;
3593     
3594     if (bDoDLB)
3595     {
3596         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3597         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3598         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3599     }
3600     else if (bDynamicBox)
3601     {
3602         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3603     }
3604     
3605     /* Set the dimensions for which no DD is used */
3606     for(dim=0; dim<DIM; dim++) {
3607         if (dd->nc[dim] == 1) {
3608             comm->cell_x0[dim] = 0;
3609             comm->cell_x1[dim] = ddbox->box_size[dim];
3610             if (dim >= ddbox->nboundeddim)
3611             {
3612                 comm->cell_x0[dim] += ddbox->box0[dim];
3613                 comm->cell_x1[dim] += ddbox->box0[dim];
3614             }
3615         }
3616     }
3617 }
3618
3619 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3620 {
3621     int d,np,i;
3622     gmx_domdec_comm_dim_t *cd;
3623     
3624     for(d=0; d<dd->ndim; d++)
3625     {
3626         cd = &dd->comm->cd[d];
3627         np = npulse[dd->dim[d]];
3628         if (np > cd->np_nalloc)
3629         {
3630             if (debug)
3631             {
3632                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3633                         dim2char(dd->dim[d]),np);
3634             }
3635             if (DDMASTER(dd) && cd->np_nalloc > 0)
3636             {
3637                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3638             }
3639             srenew(cd->ind,np);
3640             for(i=cd->np_nalloc; i<np; i++)
3641             {
3642                 cd->ind[i].index  = NULL;
3643                 cd->ind[i].nalloc = 0;
3644             }
3645             cd->np_nalloc = np;
3646         }
3647         cd->np = np;
3648     }
3649 }
3650
3651
3652 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3653                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3654                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3655                               gmx_wallcycle_t wcycle)
3656 {
3657     gmx_domdec_comm_t *comm;
3658     int  d;
3659     ivec npulse;
3660     
3661     comm = dd->comm;
3662
3663     /* Copy the old cell boundaries for the cg displacement check */
3664     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3665     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3666     
3667     if (comm->bDynLoadBal)
3668     {
3669         if (DDMASTER(dd))
3670         {
3671             check_box_size(dd,ddbox);
3672         }
3673         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3674     }
3675     else
3676     {
3677         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3678         realloc_comm_ind(dd,npulse);
3679     }
3680     
3681     if (debug)
3682     {
3683         for(d=0; d<DIM; d++)
3684         {
3685             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3686                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3687         }
3688     }
3689 }
3690
3691 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3692                                   gmx_ddbox_t *ddbox,
3693                                   rvec cell_ns_x0,rvec cell_ns_x1,
3694                                   gmx_large_int_t step)
3695 {
3696     gmx_domdec_comm_t *comm;
3697     int dim_ind,dim;
3698     
3699     comm = dd->comm;
3700
3701     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3702     {
3703         dim = dd->dim[dim_ind];
3704         
3705         /* Without PBC we don't have restrictions on the outer cells */
3706         if (!(dim >= ddbox->npbcdim && 
3707               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3708             comm->bDynLoadBal &&
3709             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3710             comm->cellsize_min[dim])
3711         {
3712             char buf[22];
3713             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3714                       gmx_step_str(step,buf),dim2char(dim),
3715                       comm->cell_x1[dim] - comm->cell_x0[dim],
3716                       ddbox->skew_fac[dim],
3717                       dd->comm->cellsize_min[dim],
3718                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3719         }
3720     }
3721     
3722     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3723     {
3724         /* Communicate the boundaries and update cell_ns_x0/1 */
3725         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3726         if (dd->bGridJump && dd->ndim > 1)
3727         {
3728             check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3729         }
3730     }
3731 }
3732
3733 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3734 {
3735     if (YY < npbcdim)
3736     {
3737         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3738     }
3739     else
3740     {
3741         tcm[YY][XX] = 0;
3742     }
3743     if (ZZ < npbcdim)
3744     {
3745         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3746         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3747     }
3748     else
3749     {
3750         tcm[ZZ][XX] = 0;
3751         tcm[ZZ][YY] = 0;
3752     }
3753 }
3754
3755 static void check_screw_box(matrix box)
3756 {
3757     /* Mathematical limitation */
3758     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3759     {
3760         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3761     }
3762     
3763     /* Limitation due to the asymmetry of the eighth shell method */
3764     if (box[ZZ][YY] != 0)
3765     {
3766         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3767     }
3768 }
3769
3770 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3771                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3772                           gmx_domdec_t *dd)
3773 {
3774     gmx_domdec_master_t *ma;
3775     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3776     int  i,icg,j,k,k0,k1,d,npbcdim;
3777     matrix tcm;
3778     rvec box_size,cg_cm;
3779     ivec ind;
3780     real nrcg,inv_ncg,pos_d;
3781     atom_id *cgindex;
3782     gmx_bool bUnbounded,bScrew;
3783
3784     ma = dd->ma;
3785     
3786     if (tmp_ind == NULL)
3787     {
3788         snew(tmp_nalloc,dd->nnodes);
3789         snew(tmp_ind,dd->nnodes);
3790         for(i=0; i<dd->nnodes; i++)
3791         {
3792             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3793             snew(tmp_ind[i],tmp_nalloc[i]);
3794         }
3795     }
3796     
3797     /* Clear the count */
3798     for(i=0; i<dd->nnodes; i++)
3799     {
3800         ma->ncg[i] = 0;
3801         ma->nat[i] = 0;
3802     }
3803     
3804     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3805     
3806     cgindex = cgs->index;
3807     
3808     /* Compute the center of geometry for all charge groups */
3809     for(icg=0; icg<cgs->nr; icg++)
3810     {
3811         k0      = cgindex[icg];
3812         k1      = cgindex[icg+1];
3813         nrcg    = k1 - k0;
3814         if (nrcg == 1)
3815         {
3816             copy_rvec(pos[k0],cg_cm);
3817         }
3818         else
3819         {
3820             inv_ncg = 1.0/nrcg;
3821             
3822             clear_rvec(cg_cm);
3823             for(k=k0; (k<k1); k++)
3824             {
3825                 rvec_inc(cg_cm,pos[k]);
3826             }
3827             for(d=0; (d<DIM); d++)
3828             {
3829                 cg_cm[d] *= inv_ncg;
3830             }
3831         }
3832         /* Put the charge group in the box and determine the cell index */
3833         for(d=DIM-1; d>=0; d--) {
3834             pos_d = cg_cm[d];
3835             if (d < dd->npbcdim)
3836             {
3837                 bScrew = (dd->bScrewPBC && d == XX);
3838                 if (tric_dir[d] && dd->nc[d] > 1)
3839                 {
3840                     /* Use triclinic coordintates for this dimension */
3841                     for(j=d+1; j<DIM; j++)
3842                     {
3843                         pos_d += cg_cm[j]*tcm[j][d];
3844                     }
3845                 }
3846                 while(pos_d >= box[d][d])
3847                 {
3848                     pos_d -= box[d][d];
3849                     rvec_dec(cg_cm,box[d]);
3850                     if (bScrew)
3851                     {
3852                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3853                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3854                     }
3855                     for(k=k0; (k<k1); k++)
3856                     {
3857                         rvec_dec(pos[k],box[d]);
3858                         if (bScrew)
3859                         {
3860                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3861                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3862                         }
3863                     }
3864                 }
3865                 while(pos_d < 0)
3866                 {
3867                     pos_d += box[d][d];
3868                     rvec_inc(cg_cm,box[d]);
3869                     if (bScrew)
3870                     {
3871                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3872                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3873                     }
3874                     for(k=k0; (k<k1); k++)
3875                     {
3876                         rvec_inc(pos[k],box[d]);
3877                         if (bScrew) {
3878                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3879                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3880                         }
3881                     }
3882                 }
3883             }
3884             /* This could be done more efficiently */
3885             ind[d] = 0;
3886             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3887             {
3888                 ind[d]++;
3889             }
3890         }
3891         i = dd_index(dd->nc,ind);
3892         if (ma->ncg[i] == tmp_nalloc[i])
3893         {
3894             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3895             srenew(tmp_ind[i],tmp_nalloc[i]);
3896         }
3897         tmp_ind[i][ma->ncg[i]] = icg;
3898         ma->ncg[i]++;
3899         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3900     }
3901     
3902     k1 = 0;
3903     for(i=0; i<dd->nnodes; i++)
3904     {
3905         ma->index[i] = k1;
3906         for(k=0; k<ma->ncg[i]; k++)
3907         {
3908             ma->cg[k1++] = tmp_ind[i][k];
3909         }
3910     }
3911     ma->index[dd->nnodes] = k1;
3912     
3913     for(i=0; i<dd->nnodes; i++)
3914     {
3915         sfree(tmp_ind[i]);
3916     }
3917     sfree(tmp_ind);
3918     sfree(tmp_nalloc);
3919     
3920     if (fplog)
3921     {
3922         char buf[22];
3923         fprintf(fplog,"Charge group distribution at step %s:",
3924                 gmx_step_str(step,buf));
3925         for(i=0; i<dd->nnodes; i++)
3926         {
3927             fprintf(fplog," %d",ma->ncg[i]);
3928         }
3929         fprintf(fplog,"\n");
3930     }
3931 }
3932
3933 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3934                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3935                                 rvec pos[])
3936 {
3937     gmx_domdec_master_t *ma=NULL;
3938     ivec npulse;
3939     int  i,cg_gl;
3940     int  *ibuf,buf2[2] = { 0, 0 };
3941     gmx_bool bMaster = DDMASTER(dd);
3942     if (bMaster)
3943     {
3944         ma = dd->ma;
3945         
3946         if (dd->bScrewPBC)
3947         {
3948             check_screw_box(box);
3949         }
3950     
3951         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3952     
3953         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3954         for(i=0; i<dd->nnodes; i++)
3955         {
3956             ma->ibuf[2*i]   = ma->ncg[i];
3957             ma->ibuf[2*i+1] = ma->nat[i];
3958         }
3959         ibuf = ma->ibuf;
3960     }
3961     else
3962     {
3963         ibuf = NULL;
3964     }
3965     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3966     
3967     dd->ncg_home = buf2[0];
3968     dd->nat_home = buf2[1];
3969     dd->ncg_tot  = dd->ncg_home;
3970     dd->nat_tot  = dd->nat_home;
3971     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3972     {
3973         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3974         srenew(dd->index_gl,dd->cg_nalloc);
3975         srenew(dd->cgindex,dd->cg_nalloc+1);
3976     }
3977     if (bMaster)
3978     {
3979         for(i=0; i<dd->nnodes; i++)
3980         {
3981             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3982             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3983         }
3984     }
3985     
3986     dd_scatterv(dd,
3987                 DDMASTER(dd) ? ma->ibuf : NULL,
3988                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3989                 DDMASTER(dd) ? ma->cg : NULL,
3990                 dd->ncg_home*sizeof(int),dd->index_gl);
3991     
3992     /* Determine the home charge group sizes */
3993     dd->cgindex[0] = 0;
3994     for(i=0; i<dd->ncg_home; i++)
3995     {
3996         cg_gl = dd->index_gl[i];
3997         dd->cgindex[i+1] =
3998             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3999     }
4000     
4001     if (debug)
4002     {
4003         fprintf(debug,"Home charge groups:\n");
4004         for(i=0; i<dd->ncg_home; i++)
4005         {
4006             fprintf(debug," %d",dd->index_gl[i]);
4007             if (i % 10 == 9) 
4008                 fprintf(debug,"\n");
4009         }
4010         fprintf(debug,"\n");
4011     }
4012 }
4013
4014 static int compact_and_copy_vec_at(int ncg,int *move,
4015                                    int *cgindex,
4016                                    int nvec,int vec,
4017                                    rvec *src,gmx_domdec_comm_t *comm,
4018                                    gmx_bool bCompact)
4019 {
4020     int m,icg,i,i0,i1,nrcg;
4021     int home_pos;
4022     int pos_vec[DIM*2];
4023     
4024     home_pos = 0;
4025
4026     for(m=0; m<DIM*2; m++)
4027     {
4028         pos_vec[m] = 0;
4029     }
4030     
4031     i0 = 0;
4032     for(icg=0; icg<ncg; icg++)
4033     {
4034         i1 = cgindex[icg+1];
4035         m = move[icg];
4036         if (m == -1)
4037         {
4038             if (bCompact)
4039             {
4040                 /* Compact the home array in place */
4041                 for(i=i0; i<i1; i++)
4042                 {
4043                     copy_rvec(src[i],src[home_pos++]);
4044                 }
4045             }
4046         }
4047         else
4048         {
4049             /* Copy to the communication buffer */
4050             nrcg = i1 - i0;
4051             pos_vec[m] += 1 + vec*nrcg;
4052             for(i=i0; i<i1; i++)
4053             {
4054                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4055             }
4056             pos_vec[m] += (nvec - vec - 1)*nrcg;
4057         }
4058         if (!bCompact)
4059         {
4060             home_pos += i1 - i0;
4061         }
4062         i0 = i1;
4063     }
4064     
4065     return home_pos;
4066 }
4067
4068 static int compact_and_copy_vec_cg(int ncg,int *move,
4069                                    int *cgindex,
4070                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
4071                                    gmx_bool bCompact)
4072 {
4073     int m,icg,i0,i1,nrcg;
4074     int home_pos;
4075     int pos_vec[DIM*2];
4076     
4077     home_pos = 0;
4078     
4079     for(m=0; m<DIM*2; m++)
4080     {
4081         pos_vec[m] = 0;
4082     }
4083     
4084     i0 = 0;
4085     for(icg=0; icg<ncg; icg++)
4086     {
4087         i1 = cgindex[icg+1];
4088         m = move[icg];
4089         if (m == -1)
4090         {
4091             if (bCompact)
4092             {
4093                 /* Compact the home array in place */
4094                 copy_rvec(src[icg],src[home_pos++]);
4095             }
4096         }
4097         else
4098         {
4099             nrcg = i1 - i0;
4100             /* Copy to the communication buffer */
4101             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4102             pos_vec[m] += 1 + nrcg*nvec;
4103         }
4104         i0 = i1;
4105     }
4106     if (!bCompact)
4107     {
4108         home_pos = ncg;
4109     }
4110     
4111     return home_pos;
4112 }
4113
4114 static int compact_ind(int ncg,int *move,
4115                        int *index_gl,int *cgindex,
4116                        int *gatindex,
4117                        gmx_ga2la_t ga2la,char *bLocalCG,
4118                        int *cginfo)
4119 {
4120     int cg,nat,a0,a1,a,a_gl;
4121     int home_pos;
4122
4123     home_pos = 0;
4124     nat = 0;
4125     for(cg=0; cg<ncg; cg++)
4126     {
4127         a0 = cgindex[cg];
4128         a1 = cgindex[cg+1];
4129         if (move[cg] == -1)
4130         {
4131             /* Compact the home arrays in place.
4132              * Anything that can be done here avoids access to global arrays.
4133              */
4134             cgindex[home_pos] = nat;
4135             for(a=a0; a<a1; a++)
4136             {
4137                 a_gl = gatindex[a];
4138                 gatindex[nat] = a_gl;
4139                 /* The cell number stays 0, so we don't need to set it */
4140                 ga2la_change_la(ga2la,a_gl,nat);
4141                 nat++;
4142             }
4143             index_gl[home_pos] = index_gl[cg];
4144             cginfo[home_pos]   = cginfo[cg];
4145             /* The charge group remains local, so bLocalCG does not change */
4146             home_pos++;
4147         }
4148         else
4149         {
4150             /* Clear the global indices */
4151             for(a=a0; a<a1; a++)
4152             {
4153                 ga2la_del(ga2la,gatindex[a]);
4154             }
4155             if (bLocalCG)
4156             {
4157                 bLocalCG[index_gl[cg]] = FALSE;
4158             }
4159         }
4160     }
4161     cgindex[home_pos] = nat;
4162     
4163     return home_pos;
4164 }
4165
4166 static void clear_and_mark_ind(int ncg,int *move,
4167                                int *index_gl,int *cgindex,int *gatindex,
4168                                gmx_ga2la_t ga2la,char *bLocalCG,
4169                                int *cell_index)
4170 {
4171     int cg,a0,a1,a;
4172     
4173     for(cg=0; cg<ncg; cg++)
4174     {
4175         if (move[cg] >= 0)
4176         {
4177             a0 = cgindex[cg];
4178             a1 = cgindex[cg+1];
4179             /* Clear the global indices */
4180             for(a=a0; a<a1; a++)
4181             {
4182                 ga2la_del(ga2la,gatindex[a]);
4183             }
4184             if (bLocalCG)
4185             {
4186                 bLocalCG[index_gl[cg]] = FALSE;
4187             }
4188             /* Signal that this cg has moved using the ns cell index.
4189              * Here we set it to -1. fill_grid will change it
4190              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4191              */
4192             cell_index[cg] = -1;
4193         }
4194     }
4195 }
4196
4197 static void print_cg_move(FILE *fplog,
4198                           gmx_domdec_t *dd,
4199                           gmx_large_int_t step,int cg,int dim,int dir,
4200                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4201                           rvec cm_old,rvec cm_new,real pos_d)
4202 {
4203     gmx_domdec_comm_t *comm;
4204     char buf[22];
4205
4206     comm = dd->comm;
4207
4208     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4209     if (bHaveLimitdAndCMOld)
4210     {
4211         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4212                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4213     }
4214     else
4215     {
4216         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4217                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4218     }
4219     fprintf(fplog,"distance out of cell %f\n",
4220             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4221     if (bHaveLimitdAndCMOld)
4222     {
4223         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4224                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4225     }
4226     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4227             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4228     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4229             dim2char(dim),
4230             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4231     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4232             dim2char(dim),
4233             comm->cell_x0[dim],comm->cell_x1[dim]);
4234 }
4235
4236 static void cg_move_error(FILE *fplog,
4237                           gmx_domdec_t *dd,
4238                           gmx_large_int_t step,int cg,int dim,int dir,
4239                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4240                           rvec cm_old,rvec cm_new,real pos_d)
4241 {
4242     if (fplog)
4243     {
4244         print_cg_move(fplog, dd,step,cg,dim,dir,
4245                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4246     }
4247     print_cg_move(stderr,dd,step,cg,dim,dir,
4248                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4249     gmx_fatal(FARGS,
4250               "A charge group moved too far between two domain decomposition steps\n"
4251               "This usually means that your system is not well equilibrated");
4252 }
4253
4254 static void rotate_state_atom(t_state *state,int a)
4255 {
4256     int est;
4257
4258     for(est=0; est<estNR; est++)
4259     {
4260         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4261             switch (est) {
4262             case estX:
4263                 /* Rotate the complete state; for a rectangular box only */
4264                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4265                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4266                 break;
4267             case estV:
4268                 state->v[a][YY] = -state->v[a][YY];
4269                 state->v[a][ZZ] = -state->v[a][ZZ];
4270                 break;
4271             case estSDX:
4272                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4273                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4274                 break;
4275             case estCGP:
4276                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4277                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4278                 break;
4279             case estDISRE_INITF:
4280             case estDISRE_RM3TAV:
4281             case estORIRE_INITF:
4282             case estORIRE_DTAV:
4283                 /* These are distances, so not affected by rotation */
4284                 break;
4285             default:
4286                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4287             }
4288         }
4289     }
4290 }
4291
4292 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4293 {
4294     if (natoms > comm->moved_nalloc)
4295     {
4296         /* Contents should be preserved here */
4297         comm->moved_nalloc = over_alloc_dd(natoms);
4298         srenew(comm->moved,comm->moved_nalloc);
4299     }
4300
4301     return comm->moved;
4302 }
4303
4304 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4305                          gmx_domdec_t *dd,
4306                          t_state *state,
4307                          ivec tric_dir,matrix tcm,
4308                          rvec cell_x0,rvec cell_x1,
4309                          rvec limitd,rvec limit0,rvec limit1,
4310                          const int *cgindex,
4311                          int cg_start,int cg_end,
4312                          rvec *cg_cm,
4313                          int *move)
4314 {
4315     int  npbcdim;
4316     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4317     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4318     int  flag;
4319     gmx_bool bScrew;
4320     ivec dev;
4321     real inv_ncg,pos_d;
4322     rvec cm_new;
4323
4324     npbcdim = dd->npbcdim;
4325
4326     for(cg=cg_start; cg<cg_end; cg++)
4327     {
4328         k0   = cgindex[cg];
4329         k1   = cgindex[cg+1];
4330         nrcg = k1 - k0;
4331         if (nrcg == 1)
4332         {
4333             copy_rvec(state->x[k0],cm_new);
4334         }
4335         else
4336         {
4337             inv_ncg = 1.0/nrcg;
4338             
4339             clear_rvec(cm_new);
4340             for(k=k0; (k<k1); k++)
4341             {
4342                 rvec_inc(cm_new,state->x[k]);
4343             }
4344             for(d=0; (d<DIM); d++)
4345             {
4346                 cm_new[d] = inv_ncg*cm_new[d];
4347             }
4348         }
4349         
4350         clear_ivec(dev);
4351         /* Do pbc and check DD cell boundary crossings */
4352         for(d=DIM-1; d>=0; d--)
4353         {
4354             if (dd->nc[d] > 1)
4355             {
4356                 bScrew = (dd->bScrewPBC && d == XX);
4357                 /* Determine the location of this cg in lattice coordinates */
4358                 pos_d = cm_new[d];
4359                 if (tric_dir[d])
4360                 {
4361                     for(d2=d+1; d2<DIM; d2++)
4362                     {
4363                         pos_d += cm_new[d2]*tcm[d2][d];
4364                     }
4365                 }
4366                 /* Put the charge group in the triclinic unit-cell */
4367                 if (pos_d >= cell_x1[d])
4368                 {
4369                     if (pos_d >= limit1[d])
4370                     {
4371                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4372                                       cg_cm[cg],cm_new,pos_d);
4373                     }
4374                     dev[d] = 1;
4375                     if (dd->ci[d] == dd->nc[d] - 1)
4376                     {
4377                         rvec_dec(cm_new,state->box[d]);
4378                         if (bScrew)
4379                         {
4380                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4381                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4382                         }
4383                         for(k=k0; (k<k1); k++)
4384                         {
4385                             rvec_dec(state->x[k],state->box[d]);
4386                             if (bScrew)
4387                             {
4388                                 rotate_state_atom(state,k);
4389                             }
4390                         }
4391                     }
4392                 }
4393                 else if (pos_d < cell_x0[d])
4394                 {
4395                     if (pos_d < limit0[d])
4396                     {
4397                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4398                                       cg_cm[cg],cm_new,pos_d);
4399                     }
4400                     dev[d] = -1;
4401                     if (dd->ci[d] == 0)
4402                     {
4403                         rvec_inc(cm_new,state->box[d]);
4404                         if (bScrew)
4405                         {
4406                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4407                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4408                         }
4409                         for(k=k0; (k<k1); k++)
4410                         {
4411                             rvec_inc(state->x[k],state->box[d]);
4412                             if (bScrew)
4413                             {
4414                                 rotate_state_atom(state,k);
4415                             }
4416                         }
4417                     }
4418                 }
4419             }
4420             else if (d < npbcdim)
4421             {
4422                 /* Put the charge group in the rectangular unit-cell */
4423                 while (cm_new[d] >= state->box[d][d])
4424                 {
4425                     rvec_dec(cm_new,state->box[d]);
4426                     for(k=k0; (k<k1); k++)
4427                     {
4428                         rvec_dec(state->x[k],state->box[d]);
4429                     }
4430                 }
4431                 while (cm_new[d] < 0)
4432                 {
4433                     rvec_inc(cm_new,state->box[d]);
4434                     for(k=k0; (k<k1); k++)
4435                     {
4436                         rvec_inc(state->x[k],state->box[d]);
4437                     }
4438                 }
4439             }
4440         }
4441     
4442         copy_rvec(cm_new,cg_cm[cg]);
4443         
4444         /* Determine where this cg should go */
4445         flag = 0;
4446         mc = -1;
4447         for(d=0; d<dd->ndim; d++)
4448         {
4449             dim = dd->dim[d];
4450             if (dev[dim] == 1)
4451             {
4452                 flag |= DD_FLAG_FW(d);
4453                 if (mc == -1)
4454                 {
4455                     mc = d*2;
4456                 }
4457             }
4458             else if (dev[dim] == -1)
4459             {
4460                 flag |= DD_FLAG_BW(d);
4461                 if (mc == -1) {
4462                     if (dd->nc[dim] > 2)
4463                     {
4464                         mc = d*2 + 1;
4465                     }
4466                     else
4467                     {
4468                         mc = d*2;
4469                     }
4470                 }
4471             }
4472         }
4473         /* Temporarily store the flag in move */
4474         move[cg] = mc + flag;
4475     }
4476 }
4477
4478 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4479                                gmx_domdec_t *dd,ivec tric_dir,
4480                                t_state *state,rvec **f,
4481                                t_forcerec *fr,t_mdatoms *md,
4482                                gmx_bool bCompact,
4483                                t_nrnb *nrnb,
4484                                int *ncg_stay_home,
4485                                int *ncg_moved)
4486 {
4487     int  *move;
4488     int  npbcdim;
4489     int  ncg[DIM*2],nat[DIM*2];
4490     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4491     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4492     int  sbuf[2],rbuf[2];
4493     int  home_pos_cg,home_pos_at,buf_pos;
4494     int  flag;
4495     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4496     gmx_bool bScrew;
4497     ivec dev;
4498     real inv_ncg,pos_d;
4499     matrix tcm;
4500     rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4501     atom_id *cgindex;
4502     cginfo_mb_t *cginfo_mb;
4503     gmx_domdec_comm_t *comm;
4504     int  *moved;
4505     int  nthread,thread;
4506     
4507     if (dd->bScrewPBC)
4508     {
4509         check_screw_box(state->box);
4510     }
4511     
4512     comm  = dd->comm;
4513     if (fr->cutoff_scheme == ecutsGROUP)
4514     {
4515         cg_cm = fr->cg_cm;
4516     }
4517     
4518     for(i=0; i<estNR; i++)
4519     {
4520         if (EST_DISTR(i))
4521         {
4522             switch (i)
4523             {
4524             case estX:   /* Always present */            break;
4525             case estV:   bV   = (state->flags & (1<<i)); break;
4526             case estSDX: bSDX = (state->flags & (1<<i)); break;
4527             case estCGP: bCGP = (state->flags & (1<<i)); break;
4528             case estLD_RNG:
4529             case estLD_RNGI:
4530             case estDISRE_INITF:
4531             case estDISRE_RM3TAV:
4532             case estORIRE_INITF:
4533             case estORIRE_DTAV:
4534                 /* No processing required */
4535                 break;
4536             default:
4537             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4538             }
4539         }
4540     }
4541     
4542     if (dd->ncg_tot > comm->nalloc_int)
4543     {
4544         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4545         srenew(comm->buf_int,comm->nalloc_int);
4546     }
4547     move = comm->buf_int;
4548     
4549     /* Clear the count */
4550     for(c=0; c<dd->ndim*2; c++)
4551     {
4552         ncg[c] = 0;
4553         nat[c] = 0;
4554     }
4555
4556     npbcdim = dd->npbcdim;
4557
4558     for(d=0; (d<DIM); d++)
4559     {
4560         limitd[d] = dd->comm->cellsize_min[d];
4561         if (d >= npbcdim && dd->ci[d] == 0)
4562         {
4563             cell_x0[d] = -GMX_FLOAT_MAX;
4564         }
4565         else
4566         {
4567             cell_x0[d] = comm->cell_x0[d];
4568         }
4569         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4570         {
4571             cell_x1[d] = GMX_FLOAT_MAX;
4572         }
4573         else
4574         {
4575             cell_x1[d] = comm->cell_x1[d];
4576         }
4577         if (d < npbcdim)
4578         {
4579             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4580             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4581         }
4582         else
4583         {
4584             /* We check after communication if a charge group moved
4585              * more than one cell. Set the pre-comm check limit to float_max.
4586              */
4587             limit0[d] = -GMX_FLOAT_MAX;
4588             limit1[d] =  GMX_FLOAT_MAX;
4589         }
4590     }
4591     
4592     make_tric_corr_matrix(npbcdim,state->box,tcm);
4593     
4594     cgindex = dd->cgindex;
4595
4596     nthread = gmx_omp_nthreads_get(emntDomdec);
4597
4598     /* Compute the center of geometry for all home charge groups
4599      * and put them in the box and determine where they should go.
4600      */
4601 #pragma omp parallel for num_threads(nthread) schedule(static)
4602     for(thread=0; thread<nthread; thread++)
4603     {
4604         calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4605                      cell_x0,cell_x1,limitd,limit0,limit1,
4606                      cgindex,
4607                      ( thread   *dd->ncg_home)/nthread,
4608                      ((thread+1)*dd->ncg_home)/nthread,
4609                      fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4610                      move);
4611     }
4612
4613     for(cg=0; cg<dd->ncg_home; cg++)
4614     {
4615         if (move[cg] >= 0)
4616         {
4617             mc = move[cg];
4618             flag     = mc & ~DD_FLAG_NRCG;
4619             mc       = mc & DD_FLAG_NRCG;
4620             move[cg] = mc;
4621
4622             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4623             {
4624                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4625                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4626             }
4627             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4628             /* We store the cg size in the lower 16 bits
4629              * and the place where the charge group should go
4630              * in the next 6 bits. This saves some communication volume.
4631              */
4632             nrcg = cgindex[cg+1] - cgindex[cg];
4633             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4634             ncg[mc] += 1;
4635             nat[mc] += nrcg;
4636         }
4637     }
4638     
4639     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4640     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4641
4642     *ncg_moved = 0;
4643     for(i=0; i<dd->ndim*2; i++)
4644     {
4645         *ncg_moved += ncg[i];
4646     }
4647     
4648     nvec = 1;
4649     if (bV)
4650     {
4651         nvec++;
4652     }
4653     if (bSDX)
4654     {
4655         nvec++;
4656     }
4657     if (bCGP)
4658     {
4659         nvec++;
4660     }
4661     
4662     /* Make sure the communication buffers are large enough */
4663     for(mc=0; mc<dd->ndim*2; mc++)
4664     {
4665         nvr = ncg[mc] + nat[mc]*nvec;
4666         if (nvr > comm->cgcm_state_nalloc[mc])
4667         {
4668             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4669             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4670         }
4671     }
4672     
4673     switch (fr->cutoff_scheme)
4674     {
4675     case ecutsGROUP:
4676         /* Recalculating cg_cm might be cheaper than communicating,
4677          * but that could give rise to rounding issues.
4678          */
4679         home_pos_cg =
4680             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4681                                     nvec,cg_cm,comm,bCompact);
4682     break;
4683     case ecutsVERLET:
4684         /* Without charge groups we send the moved atom coordinates
4685          * over twice. This is so the code below can be used without
4686          * many conditionals for both for with and without charge groups.
4687          */
4688         home_pos_cg =
4689             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4690                                     nvec,state->x,comm,FALSE);
4691         if (bCompact)
4692         {
4693             home_pos_cg -= *ncg_moved;
4694         }
4695         break;
4696     default:
4697         gmx_incons("unimplemented");
4698         home_pos_cg = 0;
4699     }
4700     
4701     vec = 0;
4702     home_pos_at =
4703         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4704                                 nvec,vec++,state->x,comm,bCompact);
4705     if (bV)
4706     {
4707         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4708                                 nvec,vec++,state->v,comm,bCompact);
4709     }
4710     if (bSDX)
4711     {
4712         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4713                                 nvec,vec++,state->sd_X,comm,bCompact);
4714     }
4715     if (bCGP)
4716     {
4717         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4718                                 nvec,vec++,state->cg_p,comm,bCompact);
4719     }
4720     
4721     if (bCompact)
4722     {
4723         compact_ind(dd->ncg_home,move,
4724                     dd->index_gl,dd->cgindex,dd->gatindex,
4725                     dd->ga2la,comm->bLocalCG,
4726                     fr->cginfo);
4727     }
4728     else
4729     {
4730         if (fr->cutoff_scheme == ecutsVERLET)
4731         {
4732             moved = get_moved(comm,dd->ncg_home);
4733
4734             for(k=0; k<dd->ncg_home; k++)
4735             {
4736                 moved[k] = 0;
4737             }
4738         }
4739         else
4740         {
4741             moved = fr->ns.grid->cell_index;
4742         }
4743
4744         clear_and_mark_ind(dd->ncg_home,move,
4745                            dd->index_gl,dd->cgindex,dd->gatindex,
4746                            dd->ga2la,comm->bLocalCG,
4747                            moved);
4748     }
4749     
4750     cginfo_mb = fr->cginfo_mb;
4751
4752     *ncg_stay_home = home_pos_cg;
4753     for(d=0; d<dd->ndim; d++)
4754     {
4755         dim = dd->dim[d];
4756         ncg_recv = 0;
4757         nat_recv = 0;
4758         nvr      = 0;
4759         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4760         {
4761             cdd = d*2 + dir;
4762             /* Communicate the cg and atom counts */
4763             sbuf[0] = ncg[cdd];
4764             sbuf[1] = nat[cdd];
4765             if (debug)
4766             {
4767                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4768                         d,dir,sbuf[0],sbuf[1]);
4769             }
4770             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4771             
4772             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4773             {
4774                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4775                 srenew(comm->buf_int,comm->nalloc_int);
4776             }
4777             
4778             /* Communicate the charge group indices, sizes and flags */
4779             dd_sendrecv_int(dd, d, dir,
4780                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4781                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4782             
4783             nvs = ncg[cdd] + nat[cdd]*nvec;
4784             i   = rbuf[0]  + rbuf[1] *nvec;
4785             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4786             
4787             /* Communicate cgcm and state */
4788             dd_sendrecv_rvec(dd, d, dir,
4789                              comm->cgcm_state[cdd], nvs,
4790                              comm->vbuf.v+nvr, i);
4791             ncg_recv += rbuf[0];
4792             nat_recv += rbuf[1];
4793             nvr      += i;
4794         }
4795         
4796         /* Process the received charge groups */
4797         buf_pos = 0;
4798         for(cg=0; cg<ncg_recv; cg++)
4799         {
4800             flag = comm->buf_int[cg*DD_CGIBS+1];
4801
4802             if (dim >= npbcdim && dd->nc[dim] > 2)
4803             {
4804                 /* No pbc in this dim and more than one domain boundary.
4805                  * We do a separate check if a charge group didn't move too far.
4806                  */
4807                 if (((flag & DD_FLAG_FW(d)) &&
4808                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4809                     ((flag & DD_FLAG_BW(d)) &&
4810                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4811                 {
4812                     cg_move_error(fplog,dd,step,cg,dim,
4813                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4814                                    FALSE,0,
4815                                    comm->vbuf.v[buf_pos],
4816                                    comm->vbuf.v[buf_pos],
4817                                    comm->vbuf.v[buf_pos][dim]);
4818                 }
4819             }
4820
4821             mc = -1;
4822             if (d < dd->ndim-1)
4823             {
4824                 /* Check which direction this cg should go */
4825                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4826                 {
4827                     if (dd->bGridJump)
4828                     {
4829                         /* The cell boundaries for dimension d2 are not equal
4830                          * for each cell row of the lower dimension(s),
4831                          * therefore we might need to redetermine where
4832                          * this cg should go.
4833                          */
4834                         dim2 = dd->dim[d2];
4835                         /* If this cg crosses the box boundary in dimension d2
4836                          * we can use the communicated flag, so we do not
4837                          * have to worry about pbc.
4838                          */
4839                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4840                                (flag & DD_FLAG_FW(d2))) ||
4841                               (dd->ci[dim2] == 0 &&
4842                                (flag & DD_FLAG_BW(d2)))))
4843                         {
4844                             /* Clear the two flags for this dimension */
4845                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4846                             /* Determine the location of this cg
4847                              * in lattice coordinates
4848                              */
4849                             pos_d = comm->vbuf.v[buf_pos][dim2];
4850                             if (tric_dir[dim2])
4851                             {
4852                                 for(d3=dim2+1; d3<DIM; d3++)
4853                                 {
4854                                     pos_d +=
4855                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4856                                 }
4857                             }
4858                             /* Check of we are not at the box edge.
4859                              * pbc is only handled in the first step above,
4860                              * but this check could move over pbc while
4861                              * the first step did not due to different rounding.
4862                              */
4863                             if (pos_d >= cell_x1[dim2] &&
4864                                 dd->ci[dim2] != dd->nc[dim2]-1)
4865                             {
4866                                 flag |= DD_FLAG_FW(d2);
4867                             }
4868                             else if (pos_d < cell_x0[dim2] &&
4869                                      dd->ci[dim2] != 0)
4870                             {
4871                                 flag |= DD_FLAG_BW(d2);
4872                             }
4873                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4874                         }
4875                     }
4876                     /* Set to which neighboring cell this cg should go */
4877                     if (flag & DD_FLAG_FW(d2))
4878                     {
4879                         mc = d2*2;
4880                     }
4881                     else if (flag & DD_FLAG_BW(d2))
4882                     {
4883                         if (dd->nc[dd->dim[d2]] > 2)
4884                         {
4885                             mc = d2*2+1;
4886                         }
4887                         else
4888                         {
4889                             mc = d2*2;
4890                         }
4891                     }
4892                 }
4893             }
4894             
4895             nrcg = flag & DD_FLAG_NRCG;
4896             if (mc == -1)
4897             {
4898                 if (home_pos_cg+1 > dd->cg_nalloc)
4899                 {
4900                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4901                     srenew(dd->index_gl,dd->cg_nalloc);
4902                     srenew(dd->cgindex,dd->cg_nalloc+1);
4903                 }
4904                 /* Set the global charge group index and size */
4905                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4906                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4907                 /* Copy the state from the buffer */
4908                 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4909                 if (fr->cutoff_scheme == ecutsGROUP)
4910                 {
4911                     cg_cm = fr->cg_cm;
4912                     copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4913                 }
4914                 buf_pos++;
4915
4916                 /* Set the cginfo */
4917                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4918                                                    dd->index_gl[home_pos_cg]);
4919                 if (comm->bLocalCG)
4920                 {
4921                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4922                 }
4923
4924                 if (home_pos_at+nrcg > state->nalloc)
4925                 {
4926                     dd_realloc_state(state,f,home_pos_at+nrcg);
4927                 }
4928                 for(i=0; i<nrcg; i++)
4929                 {
4930                     copy_rvec(comm->vbuf.v[buf_pos++],
4931                               state->x[home_pos_at+i]);
4932                 }
4933                 if (bV)
4934                 {
4935                     for(i=0; i<nrcg; i++)
4936                     {
4937                         copy_rvec(comm->vbuf.v[buf_pos++],
4938                                   state->v[home_pos_at+i]);
4939                     }
4940                 }
4941                 if (bSDX)
4942                 {
4943                     for(i=0; i<nrcg; i++)
4944                     {
4945                         copy_rvec(comm->vbuf.v[buf_pos++],
4946                                   state->sd_X[home_pos_at+i]);
4947                     }
4948                 }
4949                 if (bCGP)
4950                 {
4951                     for(i=0; i<nrcg; i++)
4952                     {
4953                         copy_rvec(comm->vbuf.v[buf_pos++],
4954                                   state->cg_p[home_pos_at+i]);
4955                     }
4956                 }
4957                 home_pos_cg += 1;
4958                 home_pos_at += nrcg;
4959             }
4960             else
4961             {
4962                 /* Reallocate the buffers if necessary  */
4963                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4964                 {
4965                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4966                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4967                 }
4968                 nvr = ncg[mc] + nat[mc]*nvec;
4969                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4970                 {
4971                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4972                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4973                 }
4974                 /* Copy from the receive to the send buffers */
4975                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4976                        comm->buf_int + cg*DD_CGIBS,
4977                        DD_CGIBS*sizeof(int));
4978                 memcpy(comm->cgcm_state[mc][nvr],
4979                        comm->vbuf.v[buf_pos],
4980                        (1+nrcg*nvec)*sizeof(rvec));
4981                 buf_pos += 1 + nrcg*nvec;
4982                 ncg[mc] += 1;
4983                 nat[mc] += nrcg;
4984             }
4985         }
4986     }
4987     
4988     /* With sorting (!bCompact) the indices are now only partially up to date
4989      * and ncg_home and nat_home are not the real count, since there are
4990      * "holes" in the arrays for the charge groups that moved to neighbors.
4991      */
4992     if (fr->cutoff_scheme == ecutsVERLET)
4993     {
4994         moved = get_moved(comm,home_pos_cg);
4995
4996         for(i=dd->ncg_home; i<home_pos_cg; i++)
4997         {
4998             moved[i] = 0;
4999         }
5000     }
5001     dd->ncg_home = home_pos_cg;
5002     dd->nat_home = home_pos_at;
5003
5004     if (debug)
5005     {
5006         fprintf(debug,
5007                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5008                 *ncg_moved,dd->ncg_home-*ncg_moved);
5009                 
5010     }
5011 }
5012
5013 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
5014 {
5015     dd->comm->cycl[ddCycl] += cycles;
5016     dd->comm->cycl_n[ddCycl]++;
5017     if (cycles > dd->comm->cycl_max[ddCycl])
5018     {
5019         dd->comm->cycl_max[ddCycl] = cycles;
5020     }
5021 }
5022
5023 static double force_flop_count(t_nrnb *nrnb)
5024 {
5025     int i;
5026     double sum;
5027     const char *name;
5028
5029     sum = 0;
5030     for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5031     {
5032         /* To get closer to the real timings, we half the count
5033          * for the normal loops and again half it for water loops.
5034          */
5035         name = nrnb_str(i);
5036         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5037         {
5038             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5039         }
5040         else
5041         {
5042             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5043         }
5044     }
5045     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5046     {
5047         name = nrnb_str(i);
5048         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5049         sum += nrnb->n[i]*cost_nrnb(i);
5050     }
5051     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5052     {
5053         sum += nrnb->n[i]*cost_nrnb(i);
5054     }
5055
5056     return sum;
5057 }
5058
5059 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5060 {
5061     if (dd->comm->eFlop)
5062     {
5063         dd->comm->flop -= force_flop_count(nrnb);
5064     }
5065 }
5066 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5067 {
5068     if (dd->comm->eFlop)
5069     {
5070         dd->comm->flop += force_flop_count(nrnb);
5071         dd->comm->flop_n++;
5072     }
5073 }  
5074
5075 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5076 {
5077     int i;
5078     
5079     for(i=0; i<ddCyclNr; i++)
5080     {
5081         dd->comm->cycl[i] = 0;
5082         dd->comm->cycl_n[i] = 0;
5083         dd->comm->cycl_max[i] = 0;
5084     }
5085     dd->comm->flop = 0;
5086     dd->comm->flop_n = 0;
5087 }
5088
5089 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5090 {
5091     gmx_domdec_comm_t *comm;
5092     gmx_domdec_load_t *load;
5093     gmx_domdec_root_t *root=NULL;
5094     int  d,dim,cid,i,pos;
5095     float cell_frac=0,sbuf[DD_NLOAD_MAX];
5096     gmx_bool bSepPME;
5097     
5098     if (debug)
5099     {
5100         fprintf(debug,"get_load_distribution start\n");
5101     }
5102
5103     wallcycle_start(wcycle,ewcDDCOMMLOAD);
5104     
5105     comm = dd->comm;
5106     
5107     bSepPME = (dd->pme_nodeid >= 0);
5108     
5109     for(d=dd->ndim-1; d>=0; d--)
5110     {
5111         dim = dd->dim[d];
5112         /* Check if we participate in the communication in this dimension */
5113         if (d == dd->ndim-1 || 
5114             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5115         {
5116             load = &comm->load[d];
5117             if (dd->bGridJump)
5118             {
5119                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5120             }
5121             pos = 0;
5122             if (d == dd->ndim-1)
5123             {
5124                 sbuf[pos++] = dd_force_load(comm);
5125                 sbuf[pos++] = sbuf[0];
5126                 if (dd->bGridJump)
5127                 {
5128                     sbuf[pos++] = sbuf[0];
5129                     sbuf[pos++] = cell_frac;
5130                     if (d > 0)
5131                     {
5132                         sbuf[pos++] = comm->cell_f_max0[d];
5133                         sbuf[pos++] = comm->cell_f_min1[d];
5134                     }
5135                 }
5136                 if (bSepPME)
5137                 {
5138                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5139                     sbuf[pos++] = comm->cycl[ddCyclPME];
5140                 }
5141             }
5142             else
5143             {
5144                 sbuf[pos++] = comm->load[d+1].sum;
5145                 sbuf[pos++] = comm->load[d+1].max;
5146                 if (dd->bGridJump)
5147                 {
5148                     sbuf[pos++] = comm->load[d+1].sum_m;
5149                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5150                     sbuf[pos++] = comm->load[d+1].flags;
5151                     if (d > 0)
5152                     {
5153                         sbuf[pos++] = comm->cell_f_max0[d];
5154                         sbuf[pos++] = comm->cell_f_min1[d];
5155                     }
5156                 }
5157                 if (bSepPME)
5158                 {
5159                     sbuf[pos++] = comm->load[d+1].mdf;
5160                     sbuf[pos++] = comm->load[d+1].pme;
5161                 }
5162             }
5163             load->nload = pos;
5164             /* Communicate a row in DD direction d.
5165              * The communicators are setup such that the root always has rank 0.
5166              */
5167 #ifdef GMX_MPI
5168             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
5169                        load->load,load->nload*sizeof(float),MPI_BYTE,
5170                        0,comm->mpi_comm_load[d]);
5171 #endif
5172             if (dd->ci[dim] == dd->master_ci[dim])
5173             {
5174                 /* We are the root, process this row */
5175                 if (comm->bDynLoadBal)
5176                 {
5177                     root = comm->root[d];
5178                 }
5179                 load->sum = 0;
5180                 load->max = 0;
5181                 load->sum_m = 0;
5182                 load->cvol_min = 1;
5183                 load->flags = 0;
5184                 load->mdf = 0;
5185                 load->pme = 0;
5186                 pos = 0;
5187                 for(i=0; i<dd->nc[dim]; i++)
5188                 {
5189                     load->sum += load->load[pos++];
5190                     load->max = max(load->max,load->load[pos]);
5191                     pos++;
5192                     if (dd->bGridJump)
5193                     {
5194                         if (root->bLimited)
5195                         {
5196                             /* This direction could not be load balanced properly,
5197                              * therefore we need to use the maximum iso the average load.
5198                              */
5199                             load->sum_m = max(load->sum_m,load->load[pos]);
5200                         }
5201                         else
5202                         {
5203                             load->sum_m += load->load[pos];
5204                         }
5205                         pos++;
5206                         load->cvol_min = min(load->cvol_min,load->load[pos]);
5207                         pos++;
5208                         if (d < dd->ndim-1)
5209                         {
5210                             load->flags = (int)(load->load[pos++] + 0.5);
5211                         }
5212                         if (d > 0)
5213                         {
5214                             root->cell_f_max0[i] = load->load[pos++];
5215                             root->cell_f_min1[i] = load->load[pos++];
5216                         }
5217                     }
5218                     if (bSepPME)
5219                     {
5220                         load->mdf = max(load->mdf,load->load[pos]);
5221                         pos++;
5222                         load->pme = max(load->pme,load->load[pos]);
5223                         pos++;
5224                     }
5225                 }
5226                 if (comm->bDynLoadBal && root->bLimited)
5227                 {
5228                     load->sum_m *= dd->nc[dim];
5229                     load->flags |= (1<<d);
5230                 }
5231             }
5232         }
5233     }
5234
5235     if (DDMASTER(dd))
5236     {
5237         comm->nload      += dd_load_count(comm);
5238         comm->load_step  += comm->cycl[ddCyclStep];
5239         comm->load_sum   += comm->load[0].sum;
5240         comm->load_max   += comm->load[0].max;
5241         if (comm->bDynLoadBal)
5242         {
5243             for(d=0; d<dd->ndim; d++)
5244             {
5245                 if (comm->load[0].flags & (1<<d))
5246                 {
5247                     comm->load_lim[d]++;
5248                 }
5249             }
5250         }
5251         if (bSepPME)
5252         {
5253             comm->load_mdf += comm->load[0].mdf;
5254             comm->load_pme += comm->load[0].pme;
5255         }
5256     }
5257
5258     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5259     
5260     if (debug)
5261     {
5262         fprintf(debug,"get_load_distribution finished\n");
5263     }
5264 }
5265
5266 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5267 {
5268     /* Return the relative performance loss on the total run time
5269      * due to the force calculation load imbalance.
5270      */
5271     if (dd->comm->nload > 0)
5272     {
5273         return
5274             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5275             (dd->comm->load_step*dd->nnodes);
5276     }
5277     else
5278     {
5279         return 0;
5280     }
5281 }
5282
5283 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5284 {
5285     char  buf[STRLEN];
5286     int   npp,npme,nnodes,d,limp;
5287     float imbal,pme_f_ratio,lossf,lossp=0;
5288     gmx_bool  bLim;
5289     gmx_domdec_comm_t *comm;
5290
5291     comm = dd->comm;
5292     if (DDMASTER(dd) && comm->nload > 0)
5293     {
5294         npp    = dd->nnodes;
5295         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5296         nnodes = npp + npme;
5297         imbal = comm->load_max*npp/comm->load_sum - 1;
5298         lossf = dd_force_imb_perf_loss(dd);
5299         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5300         fprintf(fplog,"%s",buf);
5301         fprintf(stderr,"\n");
5302         fprintf(stderr,"%s",buf);
5303         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5304         fprintf(fplog,"%s",buf);
5305         fprintf(stderr,"%s",buf);
5306         bLim = FALSE;
5307         if (comm->bDynLoadBal)
5308         {
5309             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5310             for(d=0; d<dd->ndim; d++)
5311             {
5312                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5313                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5314                 if (limp >= 50)
5315                 {
5316                     bLim = TRUE;
5317                 }
5318             }
5319             sprintf(buf+strlen(buf),"\n");
5320             fprintf(fplog,"%s",buf);
5321             fprintf(stderr,"%s",buf);
5322         }
5323         if (npme > 0)
5324         {
5325             pme_f_ratio = comm->load_pme/comm->load_mdf;
5326             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5327             if (lossp <= 0)
5328             {
5329                 lossp *= (float)npme/(float)nnodes;
5330             }
5331             else
5332             {
5333                 lossp *= (float)npp/(float)nnodes;
5334             }
5335             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5336             fprintf(fplog,"%s",buf);
5337             fprintf(stderr,"%s",buf);
5338             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5339             fprintf(fplog,"%s",buf);
5340             fprintf(stderr,"%s",buf);
5341         }
5342         fprintf(fplog,"\n");
5343         fprintf(stderr,"\n");
5344         
5345         if (lossf >= DD_PERF_LOSS)
5346         {
5347             sprintf(buf,
5348                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5349                     "      in the domain decomposition.\n",lossf*100);
5350             if (!comm->bDynLoadBal)
5351             {
5352                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5353             }
5354             else if (bLim)
5355             {
5356                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5357             }
5358             fprintf(fplog,"%s\n",buf);
5359             fprintf(stderr,"%s\n",buf);
5360         }
5361         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5362         {
5363             sprintf(buf,
5364                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5365                     "      had %s work to do than the PP nodes.\n"
5366                     "      You might want to %s the number of PME nodes\n"
5367                     "      or %s the cut-off and the grid spacing.\n",
5368                     fabs(lossp*100),
5369                     (lossp < 0) ? "less"     : "more",
5370                     (lossp < 0) ? "decrease" : "increase",
5371                     (lossp < 0) ? "decrease" : "increase");
5372             fprintf(fplog,"%s\n",buf);
5373             fprintf(stderr,"%s\n",buf);
5374         }
5375     }
5376 }
5377
5378 static float dd_vol_min(gmx_domdec_t *dd)
5379 {
5380     return dd->comm->load[0].cvol_min*dd->nnodes;
5381 }
5382
5383 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5384 {
5385     return dd->comm->load[0].flags;
5386 }
5387
5388 static float dd_f_imbal(gmx_domdec_t *dd)
5389 {
5390     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5391 }
5392
5393 float dd_pme_f_ratio(gmx_domdec_t *dd)
5394 {
5395     if (dd->comm->cycl_n[ddCyclPME] > 0)
5396     {
5397         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5398     }
5399     else
5400     {
5401         return -1.0;
5402     }
5403 }
5404
5405 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5406 {
5407     int flags,d;
5408     char buf[22];
5409     
5410     flags = dd_load_flags(dd);
5411     if (flags)
5412     {
5413         fprintf(fplog,
5414                 "DD  load balancing is limited by minimum cell size in dimension");
5415         for(d=0; d<dd->ndim; d++)
5416         {
5417             if (flags & (1<<d))
5418             {
5419                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5420             }
5421         }
5422         fprintf(fplog,"\n");
5423     }
5424     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5425     if (dd->comm->bDynLoadBal)
5426     {
5427         fprintf(fplog,"  vol min/aver %5.3f%c",
5428                 dd_vol_min(dd),flags ? '!' : ' ');
5429     }
5430     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5431     if (dd->comm->cycl_n[ddCyclPME])
5432     {
5433         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5434     }
5435     fprintf(fplog,"\n\n");
5436 }
5437
5438 static void dd_print_load_verbose(gmx_domdec_t *dd)
5439 {
5440     if (dd->comm->bDynLoadBal)
5441     {
5442         fprintf(stderr,"vol %4.2f%c ",
5443                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5444     }
5445     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5446     if (dd->comm->cycl_n[ddCyclPME])
5447     {
5448         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5449     }
5450 }
5451
5452 #ifdef GMX_MPI
5453 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5454 {
5455     MPI_Comm  c_row;
5456     int  dim, i, rank;
5457     ivec loc_c;
5458     gmx_domdec_root_t *root;
5459     gmx_bool bPartOfGroup = FALSE;
5460     
5461     dim = dd->dim[dim_ind];
5462     copy_ivec(loc,loc_c);
5463     for(i=0; i<dd->nc[dim]; i++)
5464     {
5465         loc_c[dim] = i;
5466         rank = dd_index(dd->nc,loc_c);
5467         if (rank == dd->rank)
5468         {
5469             /* This process is part of the group */
5470             bPartOfGroup = TRUE;
5471         }
5472     }
5473     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5474                    &c_row);
5475     if (bPartOfGroup)
5476     {
5477         dd->comm->mpi_comm_load[dim_ind] = c_row;
5478         if (dd->comm->eDLB != edlbNO)
5479         {
5480             if (dd->ci[dim] == dd->master_ci[dim])
5481             {
5482                 /* This is the root process of this row */
5483                 snew(dd->comm->root[dim_ind],1);
5484                 root = dd->comm->root[dim_ind];
5485                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5486                 snew(root->old_cell_f,dd->nc[dim]+1);
5487                 snew(root->bCellMin,dd->nc[dim]);
5488                 if (dim_ind > 0)
5489                 {
5490                     snew(root->cell_f_max0,dd->nc[dim]);
5491                     snew(root->cell_f_min1,dd->nc[dim]);
5492                     snew(root->bound_min,dd->nc[dim]);
5493                     snew(root->bound_max,dd->nc[dim]);
5494                 }
5495                 snew(root->buf_ncd,dd->nc[dim]);
5496             }
5497             else
5498             {
5499                 /* This is not a root process, we only need to receive cell_f */
5500                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5501             }
5502         }
5503         if (dd->ci[dim] == dd->master_ci[dim])
5504         {
5505             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5506         }
5507     }
5508 }
5509 #endif
5510
5511 static void make_load_communicators(gmx_domdec_t *dd)
5512 {
5513 #ifdef GMX_MPI
5514   int  dim0,dim1,i,j;
5515   ivec loc;
5516
5517   if (debug)
5518     fprintf(debug,"Making load communicators\n");
5519
5520   snew(dd->comm->load,dd->ndim);
5521   snew(dd->comm->mpi_comm_load,dd->ndim);
5522   
5523   clear_ivec(loc);
5524   make_load_communicator(dd,0,loc);
5525   if (dd->ndim > 1) {
5526     dim0 = dd->dim[0];
5527     for(i=0; i<dd->nc[dim0]; i++) {
5528       loc[dim0] = i;
5529       make_load_communicator(dd,1,loc);
5530     }
5531   }
5532   if (dd->ndim > 2) {
5533     dim0 = dd->dim[0];
5534     for(i=0; i<dd->nc[dim0]; i++) {
5535       loc[dim0] = i;
5536       dim1 = dd->dim[1];
5537       for(j=0; j<dd->nc[dim1]; j++) {
5538           loc[dim1] = j;
5539           make_load_communicator(dd,2,loc);
5540       }
5541     }
5542   }
5543
5544   if (debug)
5545     fprintf(debug,"Finished making load communicators\n");
5546 #endif
5547 }
5548
5549 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5550 {
5551     gmx_bool bZYX;
5552     int  d,dim,i,j,m;
5553     ivec tmp,s;
5554     int  nzone,nzonep;
5555     ivec dd_zp[DD_MAXIZONE];
5556     gmx_domdec_zones_t *zones;
5557     gmx_domdec_ns_ranges_t *izone;
5558     
5559     for(d=0; d<dd->ndim; d++)
5560     {
5561         dim = dd->dim[d];
5562         copy_ivec(dd->ci,tmp);
5563         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5564         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5565         copy_ivec(dd->ci,tmp);
5566         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5567         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5568         if (debug)
5569         {
5570             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5571                     dd->rank,dim,
5572                     dd->neighbor[d][0],
5573                     dd->neighbor[d][1]);
5574         }
5575     }
5576     
5577     if (DDMASTER(dd))
5578     {
5579         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5580             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5581     }
5582     if (fplog)
5583     {
5584         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5585                 dd->ndim,
5586                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5587                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5588     }
5589     switch (dd->ndim)
5590     {
5591     case 3:
5592         nzone  = dd_z3n;
5593         nzonep = dd_zp3n;
5594         for(i=0; i<nzonep; i++)
5595         {
5596             copy_ivec(dd_zp3[i],dd_zp[i]);
5597         }
5598         break;
5599     case 2:
5600         nzone  = dd_z2n;
5601         nzonep = dd_zp2n;
5602         for(i=0; i<nzonep; i++)
5603         {
5604             copy_ivec(dd_zp2[i],dd_zp[i]);
5605         }
5606         break;
5607     case 1:
5608         nzone  = dd_z1n;
5609         nzonep = dd_zp1n;
5610         for(i=0; i<nzonep; i++)
5611         {
5612             copy_ivec(dd_zp1[i],dd_zp[i]);
5613         }
5614         break;
5615     default:
5616         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5617         nzone = 0;
5618         nzonep = 0;
5619     }
5620
5621     zones = &dd->comm->zones;
5622
5623     for(i=0; i<nzone; i++)
5624     {
5625         m = 0;
5626         clear_ivec(zones->shift[i]);
5627         for(d=0; d<dd->ndim; d++)
5628         {
5629             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5630         }
5631     }
5632     
5633     zones->n = nzone;
5634     for(i=0; i<nzone; i++)
5635     {
5636         for(d=0; d<DIM; d++)
5637         {
5638             s[d] = dd->ci[d] - zones->shift[i][d];
5639             if (s[d] < 0)
5640             {
5641                 s[d] += dd->nc[d];
5642             }
5643             else if (s[d] >= dd->nc[d])
5644             {
5645                 s[d] -= dd->nc[d];
5646             }
5647         }
5648     }
5649     zones->nizone = nzonep;
5650     for(i=0; i<zones->nizone; i++)
5651     {
5652         if (dd_zp[i][0] != i)
5653         {
5654             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5655         }
5656         izone = &zones->izone[i];
5657         izone->j0 = dd_zp[i][1];
5658         izone->j1 = dd_zp[i][2];
5659         for(dim=0; dim<DIM; dim++)
5660         {
5661             if (dd->nc[dim] == 1)
5662             {
5663                 /* All shifts should be allowed */
5664                 izone->shift0[dim] = -1;
5665                 izone->shift1[dim] = 1;
5666             }
5667             else
5668             {
5669                 /*
5670                   izone->shift0[d] = 0;
5671                   izone->shift1[d] = 0;
5672                   for(j=izone->j0; j<izone->j1; j++) {
5673                   if (dd->shift[j][d] > dd->shift[i][d])
5674                   izone->shift0[d] = -1;
5675                   if (dd->shift[j][d] < dd->shift[i][d])
5676                   izone->shift1[d] = 1;
5677                   }
5678                 */
5679                 
5680                 int shift_diff;
5681                 
5682                 /* Assume the shift are not more than 1 cell */
5683                 izone->shift0[dim] = 1;
5684                 izone->shift1[dim] = -1;
5685                 for(j=izone->j0; j<izone->j1; j++)
5686                 {
5687                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5688                     if (shift_diff < izone->shift0[dim])
5689                     {
5690                         izone->shift0[dim] = shift_diff;
5691                     }
5692                     if (shift_diff > izone->shift1[dim])
5693                     {
5694                         izone->shift1[dim] = shift_diff;
5695                     }
5696                 }
5697             }
5698         }
5699     }
5700     
5701     if (dd->comm->eDLB != edlbNO)
5702     {
5703         snew(dd->comm->root,dd->ndim);
5704     }
5705     
5706     if (dd->comm->bRecordLoad)
5707     {
5708         make_load_communicators(dd);
5709     }
5710 }
5711
5712 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5713 {
5714     gmx_domdec_t *dd;
5715     gmx_domdec_comm_t *comm;
5716     int  i,rank,*buf;
5717     ivec periods;
5718 #ifdef GMX_MPI
5719     MPI_Comm comm_cart;
5720 #endif
5721     
5722     dd = cr->dd;
5723     comm = dd->comm;
5724     
5725 #ifdef GMX_MPI
5726     if (comm->bCartesianPP)
5727     {
5728         /* Set up cartesian communication for the particle-particle part */
5729         if (fplog)
5730         {
5731             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5732                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5733         }
5734         
5735         for(i=0; i<DIM; i++)
5736         {
5737             periods[i] = TRUE;
5738         }
5739         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5740                         &comm_cart);
5741         /* We overwrite the old communicator with the new cartesian one */
5742         cr->mpi_comm_mygroup = comm_cart;
5743     }
5744     
5745     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5746     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5747     
5748     if (comm->bCartesianPP_PME)
5749     {
5750         /* Since we want to use the original cartesian setup for sim,
5751          * and not the one after split, we need to make an index.
5752          */
5753         snew(comm->ddindex2ddnodeid,dd->nnodes);
5754         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5755         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5756         /* Get the rank of the DD master,
5757          * above we made sure that the master node is a PP node.
5758          */
5759         if (MASTER(cr))
5760         {
5761             rank = dd->rank;
5762         }
5763         else
5764         {
5765             rank = 0;
5766         }
5767         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5768     }
5769     else if (comm->bCartesianPP)
5770     {
5771         if (cr->npmenodes == 0)
5772         {
5773             /* The PP communicator is also
5774              * the communicator for this simulation
5775              */
5776             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5777         }
5778         cr->nodeid = dd->rank;
5779         
5780         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5781         
5782         /* We need to make an index to go from the coordinates
5783          * to the nodeid of this simulation.
5784          */
5785         snew(comm->ddindex2simnodeid,dd->nnodes);
5786         snew(buf,dd->nnodes);
5787         if (cr->duty & DUTY_PP)
5788         {
5789             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5790         }
5791         /* Communicate the ddindex to simulation nodeid index */
5792         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5793                       cr->mpi_comm_mysim);
5794         sfree(buf);
5795         
5796         /* Determine the master coordinates and rank.
5797          * The DD master should be the same node as the master of this sim.
5798          */
5799         for(i=0; i<dd->nnodes; i++)
5800         {
5801             if (comm->ddindex2simnodeid[i] == 0)
5802             {
5803                 ddindex2xyz(dd->nc,i,dd->master_ci);
5804                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5805             }
5806         }
5807         if (debug)
5808         {
5809             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5810         }
5811     }
5812     else
5813     {
5814         /* No Cartesian communicators */
5815         /* We use the rank in dd->comm->all as DD index */
5816         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5817         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5818         dd->masterrank = 0;
5819         clear_ivec(dd->master_ci);
5820     }
5821 #endif
5822   
5823     if (fplog)
5824     {
5825         fprintf(fplog,
5826                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5827                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5828     }
5829     if (debug)
5830     {
5831         fprintf(debug,
5832                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5833                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5834     }
5835 }
5836
5837 static void receive_ddindex2simnodeid(t_commrec *cr)
5838 {
5839     gmx_domdec_t *dd;
5840     
5841     gmx_domdec_comm_t *comm;
5842     int  *buf;
5843     
5844     dd = cr->dd;
5845     comm = dd->comm;
5846     
5847 #ifdef GMX_MPI
5848     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5849     {
5850         snew(comm->ddindex2simnodeid,dd->nnodes);
5851         snew(buf,dd->nnodes);
5852         if (cr->duty & DUTY_PP)
5853         {
5854             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5855         }
5856 #ifdef GMX_MPI
5857         /* Communicate the ddindex to simulation nodeid index */
5858         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5859                       cr->mpi_comm_mysim);
5860 #endif
5861         sfree(buf);
5862     }
5863 #endif
5864 }
5865
5866 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5867                                                      int ncg,int natoms)
5868 {
5869     gmx_domdec_master_t *ma;
5870     int i;
5871
5872     snew(ma,1);
5873     
5874     snew(ma->ncg,dd->nnodes);
5875     snew(ma->index,dd->nnodes+1);
5876     snew(ma->cg,ncg);
5877     snew(ma->nat,dd->nnodes);
5878     snew(ma->ibuf,dd->nnodes*2);
5879     snew(ma->cell_x,DIM);
5880     for(i=0; i<DIM; i++)
5881     {
5882         snew(ma->cell_x[i],dd->nc[i]+1);
5883     }
5884
5885     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5886     {
5887         ma->vbuf = NULL;
5888     }
5889     else
5890     {
5891         snew(ma->vbuf,natoms);
5892     }
5893
5894     return ma;
5895 }
5896
5897 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5898                                int reorder)
5899 {
5900     gmx_domdec_t *dd;
5901     gmx_domdec_comm_t *comm;
5902     int  i,rank;
5903     gmx_bool bDiv[DIM];
5904     ivec periods;
5905 #ifdef GMX_MPI
5906     MPI_Comm comm_cart;
5907 #endif
5908     
5909     dd = cr->dd;
5910     comm = dd->comm;
5911     
5912     if (comm->bCartesianPP)
5913     {
5914         for(i=1; i<DIM; i++)
5915         {
5916             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5917         }
5918         if (bDiv[YY] || bDiv[ZZ])
5919         {
5920             comm->bCartesianPP_PME = TRUE;
5921             /* If we have 2D PME decomposition, which is always in x+y,
5922              * we stack the PME only nodes in z.
5923              * Otherwise we choose the direction that provides the thinnest slab
5924              * of PME only nodes as this will have the least effect
5925              * on the PP communication.
5926              * But for the PME communication the opposite might be better.
5927              */
5928             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5929                              !bDiv[YY] ||
5930                              dd->nc[YY] > dd->nc[ZZ]))
5931             {
5932                 comm->cartpmedim = ZZ;
5933             }
5934             else
5935             {
5936                 comm->cartpmedim = YY;
5937             }
5938             comm->ntot[comm->cartpmedim]
5939                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5940         }
5941         else if (fplog)
5942         {
5943             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5944             fprintf(fplog,
5945                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5946         }
5947     }
5948     
5949 #ifdef GMX_MPI
5950     if (comm->bCartesianPP_PME)
5951     {
5952         if (fplog)
5953         {
5954             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5955         }
5956         
5957         for(i=0; i<DIM; i++)
5958         {
5959             periods[i] = TRUE;
5960         }
5961         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5962                         &comm_cart);
5963         
5964         MPI_Comm_rank(comm_cart,&rank);
5965         if (MASTERNODE(cr) && rank != 0)
5966         {
5967             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5968         }
5969         
5970         /* With this assigment we loose the link to the original communicator
5971          * which will usually be MPI_COMM_WORLD, unless have multisim.
5972          */
5973         cr->mpi_comm_mysim = comm_cart;
5974         cr->sim_nodeid = rank;
5975         
5976         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5977         
5978         if (fplog)
5979         {
5980             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5981                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5982         }
5983         
5984         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5985         {
5986             cr->duty = DUTY_PP;
5987         }
5988         if (cr->npmenodes == 0 ||
5989             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5990         {
5991             cr->duty = DUTY_PME;
5992         }
5993         
5994         /* Split the sim communicator into PP and PME only nodes */
5995         MPI_Comm_split(cr->mpi_comm_mysim,
5996                        cr->duty,
5997                        dd_index(comm->ntot,dd->ci),
5998                        &cr->mpi_comm_mygroup);
5999     }
6000     else
6001     {
6002         switch (dd_node_order)
6003         {
6004         case ddnoPP_PME:
6005             if (fplog)
6006             {
6007                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
6008             }
6009             break;
6010         case ddnoINTERLEAVE:
6011             /* Interleave the PP-only and PME-only nodes,
6012              * as on clusters with dual-core machines this will double
6013              * the communication bandwidth of the PME processes
6014              * and thus speed up the PP <-> PME and inter PME communication.
6015              */
6016             if (fplog)
6017             {
6018                 fprintf(fplog,"Interleaving PP and PME nodes\n");
6019             }
6020             comm->pmenodes = dd_pmenodes(cr);
6021             break;
6022         case ddnoCARTESIAN:
6023             break;
6024         default:
6025             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6026         }
6027     
6028         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6029         {
6030             cr->duty = DUTY_PME;
6031         }
6032         else
6033         {
6034             cr->duty = DUTY_PP;
6035         }
6036         
6037         /* Split the sim communicator into PP and PME only nodes */
6038         MPI_Comm_split(cr->mpi_comm_mysim,
6039                        cr->duty,
6040                        cr->nodeid,
6041                        &cr->mpi_comm_mygroup);
6042         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6043     }
6044 #endif
6045
6046     if (fplog)
6047     {
6048         fprintf(fplog,"This is a %s only node\n\n",
6049                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6050     }
6051 }
6052
6053 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6054 {
6055     gmx_domdec_t *dd;
6056     gmx_domdec_comm_t *comm;
6057     int CartReorder;
6058     
6059     dd = cr->dd;
6060     comm = dd->comm;
6061     
6062     copy_ivec(dd->nc,comm->ntot);
6063     
6064     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6065     comm->bCartesianPP_PME = FALSE;
6066     
6067     /* Reorder the nodes by default. This might change the MPI ranks.
6068      * Real reordering is only supported on very few architectures,
6069      * Blue Gene is one of them.
6070      */
6071     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6072     
6073     if (cr->npmenodes > 0)
6074     {
6075         /* Split the communicator into a PP and PME part */
6076         split_communicator(fplog,cr,dd_node_order,CartReorder);
6077         if (comm->bCartesianPP_PME)
6078         {
6079             /* We (possibly) reordered the nodes in split_communicator,
6080              * so it is no longer required in make_pp_communicator.
6081              */
6082             CartReorder = FALSE;
6083         }
6084     }
6085     else
6086     {
6087         /* All nodes do PP and PME */
6088 #ifdef GMX_MPI    
6089         /* We do not require separate communicators */
6090         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6091 #endif
6092     }
6093     
6094     if (cr->duty & DUTY_PP)
6095     {
6096         /* Copy or make a new PP communicator */
6097         make_pp_communicator(fplog,cr,CartReorder);
6098     }
6099     else
6100     {
6101         receive_ddindex2simnodeid(cr);
6102     }
6103     
6104     if (!(cr->duty & DUTY_PME))
6105     {
6106         /* Set up the commnuication to our PME node */
6107         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6108         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6109         if (debug)
6110         {
6111             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6112                     dd->pme_nodeid,dd->pme_receive_vir_ener);
6113         }
6114     }
6115     else
6116     {
6117         dd->pme_nodeid = -1;
6118     }
6119
6120     if (DDMASTER(dd))
6121     {
6122         dd->ma = init_gmx_domdec_master_t(dd,
6123                                           comm->cgs_gl.nr,
6124                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6125     }
6126 }
6127
6128 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6129 {
6130     real *slb_frac,tot;
6131     int  i,n;
6132     double dbl;
6133     
6134     slb_frac = NULL;
6135     if (nc > 1 && size_string != NULL)
6136     {
6137         if (fplog)
6138         {
6139             fprintf(fplog,"Using static load balancing for the %s direction\n",
6140                     dir);
6141         }
6142         snew(slb_frac,nc);
6143         tot = 0;
6144         for (i=0; i<nc; i++)
6145         {
6146             dbl = 0;
6147             sscanf(size_string,"%lf%n",&dbl,&n);
6148             if (dbl == 0)
6149             {
6150                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6151             }
6152             slb_frac[i] = dbl;
6153             size_string += n;
6154             tot += slb_frac[i];
6155         }
6156         /* Normalize */
6157         if (fplog)
6158         {
6159             fprintf(fplog,"Relative cell sizes:");
6160         }
6161         for (i=0; i<nc; i++)
6162         {
6163             slb_frac[i] /= tot;
6164             if (fplog)
6165             {
6166                 fprintf(fplog," %5.3f",slb_frac[i]);
6167             }
6168         }
6169         if (fplog)
6170         {
6171             fprintf(fplog,"\n");
6172         }
6173     }
6174     
6175     return slb_frac;
6176 }
6177
6178 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6179 {
6180     int n,nmol,ftype;
6181     gmx_mtop_ilistloop_t iloop;
6182     t_ilist *il;
6183     
6184     n = 0;
6185     iloop = gmx_mtop_ilistloop_init(mtop);
6186     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6187     {
6188         for(ftype=0; ftype<F_NRE; ftype++)
6189         {
6190             if ((interaction_function[ftype].flags & IF_BOND) &&
6191                 NRAL(ftype) >  2)
6192             {
6193                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6194             }
6195         }
6196   }
6197
6198   return n;
6199 }
6200
6201 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6202 {
6203     char *val;
6204     int  nst;
6205     
6206     nst = def;
6207     val = getenv(env_var);
6208     if (val)
6209     {
6210         if (sscanf(val,"%d",&nst) <= 0)
6211         {
6212             nst = 1;
6213         }
6214         if (fplog)
6215         {
6216             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6217                     env_var,val,nst);
6218         }
6219     }
6220     
6221     return nst;
6222 }
6223
6224 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6225 {
6226     if (MASTER(cr))
6227     {
6228         fprintf(stderr,"\n%s\n",warn_string);
6229     }
6230     if (fplog)
6231     {
6232         fprintf(fplog,"\n%s\n",warn_string);
6233     }
6234 }
6235
6236 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6237                                   t_inputrec *ir,FILE *fplog)
6238 {
6239     if (ir->ePBC == epbcSCREW &&
6240         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6241     {
6242         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6243     }
6244
6245     if (ir->ns_type == ensSIMPLE)
6246     {
6247         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6248     }
6249
6250     if (ir->nstlist == 0)
6251     {
6252         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6253     }
6254
6255     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6256     {
6257         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6258     }
6259 }
6260
6261 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6262 {
6263     int  di,d;
6264     real r;
6265
6266     r = ddbox->box_size[XX];
6267     for(di=0; di<dd->ndim; di++)
6268     {
6269         d = dd->dim[di];
6270         /* Check using the initial average cell size */
6271         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6272     }
6273
6274     return r;
6275 }
6276
6277 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6278                              const char *dlb_opt,gmx_bool bRecordLoad,
6279                              unsigned long Flags,t_inputrec *ir)
6280 {
6281     gmx_domdec_t *dd;
6282     int  eDLB=-1;
6283     char buf[STRLEN];
6284
6285     switch (dlb_opt[0])
6286     {
6287     case 'a': eDLB = edlbAUTO; break;
6288     case 'n': eDLB = edlbNO;   break;
6289     case 'y': eDLB = edlbYES;  break;
6290     default: gmx_incons("Unknown dlb_opt");
6291     }
6292
6293     if (Flags & MD_RERUN)
6294     {
6295         return edlbNO;
6296     }
6297
6298     if (!EI_DYNAMICS(ir->eI))
6299     {
6300         if (eDLB == edlbYES)
6301         {
6302             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6303             dd_warning(cr,fplog,buf);
6304         }
6305             
6306         return edlbNO;
6307     }
6308
6309     if (!bRecordLoad)
6310     {
6311         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6312
6313         return edlbNO;
6314     }
6315
6316     if (Flags & MD_REPRODUCIBLE)
6317     {
6318         switch (eDLB)
6319         {
6320                         case edlbNO: 
6321                                 break;
6322                         case edlbAUTO:
6323                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6324                                 eDLB = edlbNO;
6325                                 break;
6326                         case edlbYES:
6327                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6328                                 break;
6329                         default:
6330                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6331                                 break;
6332         }
6333     }
6334
6335     return eDLB;
6336 }
6337
6338 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6339 {
6340     int dim;
6341
6342     dd->ndim = 0;
6343     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6344     {
6345         /* Decomposition order z,y,x */
6346         if (fplog)
6347         {
6348             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6349         }
6350         for(dim=DIM-1; dim>=0; dim--)
6351         {
6352             if (dd->nc[dim] > 1)
6353             {
6354                 dd->dim[dd->ndim++] = dim;
6355             }
6356         }
6357     }
6358     else
6359     {
6360         /* Decomposition order x,y,z */
6361         for(dim=0; dim<DIM; dim++)
6362         {
6363             if (dd->nc[dim] > 1)
6364             {
6365                 dd->dim[dd->ndim++] = dim;
6366             }
6367         }
6368     }
6369 }
6370
6371 static gmx_domdec_comm_t *init_dd_comm()
6372 {
6373     gmx_domdec_comm_t *comm;
6374     int  i;
6375
6376     snew(comm,1);
6377     snew(comm->cggl_flag,DIM*2);
6378     snew(comm->cgcm_state,DIM*2);
6379     for(i=0; i<DIM*2; i++)
6380     {
6381         comm->cggl_flag_nalloc[i]  = 0;
6382         comm->cgcm_state_nalloc[i] = 0;
6383     }
6384     
6385     comm->nalloc_int = 0;
6386     comm->buf_int    = NULL;
6387
6388     vec_rvec_init(&comm->vbuf);
6389
6390     comm->n_load_have    = 0;
6391     comm->n_load_collect = 0;
6392
6393     for(i=0; i<ddnatNR-ddnatZONE; i++)
6394     {
6395         comm->sum_nat[i] = 0;
6396     }
6397     comm->ndecomp = 0;
6398     comm->nload   = 0;
6399     comm->load_step = 0;
6400     comm->load_sum  = 0;
6401     comm->load_max  = 0;
6402     clear_ivec(comm->load_lim);
6403     comm->load_mdf  = 0;
6404     comm->load_pme  = 0;
6405
6406     return comm;
6407 }
6408
6409 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6410                                         unsigned long Flags,
6411                                         ivec nc,
6412                                         real comm_distance_min,real rconstr,
6413                                         const char *dlb_opt,real dlb_scale,
6414                                         const char *sizex,const char *sizey,const char *sizez,
6415                                         gmx_mtop_t *mtop,t_inputrec *ir,
6416                                         matrix box,rvec *x,
6417                                         gmx_ddbox_t *ddbox,
6418                                         int *npme_x,int *npme_y)
6419 {
6420     gmx_domdec_t *dd;
6421     gmx_domdec_comm_t *comm;
6422     int  recload;
6423     int  d,i,j;
6424     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6425     gmx_bool bC;
6426     char buf[STRLEN];
6427     
6428     if (fplog)
6429     {
6430         fprintf(fplog,
6431                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6432     }
6433     
6434     snew(dd,1);
6435
6436     dd->comm = init_dd_comm();
6437     comm = dd->comm;
6438     snew(comm->cggl_flag,DIM*2);
6439     snew(comm->cgcm_state,DIM*2);
6440
6441     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6442     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6443     
6444     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6445     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6446     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6447     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6448     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6449     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6450     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6451     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6452
6453     dd->pme_recv_f_alloc = 0;
6454     dd->pme_recv_f_buf = NULL;
6455
6456     if (dd->bSendRecv2 && fplog)
6457     {
6458         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6459     }
6460     if (comm->eFlop)
6461     {
6462         if (fplog)
6463         {
6464             fprintf(fplog,"Will load balance based on FLOP count\n");
6465         }
6466         if (comm->eFlop > 1)
6467         {
6468             srand(1+cr->nodeid);
6469         }
6470         comm->bRecordLoad = TRUE;
6471     }
6472     else
6473     {
6474         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6475                              
6476     }
6477     
6478     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6479     
6480     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6481     if (fplog)
6482     {
6483         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6484     }
6485     dd->bGridJump = comm->bDynLoadBal;
6486     
6487     if (comm->nstSortCG)
6488     {
6489         if (fplog)
6490         {
6491             if (comm->nstSortCG == 1)
6492             {
6493                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6494             }
6495             else
6496             {
6497                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6498                         comm->nstSortCG);
6499             }
6500         }
6501         snew(comm->sort,1);
6502     }
6503     else
6504     {
6505         if (fplog)
6506         {
6507             fprintf(fplog,"Will not sort the charge groups\n");
6508         }
6509     }
6510
6511     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6512     
6513     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6514     if (comm->bInterCGBondeds)
6515     {
6516         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6517     }
6518     else
6519     {
6520         comm->bInterCGMultiBody = FALSE;
6521     }
6522     
6523     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6524     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6525
6526     if (ir->rlistlong == 0)
6527     {
6528         /* Set the cut-off to some very large value,
6529          * so we don't need if statements everywhere in the code.
6530          * We use sqrt, since the cut-off is squared in some places.
6531          */
6532         comm->cutoff   = GMX_CUTOFF_INF;
6533     }
6534     else
6535     {
6536         comm->cutoff   = ir->rlistlong;
6537     }
6538     comm->cutoff_mbody = 0;
6539     
6540     comm->cellsize_limit = 0;
6541     comm->bBondComm = FALSE;
6542
6543     if (comm->bInterCGBondeds)
6544     {
6545         if (comm_distance_min > 0)
6546         {
6547             comm->cutoff_mbody = comm_distance_min;
6548             if (Flags & MD_DDBONDCOMM)
6549             {
6550                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6551             }
6552             else
6553             {
6554                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6555             }
6556             r_bonded_limit = comm->cutoff_mbody;
6557         }
6558         else if (ir->bPeriodicMols)
6559         {
6560             /* Can not easily determine the required cut-off */
6561             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6562             comm->cutoff_mbody = comm->cutoff/2;
6563             r_bonded_limit = comm->cutoff_mbody;
6564         }
6565         else
6566         {
6567             if (MASTER(cr))
6568             {
6569                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6570                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6571             }
6572             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6573             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6574
6575             /* We use an initial margin of 10% for the minimum cell size,
6576              * except when we are just below the non-bonded cut-off.
6577              */
6578             if (Flags & MD_DDBONDCOMM)
6579             {
6580                 if (max(r_2b,r_mb) > comm->cutoff)
6581                 {
6582                     r_bonded       = max(r_2b,r_mb);
6583                     r_bonded_limit = 1.1*r_bonded;
6584                     comm->bBondComm = TRUE;
6585                 }
6586                 else
6587                 {
6588                     r_bonded       = r_mb;
6589                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6590                 }
6591                 /* We determine cutoff_mbody later */
6592             }
6593             else
6594             {
6595                 /* No special bonded communication,
6596                  * simply increase the DD cut-off.
6597                  */
6598                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6599                 comm->cutoff_mbody = r_bonded_limit;
6600                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6601             }
6602         }
6603         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6604         if (fplog)
6605         {
6606             fprintf(fplog,
6607                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6608                     comm->cellsize_limit);
6609         }
6610     }
6611
6612     if (dd->bInterCGcons && rconstr <= 0)
6613     {
6614         /* There is a cell size limit due to the constraints (P-LINCS) */
6615         rconstr = constr_r_max(fplog,mtop,ir);
6616         if (fplog)
6617         {
6618             fprintf(fplog,
6619                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6620                     rconstr);
6621             if (rconstr > comm->cellsize_limit)
6622             {
6623                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6624             }
6625         }
6626     }
6627     else if (rconstr > 0 && fplog)
6628     {
6629         /* Here we do not check for dd->bInterCGcons,
6630          * because one can also set a cell size limit for virtual sites only
6631          * and at this point we don't know yet if there are intercg v-sites.
6632          */
6633         fprintf(fplog,
6634                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6635                 rconstr);
6636     }
6637     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6638
6639     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6640
6641     if (nc[XX] > 0)
6642     {
6643         copy_ivec(nc,dd->nc);
6644         set_dd_dim(fplog,dd);
6645         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6646
6647         if (cr->npmenodes == -1)
6648         {
6649             cr->npmenodes = 0;
6650         }
6651         acs = average_cellsize_min(dd,ddbox);
6652         if (acs < comm->cellsize_limit)
6653         {
6654             if (fplog)
6655             {
6656                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6657             }
6658             gmx_fatal_collective(FARGS,cr,NULL,
6659                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6660                                  acs,comm->cellsize_limit);
6661         }
6662     }
6663     else
6664     {
6665         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6666
6667         /* We need to choose the optimal DD grid and possibly PME nodes */
6668         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6669                                comm->eDLB!=edlbNO,dlb_scale,
6670                                comm->cellsize_limit,comm->cutoff,
6671                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6672         
6673         if (dd->nc[XX] == 0)
6674         {
6675             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6676             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6677                     !bC ? "-rdd" : "-rcon",
6678                     comm->eDLB!=edlbNO ? " or -dds" : "",
6679                     bC ? " or your LINCS settings" : "");
6680
6681             gmx_fatal_collective(FARGS,cr,NULL,
6682                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6683                                  "%s\n"
6684                                  "Look in the log file for details on the domain decomposition",
6685                                  cr->nnodes-cr->npmenodes,limit,buf);
6686         }
6687         set_dd_dim(fplog,dd);
6688     }
6689
6690     if (fplog)
6691     {
6692         fprintf(fplog,
6693                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6694                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6695     }
6696     
6697     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6698     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6699     {
6700         gmx_fatal_collective(FARGS,cr,NULL,
6701                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6702                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6703     }
6704     if (cr->npmenodes > dd->nnodes)
6705     {
6706         gmx_fatal_collective(FARGS,cr,NULL,
6707                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6708     }
6709     if (cr->npmenodes > 0)
6710     {
6711         comm->npmenodes = cr->npmenodes;
6712     }
6713     else
6714     {
6715         comm->npmenodes = dd->nnodes;
6716     }
6717
6718     if (EEL_PME(ir->coulombtype))
6719     {
6720         /* The following choices should match those
6721          * in comm_cost_est in domdec_setup.c.
6722          * Note that here the checks have to take into account
6723          * that the decomposition might occur in a different order than xyz
6724          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6725          * in which case they will not match those in comm_cost_est,
6726          * but since that is mainly for testing purposes that's fine.
6727          */
6728         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6729             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6730             getenv("GMX_PMEONEDD") == NULL)
6731         {
6732             comm->npmedecompdim = 2;
6733             comm->npmenodes_x   = dd->nc[XX];
6734             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6735         }
6736         else
6737         {
6738             /* In case nc is 1 in both x and y we could still choose to
6739              * decompose pme in y instead of x, but we use x for simplicity.
6740              */
6741             comm->npmedecompdim = 1;
6742             if (dd->dim[0] == YY)
6743             {
6744                 comm->npmenodes_x = 1;
6745                 comm->npmenodes_y = comm->npmenodes;
6746             }
6747             else
6748             {
6749                 comm->npmenodes_x = comm->npmenodes;
6750                 comm->npmenodes_y = 1;
6751             }
6752         }    
6753         if (fplog)
6754         {
6755             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6756                     comm->npmenodes_x,comm->npmenodes_y,1);
6757         }
6758     }
6759     else
6760     {
6761         comm->npmedecompdim = 0;
6762         comm->npmenodes_x   = 0;
6763         comm->npmenodes_y   = 0;
6764     }
6765     
6766     /* Technically we don't need both of these,
6767      * but it simplifies code not having to recalculate it.
6768      */
6769     *npme_x = comm->npmenodes_x;
6770     *npme_y = comm->npmenodes_y;
6771         
6772     snew(comm->slb_frac,DIM);
6773     if (comm->eDLB == edlbNO)
6774     {
6775         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6776         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6777         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6778     }
6779
6780     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6781     {
6782         if (comm->bBondComm || comm->eDLB != edlbNO)
6783         {
6784             /* Set the bonded communication distance to halfway
6785              * the minimum and the maximum,
6786              * since the extra communication cost is nearly zero.
6787              */
6788             acs = average_cellsize_min(dd,ddbox);
6789             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6790             if (comm->eDLB != edlbNO)
6791             {
6792                 /* Check if this does not limit the scaling */
6793                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6794             }
6795             if (!comm->bBondComm)
6796             {
6797                 /* Without bBondComm do not go beyond the n.b. cut-off */
6798                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6799                 if (comm->cellsize_limit >= comm->cutoff)
6800                 {
6801                     /* We don't loose a lot of efficieny
6802                      * when increasing it to the n.b. cut-off.
6803                      * It can even be slightly faster, because we need
6804                      * less checks for the communication setup.
6805                      */
6806                     comm->cutoff_mbody = comm->cutoff;
6807                 }
6808             }
6809             /* Check if we did not end up below our original limit */
6810             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6811
6812             if (comm->cutoff_mbody > comm->cellsize_limit)
6813             {
6814                 comm->cellsize_limit = comm->cutoff_mbody;
6815             }
6816         }
6817         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6818     }
6819
6820     if (debug)
6821     {
6822         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6823                 "cellsize limit %f\n",
6824                 comm->bBondComm,comm->cellsize_limit);
6825     }
6826     
6827     if (MASTER(cr))
6828     {
6829         check_dd_restrictions(cr,dd,ir,fplog);
6830     }
6831
6832     comm->partition_step = INT_MIN;
6833     dd->ddp_count = 0;
6834
6835     clear_dd_cycle_counts(dd);
6836
6837     return dd;
6838 }
6839
6840 static void set_dlb_limits(gmx_domdec_t *dd)
6841
6842 {
6843     int d;
6844
6845     for(d=0; d<dd->ndim; d++)
6846     {
6847         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6848         dd->comm->cellsize_min[dd->dim[d]] =
6849             dd->comm->cellsize_min_dlb[dd->dim[d]];
6850     }
6851 }
6852
6853
6854 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6855 {
6856     gmx_domdec_t *dd;
6857     gmx_domdec_comm_t *comm;
6858     real cellsize_min;
6859     int  d,nc,i;
6860     char buf[STRLEN];
6861     
6862     dd = cr->dd;
6863     comm = dd->comm;
6864     
6865     if (fplog)
6866     {
6867         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6868     }
6869
6870     cellsize_min = comm->cellsize_min[dd->dim[0]];
6871     for(d=1; d<dd->ndim; d++)
6872     {
6873         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6874     }
6875
6876     if (cellsize_min < comm->cellsize_limit*1.05)
6877     {
6878         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6879
6880         /* Change DLB from "auto" to "no". */
6881         comm->eDLB = edlbNO;
6882
6883         return;
6884     }
6885
6886     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6887     comm->bDynLoadBal = TRUE;
6888     dd->bGridJump = TRUE;
6889     
6890     set_dlb_limits(dd);
6891
6892     /* We can set the required cell size info here,
6893      * so we do not need to communicate this.
6894      * The grid is completely uniform.
6895      */
6896     for(d=0; d<dd->ndim; d++)
6897     {
6898         if (comm->root[d])
6899         {
6900             comm->load[d].sum_m = comm->load[d].sum;
6901
6902             nc = dd->nc[dd->dim[d]];
6903             for(i=0; i<nc; i++)
6904             {
6905                 comm->root[d]->cell_f[i]    = i/(real)nc;
6906                 if (d > 0)
6907                 {
6908                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6909                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6910                 }
6911             }
6912             comm->root[d]->cell_f[nc] = 1.0;
6913         }
6914     }
6915 }
6916
6917 static char *init_bLocalCG(gmx_mtop_t *mtop)
6918 {
6919     int  ncg,cg;
6920     char *bLocalCG;
6921     
6922     ncg = ncg_mtop(mtop);
6923     snew(bLocalCG,ncg);
6924     for(cg=0; cg<ncg; cg++)
6925     {
6926         bLocalCG[cg] = FALSE;
6927     }
6928
6929     return bLocalCG;
6930 }
6931
6932 void dd_init_bondeds(FILE *fplog,
6933                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6934                      gmx_vsite_t *vsite,gmx_constr_t constr,
6935                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6936 {
6937     gmx_domdec_comm_t *comm;
6938     gmx_bool bBondComm;
6939     int  d;
6940
6941     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6942
6943     comm = dd->comm;
6944
6945     if (comm->bBondComm)
6946     {
6947         /* Communicate atoms beyond the cut-off for bonded interactions */
6948         comm = dd->comm;
6949
6950         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6951
6952         comm->bLocalCG = init_bLocalCG(mtop);
6953     }
6954     else
6955     {
6956         /* Only communicate atoms based on cut-off */
6957         comm->cglink   = NULL;
6958         comm->bLocalCG = NULL;
6959     }
6960 }
6961
6962 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6963                               t_inputrec *ir,
6964                               gmx_bool bDynLoadBal,real dlb_scale,
6965                               gmx_ddbox_t *ddbox)
6966 {
6967     gmx_domdec_comm_t *comm;
6968     int  d;
6969     ivec np;
6970     real limit,shrink;
6971     char buf[64];
6972
6973     if (fplog == NULL)
6974     {
6975         return;
6976     }
6977
6978     comm = dd->comm;
6979
6980     if (bDynLoadBal)
6981     {
6982         fprintf(fplog,"The maximum number of communication pulses is:");
6983         for(d=0; d<dd->ndim; d++)
6984         {
6985             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6986         }
6987         fprintf(fplog,"\n");
6988         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6989         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6990         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6991         for(d=0; d<DIM; d++)
6992         {
6993             if (dd->nc[d] > 1)
6994             {
6995                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6996                 {
6997                     shrink = 0;
6998                 }
6999                 else
7000                 {
7001                     shrink =
7002                         comm->cellsize_min_dlb[d]/
7003                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7004                 }
7005                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
7006             }
7007         }
7008         fprintf(fplog,"\n");
7009     }
7010     else
7011     {
7012         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
7013         fprintf(fplog,"The initial number of communication pulses is:");
7014         for(d=0; d<dd->ndim; d++)
7015         {
7016             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7017         }
7018         fprintf(fplog,"\n");
7019         fprintf(fplog,"The initial domain decomposition cell size is:");
7020         for(d=0; d<DIM; d++) {
7021             if (dd->nc[d] > 1)
7022             {
7023                 fprintf(fplog," %c %.2f nm",
7024                         dim2char(d),dd->comm->cellsize_min[d]);
7025             }
7026         }
7027         fprintf(fplog,"\n\n");
7028     }
7029     
7030     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7031     {
7032         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7033         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7034                 "non-bonded interactions","",comm->cutoff);
7035
7036         if (bDynLoadBal)
7037         {
7038             limit = dd->comm->cellsize_limit;
7039         }
7040         else
7041         {
7042             if (dynamic_dd_box(ddbox,ir))
7043             {
7044                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7045             }
7046             limit = dd->comm->cellsize_min[XX];
7047             for(d=1; d<DIM; d++)
7048             {
7049                 limit = min(limit,dd->comm->cellsize_min[d]);
7050             }
7051         }
7052
7053         if (comm->bInterCGBondeds)
7054         {
7055             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7056                     "two-body bonded interactions","(-rdd)",
7057                     max(comm->cutoff,comm->cutoff_mbody));
7058             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7059                     "multi-body bonded interactions","(-rdd)",
7060                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7061         }
7062         if (dd->vsite_comm)
7063         {
7064             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7065                     "virtual site constructions","(-rcon)",limit);
7066         }
7067         if (dd->constraint_comm)
7068         {
7069             sprintf(buf,"atoms separated by up to %d constraints",
7070                     1+ir->nProjOrder);
7071             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7072                     buf,"(-rcon)",limit);
7073         }
7074         fprintf(fplog,"\n");
7075     }
7076     
7077     fflush(fplog);
7078 }
7079
7080 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7081                                 real dlb_scale,
7082                                 const t_inputrec *ir,
7083                                 const gmx_ddbox_t *ddbox)
7084 {
7085     gmx_domdec_comm_t *comm;
7086     int  d,dim,npulse,npulse_d_max,npulse_d;
7087     gmx_bool bNoCutOff;
7088
7089     comm = dd->comm;
7090
7091     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7092
7093     /* Determine the maximum number of comm. pulses in one dimension */
7094         
7095     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7096         
7097     /* Determine the maximum required number of grid pulses */
7098     if (comm->cellsize_limit >= comm->cutoff)
7099     {
7100         /* Only a single pulse is required */
7101         npulse = 1;
7102     }
7103     else if (!bNoCutOff && comm->cellsize_limit > 0)
7104     {
7105         /* We round down slightly here to avoid overhead due to the latency
7106          * of extra communication calls when the cut-off
7107          * would be only slightly longer than the cell size.
7108          * Later cellsize_limit is redetermined,
7109          * so we can not miss interactions due to this rounding.
7110          */
7111         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7112     }
7113     else
7114     {
7115         /* There is no cell size limit */
7116         npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7117     }
7118
7119     if (!bNoCutOff && npulse > 1)
7120     {
7121         /* See if we can do with less pulses, based on dlb_scale */
7122         npulse_d_max = 0;
7123         for(d=0; d<dd->ndim; d++)
7124         {
7125             dim = dd->dim[d];
7126             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7127                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7128             npulse_d_max = max(npulse_d_max,npulse_d);
7129         }
7130         npulse = min(npulse,npulse_d_max);
7131     }
7132
7133     /* This env var can override npulse */
7134     d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7135     if (d > 0)
7136     {
7137         npulse = d;
7138     }
7139
7140     comm->maxpulse = 1;
7141     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7142     for(d=0; d<dd->ndim; d++)
7143     {
7144         comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7145         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7146         snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7147         comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7148         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7149         {
7150             comm->bVacDLBNoLimit = FALSE;
7151         }
7152     }
7153
7154     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7155     if (!comm->bVacDLBNoLimit)
7156     {
7157         comm->cellsize_limit = max(comm->cellsize_limit,
7158                                    comm->cutoff/comm->maxpulse);
7159     }
7160     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7161     /* Set the minimum cell size for each DD dimension */
7162     for(d=0; d<dd->ndim; d++)
7163     {
7164         if (comm->bVacDLBNoLimit ||
7165             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7166         {
7167             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7168         }
7169         else
7170         {
7171             comm->cellsize_min_dlb[dd->dim[d]] =
7172                 comm->cutoff/comm->cd[d].np_dlb;
7173         }
7174     }
7175     if (comm->cutoff_mbody <= 0)
7176     {
7177         comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7178     }
7179     if (comm->bDynLoadBal)
7180     {
7181         set_dlb_limits(dd);
7182     }
7183 }
7184
7185 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7186 {
7187     /* If each molecule is a single charge group
7188      * or we use domain decomposition for each periodic dimension,
7189      * we do not need to take pbc into account for the bonded interactions.
7190      */
7191     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7192             !(dd->nc[XX]>1 &&
7193               dd->nc[YY]>1 &&
7194               (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7195 }
7196
7197 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7198                        t_inputrec *ir,t_forcerec *fr,
7199                        gmx_ddbox_t *ddbox)
7200 {
7201     gmx_domdec_comm_t *comm;
7202     int  natoms_tot;
7203     real vol_frac;
7204
7205     comm = dd->comm;
7206
7207     /* Initialize the thread data.
7208      * This can not be done in init_domain_decomposition,
7209      * as the numbers of threads is determined later.
7210      */
7211     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7212     if (comm->nth > 1)
7213     {
7214         snew(comm->dth,comm->nth);
7215     }
7216
7217     if (EEL_PME(ir->coulombtype))
7218     {
7219         init_ddpme(dd,&comm->ddpme[0],0);
7220         if (comm->npmedecompdim >= 2)
7221         {
7222             init_ddpme(dd,&comm->ddpme[1],1);
7223         }
7224     }
7225     else
7226     {
7227         comm->npmenodes = 0;
7228         if (dd->pme_nodeid >= 0)
7229         {
7230             gmx_fatal_collective(FARGS,NULL,dd,
7231                                  "Can not have separate PME nodes without PME electrostatics");
7232         }
7233     }
7234         
7235     if (debug)
7236     {
7237         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7238     }
7239     if (comm->eDLB != edlbNO)
7240     {
7241         set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7242     }
7243     
7244     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7245     if (comm->eDLB == edlbAUTO)
7246     {
7247         if (fplog)
7248         {
7249             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7250         }
7251         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7252     }
7253
7254     if (ir->ePBC == epbcNONE)
7255     {
7256         vol_frac = 1 - 1/(double)dd->nnodes;
7257     }
7258     else
7259     {
7260         vol_frac =
7261             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7262     }
7263     if (debug)
7264     {
7265         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7266     }
7267     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7268    
7269     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7270 }
7271
7272 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7273                           real cutoff_req)
7274 {
7275     gmx_domdec_t *dd;
7276     gmx_ddbox_t ddbox;
7277     int d,dim,np;
7278     real inv_cell_size;
7279     int LocallyLimited;
7280
7281     dd = cr->dd;
7282
7283     set_ddbox(dd,FALSE,cr,ir,state->box,
7284               TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7285
7286     LocallyLimited = 0;
7287
7288     for(d=0; d<dd->ndim; d++)
7289     {
7290         dim = dd->dim[d];
7291
7292         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7293         if (dynamic_dd_box(&ddbox,ir))
7294         {
7295             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7296         }
7297
7298         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7299
7300         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7301             dd->comm->cd[d].np_dlb > 0)
7302         {
7303             if (np > dd->comm->cd[d].np_dlb)
7304             {
7305                 return FALSE;
7306             }
7307
7308             /* If a current local cell size is smaller than the requested
7309              * cut-off, we could still fix it, but this gets very complicated.
7310              * Without fixing here, we might actually need more checks.
7311              */
7312             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7313             {
7314                 LocallyLimited = 1;
7315             }
7316         }
7317     }
7318
7319     if (dd->comm->eDLB != edlbNO)
7320     {
7321         /* If DLB is not active yet, we don't need to check the grid jumps.
7322          * Actually we shouldn't, because then the grid jump data is not set.
7323          */
7324         if (dd->comm->bDynLoadBal &&
7325             check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7326         {
7327             LocallyLimited = 1; 
7328         }
7329
7330         gmx_sumi(1,&LocallyLimited,cr);
7331
7332         if (LocallyLimited > 0)
7333         {
7334             return FALSE;
7335         }
7336     }
7337
7338     dd->comm->cutoff = cutoff_req;
7339
7340     return TRUE;
7341 }
7342
7343 static void merge_cg_buffers(int ncell,
7344                              gmx_domdec_comm_dim_t *cd, int pulse,
7345                              int  *ncg_cell,
7346                              int  *index_gl, int  *recv_i,
7347                              rvec *cg_cm,    rvec *recv_vr,
7348                              int *cgindex,
7349                              cginfo_mb_t *cginfo_mb,int *cginfo)
7350 {
7351     gmx_domdec_ind_t *ind,*ind_p;
7352     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7353     int shift,shift_at;
7354     
7355     ind = &cd->ind[pulse];
7356     
7357     /* First correct the already stored data */
7358     shift = ind->nrecv[ncell];
7359     for(cell=ncell-1; cell>=0; cell--)
7360     {
7361         shift -= ind->nrecv[cell];
7362         if (shift > 0)
7363         {
7364             /* Move the cg's present from previous grid pulses */
7365             cg0 = ncg_cell[ncell+cell];
7366             cg1 = ncg_cell[ncell+cell+1];
7367             cgindex[cg1+shift] = cgindex[cg1];
7368             for(cg=cg1-1; cg>=cg0; cg--)
7369             {
7370                 index_gl[cg+shift] = index_gl[cg];
7371                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7372                 cgindex[cg+shift] = cgindex[cg];
7373                 cginfo[cg+shift] = cginfo[cg];
7374             }
7375             /* Correct the already stored send indices for the shift */
7376             for(p=1; p<=pulse; p++)
7377             {
7378                 ind_p = &cd->ind[p];
7379                 cg0 = 0;
7380                 for(c=0; c<cell; c++)
7381                 {
7382                     cg0 += ind_p->nsend[c];
7383                 }
7384                 cg1 = cg0 + ind_p->nsend[cell];
7385                 for(cg=cg0; cg<cg1; cg++)
7386                 {
7387                     ind_p->index[cg] += shift;
7388                 }
7389             }
7390         }
7391     }
7392
7393     /* Merge in the communicated buffers */
7394     shift = 0;
7395     shift_at = 0;
7396     cg0 = 0;
7397     for(cell=0; cell<ncell; cell++)
7398     {
7399         cg1 = ncg_cell[ncell+cell+1] + shift;
7400         if (shift_at > 0)
7401         {
7402             /* Correct the old cg indices */
7403             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7404             {
7405                 cgindex[cg+1] += shift_at;
7406             }
7407         }
7408         for(cg=0; cg<ind->nrecv[cell]; cg++)
7409         {
7410             /* Copy this charge group from the buffer */
7411             index_gl[cg1] = recv_i[cg0];
7412             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7413             /* Add it to the cgindex */
7414             cg_gl = index_gl[cg1];
7415             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7416             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7417             cgindex[cg1+1] = cgindex[cg1] + nat;
7418             cg0++;
7419             cg1++;
7420             shift_at += nat;
7421         }
7422         shift += ind->nrecv[cell];
7423         ncg_cell[ncell+cell+1] = cg1;
7424     }
7425 }
7426
7427 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7428                                int nzone,int cg0,const int *cgindex)
7429 {
7430     int cg,zone,p;
7431     
7432     /* Store the atom block boundaries for easy copying of communication buffers
7433      */
7434     cg = cg0;
7435     for(zone=0; zone<nzone; zone++)
7436     {
7437         for(p=0; p<cd->np; p++) {
7438             cd->ind[p].cell2at0[zone] = cgindex[cg];
7439             cg += cd->ind[p].nrecv[zone];
7440             cd->ind[p].cell2at1[zone] = cgindex[cg];
7441         }
7442     }
7443 }
7444
7445 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7446 {
7447     int  i;
7448     gmx_bool bMiss;
7449
7450     bMiss = FALSE;
7451     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7452     {
7453         if (!bLocalCG[link->a[i]])
7454         {
7455             bMiss = TRUE;
7456         }
7457     }
7458
7459     return bMiss;
7460 }
7461
7462 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7463 typedef struct {
7464     real c[DIM][4]; /* the corners for the non-bonded communication */
7465     real cr0;       /* corner for rounding */
7466     real cr1[4];    /* corners for rounding */
7467     real bc[DIM];   /* corners for bounded communication */
7468     real bcr1;      /* corner for rounding for bonded communication */
7469 } dd_corners_t;
7470
7471 /* Determine the corners of the domain(s) we are communicating with */
7472 static void
7473 set_dd_corners(const gmx_domdec_t *dd,
7474                int dim0, int dim1, int dim2,
7475                gmx_bool bDistMB,
7476                dd_corners_t *c)
7477 {
7478     const gmx_domdec_comm_t *comm;
7479     const gmx_domdec_zones_t *zones;
7480     int i,j;
7481
7482     comm = dd->comm;
7483
7484     zones = &comm->zones;
7485
7486     /* Keep the compiler happy */
7487     c->cr0  = 0;
7488     c->bcr1 = 0;
7489
7490     /* The first dimension is equal for all cells */
7491     c->c[0][0] = comm->cell_x0[dim0];
7492     if (bDistMB)
7493     {
7494         c->bc[0] = c->c[0][0];
7495     }
7496     if (dd->ndim >= 2)
7497     {
7498         dim1 = dd->dim[1];
7499         /* This cell row is only seen from the first row */
7500         c->c[1][0] = comm->cell_x0[dim1];
7501         /* All rows can see this row */
7502         c->c[1][1] = comm->cell_x0[dim1];
7503         if (dd->bGridJump)
7504         {
7505             c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7506             if (bDistMB)
7507             {
7508                 /* For the multi-body distance we need the maximum */
7509                 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7510             }
7511         }
7512         /* Set the upper-right corner for rounding */
7513         c->cr0 = comm->cell_x1[dim0];
7514         
7515         if (dd->ndim >= 3)
7516         {
7517             dim2 = dd->dim[2];
7518             for(j=0; j<4; j++)
7519             {
7520                 c->c[2][j] = comm->cell_x0[dim2];
7521             }
7522             if (dd->bGridJump)
7523             {
7524                 /* Use the maximum of the i-cells that see a j-cell */
7525                 for(i=0; i<zones->nizone; i++)
7526                 {
7527                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7528                     {
7529                         if (j >= 4)
7530                         {
7531                             c->c[2][j-4] =
7532                                 max(c->c[2][j-4],
7533                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7534                         }
7535                     }
7536                 }
7537                 if (bDistMB)
7538                 {
7539                     /* For the multi-body distance we need the maximum */
7540                     c->bc[2] = comm->cell_x0[dim2];
7541                     for(i=0; i<2; i++)
7542                     {
7543                         for(j=0; j<2; j++)
7544                         {
7545                             c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7546                         }
7547                     }
7548                 }
7549             }
7550             
7551             /* Set the upper-right corner for rounding */
7552             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7553              * Only cell (0,0,0) can see cell 7 (1,1,1)
7554              */
7555             c->cr1[0] = comm->cell_x1[dim1];
7556             c->cr1[3] = comm->cell_x1[dim1];
7557             if (dd->bGridJump)
7558             {
7559                 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7560                 if (bDistMB)
7561                 {
7562                     /* For the multi-body distance we need the maximum */
7563                     c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7564                 }
7565             }
7566         }
7567     }
7568 }
7569
7570 /* Determine which cg's we need to send in this pulse from this zone */
7571 static void
7572 get_zone_pulse_cgs(gmx_domdec_t *dd,
7573                    int zonei, int zone,
7574                    int cg0, int cg1,
7575                    const int *index_gl,
7576                    const int *cgindex,
7577                    int dim, int dim_ind,
7578                    int dim0, int dim1, int dim2,
7579                    real r_comm2, real r_bcomm2,
7580                    matrix box,
7581                    ivec tric_dist,
7582                    rvec *normal,
7583                    real skew_fac2_d, real skew_fac_01,
7584                    rvec *v_d, rvec *v_0, rvec *v_1,
7585                    const dd_corners_t *c,
7586                    rvec sf2_round,
7587                    gmx_bool bDistBonded,
7588                    gmx_bool bBondComm,
7589                    gmx_bool bDist2B,
7590                    gmx_bool bDistMB,
7591                    rvec *cg_cm,
7592                    int *cginfo,
7593                    gmx_domdec_ind_t *ind,
7594                    int **ibuf, int *ibuf_nalloc,
7595                    vec_rvec_t *vbuf,
7596                    int *nsend_ptr,
7597                    int *nat_ptr,
7598                    int *nsend_z_ptr)
7599 {
7600     gmx_domdec_comm_t *comm;
7601     gmx_bool bScrew;
7602     gmx_bool bDistMB_pulse;
7603     int  cg,i;
7604     real r2,rb2,r,tric_sh;
7605     rvec rn,rb;
7606     int  dimd;
7607     int  nsend_z,nsend,nat;
7608
7609     comm = dd->comm;
7610
7611     bScrew = (dd->bScrewPBC && dim == XX);
7612
7613     bDistMB_pulse = (bDistMB && bDistBonded);
7614
7615     nsend_z = 0;
7616     nsend   = *nsend_ptr;
7617     nat     = *nat_ptr;
7618
7619     for(cg=cg0; cg<cg1; cg++)
7620     {
7621         r2  = 0;
7622         rb2 = 0;
7623         if (tric_dist[dim_ind] == 0)
7624         {
7625             /* Rectangular direction, easy */
7626             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7627             if (r > 0)
7628             {
7629                 r2 += r*r;
7630             }
7631             if (bDistMB_pulse)
7632             {
7633                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7634                 if (r > 0)
7635                 {
7636                     rb2 += r*r;
7637                 }
7638             }
7639             /* Rounding gives at most a 16% reduction
7640              * in communicated atoms
7641              */
7642             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7643             {
7644                 r = cg_cm[cg][dim0] - c->cr0;
7645                 /* This is the first dimension, so always r >= 0 */
7646                 r2 += r*r;
7647                 if (bDistMB_pulse)
7648                 {
7649                     rb2 += r*r;
7650                 }
7651             }
7652             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7653             {
7654                 r = cg_cm[cg][dim1] - c->cr1[zone];
7655                 if (r > 0)
7656                 {
7657                     r2 += r*r;
7658                 }
7659                 if (bDistMB_pulse)
7660                 {
7661                     r = cg_cm[cg][dim1] - c->bcr1;
7662                     if (r > 0)
7663                     {
7664                         rb2 += r*r;
7665                     }
7666                 }
7667             }
7668         }
7669         else
7670         {
7671             /* Triclinic direction, more complicated */
7672             clear_rvec(rn);
7673             clear_rvec(rb);
7674             /* Rounding, conservative as the skew_fac multiplication
7675              * will slightly underestimate the distance.
7676              */
7677             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7678             {
7679                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7680                 for(i=dim0+1; i<DIM; i++)
7681                 {
7682                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7683                 }
7684                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7685                 if (bDistMB_pulse)
7686                 {
7687                     rb[dim0] = rn[dim0];
7688                     rb2 = r2;
7689                 }
7690                 /* Take care that the cell planes along dim0 might not
7691                  * be orthogonal to those along dim1 and dim2.
7692                  */
7693                 for(i=1; i<=dim_ind; i++)
7694                 {
7695                     dimd = dd->dim[i];
7696                     if (normal[dim0][dimd] > 0)
7697                     {
7698                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7699                         if (bDistMB_pulse)
7700                         {
7701                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7702                         }
7703                     }
7704                 }
7705             }
7706             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7707             {
7708                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7709                 tric_sh = 0;
7710                 for(i=dim1+1; i<DIM; i++)
7711                 {
7712                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7713                 }
7714                 rn[dim1] += tric_sh;
7715                 if (rn[dim1] > 0)
7716                 {
7717                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7718                     /* Take care of coupling of the distances
7719                      * to the planes along dim0 and dim1 through dim2.
7720                      */
7721                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7722                     /* Take care that the cell planes along dim1
7723                      * might not be orthogonal to that along dim2.
7724                      */
7725                     if (normal[dim1][dim2] > 0)
7726                     {
7727                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7728                     }
7729                 }
7730                 if (bDistMB_pulse)
7731                 {
7732                     rb[dim1] +=
7733                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7734                     if (rb[dim1] > 0)
7735                     {
7736                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7737                         /* Take care of coupling of the distances
7738                          * to the planes along dim0 and dim1 through dim2.
7739                          */
7740                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7741                         /* Take care that the cell planes along dim1
7742                          * might not be orthogonal to that along dim2.
7743                          */
7744                         if (normal[dim1][dim2] > 0)
7745                         {
7746                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7747                         }
7748                     }
7749                 }
7750             }
7751             /* The distance along the communication direction */
7752             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7753             tric_sh = 0;
7754             for(i=dim+1; i<DIM; i++)
7755             {
7756                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7757             }
7758             rn[dim] += tric_sh;
7759             if (rn[dim] > 0)
7760             {
7761                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7762                 /* Take care of coupling of the distances
7763                  * to the planes along dim0 and dim1 through dim2.
7764                  */
7765                 if (dim_ind == 1 && zonei == 1)
7766                 {
7767                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7768                 }
7769             }
7770             if (bDistMB_pulse)
7771             {
7772                 clear_rvec(rb);
7773                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7774                 if (rb[dim] > 0)
7775                 {
7776                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7777                     /* Take care of coupling of the distances
7778                      * to the planes along dim0 and dim1 through dim2.
7779                      */
7780                     if (dim_ind == 1 && zonei == 1)
7781                     {
7782                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7783                     }
7784                 }
7785             }
7786         }
7787         
7788         if (r2 < r_comm2 ||
7789             (bDistBonded &&
7790              ((bDistMB && rb2 < r_bcomm2) ||
7791               (bDist2B && r2  < r_bcomm2)) &&
7792              (!bBondComm ||
7793               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7794                missing_link(comm->cglink,index_gl[cg],
7795                             comm->bLocalCG)))))
7796         {
7797             /* Make an index to the local charge groups */
7798             if (nsend+1 > ind->nalloc)
7799             {
7800                 ind->nalloc = over_alloc_large(nsend+1);
7801                 srenew(ind->index,ind->nalloc);
7802             }
7803             if (nsend+1 > *ibuf_nalloc)
7804             {
7805                 *ibuf_nalloc = over_alloc_large(nsend+1);
7806                 srenew(*ibuf,*ibuf_nalloc);
7807             }
7808             ind->index[nsend] = cg;
7809             (*ibuf)[nsend] = index_gl[cg];
7810             nsend_z++;
7811             vec_rvec_check_alloc(vbuf,nsend+1);
7812             
7813             if (dd->ci[dim] == 0)
7814             {
7815                 /* Correct cg_cm for pbc */
7816                 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7817                 if (bScrew)
7818                 {
7819                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7820                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7821                 }
7822             }
7823             else
7824             {
7825                 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7826             }
7827             nsend++;
7828             nat += cgindex[cg+1] - cgindex[cg];
7829         }
7830     }
7831
7832     *nsend_ptr   = nsend;
7833     *nat_ptr     = nat;
7834     *nsend_z_ptr = nsend_z;
7835 }
7836
7837 static void setup_dd_communication(gmx_domdec_t *dd,
7838                                    matrix box,gmx_ddbox_t *ddbox,
7839                                    t_forcerec *fr,t_state *state,rvec **f)
7840 {
7841     int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7842     int nzone,nzone_send,zone,zonei,cg0,cg1;
7843     int c,i,j,cg,cg_gl,nrcg;
7844     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7845     gmx_domdec_comm_t *comm;
7846     gmx_domdec_zones_t *zones;
7847     gmx_domdec_comm_dim_t *cd;
7848     gmx_domdec_ind_t *ind;
7849     cginfo_mb_t *cginfo_mb;
7850     gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7851     real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7852     dd_corners_t corners;
7853     ivec tric_dist;
7854     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7855     real skew_fac2_d,skew_fac_01;
7856     rvec sf2_round;
7857     int  nsend,nat;
7858     int  th;
7859     
7860     if (debug)
7861     {
7862         fprintf(debug,"Setting up DD communication\n");
7863     }
7864     
7865     comm  = dd->comm;
7866
7867     switch (fr->cutoff_scheme)
7868     {
7869     case ecutsGROUP:
7870         cg_cm = fr->cg_cm;
7871         break;
7872     case ecutsVERLET:
7873         cg_cm = state->x;
7874         break;
7875     default:
7876         gmx_incons("unimplemented");
7877         cg_cm = NULL;
7878     }
7879
7880     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7881     {
7882         dim = dd->dim[dim_ind];
7883
7884         /* Check if we need to use triclinic distances */
7885         tric_dist[dim_ind] = 0;
7886         for(i=0; i<=dim_ind; i++)
7887         {
7888             if (ddbox->tric_dir[dd->dim[i]])
7889             {
7890                 tric_dist[dim_ind] = 1;
7891             }
7892         }
7893     }
7894
7895     bBondComm = comm->bBondComm;
7896
7897     /* Do we need to determine extra distances for multi-body bondeds? */
7898     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7899     
7900     /* Do we need to determine extra distances for only two-body bondeds? */
7901     bDist2B = (bBondComm && !bDistMB);
7902
7903     r_comm2  = sqr(comm->cutoff);
7904     r_bcomm2 = sqr(comm->cutoff_mbody);
7905
7906     if (debug)
7907     {
7908         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7909     }
7910
7911     zones = &comm->zones;
7912     
7913     dim0 = dd->dim[0];
7914     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7915     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7916
7917     set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7918     
7919     /* Triclinic stuff */
7920     normal = ddbox->normal;
7921     skew_fac_01 = 0;
7922     if (dd->ndim >= 2)
7923     {
7924         v_0 = ddbox->v[dim0];
7925         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7926         {
7927             /* Determine the coupling coefficient for the distances
7928              * to the cell planes along dim0 and dim1 through dim2.
7929              * This is required for correct rounding.
7930              */
7931             skew_fac_01 =
7932                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7933             if (debug)
7934             {
7935                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7936             }
7937         }
7938     }
7939     if (dd->ndim >= 3)
7940     {
7941         v_1 = ddbox->v[dim1];
7942     }
7943     
7944     zone_cg_range = zones->cg_range;
7945     index_gl = dd->index_gl;
7946     cgindex  = dd->cgindex;
7947     cginfo_mb = fr->cginfo_mb;
7948     
7949     zone_cg_range[0]   = 0;
7950     zone_cg_range[1]   = dd->ncg_home;
7951     comm->zone_ncg1[0] = dd->ncg_home;
7952     pos_cg             = dd->ncg_home;
7953     
7954     nat_tot = dd->nat_home;
7955     nzone = 1;
7956     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7957     {
7958         dim = dd->dim[dim_ind];
7959         cd = &comm->cd[dim_ind];
7960         
7961         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7962         {
7963             /* No pbc in this dimension, the first node should not comm. */
7964             nzone_send = 0;
7965         }
7966         else
7967         {
7968             nzone_send = nzone;
7969         }
7970
7971         v_d = ddbox->v[dim];
7972         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7973
7974         cd->bInPlace = TRUE;
7975         for(p=0; p<cd->np; p++)
7976         {
7977             /* Only atoms communicated in the first pulse are used
7978              * for multi-body bonded interactions or for bBondComm.
7979              */
7980             bDistBonded = ((bDistMB || bDist2B) && p == 0);
7981
7982             ind = &cd->ind[p];
7983             nsend = 0;
7984             nat = 0;
7985             for(zone=0; zone<nzone_send; zone++)
7986             {
7987                 if (tric_dist[dim_ind] && dim_ind > 0)
7988                 {
7989                     /* Determine slightly more optimized skew_fac's
7990                      * for rounding.
7991                      * This reduces the number of communicated atoms
7992                      * by about 10% for 3D DD of rhombic dodecahedra.
7993                      */
7994                     for(dimd=0; dimd<dim; dimd++)
7995                     {
7996                         sf2_round[dimd] = 1;
7997                         if (ddbox->tric_dir[dimd])
7998                         {
7999                             for(i=dd->dim[dimd]+1; i<DIM; i++)
8000                             {
8001                                 /* If we are shifted in dimension i
8002                                  * and the cell plane is tilted forward
8003                                  * in dimension i, skip this coupling.
8004                                  */
8005                                 if (!(zones->shift[nzone+zone][i] &&
8006                                       ddbox->v[dimd][i][dimd] >= 0))
8007                                 {
8008                                     sf2_round[dimd] +=
8009                                         sqr(ddbox->v[dimd][i][dimd]);
8010                                 }
8011                             }
8012                             sf2_round[dimd] = 1/sf2_round[dimd];
8013                         }
8014                     }
8015                 }
8016
8017                 zonei = zone_perm[dim_ind][zone];
8018                 if (p == 0)
8019                 {
8020                     /* Here we permutate the zones to obtain a convenient order
8021                      * for neighbor searching
8022                      */
8023                     cg0 = zone_cg_range[zonei];
8024                     cg1 = zone_cg_range[zonei+1];
8025                 }
8026                 else
8027                 {
8028                     /* Look only at the cg's received in the previous grid pulse
8029                      */
8030                     cg1 = zone_cg_range[nzone+zone+1];
8031                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8032                 }
8033
8034 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8035                 for(th=0; th<comm->nth; th++)
8036                 {
8037                     gmx_domdec_ind_t *ind_p;
8038                     int **ibuf_p,*ibuf_nalloc_p;
8039                     vec_rvec_t *vbuf_p;
8040                     int *nsend_p,*nat_p;
8041                     int *nsend_zone_p;
8042                     int cg0_th,cg1_th;
8043
8044                     if (th == 0)
8045                     {
8046                         /* Thread 0 writes in the comm buffers */
8047                         ind_p         = ind;
8048                         ibuf_p        = &comm->buf_int;
8049                         ibuf_nalloc_p = &comm->nalloc_int;
8050                         vbuf_p        = &comm->vbuf;
8051                         nsend_p       = &nsend;
8052                         nat_p         = &nat;
8053                         nsend_zone_p  = &ind->nsend[zone];
8054                     }
8055                     else
8056                     {
8057                         /* Other threads write into temp buffers */
8058                         ind_p         = &comm->dth[th].ind;
8059                         ibuf_p        = &comm->dth[th].ibuf;
8060                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8061                         vbuf_p        = &comm->dth[th].vbuf;
8062                         nsend_p       = &comm->dth[th].nsend;
8063                         nat_p         = &comm->dth[th].nat;
8064                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8065
8066                         comm->dth[th].nsend      = 0;
8067                         comm->dth[th].nat        = 0;
8068                         comm->dth[th].nsend_zone = 0;
8069                     }
8070
8071                     if (comm->nth == 1)
8072                     {
8073                         cg0_th = cg0;
8074                         cg1_th = cg1;
8075                     }
8076                     else
8077                     {
8078                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8079                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8080                     }
8081                     
8082                     /* Get the cg's for this pulse in this zone */
8083                     get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8084                                        index_gl,cgindex,
8085                                        dim,dim_ind,dim0,dim1,dim2,
8086                                        r_comm2,r_bcomm2,
8087                                        box,tric_dist,
8088                                        normal,skew_fac2_d,skew_fac_01,
8089                                        v_d,v_0,v_1,&corners,sf2_round,
8090                                        bDistBonded,bBondComm,
8091                                        bDist2B,bDistMB,
8092                                        cg_cm,fr->cginfo,
8093                                        ind_p,
8094                                        ibuf_p,ibuf_nalloc_p,
8095                                        vbuf_p,
8096                                        nsend_p,nat_p,
8097                                        nsend_zone_p);
8098                 }
8099
8100                 /* Append data of threads>=1 to the communication buffers */
8101                 for(th=1; th<comm->nth; th++)
8102                 {
8103                     dd_comm_setup_work_t *dth;
8104                     int i,ns1;
8105
8106                     dth = &comm->dth[th];
8107
8108                     ns1 = nsend + dth->nsend_zone;
8109                     if (ns1 > ind->nalloc)
8110                     {
8111                         ind->nalloc = over_alloc_dd(ns1);
8112                         srenew(ind->index,ind->nalloc);
8113                     }
8114                     if (ns1 > comm->nalloc_int)
8115                     {
8116                         comm->nalloc_int = over_alloc_dd(ns1);
8117                         srenew(comm->buf_int,comm->nalloc_int);
8118                     }
8119                     if (ns1 > comm->vbuf.nalloc)
8120                     {
8121                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8122                         srenew(comm->vbuf.v,comm->vbuf.nalloc);
8123                     }
8124
8125                     for(i=0; i<dth->nsend_zone; i++)
8126                     {
8127                         ind->index[nsend] = dth->ind.index[i];
8128                         comm->buf_int[nsend] = dth->ibuf[i];
8129                         copy_rvec(dth->vbuf.v[i],
8130                                   comm->vbuf.v[nsend]);
8131                         nsend++;
8132                     }
8133                     nat              += dth->nat;
8134                     ind->nsend[zone] += dth->nsend_zone;
8135                 }
8136             }
8137             /* Clear the counts in case we do not have pbc */
8138             for(zone=nzone_send; zone<nzone; zone++)
8139             {
8140                 ind->nsend[zone] = 0;
8141             }
8142             ind->nsend[nzone]   = nsend;
8143             ind->nsend[nzone+1] = nat;
8144             /* Communicate the number of cg's and atoms to receive */
8145             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8146                             ind->nsend, nzone+2,
8147                             ind->nrecv, nzone+2);
8148             
8149             /* The rvec buffer is also required for atom buffers of size nsend
8150              * in dd_move_x and dd_move_f.
8151              */
8152             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8153
8154             if (p > 0)
8155             {
8156                 /* We can receive in place if only the last zone is not empty */
8157                 for(zone=0; zone<nzone-1; zone++)
8158                 {
8159                     if (ind->nrecv[zone] > 0)
8160                     {
8161                         cd->bInPlace = FALSE;
8162                     }
8163                 }
8164                 if (!cd->bInPlace)
8165                 {
8166                     /* The int buffer is only required here for the cg indices */
8167                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8168                     {
8169                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8170                         srenew(comm->buf_int2,comm->nalloc_int2);
8171                     }
8172                     /* The rvec buffer is also required for atom buffers
8173                      * of size nrecv in dd_move_x and dd_move_f.
8174                      */
8175                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8176                     vec_rvec_check_alloc(&comm->vbuf2,i);
8177                 }
8178             }
8179             
8180             /* Make space for the global cg indices */
8181             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8182                 || dd->cg_nalloc == 0)
8183             {
8184                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8185                 srenew(index_gl,dd->cg_nalloc);
8186                 srenew(cgindex,dd->cg_nalloc+1);
8187             }
8188             /* Communicate the global cg indices */
8189             if (cd->bInPlace)
8190             {
8191                 recv_i = index_gl + pos_cg;
8192             }
8193             else
8194             {
8195                 recv_i = comm->buf_int2;
8196             }
8197             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8198                             comm->buf_int, nsend,
8199                             recv_i,        ind->nrecv[nzone]);
8200
8201             /* Make space for cg_cm */
8202             dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8203             if (fr->cutoff_scheme == ecutsGROUP)
8204             {
8205                 cg_cm = fr->cg_cm;
8206             }
8207             else
8208             {
8209                 cg_cm = state->x;
8210             }
8211             /* Communicate cg_cm */
8212             if (cd->bInPlace)
8213             {
8214                 recv_vr = cg_cm + pos_cg;
8215             }
8216             else
8217             {
8218                 recv_vr = comm->vbuf2.v;
8219             }
8220             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8221                              comm->vbuf.v, nsend,
8222                              recv_vr,      ind->nrecv[nzone]);
8223             
8224             /* Make the charge group index */
8225             if (cd->bInPlace)
8226             {
8227                 zone = (p == 0 ? 0 : nzone - 1);
8228                 while (zone < nzone)
8229                 {
8230                     for(cg=0; cg<ind->nrecv[zone]; cg++)
8231                     {
8232                         cg_gl = index_gl[pos_cg];
8233                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8234                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8235                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8236                         if (bBondComm)
8237                         {
8238                             /* Update the charge group presence,
8239                              * so we can use it in the next pass of the loop.
8240                              */
8241                             comm->bLocalCG[cg_gl] = TRUE;
8242                         }
8243                         pos_cg++;
8244                     }
8245                     if (p == 0)
8246                     {
8247                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8248                     }
8249                     zone++;
8250                     zone_cg_range[nzone+zone] = pos_cg;
8251                 }
8252             }
8253             else
8254             {
8255                 /* This part of the code is never executed with bBondComm. */
8256                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8257                                  index_gl,recv_i,cg_cm,recv_vr,
8258                                  cgindex,fr->cginfo_mb,fr->cginfo);
8259                 pos_cg += ind->nrecv[nzone];
8260             }
8261             nat_tot += ind->nrecv[nzone+1];
8262         }
8263         if (!cd->bInPlace)
8264         {
8265             /* Store the atom block for easy copying of communication buffers */
8266             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8267         }
8268         nzone += nzone;
8269     }
8270     dd->index_gl = index_gl;
8271     dd->cgindex  = cgindex;
8272     
8273     dd->ncg_tot = zone_cg_range[zones->n];
8274     dd->nat_tot = nat_tot;
8275     comm->nat[ddnatHOME] = dd->nat_home;
8276     for(i=ddnatZONE; i<ddnatNR; i++)
8277     {
8278         comm->nat[i] = dd->nat_tot;
8279     }
8280
8281     if (!bBondComm)
8282     {
8283         /* We don't need to update cginfo, since that was alrady done above.
8284          * So we pass NULL for the forcerec.
8285          */
8286         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8287                       NULL,comm->bLocalCG);
8288     }
8289
8290     if (debug)
8291     {
8292         fprintf(debug,"Finished setting up DD communication, zones:");
8293         for(c=0; c<zones->n; c++)
8294         {
8295             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8296         }
8297         fprintf(debug,"\n");
8298     }
8299 }
8300
8301 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8302 {
8303     int c;
8304     
8305     for(c=0; c<zones->nizone; c++)
8306     {
8307         zones->izone[c].cg1  = zones->cg_range[c+1];
8308         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8309         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8310     }
8311 }
8312
8313 static void set_zones_size(gmx_domdec_t *dd,
8314                            matrix box,const gmx_ddbox_t *ddbox,
8315                            int zone_start,int zone_end)
8316 {
8317     gmx_domdec_comm_t *comm;
8318     gmx_domdec_zones_t *zones;
8319     gmx_bool bDistMB;
8320     int  z,zi,zj0,zj1,d,dim;
8321     real rcs,rcmbs;
8322     int  i,j;
8323     real size_j,add_tric;
8324     real vol;
8325
8326     comm = dd->comm;
8327
8328     zones = &comm->zones;
8329
8330     /* Do we need to determine extra distances for multi-body bondeds? */
8331     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8332
8333     for(z=zone_start; z<zone_end; z++)
8334     {
8335         /* Copy cell limits to zone limits.
8336          * Valid for non-DD dims and non-shifted dims.
8337          */
8338         copy_rvec(comm->cell_x0,zones->size[z].x0);
8339         copy_rvec(comm->cell_x1,zones->size[z].x1);
8340     }
8341
8342     for(d=0; d<dd->ndim; d++)
8343     {
8344         dim = dd->dim[d];
8345
8346         for(z=0; z<zones->n; z++)
8347         {
8348             /* With a staggered grid we have different sizes
8349              * for non-shifted dimensions.
8350              */
8351             if (dd->bGridJump && zones->shift[z][dim] == 0)
8352             {
8353                 if (d == 1)
8354                 {
8355                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8356                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8357                 }
8358                 else if (d == 2)
8359                 {
8360                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8361                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8362                 }
8363             }
8364         }
8365
8366         rcs   = comm->cutoff;
8367         rcmbs = comm->cutoff_mbody;
8368         if (ddbox->tric_dir[dim])
8369         {
8370             rcs   /= ddbox->skew_fac[dim];
8371             rcmbs /= ddbox->skew_fac[dim];
8372         }
8373
8374         /* Set the lower limit for the shifted zone dimensions */
8375         for(z=zone_start; z<zone_end; z++)
8376         {
8377             if (zones->shift[z][dim] > 0)
8378             {
8379                 dim = dd->dim[d];
8380                 if (!dd->bGridJump || d == 0)
8381                 {
8382                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8383                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8384                 }
8385                 else
8386                 {
8387                     /* Here we take the lower limit of the zone from
8388                      * the lowest domain of the zone below.
8389                      */
8390                     if (z < 4)
8391                     {
8392                         zones->size[z].x0[dim] =
8393                              comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8394                     }
8395                     else
8396                     {
8397                         if (d == 1)
8398                         {
8399                             zones->size[z].x0[dim] =
8400                                 zones->size[zone_perm[2][z-4]].x0[dim];
8401                         }
8402                         else
8403                         {
8404                             zones->size[z].x0[dim] =
8405                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8406                         }
8407                     }
8408                     /* A temporary limit, is updated below */
8409                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8410
8411                     if (bDistMB)
8412                     {
8413                         for(zi=0; zi<zones->nizone; zi++)
8414                         {
8415                             if (zones->shift[zi][dim] == 0)
8416                             {
8417                                 /* This takes the whole zone into account.
8418                                  * With multiple pulses this will lead
8419                                  * to a larger zone then strictly necessary.
8420                                  */
8421                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8422                                                              zones->size[zi].x1[dim]+rcmbs);
8423                             }
8424                         }
8425                     }
8426                 }
8427             }
8428         }
8429
8430         /* Loop over the i-zones to set the upper limit of each
8431          * j-zone they see.
8432          */
8433         for(zi=0; zi<zones->nizone; zi++)
8434         {
8435             if (zones->shift[zi][dim] == 0)
8436             {
8437                 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8438                 {
8439                     if (zones->shift[z][dim] > 0)
8440                     {
8441                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8442                                                      zones->size[zi].x1[dim]+rcs);
8443                     }
8444                 }
8445             }
8446         }
8447     }
8448
8449     for(z=zone_start; z<zone_end; z++)
8450     {
8451         /* Initialization only required to keep the compiler happy */
8452         rvec corner_min={0,0,0},corner_max={0,0,0},corner;
8453         int  nc,c;
8454
8455         /* To determine the bounding box for a zone we need to find
8456          * the extreme corners of 4, 2 or 1 corners.
8457          */
8458         nc = 1 << (ddbox->npbcdim - 1);
8459
8460         for(c=0; c<nc; c++)
8461         {
8462             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8463             corner[XX] = 0;
8464             if ((c & 1) == 0)
8465             {
8466                 corner[YY] = zones->size[z].x0[YY];
8467             }
8468             else
8469             {
8470                 corner[YY] = zones->size[z].x1[YY];
8471             }
8472             if ((c & 2) == 0)
8473             {
8474                 corner[ZZ] = zones->size[z].x0[ZZ];
8475             }
8476             else
8477             {
8478                 corner[ZZ] = zones->size[z].x1[ZZ];
8479             }
8480             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8481             {
8482                 /* With 1D domain decomposition the cg's are not in
8483                  * the triclinic box, but triclinic x-y and rectangular y-z.
8484                  * Shift y back, so it will later end up at 0.
8485                  */
8486                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8487             }
8488             /* Apply the triclinic couplings */
8489             for(i=YY; i<ddbox->npbcdim; i++)
8490             {
8491                 for(j=XX; j<i; j++)
8492                 {
8493                     corner[j] += corner[i]*box[i][j]/box[i][i];
8494                 }
8495             }
8496             if (c == 0)
8497             {
8498                 copy_rvec(corner,corner_min);
8499                 copy_rvec(corner,corner_max);
8500             }
8501             else
8502             {
8503                 for(i=0; i<DIM; i++)
8504                 {
8505                     corner_min[i] = min(corner_min[i],corner[i]);
8506                     corner_max[i] = max(corner_max[i],corner[i]);
8507                 }
8508             }
8509         }
8510         /* Copy the extreme cornes without offset along x */
8511         for(i=0; i<DIM; i++)
8512         {
8513             zones->size[z].bb_x0[i] = corner_min[i];
8514             zones->size[z].bb_x1[i] = corner_max[i];
8515         }
8516         /* Add the offset along x */
8517         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8518         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8519     }
8520
8521     if (zone_start == 0)
8522     {
8523         vol = 1;
8524         for(dim=0; dim<DIM; dim++)
8525         {
8526             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8527         }
8528         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8529     }
8530
8531     if (debug)
8532     {
8533         for(z=zone_start; z<zone_end; z++)
8534         {
8535             fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8536                     z,
8537                     zones->size[z].x0[XX],zones->size[z].x1[XX],
8538                     zones->size[z].x0[YY],zones->size[z].x1[YY],
8539                     zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8540             fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8541                     z,
8542                     zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8543                     zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8544                     zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8545         }
8546     }
8547 }
8548
8549 static int comp_cgsort(const void *a,const void *b)
8550 {
8551     int comp;
8552     
8553     gmx_cgsort_t *cga,*cgb;
8554     cga = (gmx_cgsort_t *)a;
8555     cgb = (gmx_cgsort_t *)b;
8556     
8557     comp = cga->nsc - cgb->nsc;
8558     if (comp == 0)
8559     {
8560         comp = cga->ind_gl - cgb->ind_gl;
8561     }
8562     
8563     return comp;
8564 }
8565
8566 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8567                          int *a,int *buf)
8568 {
8569     int i;
8570     
8571     /* Order the data */
8572     for(i=0; i<n; i++)
8573     {
8574         buf[i] = a[sort[i].ind];
8575     }
8576     
8577     /* Copy back to the original array */
8578     for(i=0; i<n; i++)
8579     {
8580         a[i] = buf[i];
8581     }
8582 }
8583
8584 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8585                          rvec *v,rvec *buf)
8586 {
8587     int i;
8588     
8589     /* Order the data */
8590     for(i=0; i<n; i++)
8591     {
8592         copy_rvec(v[sort[i].ind],buf[i]);
8593     }
8594     
8595     /* Copy back to the original array */
8596     for(i=0; i<n; i++)
8597     {
8598         copy_rvec(buf[i],v[i]);
8599     }
8600 }
8601
8602 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8603                            rvec *v,rvec *buf)
8604 {
8605     int a,atot,cg,cg0,cg1,i;
8606     
8607     if (cgindex == NULL)
8608     {
8609         /* Avoid the useless loop of the atoms within a cg */
8610         order_vec_cg(ncg,sort,v,buf);
8611
8612         return;
8613     }
8614
8615     /* Order the data */
8616     a = 0;
8617     for(cg=0; cg<ncg; cg++)
8618     {
8619         cg0 = cgindex[sort[cg].ind];
8620         cg1 = cgindex[sort[cg].ind+1];
8621         for(i=cg0; i<cg1; i++)
8622         {
8623             copy_rvec(v[i],buf[a]);
8624             a++;
8625         }
8626     }
8627     atot = a;
8628     
8629     /* Copy back to the original array */
8630     for(a=0; a<atot; a++)
8631     {
8632         copy_rvec(buf[a],v[a]);
8633     }
8634 }
8635
8636 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8637                          int nsort_new,gmx_cgsort_t *sort_new,
8638                          gmx_cgsort_t *sort1)
8639 {
8640     int i1,i2,i_new;
8641     
8642     /* The new indices are not very ordered, so we qsort them */
8643     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8644     
8645     /* sort2 is already ordered, so now we can merge the two arrays */
8646     i1 = 0;
8647     i2 = 0;
8648     i_new = 0;
8649     while(i2 < nsort2 || i_new < nsort_new)
8650     {
8651         if (i2 == nsort2)
8652         {
8653             sort1[i1++] = sort_new[i_new++];
8654         }
8655         else if (i_new == nsort_new)
8656         {
8657             sort1[i1++] = sort2[i2++];
8658         }
8659         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8660                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8661                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8662         {
8663             sort1[i1++] = sort2[i2++];
8664         }
8665         else
8666         {
8667             sort1[i1++] = sort_new[i_new++];
8668         }
8669     }
8670 }
8671
8672 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8673 {
8674     gmx_domdec_sort_t *sort;
8675     gmx_cgsort_t *cgsort,*sort_i;
8676     int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8677     int  sort_last,sort_skip;
8678
8679     sort = dd->comm->sort;
8680
8681     a = fr->ns.grid->cell_index;
8682
8683     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8684
8685     if (ncg_home_old >= 0)
8686     {
8687         /* The charge groups that remained in the same ns grid cell
8688          * are completely ordered. So we can sort efficiently by sorting
8689          * the charge groups that did move into the stationary list.
8690          */
8691         ncg_new = 0;
8692         nsort2 = 0;
8693         nsort_new = 0;
8694         for(i=0; i<dd->ncg_home; i++)
8695         {
8696             /* Check if this cg did not move to another node */
8697             if (a[i] < moved)
8698             {
8699                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8700                 {
8701                     /* This cg is new on this node or moved ns grid cell */
8702                     if (nsort_new >= sort->sort_new_nalloc)
8703                     {
8704                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8705                         srenew(sort->sort_new,sort->sort_new_nalloc);
8706                     }
8707                     sort_i = &(sort->sort_new[nsort_new++]);
8708                 }
8709                 else
8710                 {
8711                     /* This cg did not move */
8712                     sort_i = &(sort->sort2[nsort2++]);
8713                 }
8714                 /* Sort on the ns grid cell indices
8715                  * and the global topology index.
8716                  * index_gl is irrelevant with cell ns,
8717                  * but we set it here anyhow to avoid a conditional.
8718                  */
8719                 sort_i->nsc    = a[i];
8720                 sort_i->ind_gl = dd->index_gl[i];
8721                 sort_i->ind    = i;
8722                 ncg_new++;
8723             }
8724         }
8725         if (debug)
8726         {
8727             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8728                     nsort2,nsort_new);
8729         }
8730         /* Sort efficiently */
8731         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8732                      sort->sort);
8733     }
8734     else
8735     {
8736         cgsort = sort->sort;
8737         ncg_new = 0;
8738         for(i=0; i<dd->ncg_home; i++)
8739         {
8740             /* Sort on the ns grid cell indices
8741              * and the global topology index
8742              */
8743             cgsort[i].nsc    = a[i];
8744             cgsort[i].ind_gl = dd->index_gl[i];
8745             cgsort[i].ind    = i;
8746             if (cgsort[i].nsc < moved)
8747             {
8748                 ncg_new++;
8749             }
8750         }
8751         if (debug)
8752         {
8753             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8754         }
8755         /* Determine the order of the charge groups using qsort */
8756         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8757     }
8758
8759     return ncg_new;
8760 }
8761
8762 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8763 {
8764     gmx_cgsort_t *sort;
8765     int  ncg_new,i,*a,na;
8766
8767     sort = dd->comm->sort->sort;
8768
8769     nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8770
8771     ncg_new = 0;
8772     for(i=0; i<na; i++)
8773     {
8774         if (a[i] >= 0)
8775         {
8776             sort[ncg_new].ind = a[i];
8777             ncg_new++;
8778         }
8779     }
8780
8781     return ncg_new;
8782 }
8783
8784 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8785                           rvec *cgcm,t_forcerec *fr,t_state *state,
8786                           int ncg_home_old)
8787 {
8788     gmx_domdec_sort_t *sort;
8789     gmx_cgsort_t *cgsort,*sort_i;
8790     int  *cgindex;
8791     int  ncg_new,i,*ibuf,cgsize;
8792     rvec *vbuf;
8793     
8794     sort = dd->comm->sort;
8795     
8796     if (dd->ncg_home > sort->sort_nalloc)
8797     {
8798         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8799         srenew(sort->sort,sort->sort_nalloc);
8800         srenew(sort->sort2,sort->sort_nalloc);
8801     }
8802     cgsort = sort->sort;
8803
8804     switch (fr->cutoff_scheme)
8805     {
8806     case ecutsGROUP:
8807         ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8808         break;
8809     case ecutsVERLET:
8810         ncg_new = dd_sort_order_nbnxn(dd,fr);
8811         break;
8812     default:
8813         gmx_incons("unimplemented");
8814         ncg_new = 0;
8815     }
8816
8817     /* We alloc with the old size, since cgindex is still old */
8818     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8819     vbuf = dd->comm->vbuf.v;
8820     
8821     if (dd->comm->bCGs)
8822     {
8823         cgindex = dd->cgindex;
8824     }
8825     else
8826     {
8827         cgindex = NULL;
8828     }
8829
8830     /* Remove the charge groups which are no longer at home here */
8831     dd->ncg_home = ncg_new;
8832     if (debug)
8833     {
8834         fprintf(debug,"Set the new home charge group count to %d\n",
8835                 dd->ncg_home);
8836     }
8837     
8838     /* Reorder the state */
8839     for(i=0; i<estNR; i++)
8840     {
8841         if (EST_DISTR(i) && (state->flags & (1<<i)))
8842         {
8843             switch (i)
8844             {
8845             case estX:
8846                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8847                 break;
8848             case estV:
8849                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8850                 break;
8851             case estSDX:
8852                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8853                 break;
8854             case estCGP:
8855                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8856                 break;
8857             case estLD_RNG:
8858             case estLD_RNGI:
8859             case estDISRE_INITF:
8860             case estDISRE_RM3TAV:
8861             case estORIRE_INITF:
8862             case estORIRE_DTAV:
8863                 /* No ordering required */
8864                 break;
8865             default:
8866                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8867                 break;
8868             }
8869         }
8870     }
8871     if (fr->cutoff_scheme == ecutsGROUP)
8872     {
8873         /* Reorder cgcm */
8874         order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8875     }
8876     
8877     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8878     {
8879         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8880         srenew(sort->ibuf,sort->ibuf_nalloc);
8881     }
8882     ibuf = sort->ibuf;
8883     /* Reorder the global cg index */
8884     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8885     /* Reorder the cginfo */
8886     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8887     /* Rebuild the local cg index */
8888     if (dd->comm->bCGs)
8889     {
8890         ibuf[0] = 0;
8891         for(i=0; i<dd->ncg_home; i++)
8892         {
8893             cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8894             ibuf[i+1] = ibuf[i] + cgsize;
8895         }
8896         for(i=0; i<dd->ncg_home+1; i++)
8897         {
8898             dd->cgindex[i] = ibuf[i];
8899         }
8900     }
8901     else
8902     {
8903         for(i=0; i<dd->ncg_home+1; i++)
8904         {
8905             dd->cgindex[i] = i;
8906         }
8907     }
8908     /* Set the home atom number */
8909     dd->nat_home = dd->cgindex[dd->ncg_home];
8910
8911     if (fr->cutoff_scheme == ecutsVERLET)
8912     {
8913         /* The atoms are now exactly in grid order, update the grid order */
8914         nbnxn_set_atomorder(fr->nbv->nbs);
8915     }
8916     else
8917     {
8918         /* Copy the sorted ns cell indices back to the ns grid struct */
8919         for(i=0; i<dd->ncg_home; i++)
8920         {
8921             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8922         }
8923         fr->ns.grid->nr = dd->ncg_home;
8924     }
8925 }
8926
8927 static void add_dd_statistics(gmx_domdec_t *dd)
8928 {
8929     gmx_domdec_comm_t *comm;
8930     int ddnat;
8931     
8932     comm = dd->comm;
8933     
8934     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8935     {
8936         comm->sum_nat[ddnat-ddnatZONE] +=
8937             comm->nat[ddnat] - comm->nat[ddnat-1];
8938     }
8939     comm->ndecomp++;
8940 }
8941
8942 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8943 {
8944     gmx_domdec_comm_t *comm;
8945     int ddnat;
8946     
8947     comm = dd->comm;
8948
8949     /* Reset all the statistics and counters for total run counting */
8950     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8951     {
8952         comm->sum_nat[ddnat-ddnatZONE] = 0;
8953     }
8954     comm->ndecomp = 0;
8955     comm->nload = 0;
8956     comm->load_step = 0;
8957     comm->load_sum = 0;
8958     comm->load_max = 0;
8959     clear_ivec(comm->load_lim);
8960     comm->load_mdf = 0;
8961     comm->load_pme = 0;
8962 }
8963
8964 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8965 {
8966     gmx_domdec_comm_t *comm;
8967     int ddnat;
8968     double av;
8969    
8970     comm = cr->dd->comm;
8971     
8972     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8973     
8974     if (fplog == NULL)
8975     {
8976         return;
8977     }
8978     
8979     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8980             
8981     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8982     {
8983         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8984         switch(ddnat)
8985         {
8986         case ddnatZONE:
8987             fprintf(fplog,
8988                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8989                     2,av);
8990             break;
8991         case ddnatVSITE:
8992             if (cr->dd->vsite_comm)
8993             {
8994                 fprintf(fplog,
8995                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8996                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8997                         av);
8998             }
8999             break;
9000         case ddnatCON:
9001             if (cr->dd->constraint_comm)
9002             {
9003                 fprintf(fplog,
9004                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9005                         1 + ir->nLincsIter,av);
9006             }
9007             break;
9008         default:
9009             gmx_incons(" Unknown type for DD statistics");
9010         }
9011     }
9012     fprintf(fplog,"\n");
9013     
9014     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9015     {
9016         print_dd_load_av(fplog,cr->dd);
9017     }
9018 }
9019
9020 void dd_partition_system(FILE            *fplog,
9021                          gmx_large_int_t      step,
9022                          t_commrec       *cr,
9023                          gmx_bool            bMasterState,
9024                          int             nstglobalcomm,
9025                          t_state         *state_global,
9026                          gmx_mtop_t      *top_global,
9027                          t_inputrec      *ir,
9028                          t_state         *state_local,
9029                          rvec            **f,
9030                          t_mdatoms       *mdatoms,
9031                          gmx_localtop_t  *top_local,
9032                          t_forcerec      *fr,
9033                          gmx_vsite_t     *vsite,
9034                          gmx_shellfc_t   shellfc,
9035                          gmx_constr_t    constr,
9036                          t_nrnb          *nrnb,
9037                          gmx_wallcycle_t wcycle,
9038                          gmx_bool            bVerbose)
9039 {
9040     gmx_domdec_t *dd;
9041     gmx_domdec_comm_t *comm;
9042     gmx_ddbox_t ddbox={0};
9043     t_block *cgs_gl;
9044     gmx_large_int_t step_pcoupl;
9045     rvec cell_ns_x0,cell_ns_x1;
9046     int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
9047     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
9048     gmx_bool bRedist,bSortCG,bResortAll;
9049     ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
9050     real grid_density;
9051     char sbuf[22];
9052         
9053     dd = cr->dd;
9054     comm = dd->comm;
9055
9056     bBoxChanged = (bMasterState || DEFORM(*ir));
9057     if (ir->epc != epcNO)
9058     {
9059         /* With nstpcouple > 1 pressure coupling happens.
9060          * one step after calculating the pressure.
9061          * Box scaling happens at the end of the MD step,
9062          * after the DD partitioning.
9063          * We therefore have to do DLB in the first partitioning
9064          * after an MD step where P-coupling occured.
9065          * We need to determine the last step in which p-coupling occurred.
9066          * MRS -- need to validate this for vv?
9067          */
9068         n = ir->nstpcouple;
9069         if (n == 1)
9070         {
9071             step_pcoupl = step - 1;
9072         }
9073         else
9074         {
9075             step_pcoupl = ((step - 1)/n)*n + 1;
9076         }
9077         if (step_pcoupl >= comm->partition_step)
9078         {
9079             bBoxChanged = TRUE;
9080         }
9081     }
9082
9083     bNStGlobalComm = (step % nstglobalcomm == 0);
9084
9085     if (!comm->bDynLoadBal)
9086     {
9087         bDoDLB = FALSE;
9088     }
9089     else
9090     {
9091         /* Should we do dynamic load balacing this step?
9092          * Since it requires (possibly expensive) global communication,
9093          * we might want to do DLB less frequently.
9094          */
9095         if (bBoxChanged || ir->epc != epcNO)
9096         {
9097             bDoDLB = bBoxChanged;
9098         }
9099         else
9100         {
9101             bDoDLB = bNStGlobalComm;
9102         }
9103     }
9104
9105     /* Check if we have recorded loads on the nodes */
9106     if (comm->bRecordLoad && dd_load_count(comm))
9107     {
9108         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9109         {
9110             /* Check if we should use DLB at the second partitioning
9111              * and every 100 partitionings,
9112              * so the extra communication cost is negligible.
9113              */
9114             n = max(100,nstglobalcomm);
9115             bCheckDLB = (comm->n_load_collect == 0 ||
9116                          comm->n_load_have % n == n-1);
9117         }
9118         else
9119         {
9120             bCheckDLB = FALSE;
9121         }
9122         
9123         /* Print load every nstlog, first and last step to the log file */
9124         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9125                     comm->n_load_collect == 0 ||
9126                     (ir->nsteps >= 0 &&
9127                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9128
9129         /* Avoid extra communication due to verbose screen output
9130          * when nstglobalcomm is set.
9131          */
9132         if (bDoDLB || bLogLoad || bCheckDLB ||
9133             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9134         {
9135             get_load_distribution(dd,wcycle);
9136             if (DDMASTER(dd))
9137             {
9138                 if (bLogLoad)
9139                 {
9140                     dd_print_load(fplog,dd,step-1);
9141                 }
9142                 if (bVerbose)
9143                 {
9144                     dd_print_load_verbose(dd);
9145                 }
9146             }
9147             comm->n_load_collect++;
9148
9149             if (bCheckDLB) {
9150                 /* Since the timings are node dependent, the master decides */
9151                 if (DDMASTER(dd))
9152                 {
9153                     bTurnOnDLB =
9154                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9155                     if (debug)
9156                     {
9157                         fprintf(debug,"step %s, imb loss %f\n",
9158                                 gmx_step_str(step,sbuf),
9159                                 dd_force_imb_perf_loss(dd));
9160                     }
9161                 }
9162                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9163                 if (bTurnOnDLB)
9164                 {
9165                     turn_on_dlb(fplog,cr,step);
9166                     bDoDLB = TRUE;
9167                 }
9168             }
9169         }
9170         comm->n_load_have++;
9171     }
9172
9173     cgs_gl = &comm->cgs_gl;
9174
9175     bRedist = FALSE;
9176     if (bMasterState)
9177     {
9178         /* Clear the old state */
9179         clear_dd_indices(dd,0,0);
9180
9181         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9182                   TRUE,cgs_gl,state_global->x,&ddbox);
9183     
9184         get_cg_distribution(fplog,step,dd,cgs_gl,
9185                             state_global->box,&ddbox,state_global->x);
9186         
9187         dd_distribute_state(dd,cgs_gl,
9188                             state_global,state_local,f);
9189         
9190         dd_make_local_cgs(dd,&top_local->cgs);
9191         
9192         /* Ensure that we have space for the new distribution */
9193         dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9194
9195         if (fr->cutoff_scheme == ecutsGROUP)
9196         {
9197             calc_cgcm(fplog,0,dd->ncg_home,
9198                       &top_local->cgs,state_local->x,fr->cg_cm);
9199         }
9200         
9201         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9202         
9203         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9204
9205         cg0 = 0;
9206     }
9207     else if (state_local->ddp_count != dd->ddp_count)
9208     {
9209         if (state_local->ddp_count > dd->ddp_count)
9210         {
9211             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9212         }
9213         
9214         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9215         {
9216             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9217         }
9218         
9219         /* Clear the old state */
9220         clear_dd_indices(dd,0,0);
9221         
9222         /* Build the new indices */
9223         rebuild_cgindex(dd,cgs_gl->index,state_local);
9224         make_dd_indices(dd,cgs_gl->index,0);
9225
9226         if (fr->cutoff_scheme == ecutsGROUP)
9227         {
9228             /* Redetermine the cg COMs */
9229             calc_cgcm(fplog,0,dd->ncg_home,
9230                       &top_local->cgs,state_local->x,fr->cg_cm);
9231         }
9232         
9233         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9234
9235         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9236
9237         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9238                   TRUE,&top_local->cgs,state_local->x,&ddbox);
9239
9240         bRedist = comm->bDynLoadBal;
9241     }
9242     else
9243     {
9244         /* We have the full state, only redistribute the cgs */
9245
9246         /* Clear the non-home indices */
9247         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9248
9249         /* Avoid global communication for dim's without pbc and -gcom */
9250         if (!bNStGlobalComm)
9251         {
9252             copy_rvec(comm->box0    ,ddbox.box0    );
9253             copy_rvec(comm->box_size,ddbox.box_size);
9254         }
9255         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9256                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9257
9258         bBoxChanged = TRUE;
9259         bRedist = TRUE;
9260     }
9261     /* For dim's without pbc and -gcom */
9262     copy_rvec(ddbox.box0    ,comm->box0    );
9263     copy_rvec(ddbox.box_size,comm->box_size);
9264     
9265     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9266                       step,wcycle);
9267     
9268     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9269     {
9270         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9271     }
9272     
9273     /* Check if we should sort the charge groups */
9274     if (comm->nstSortCG > 0)
9275     {
9276         bSortCG = (bMasterState ||
9277                    (bRedist && (step % comm->nstSortCG == 0)));
9278     }
9279     else
9280     {
9281         bSortCG = FALSE;
9282     }
9283
9284     ncg_home_old = dd->ncg_home;
9285
9286     ncg_moved = 0;
9287     if (bRedist)
9288     {
9289         wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9290
9291         dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9292                            state_local,f,fr,mdatoms,
9293                            !bSortCG,nrnb,&cg0,&ncg_moved);
9294
9295         wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9296     }
9297     
9298     get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9299                           dd,&ddbox,
9300                           &comm->cell_x0,&comm->cell_x1,
9301                           dd->ncg_home,fr->cg_cm,
9302                           cell_ns_x0,cell_ns_x1,&grid_density);
9303
9304     if (bBoxChanged)
9305     {
9306         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9307     }
9308
9309     switch (fr->cutoff_scheme)
9310     {
9311     case ecutsGROUP:
9312         copy_ivec(fr->ns.grid->n,ncells_old);
9313         grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9314                    state_local->box,cell_ns_x0,cell_ns_x1,
9315                    fr->rlistlong,grid_density);
9316         break;
9317     case ecutsVERLET:
9318         nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9319         break;
9320     default:
9321         gmx_incons("unimplemented");
9322     }
9323     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9324     copy_ivec(ddbox.tric_dir,comm->tric_dir);
9325
9326     if (bSortCG)
9327     {
9328         wallcycle_sub_start(wcycle,ewcsDD_GRID);
9329
9330         /* Sort the state on charge group position.
9331          * This enables exact restarts from this step.
9332          * It also improves performance by about 15% with larger numbers
9333          * of atoms per node.
9334          */
9335         
9336         /* Fill the ns grid with the home cell,
9337          * so we can sort with the indices.
9338          */
9339         set_zones_ncg_home(dd);
9340
9341         switch (fr->cutoff_scheme)
9342         {
9343         case ecutsVERLET:
9344             set_zones_size(dd,state_local->box,&ddbox,0,1);
9345
9346             nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9347                               0,
9348                               comm->zones.size[0].bb_x0,
9349                               comm->zones.size[0].bb_x1,
9350                               0,dd->ncg_home,
9351                               comm->zones.dens_zone0,
9352                               fr->cginfo,
9353                               state_local->x,
9354                               ncg_moved,comm->moved,
9355                               fr->nbv->grp[eintLocal].kernel_type,
9356                               fr->nbv->grp[eintLocal].nbat);
9357
9358             nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9359             break;
9360         case ecutsGROUP:
9361             fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9362                       0,dd->ncg_home,fr->cg_cm);
9363             
9364             copy_ivec(fr->ns.grid->n,ncells_new);
9365             break;
9366         default:
9367             gmx_incons("unimplemented");
9368         }
9369
9370         bResortAll = bMasterState;
9371    
9372         /* Check if we can user the old order and ns grid cell indices
9373          * of the charge groups to sort the charge groups efficiently.
9374          */
9375         if (ncells_new[XX] != ncells_old[XX] ||
9376             ncells_new[YY] != ncells_old[YY] ||
9377             ncells_new[ZZ] != ncells_old[ZZ])
9378         {
9379             bResortAll = TRUE;
9380         }
9381
9382         if (debug)
9383         {
9384             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9385                     gmx_step_str(step,sbuf),dd->ncg_home);
9386         }
9387         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9388                       bResortAll ? -1 : ncg_home_old);
9389         /* Rebuild all the indices */
9390         cg0 = 0;
9391         ga2la_clear(dd->ga2la);
9392
9393         wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9394     }
9395
9396     wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9397     
9398     /* Setup up the communication and communicate the coordinates */
9399     setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9400     
9401     /* Set the indices */
9402     make_dd_indices(dd,cgs_gl->index,cg0);
9403
9404     /* Set the charge group boundaries for neighbor searching */
9405     set_cg_boundaries(&comm->zones);
9406
9407     if (fr->cutoff_scheme == ecutsVERLET)
9408     {
9409         set_zones_size(dd,state_local->box,&ddbox,
9410                        bSortCG ? 1 : 0,comm->zones.n);
9411     }
9412
9413     wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9414
9415     /*
9416     write_dd_pdb("dd_home",step,"dump",top_global,cr,
9417                  -1,state_local->x,state_local->box);
9418     */
9419
9420     wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9421     
9422     /* Extract a local topology from the global topology */
9423     for(i=0; i<dd->ndim; i++)
9424     {
9425         np[dd->dim[i]] = comm->cd[i].np;
9426     }
9427     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9428                       comm->cellsize_min,np,
9429                       fr,
9430                       fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9431                       vsite,top_global,top_local);
9432
9433     wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9434
9435     wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9436     
9437     /* Set up the special atom communication */
9438     n = comm->nat[ddnatZONE];
9439     for(i=ddnatZONE+1; i<ddnatNR; i++)
9440     {
9441         switch(i)
9442         {
9443         case ddnatVSITE:
9444             if (vsite && vsite->n_intercg_vsite)
9445             {
9446                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9447             }
9448             break;
9449         case ddnatCON:
9450             if (dd->bInterCGcons || dd->bInterCGsettles)
9451             {
9452                 /* Only for inter-cg constraints we need special code */
9453                 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9454                                               constr,ir->nProjOrder,
9455                                               top_local->idef.il);
9456             }
9457             break;
9458         default:
9459             gmx_incons("Unknown special atom type setup");
9460         }
9461         comm->nat[i] = n;
9462     }
9463
9464     wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9465
9466     wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9467
9468     /* Make space for the extra coordinates for virtual site
9469      * or constraint communication.
9470      */
9471     state_local->natoms = comm->nat[ddnatNR-1];
9472     if (state_local->natoms > state_local->nalloc)
9473     {
9474         dd_realloc_state(state_local,f,state_local->natoms);
9475     }
9476
9477     if (fr->bF_NoVirSum)
9478     {
9479         if (vsite && vsite->n_intercg_vsite)
9480         {
9481             nat_f_novirsum = comm->nat[ddnatVSITE];
9482         }
9483         else
9484         {
9485             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9486             {
9487                 nat_f_novirsum = dd->nat_tot;
9488             }
9489             else
9490             {
9491                 nat_f_novirsum = dd->nat_home;
9492             }
9493         }
9494     }
9495     else
9496     {
9497         nat_f_novirsum = 0;
9498     }
9499
9500     /* Set the number of atoms required for the force calculation.
9501      * Forces need to be constrained when using a twin-range setup
9502      * or with energy minimization. For simple simulations we could
9503      * avoid some allocation, zeroing and copying, but this is
9504      * probably not worth the complications ande checking.
9505      */
9506     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9507                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9508
9509     /* We make the all mdatoms up to nat_tot_con.
9510      * We could save some work by only setting invmass
9511      * between nat_tot and nat_tot_con.
9512      */
9513     /* This call also sets the new number of home particles to dd->nat_home */
9514     atoms2md(top_global,ir,
9515              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9516
9517     /* Now we have the charges we can sort the FE interactions */
9518     dd_sort_local_top(dd,mdatoms,top_local);
9519
9520     if (vsite != NULL)
9521     {
9522         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9523         split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
9524     }
9525
9526     if (shellfc)
9527     {
9528         /* Make the local shell stuff, currently no communication is done */
9529         make_local_shells(cr,mdatoms,shellfc);
9530     }
9531     
9532         if (ir->implicit_solvent)
9533     {
9534         make_local_gb(cr,fr->born,ir->gb_algorithm);
9535     }
9536
9537     init_bonded_thread_force_reduction(fr,&top_local->idef);
9538
9539     if (!(cr->duty & DUTY_PME))
9540     {
9541         /* Send the charges to our PME only node */
9542         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9543                        mdatoms->chargeA,mdatoms->chargeB,
9544                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9545     }
9546     
9547     if (constr)
9548     {
9549         set_constraints(constr,top_local,ir,mdatoms,cr);
9550     }
9551     
9552     if (ir->ePull != epullNO)
9553     {
9554         /* Update the local pull groups */
9555         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9556     }
9557     
9558     if (ir->bRot)
9559     {
9560         /* Update the local rotation groups */
9561         dd_make_local_rotation_groups(dd,ir->rot);
9562     }
9563
9564
9565     add_dd_statistics(dd);
9566     
9567     /* Make sure we only count the cycles for this DD partitioning */
9568     clear_dd_cycle_counts(dd);
9569     
9570     /* Because the order of the atoms might have changed since
9571      * the last vsite construction, we need to communicate the constructing
9572      * atom coordinates again (for spreading the forces this MD step).
9573      */
9574     dd_move_x_vsites(dd,state_local->box,state_local->x);
9575
9576     wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9577     
9578     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9579     {
9580         dd_move_x(dd,state_local->box,state_local->x);
9581         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9582                      -1,state_local->x,state_local->box);
9583     }
9584
9585     /* Store the partitioning step */
9586     comm->partition_step = step;
9587     
9588     /* Increase the DD partitioning counter */
9589     dd->ddp_count++;
9590     /* The state currently matches this DD partitioning count, store it */
9591     state_local->ddp_count = dd->ddp_count;
9592     if (bMasterState)
9593     {
9594         /* The DD master node knows the complete cg distribution,
9595          * store the count so we can possibly skip the cg info communication.
9596          */
9597         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9598     }
9599
9600     if (comm->DD_debug > 0)
9601     {
9602         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9603         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9604                                 "after partitioning");
9605     }
9606 }