Merge gromacs-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "vec.h"
31 #include "domdec.h"
32 #include "domdec_network.h"
33 #include "nrnb.h"
34 #include "pbc.h"
35 #include "chargegroup.h"
36 #include "constr.h"
37 #include "mdatoms.h"
38 #include "names.h"
39 #include "pdbio.h"
40 #include "futil.h"
41 #include "force.h"
42 #include "pme.h"
43 #include "pull.h"
44 #include "pull_rotation.h"
45 #include "gmx_wallcycle.h"
46 #include "mdrun.h"
47 #include "nsgrid.h"
48 #include "shellfc.h"
49 #include "mtop_util.h"
50 #include "gmxfio.h"
51 #include "gmx_ga2la.h"
52 #include "gmx_sort.h"
53
54 #ifdef GMX_LIB_MPI
55 #include <mpi.h>
56 #endif
57 #ifdef GMX_THREADS
58 #include "tmpi.h"
59 #endif
60
61 #define DDRANK(dd,rank)    (rank)
62 #define DDMASTERRANK(dd)   (dd->masterrank)
63
64 typedef struct gmx_domdec_master
65 {
66     /* The cell boundaries */
67     real **cell_x;
68     /* The global charge group division */
69     int  *ncg;     /* Number of home charge groups for each node */
70     int  *index;   /* Index of nnodes+1 into cg */
71     int  *cg;      /* Global charge group index */
72     int  *nat;     /* Number of home atoms for each node. */
73     int  *ibuf;    /* Buffer for communication */
74     rvec *vbuf;    /* Buffer for state scattering and gathering */
75 } gmx_domdec_master_t;
76
77 typedef struct
78 {
79     /* The numbers of charge groups to send and receive for each cell
80      * that requires communication, the last entry contains the total
81      * number of atoms that needs to be communicated.
82      */
83     int nsend[DD_MAXIZONE+2];
84     int nrecv[DD_MAXIZONE+2];
85     /* The charge groups to send */
86     int *index;
87     int nalloc;
88     /* The atom range for non-in-place communication */
89     int cell2at0[DD_MAXIZONE];
90     int cell2at1[DD_MAXIZONE];
91 } gmx_domdec_ind_t;
92
93 typedef struct
94 {
95     int  np;                   /* Number of grid pulses in this dimension */
96     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
97     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
98     int  np_nalloc;
99     gmx_bool bInPlace;             /* Can we communicate in place?            */
100 } gmx_domdec_comm_dim_t;
101
102 typedef struct
103 {
104     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
105     real *cell_f;      /* State var.: cell boundaries, box relative      */
106     real *old_cell_f;  /* Temp. var.: old cell size                      */
107     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
108     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
109     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
110     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
111     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
112     real *buf_ncd;     /* Temp. var.                                     */
113 } gmx_domdec_root_t;
114
115 #define DD_NLOAD_MAX 9
116
117 /* Here floats are accurate enough, since these variables
118  * only influence the load balancing, not the actual MD results.
119  */
120 typedef struct
121 {
122     int  nload;
123     float *load;
124     float sum;
125     float max;
126     float sum_m;
127     float cvol_min;
128     float mdf;
129     float pme;
130     int   flags;
131 } gmx_domdec_load_t;
132
133 typedef struct
134 {
135     int  nsc;
136     int  ind_gl;
137     int  ind;
138 } gmx_cgsort_t;
139
140 typedef struct
141 {
142     gmx_cgsort_t *sort1,*sort2;
143     int  sort_nalloc;
144     gmx_cgsort_t *sort_new;
145     int  sort_new_nalloc;
146     int  *ibuf;
147     int  ibuf_nalloc;
148 } gmx_domdec_sort_t;
149
150 typedef struct
151 {
152     rvec *v;
153     int  nalloc;
154 } vec_rvec_t;
155
156 /* This enum determines the order of the coordinates.
157  * ddnatHOME and ddnatZONE should be first and second,
158  * the others can be ordered as wanted.
159  */
160 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
161
162 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
163 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
164
165 typedef struct
166 {
167     int  dim;      /* The dimension                                          */
168     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
169     int  nslab;    /* The number of PME slabs in this dimension              */
170     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
171     int  *pp_min;  /* The minimum pp node location, size nslab               */
172     int  *pp_max;  /* The maximum pp node location,size nslab                */
173     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
174 } gmx_ddpme_t;
175
176 typedef struct
177 {
178     real min0;    /* The minimum bottom of this zone                        */
179     real max1;    /* The maximum top of this zone                           */
180     real mch0;    /* The maximum bottom communicaton height for this zone   */
181     real mch1;    /* The maximum top communicaton height for this zone      */
182     real p1_0;    /* The bottom value of the first cell in this zone        */
183     real p1_1;    /* The top value of the first cell in this zone           */
184 } gmx_ddzone_t;
185
186 typedef struct gmx_domdec_comm
187 {
188     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
189      * unless stated otherwise.
190      */
191
192     /* The number of decomposition dimensions for PME, 0: no PME */
193     int  npmedecompdim;
194     /* The number of nodes doing PME (PP/PME or only PME) */
195     int  npmenodes;
196     int  npmenodes_x;
197     int  npmenodes_y;
198     /* The communication setup including the PME only nodes */
199     gmx_bool bCartesianPP_PME;
200     ivec ntot;
201     int  cartpmedim;
202     int  *pmenodes;          /* size npmenodes                         */
203     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
204                               * but with bCartesianPP_PME              */
205     gmx_ddpme_t ddpme[2];
206     
207     /* The DD particle-particle nodes only */
208     gmx_bool bCartesianPP;
209     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
210     
211     /* The global charge groups */
212     t_block cgs_gl;
213
214     /* Should we sort the cgs */
215     int  nstSortCG;
216     gmx_domdec_sort_t *sort;
217     
218     /* Are there bonded and multi-body interactions between charge groups? */
219     gmx_bool bInterCGBondeds;
220     gmx_bool bInterCGMultiBody;
221
222     /* Data for the optional bonded interaction atom communication range */
223     gmx_bool bBondComm;
224     t_blocka *cglink;
225     char *bLocalCG;
226
227     /* The DLB option */
228     int  eDLB;
229     /* Are we actually using DLB? */
230     gmx_bool bDynLoadBal;
231
232     /* Cell sizes for static load balancing, first index cartesian */
233     real **slb_frac;
234     
235     /* The width of the communicated boundaries */
236     real cutoff_mbody;
237     real cutoff;
238     /* The minimum cell size (including triclinic correction) */
239     rvec cellsize_min;
240     /* For dlb, for use with edlbAUTO */
241     rvec cellsize_min_dlb;
242     /* The lower limit for the DD cell size with DLB */
243     real cellsize_limit;
244     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
245     gmx_bool bVacDLBNoLimit;
246
247     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
248     ivec tric_dir;
249     /* box0 and box_size are required with dim's without pbc and -gcom */
250     rvec box0;
251     rvec box_size;
252     
253     /* The cell boundaries */
254     rvec cell_x0;
255     rvec cell_x1;
256
257     /* The old location of the cell boundaries, to check cg displacements */
258     rvec old_cell_x0;
259     rvec old_cell_x1;
260
261     /* The communication setup and charge group boundaries for the zones */
262     gmx_domdec_zones_t zones;
263     
264     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
265      * cell boundaries of neighboring cells for dynamic load balancing.
266      */
267     gmx_ddzone_t zone_d1[2];
268     gmx_ddzone_t zone_d2[2][2];
269     
270     /* The coordinate/force communication setup and indices */
271     gmx_domdec_comm_dim_t cd[DIM];
272     /* The maximum number of cells to communicate with in one dimension */
273     int  maxpulse;
274     
275     /* Which cg distribution is stored on the master node */
276     int master_cg_ddp_count;
277     
278     /* The number of cg's received from the direct neighbors */
279     int  zone_ncg1[DD_MAXZONE];
280     
281     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
282     int  nat[ddnatNR];
283     
284     /* Communication buffer for general use */
285     int  *buf_int;
286     int  nalloc_int;
287
288      /* Communication buffer for general use */
289     vec_rvec_t vbuf;
290     
291     /* Communication buffers only used with multiple grid pulses */
292     int  *buf_int2;
293     int  nalloc_int2;
294     vec_rvec_t vbuf2;
295     
296     /* Communication buffers for local redistribution */
297     int  **cggl_flag;
298     int  cggl_flag_nalloc[DIM*2];
299     rvec **cgcm_state;
300     int  cgcm_state_nalloc[DIM*2];
301     
302     /* Cell sizes for dynamic load balancing */
303     gmx_domdec_root_t **root;
304     real *cell_f_row;
305     real cell_f0[DIM];
306     real cell_f1[DIM];
307     real cell_f_max0[DIM];
308     real cell_f_min1[DIM];
309     
310     /* Stuff for load communication */
311     gmx_bool bRecordLoad;
312     gmx_domdec_load_t *load;
313 #ifdef GMX_MPI
314     MPI_Comm *mpi_comm_load;
315 #endif
316
317     /* Maximum DLB scaling per load balancing step in percent */
318     int dlb_scale_lim;
319
320     /* Cycle counters */
321     float cycl[ddCyclNr];
322     int   cycl_n[ddCyclNr];
323     float cycl_max[ddCyclNr];
324     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
325     int eFlop;
326     double flop;
327     int    flop_n;
328     /* Have often have did we have load measurements */
329     int    n_load_have;
330     /* Have often have we collected the load measurements */
331     int    n_load_collect;
332     
333     /* Statistics */
334     double sum_nat[ddnatNR-ddnatZONE];
335     int    ndecomp;
336     int    nload;
337     double load_step;
338     double load_sum;
339     double load_max;
340     ivec   load_lim;
341     double load_mdf;
342     double load_pme;
343
344     /* The last partition step */
345     gmx_large_int_t globalcomm_step;
346
347     /* Debugging */
348     int  nstDDDump;
349     int  nstDDDumpGrid;
350     int  DD_debug;
351 } gmx_domdec_comm_t;
352
353 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
354 #define DD_CGIBS 2
355
356 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
357 #define DD_FLAG_NRCG  65535
358 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
359 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
360
361 /* Zone permutation required to obtain consecutive charge groups
362  * for neighbor searching.
363  */
364 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
365
366 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
367  * components see only j zones with that component 0.
368  */
369
370 /* The DD zone order */
371 static const ivec dd_zo[DD_MAXZONE] =
372   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
373
374 /* The 3D setup */
375 #define dd_z3n  8
376 #define dd_zp3n 4
377 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
378
379 /* The 2D setup */
380 #define dd_z2n  4
381 #define dd_zp2n 2
382 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
383
384 /* The 1D setup */
385 #define dd_z1n  2
386 #define dd_zp1n 1
387 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
388
389 /* Factors used to avoid problems due to rounding issues */
390 #define DD_CELL_MARGIN       1.0001
391 #define DD_CELL_MARGIN2      1.00005
392 /* Factor to account for pressure scaling during nstlist steps */
393 #define DD_PRES_SCALE_MARGIN 1.02
394
395 /* Allowed performance loss before we DLB or warn */
396 #define DD_PERF_LOSS 0.05
397
398 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
399
400 /* Use separate MPI send and receive commands
401  * when nnodes <= GMX_DD_NNODES_SENDRECV.
402  * This saves memory (and some copying for small nnodes).
403  * For high parallelization scatter and gather calls are used.
404  */
405 #define GMX_DD_NNODES_SENDRECV 4
406
407
408 /*
409 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
410
411 static void index2xyz(ivec nc,int ind,ivec xyz)
412 {
413   xyz[XX] = ind % nc[XX];
414   xyz[YY] = (ind / nc[XX]) % nc[YY];
415   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
416 }
417 */
418
419 /* This order is required to minimize the coordinate communication in PME
420  * which uses decomposition in the x direction.
421  */
422 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
423
424 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
425 {
426     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
427     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
428     xyz[ZZ] = ind % nc[ZZ];
429 }
430
431 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
432 {
433     int ddindex;
434     int ddnodeid=-1;
435     
436     ddindex = dd_index(dd->nc,c);
437     if (dd->comm->bCartesianPP_PME)
438     {
439         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
440     }
441     else if (dd->comm->bCartesianPP)
442     {
443 #ifdef GMX_MPI
444         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
445 #endif
446     }
447     else
448     {
449         ddnodeid = ddindex;
450     }
451     
452     return ddnodeid;
453 }
454
455 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
456 {
457     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
458 }
459
460 int ddglatnr(gmx_domdec_t *dd,int i)
461 {
462     int atnr;
463     
464     if (dd == NULL)
465     {
466         atnr = i + 1;
467     }
468     else
469     {
470         if (i >= dd->comm->nat[ddnatNR-1])
471         {
472             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
473         }
474         atnr = dd->gatindex[i] + 1;
475     }
476     
477     return atnr;
478 }
479
480 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
481 {
482     return &dd->comm->cgs_gl;
483 }
484
485 static void vec_rvec_init(vec_rvec_t *v)
486 {
487     v->nalloc = 0;
488     v->v      = NULL;
489 }
490
491 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
492 {
493     if (n > v->nalloc)
494     {
495         v->nalloc = over_alloc_dd(n);
496         srenew(v->v,v->nalloc);
497     }
498 }
499
500 void dd_store_state(gmx_domdec_t *dd,t_state *state)
501 {
502     int i;
503     
504     if (state->ddp_count != dd->ddp_count)
505     {
506         gmx_incons("The state does not the domain decomposition state");
507     }
508     
509     state->ncg_gl = dd->ncg_home;
510     if (state->ncg_gl > state->cg_gl_nalloc)
511     {
512         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
513         srenew(state->cg_gl,state->cg_gl_nalloc);
514     }
515     for(i=0; i<state->ncg_gl; i++)
516     {
517         state->cg_gl[i] = dd->index_gl[i];
518     }
519     
520     state->ddp_count_cg_gl = dd->ddp_count;
521 }
522
523 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
524 {
525     return &dd->comm->zones;
526 }
527
528 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
529                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
530 {
531     gmx_domdec_zones_t *zones;
532     int izone,d,dim;
533
534     zones = &dd->comm->zones;
535
536     izone = 0;
537     while (icg >= zones->izone[izone].cg1)
538     {
539         izone++;
540     }
541     
542     if (izone == 0)
543     {
544         *jcg0 = icg;
545     }
546     else if (izone < zones->nizone)
547     {
548         *jcg0 = zones->izone[izone].jcg0;
549     }
550     else
551     {
552         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
553                   icg,izone,zones->nizone);
554     }
555         
556     *jcg1 = zones->izone[izone].jcg1;
557     
558     for(d=0; d<dd->ndim; d++)
559     {
560         dim = dd->dim[d];
561         shift0[dim] = zones->izone[izone].shift0[dim];
562         shift1[dim] = zones->izone[izone].shift1[dim];
563         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
564         {
565             /* A conservative approach, this can be optimized */
566             shift0[dim] -= 1;
567             shift1[dim] += 1;
568         }
569     }
570 }
571
572 int dd_natoms_vsite(gmx_domdec_t *dd)
573 {
574     return dd->comm->nat[ddnatVSITE];
575 }
576
577 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
578 {
579     *at_start = dd->comm->nat[ddnatCON-1];
580     *at_end   = dd->comm->nat[ddnatCON];
581 }
582
583 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
584 {
585     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
586     int  *index,*cgindex;
587     gmx_domdec_comm_t *comm;
588     gmx_domdec_comm_dim_t *cd;
589     gmx_domdec_ind_t *ind;
590     rvec shift={0,0,0},*buf,*rbuf;
591     gmx_bool bPBC,bScrew;
592     
593     comm = dd->comm;
594     
595     cgindex = dd->cgindex;
596     
597     buf = comm->vbuf.v;
598
599     nzone = 1;
600     nat_tot = dd->nat_home;
601     for(d=0; d<dd->ndim; d++)
602     {
603         bPBC   = (dd->ci[dd->dim[d]] == 0);
604         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
605         if (bPBC)
606         {
607             copy_rvec(box[dd->dim[d]],shift);
608         }
609         cd = &comm->cd[d];
610         for(p=0; p<cd->np; p++)
611         {
612             ind = &cd->ind[p];
613             index = ind->index;
614             n = 0;
615             if (!bPBC)
616             {
617                 for(i=0; i<ind->nsend[nzone]; i++)
618                 {
619                     at0 = cgindex[index[i]];
620                     at1 = cgindex[index[i]+1];
621                     for(j=at0; j<at1; j++)
622                     {
623                         copy_rvec(x[j],buf[n]);
624                         n++;
625                     }
626                 }
627             }
628             else if (!bScrew)
629             {
630                 for(i=0; i<ind->nsend[nzone]; i++)
631                 {
632                     at0 = cgindex[index[i]];
633                     at1 = cgindex[index[i]+1];
634                     for(j=at0; j<at1; j++)
635                     {
636                         /* We need to shift the coordinates */
637                         rvec_add(x[j],shift,buf[n]);
638                         n++;
639                     }
640                 }
641             }
642             else
643             {
644                 for(i=0; i<ind->nsend[nzone]; i++)
645                 {
646                     at0 = cgindex[index[i]];
647                     at1 = cgindex[index[i]+1];
648                     for(j=at0; j<at1; j++)
649                     {
650                         /* Shift x */
651                         buf[n][XX] = x[j][XX] + shift[XX];
652                         /* Rotate y and z.
653                          * This operation requires a special shift force
654                          * treatment, which is performed in calc_vir.
655                          */
656                         buf[n][YY] = box[YY][YY] - x[j][YY];
657                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
658                         n++;
659                     }
660                 }
661             }
662             
663             if (cd->bInPlace)
664             {
665                 rbuf = x + nat_tot;
666             }
667             else
668             {
669                 rbuf = comm->vbuf2.v;
670             }
671             /* Send and receive the coordinates */
672             dd_sendrecv_rvec(dd, d, dddirBackward,
673                              buf,  ind->nsend[nzone+1],
674                              rbuf, ind->nrecv[nzone+1]);
675             if (!cd->bInPlace)
676             {
677                 j = 0;
678                 for(zone=0; zone<nzone; zone++)
679                 {
680                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
681                     {
682                         copy_rvec(rbuf[j],x[i]);
683                         j++;
684                     }
685                 }
686             }
687             nat_tot += ind->nrecv[nzone+1];
688         }
689         nzone += nzone;
690     }
691 }
692
693 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
694 {
695     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
696     int  *index,*cgindex;
697     gmx_domdec_comm_t *comm;
698     gmx_domdec_comm_dim_t *cd;
699     gmx_domdec_ind_t *ind;
700     rvec *buf,*sbuf;
701     ivec vis;
702     int  is;
703     gmx_bool bPBC,bScrew;
704     
705     comm = dd->comm;
706     
707     cgindex = dd->cgindex;
708
709     buf = comm->vbuf.v;
710
711     n = 0;
712     nzone = comm->zones.n/2;
713     nat_tot = dd->nat_tot;
714     for(d=dd->ndim-1; d>=0; d--)
715     {
716         bPBC   = (dd->ci[dd->dim[d]] == 0);
717         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
718         if (fshift == NULL && !bScrew)
719         {
720             bPBC = FALSE;
721         }
722         /* Determine which shift vector we need */
723         clear_ivec(vis);
724         vis[dd->dim[d]] = 1;
725         is = IVEC2IS(vis);
726         
727         cd = &comm->cd[d];
728         for(p=cd->np-1; p>=0; p--) {
729             ind = &cd->ind[p];
730             nat_tot -= ind->nrecv[nzone+1];
731             if (cd->bInPlace)
732             {
733                 sbuf = f + nat_tot;
734             }
735             else
736             {
737                 sbuf = comm->vbuf2.v;
738                 j = 0;
739                 for(zone=0; zone<nzone; zone++)
740                 {
741                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
742                     {
743                         copy_rvec(f[i],sbuf[j]);
744                         j++;
745                     }
746                 }
747             }
748             /* Communicate the forces */
749             dd_sendrecv_rvec(dd, d, dddirForward,
750                              sbuf, ind->nrecv[nzone+1],
751                              buf,  ind->nsend[nzone+1]);
752             index = ind->index;
753             /* Add the received forces */
754             n = 0;
755             if (!bPBC)
756             {
757                 for(i=0; i<ind->nsend[nzone]; i++)
758                 {
759                     at0 = cgindex[index[i]];
760                     at1 = cgindex[index[i]+1];
761                     for(j=at0; j<at1; j++)
762                     {
763                         rvec_inc(f[j],buf[n]);
764                         n++;
765                     }
766                 } 
767             }
768             else if (!bScrew)
769             {
770                 for(i=0; i<ind->nsend[nzone]; i++)
771                 {
772                     at0 = cgindex[index[i]];
773                     at1 = cgindex[index[i]+1];
774                     for(j=at0; j<at1; j++)
775                     {
776                         rvec_inc(f[j],buf[n]);
777                         /* Add this force to the shift force */
778                         rvec_inc(fshift[is],buf[n]);
779                         n++;
780                     }
781                 }
782             }
783             else
784             {
785                 for(i=0; i<ind->nsend[nzone]; i++)
786                 {
787                     at0 = cgindex[index[i]];
788                     at1 = cgindex[index[i]+1];
789                     for(j=at0; j<at1; j++)
790                     {
791                         /* Rotate the force */
792                         f[j][XX] += buf[n][XX];
793                         f[j][YY] -= buf[n][YY];
794                         f[j][ZZ] -= buf[n][ZZ];
795                         if (fshift)
796                         {
797                             /* Add this force to the shift force */
798                             rvec_inc(fshift[is],buf[n]);
799                         }
800                         n++;
801                     }
802                 }
803             }
804         }
805         nzone /= 2;
806     }
807 }
808
809 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
810 {
811     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
812     int  *index,*cgindex;
813     gmx_domdec_comm_t *comm;
814     gmx_domdec_comm_dim_t *cd;
815     gmx_domdec_ind_t *ind;
816     real *buf,*rbuf;
817     
818     comm = dd->comm;
819     
820     cgindex = dd->cgindex;
821     
822     buf = &comm->vbuf.v[0][0];
823
824     nzone = 1;
825     nat_tot = dd->nat_home;
826     for(d=0; d<dd->ndim; d++)
827     {
828         cd = &comm->cd[d];
829         for(p=0; p<cd->np; p++)
830         {
831             ind = &cd->ind[p];
832             index = ind->index;
833             n = 0;
834             for(i=0; i<ind->nsend[nzone]; i++)
835             {
836                 at0 = cgindex[index[i]];
837                 at1 = cgindex[index[i]+1];
838                 for(j=at0; j<at1; j++)
839                 {
840                     buf[n] = v[j];
841                     n++;
842                 }
843             }
844             
845             if (cd->bInPlace)
846             {
847                 rbuf = v + nat_tot;
848             }
849             else
850             {
851                 rbuf = &comm->vbuf2.v[0][0];
852             }
853             /* Send and receive the coordinates */
854             dd_sendrecv_real(dd, d, dddirBackward,
855                              buf,  ind->nsend[nzone+1],
856                              rbuf, ind->nrecv[nzone+1]);
857             if (!cd->bInPlace)
858             {
859                 j = 0;
860                 for(zone=0; zone<nzone; zone++)
861                 {
862                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
863                     {
864                         v[i] = rbuf[j];
865                         j++;
866                     }
867                 }
868             }
869             nat_tot += ind->nrecv[nzone+1];
870         }
871         nzone += nzone;
872     }
873 }
874
875 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
876 {
877     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
878     int  *index,*cgindex;
879     gmx_domdec_comm_t *comm;
880     gmx_domdec_comm_dim_t *cd;
881     gmx_domdec_ind_t *ind;
882     real *buf,*sbuf;
883     
884     comm = dd->comm;
885     
886     cgindex = dd->cgindex;
887
888     buf = &comm->vbuf.v[0][0];
889
890     n = 0;
891     nzone = comm->zones.n/2;
892     nat_tot = dd->nat_tot;
893     for(d=dd->ndim-1; d>=0; d--)
894     {
895         cd = &comm->cd[d];
896         for(p=cd->np-1; p>=0; p--) {
897             ind = &cd->ind[p];
898             nat_tot -= ind->nrecv[nzone+1];
899             if (cd->bInPlace)
900             {
901                 sbuf = v + nat_tot;
902             }
903             else
904             {
905                 sbuf = &comm->vbuf2.v[0][0];
906                 j = 0;
907                 for(zone=0; zone<nzone; zone++)
908                 {
909                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
910                     {
911                         sbuf[j] = v[i];
912                         j++;
913                     }
914                 }
915             }
916             /* Communicate the forces */
917             dd_sendrecv_real(dd, d, dddirForward,
918                              sbuf, ind->nrecv[nzone+1],
919                              buf,  ind->nsend[nzone+1]);
920             index = ind->index;
921             /* Add the received forces */
922             n = 0;
923             for(i=0; i<ind->nsend[nzone]; i++)
924             {
925                 at0 = cgindex[index[i]];
926                 at1 = cgindex[index[i]+1];
927                 for(j=at0; j<at1; j++)
928                 {
929                     v[j] += buf[n];
930                     n++;
931                 }
932             } 
933         }
934         nzone /= 2;
935     }
936 }
937
938 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
939 {
940     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
941             d,i,j,
942             zone->min0,zone->max1,
943             zone->mch0,zone->mch0,
944             zone->p1_0,zone->p1_1);
945 }
946
947 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
948                                int ddimind,int direction,
949                                gmx_ddzone_t *buf_s,int n_s,
950                                gmx_ddzone_t *buf_r,int n_r)
951 {
952     rvec vbuf_s[5*2],vbuf_r[5*2];
953     int i;
954
955     for(i=0; i<n_s; i++)
956     {
957         vbuf_s[i*2  ][0] = buf_s[i].min0;
958         vbuf_s[i*2  ][1] = buf_s[i].max1;
959         vbuf_s[i*2  ][2] = buf_s[i].mch0;
960         vbuf_s[i*2+1][0] = buf_s[i].mch1;
961         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
962         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
963     }
964
965     dd_sendrecv_rvec(dd, ddimind, direction,
966                      vbuf_s, n_s*2,
967                      vbuf_r, n_r*2);
968
969     for(i=0; i<n_r; i++)
970     {
971         buf_r[i].min0 = vbuf_r[i*2  ][0];
972         buf_r[i].max1 = vbuf_r[i*2  ][1];
973         buf_r[i].mch0 = vbuf_r[i*2  ][2];
974         buf_r[i].mch1 = vbuf_r[i*2+1][0];
975         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
976         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
977     }
978 }
979
980 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
981                           rvec cell_ns_x0,rvec cell_ns_x1)
982 {
983     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
984     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
985     rvec extr_s[2],extr_r[2];
986     rvec dh;
987     real dist_d,c=0,det;
988     gmx_domdec_comm_t *comm;
989     gmx_bool bPBC,bUse;
990
991     comm = dd->comm;
992
993     for(d=1; d<dd->ndim; d++)
994     {
995         dim = dd->dim[d];
996         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
997         zp->min0 = cell_ns_x0[dim];
998         zp->max1 = cell_ns_x1[dim];
999         zp->mch0 = cell_ns_x0[dim];
1000         zp->mch1 = cell_ns_x1[dim];
1001         zp->p1_0 = cell_ns_x0[dim];
1002         zp->p1_1 = cell_ns_x1[dim];
1003     }
1004     
1005     for(d=dd->ndim-2; d>=0; d--)
1006     {
1007         dim  = dd->dim[d];
1008         bPBC = (dim < ddbox->npbcdim);
1009
1010         /* Use an rvec to store two reals */
1011         extr_s[d][0] = comm->cell_f0[d+1];
1012         extr_s[d][1] = comm->cell_f1[d+1];
1013         extr_s[d][2] = 0;
1014
1015         pos = 0;
1016         /* Store the extremes in the backward sending buffer,
1017          * so the get updated separately from the forward communication.
1018          */
1019         for(d1=d; d1<dd->ndim-1; d1++)
1020         {
1021             /* We invert the order to be able to use the same loop for buf_e */
1022             buf_s[pos].min0 = extr_s[d1][1];
1023             buf_s[pos].max1 = extr_s[d1][0];
1024             buf_s[pos].mch0 = 0;
1025             buf_s[pos].mch1 = 0;
1026             /* Store the cell corner of the dimension we communicate along */
1027             buf_s[pos].p1_0 = comm->cell_x0[dim];
1028             buf_s[pos].p1_1 = 0;
1029             pos++;
1030         }
1031
1032         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1033         pos++;
1034
1035         if (dd->ndim == 3 && d == 0)
1036         {
1037             buf_s[pos] = comm->zone_d2[0][1];
1038             pos++;
1039             buf_s[pos] = comm->zone_d1[0];
1040             pos++;
1041         }
1042
1043         /* We only need to communicate the extremes
1044          * in the forward direction
1045          */
1046         npulse = comm->cd[d].np;
1047         if (bPBC)
1048         {
1049             /* Take the minimum to avoid double communication */
1050             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1051         }
1052         else
1053         {
1054             /* Without PBC we should really not communicate over
1055              * the boundaries, but implementing that complicates
1056              * the communication setup and therefore we simply
1057              * do all communication, but ignore some data.
1058              */
1059             npulse_min = npulse;
1060         }
1061         for(p=0; p<npulse_min; p++)
1062         {
1063             /* Communicate the extremes forward */
1064             bUse = (bPBC || dd->ci[dim] > 0);
1065
1066             dd_sendrecv_rvec(dd, d, dddirForward,
1067                              extr_s+d, dd->ndim-d-1,
1068                              extr_r+d, dd->ndim-d-1);
1069
1070             if (bUse)
1071             {
1072                 for(d1=d; d1<dd->ndim-1; d1++)
1073                 {
1074                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1075                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1076                 }
1077             }
1078         }
1079
1080         buf_size = pos;
1081         for(p=0; p<npulse; p++)
1082         {
1083             /* Communicate all the zone information backward */
1084             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1085
1086             dd_sendrecv_ddzone(dd, d, dddirBackward,
1087                                buf_s, buf_size,
1088                                buf_r, buf_size);
1089
1090             clear_rvec(dh);
1091             if (p > 0)
1092             {
1093                 for(d1=d+1; d1<dd->ndim; d1++)
1094                 {
1095                     /* Determine the decrease of maximum required
1096                      * communication height along d1 due to the distance along d,
1097                      * this avoids a lot of useless atom communication.
1098                      */
1099                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1100
1101                     if (ddbox->tric_dir[dim])
1102                     {
1103                         /* c is the off-diagonal coupling between the cell planes
1104                          * along directions d and d1.
1105                          */
1106                         c = ddbox->v[dim][dd->dim[d1]][dim];
1107                     }
1108                     else
1109                     {
1110                         c = 0;
1111                     }
1112                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1113                     if (det > 0)
1114                     {
1115                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1116                     }
1117                     else
1118                     {
1119                         /* A negative value signals out of range */
1120                         dh[d1] = -1;
1121                     }
1122                 }
1123             }
1124
1125             /* Accumulate the extremes over all pulses */
1126             for(i=0; i<buf_size; i++)
1127             {
1128                 if (p == 0)
1129                 {
1130                     buf_e[i] = buf_r[i];
1131                 }
1132                 else
1133                 {
1134                     if (bUse)
1135                     {
1136                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1137                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1138                     }
1139
1140                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1141                     {
1142                         d1 = 1;
1143                     }
1144                     else
1145                     {
1146                         d1 = d + 1;
1147                     }
1148                     if (bUse && dh[d1] >= 0)
1149                     {
1150                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1151                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1152                     }
1153                 }
1154                 /* Copy the received buffer to the send buffer,
1155                  * to pass the data through with the next pulse.
1156                  */
1157                 buf_s[i] = buf_r[i];
1158             }
1159             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1160                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1161             {
1162                 /* Store the extremes */ 
1163                 pos = 0;
1164
1165                 for(d1=d; d1<dd->ndim-1; d1++)
1166                 {
1167                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1168                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1169                     pos++;
1170                 }
1171
1172                 if (d == 1 || (d == 0 && dd->ndim == 3))
1173                 {
1174                     for(i=d; i<2; i++)
1175                     {
1176                         comm->zone_d2[1-d][i] = buf_e[pos];
1177                         pos++;
1178                     }
1179                 }
1180                 if (d == 0)
1181                 {
1182                     comm->zone_d1[1] = buf_e[pos];
1183                     pos++;
1184                 }
1185             }
1186         }
1187     }
1188     
1189     if (dd->ndim >= 2)
1190     {
1191         dim = dd->dim[1];
1192         for(i=0; i<2; i++)
1193         {
1194             if (debug)
1195             {
1196                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1197             }
1198             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1199             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1200         }
1201     }
1202     if (dd->ndim >= 3)
1203     {
1204         dim = dd->dim[2];
1205         for(i=0; i<2; i++)
1206         {
1207             for(j=0; j<2; j++)
1208             {
1209                 if (debug)
1210                 {
1211                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1212                 }
1213                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1214                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1215             }
1216         }
1217     }
1218     for(d=1; d<dd->ndim; d++)
1219     {
1220         comm->cell_f_max0[d] = extr_s[d-1][0];
1221         comm->cell_f_min1[d] = extr_s[d-1][1];
1222         if (debug)
1223         {
1224             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1225                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1226         }
1227     }
1228 }
1229
1230 static void dd_collect_cg(gmx_domdec_t *dd,
1231                           t_state *state_local)
1232 {
1233     gmx_domdec_master_t *ma=NULL;
1234     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1235     t_block *cgs_gl;
1236
1237     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1238     {
1239         /* The master has the correct distribution */
1240         return;
1241     }
1242     
1243     if (state_local->ddp_count == dd->ddp_count)
1244     {
1245         ncg_home = dd->ncg_home;
1246         cg       = dd->index_gl;
1247         nat_home = dd->nat_home;
1248     } 
1249     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1250     {
1251         cgs_gl = &dd->comm->cgs_gl;
1252
1253         ncg_home = state_local->ncg_gl;
1254         cg       = state_local->cg_gl;
1255         nat_home = 0;
1256         for(i=0; i<ncg_home; i++)
1257         {
1258             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1259         }
1260     }
1261     else
1262     {
1263         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1264     }
1265     
1266     buf2[0] = dd->ncg_home;
1267     buf2[1] = dd->nat_home;
1268     if (DDMASTER(dd))
1269     {
1270         ma = dd->ma;
1271         ibuf = ma->ibuf;
1272     }
1273     else
1274     {
1275         ibuf = NULL;
1276     }
1277     /* Collect the charge group and atom counts on the master */
1278     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1279     
1280     if (DDMASTER(dd))
1281     {
1282         ma->index[0] = 0;
1283         for(i=0; i<dd->nnodes; i++)
1284         {
1285             ma->ncg[i] = ma->ibuf[2*i];
1286             ma->nat[i] = ma->ibuf[2*i+1];
1287             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1288             
1289         }
1290         /* Make byte counts and indices */
1291         for(i=0; i<dd->nnodes; i++)
1292         {
1293             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1294             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1295         }
1296         if (debug)
1297         {
1298             fprintf(debug,"Initial charge group distribution: ");
1299             for(i=0; i<dd->nnodes; i++)
1300                 fprintf(debug," %d",ma->ncg[i]);
1301             fprintf(debug,"\n");
1302         }
1303     }
1304     
1305     /* Collect the charge group indices on the master */
1306     dd_gatherv(dd,
1307                dd->ncg_home*sizeof(int),dd->index_gl,
1308                DDMASTER(dd) ? ma->ibuf : NULL,
1309                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1310                DDMASTER(dd) ? ma->cg : NULL);
1311     
1312     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1313 }
1314
1315 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1316                                     rvec *lv,rvec *v)
1317 {
1318     gmx_domdec_master_t *ma;
1319     int  n,i,c,a,nalloc=0;
1320     rvec *buf=NULL;
1321     t_block *cgs_gl;
1322
1323     ma = dd->ma;
1324     
1325     if (!DDMASTER(dd))
1326     {
1327 #ifdef GMX_MPI
1328         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1329                  dd->rank,dd->mpi_comm_all);
1330 #endif
1331     } else {
1332         /* Copy the master coordinates to the global array */
1333         cgs_gl = &dd->comm->cgs_gl;
1334
1335         n = DDMASTERRANK(dd);
1336         a = 0;
1337         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1338         {
1339             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1340             {
1341                 copy_rvec(lv[a++],v[c]);
1342             }
1343         }
1344         
1345         for(n=0; n<dd->nnodes; n++)
1346         {
1347             if (n != dd->rank)
1348             {
1349                 if (ma->nat[n] > nalloc)
1350                 {
1351                     nalloc = over_alloc_dd(ma->nat[n]);
1352                     srenew(buf,nalloc);
1353                 }
1354 #ifdef GMX_MPI
1355                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1356                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1357 #endif
1358                 a = 0;
1359                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1360                 {
1361                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1362                     {
1363                         copy_rvec(buf[a++],v[c]);
1364                     }
1365                 }
1366             }
1367         }
1368         sfree(buf);
1369     }
1370 }
1371
1372 static void get_commbuffer_counts(gmx_domdec_t *dd,
1373                                   int **counts,int **disps)
1374 {
1375     gmx_domdec_master_t *ma;
1376     int n;
1377
1378     ma = dd->ma;
1379     
1380     /* Make the rvec count and displacment arrays */
1381     *counts  = ma->ibuf;
1382     *disps   = ma->ibuf + dd->nnodes;
1383     for(n=0; n<dd->nnodes; n++)
1384     {
1385         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1386         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1387     }
1388 }
1389
1390 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1391                                    rvec *lv,rvec *v)
1392 {
1393     gmx_domdec_master_t *ma;
1394     int  *rcounts=NULL,*disps=NULL;
1395     int  n,i,c,a;
1396     rvec *buf=NULL;
1397     t_block *cgs_gl;
1398     
1399     ma = dd->ma;
1400     
1401     if (DDMASTER(dd))
1402     {
1403         get_commbuffer_counts(dd,&rcounts,&disps);
1404
1405         buf = ma->vbuf;
1406     }
1407     
1408     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1409
1410     if (DDMASTER(dd))
1411     {
1412         cgs_gl = &dd->comm->cgs_gl;
1413
1414         a = 0;
1415         for(n=0; n<dd->nnodes; n++)
1416         {
1417             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1418             {
1419                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1420                 {
1421                     copy_rvec(buf[a++],v[c]);
1422                 }
1423             }
1424         }
1425     }
1426 }
1427
1428 void dd_collect_vec(gmx_domdec_t *dd,
1429                     t_state *state_local,rvec *lv,rvec *v)
1430 {
1431     gmx_domdec_master_t *ma;
1432     int  n,i,c,a,nalloc=0;
1433     rvec *buf=NULL;
1434     
1435     dd_collect_cg(dd,state_local);
1436
1437     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1438     {
1439         dd_collect_vec_sendrecv(dd,lv,v);
1440     }
1441     else
1442     {
1443         dd_collect_vec_gatherv(dd,lv,v);
1444     }
1445 }
1446
1447
1448 void dd_collect_state(gmx_domdec_t *dd,
1449                       t_state *state_local,t_state *state)
1450 {
1451     int est,i,j,nh;
1452
1453     nh = state->nhchainlength;
1454
1455     if (DDMASTER(dd))
1456     {
1457         state->lambda = state_local->lambda;
1458         state->veta = state_local->veta;
1459         state->vol0 = state_local->vol0;
1460         copy_mat(state_local->box,state->box);
1461         copy_mat(state_local->boxv,state->boxv);
1462         copy_mat(state_local->svir_prev,state->svir_prev);
1463         copy_mat(state_local->fvir_prev,state->fvir_prev);
1464         copy_mat(state_local->pres_prev,state->pres_prev);
1465
1466
1467         for(i=0; i<state_local->ngtc; i++)
1468         {
1469             for(j=0; j<nh; j++) {
1470                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1471                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1472             }
1473             state->therm_integral[i] = state_local->therm_integral[i];            
1474         }
1475         for(i=0; i<state_local->nnhpres; i++) 
1476         {
1477             for(j=0; j<nh; j++) {
1478                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1479                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1480             }
1481         }
1482     }
1483     for(est=0; est<estNR; est++)
1484     {
1485         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1486         {
1487             switch (est) {
1488             case estX:
1489                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1490                 break;
1491             case estV:
1492                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1493                 break;
1494             case estSDX:
1495                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1496                 break;
1497             case estCGP:
1498                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1499                 break;
1500             case estLD_RNG:
1501                 if (state->nrngi == 1)
1502                 {
1503                     if (DDMASTER(dd))
1504                     {
1505                         for(i=0; i<state_local->nrng; i++)
1506                         {
1507                             state->ld_rng[i] = state_local->ld_rng[i];
1508                         }
1509                     }
1510                 }
1511                 else
1512                 {
1513                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1514                               state_local->ld_rng,state->ld_rng);
1515                 }
1516                 break;
1517             case estLD_RNGI:
1518                 if (state->nrngi == 1)
1519                 {
1520                    if (DDMASTER(dd))
1521                     {
1522                         state->ld_rngi[0] = state_local->ld_rngi[0];
1523                     } 
1524                 }
1525                 else
1526                 {
1527                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1528                               state_local->ld_rngi,state->ld_rngi);
1529                 }
1530                 break;
1531             case estDISRE_INITF:
1532             case estDISRE_RM3TAV:
1533             case estORIRE_INITF:
1534             case estORIRE_DTAV:
1535                 break;
1536             default:
1537                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1538             }
1539         }
1540     }
1541 }
1542
1543 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1544 {
1545     if (debug)
1546     {
1547         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1548     }
1549     fr->cg_nalloc = over_alloc_dd(nalloc);
1550     srenew(fr->cg_cm,fr->cg_nalloc);
1551     srenew(fr->cginfo,fr->cg_nalloc);
1552 }
1553
1554 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1555 {
1556     int est;
1557
1558     if (debug)
1559     {
1560         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1561     }
1562
1563     state->nalloc = over_alloc_dd(nalloc);
1564     
1565     for(est=0; est<estNR; est++)
1566     {
1567         if (EST_DISTR(est) && (state->flags & (1<<est)))
1568         {
1569             switch(est) {
1570             case estX:
1571                 srenew(state->x,state->nalloc);
1572                 break;
1573             case estV:
1574                 srenew(state->v,state->nalloc);
1575                 break;
1576             case estSDX:
1577                 srenew(state->sd_X,state->nalloc);
1578                 break;
1579             case estCGP:
1580                 srenew(state->cg_p,state->nalloc);
1581                 break;
1582             case estLD_RNG:
1583             case estLD_RNGI:
1584             case estDISRE_INITF:
1585             case estDISRE_RM3TAV:
1586             case estORIRE_INITF:
1587             case estORIRE_DTAV:
1588                 /* No reallocation required */
1589                 break;
1590             default:
1591                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1592             }
1593         }
1594     }
1595     
1596     if (f != NULL)
1597     {
1598         srenew(*f,state->nalloc);
1599     }
1600 }
1601
1602 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1603                                        rvec *v,rvec *lv)
1604 {
1605     gmx_domdec_master_t *ma;
1606     int  n,i,c,a,nalloc=0;
1607     rvec *buf=NULL;
1608     
1609     if (DDMASTER(dd))
1610     {
1611         ma  = dd->ma;
1612         
1613         for(n=0; n<dd->nnodes; n++)
1614         {
1615             if (n != dd->rank)
1616             {
1617                 if (ma->nat[n] > nalloc)
1618                 {
1619                     nalloc = over_alloc_dd(ma->nat[n]);
1620                     srenew(buf,nalloc);
1621                 }
1622                 /* Use lv as a temporary buffer */
1623                 a = 0;
1624                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1625                 {
1626                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1627                     {
1628                         copy_rvec(v[c],buf[a++]);
1629                     }
1630                 }
1631                 if (a != ma->nat[n])
1632                 {
1633                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1634                               a,ma->nat[n]);
1635                 }
1636                 
1637 #ifdef GMX_MPI
1638                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1639                          DDRANK(dd,n),n,dd->mpi_comm_all);
1640 #endif
1641             }
1642         }
1643         sfree(buf);
1644         n = DDMASTERRANK(dd);
1645         a = 0;
1646         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1647         {
1648             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1649             {
1650                 copy_rvec(v[c],lv[a++]);
1651             }
1652         }
1653     }
1654     else
1655     {
1656 #ifdef GMX_MPI
1657         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1658                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1659 #endif
1660     }
1661 }
1662
1663 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1664                                        rvec *v,rvec *lv)
1665 {
1666     gmx_domdec_master_t *ma;
1667     int  *scounts=NULL,*disps=NULL;
1668     int  n,i,c,a,nalloc=0;
1669     rvec *buf=NULL;
1670     
1671     if (DDMASTER(dd))
1672     {
1673         ma  = dd->ma;
1674      
1675         get_commbuffer_counts(dd,&scounts,&disps);
1676
1677         buf = ma->vbuf;
1678         a = 0;
1679         for(n=0; n<dd->nnodes; n++)
1680         {
1681             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1682             {
1683                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1684                 {
1685                     copy_rvec(v[c],buf[a++]);
1686                 }
1687             }
1688         }
1689     }
1690
1691     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1692 }
1693
1694 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1695 {
1696     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1697     {
1698         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1699     }
1700     else
1701     {
1702         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1703     }
1704 }
1705
1706 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1707                                 t_state *state,t_state *state_local,
1708                                 rvec **f)
1709 {
1710     int  i,j,ngtch,ngtcp,nh;
1711
1712     nh = state->nhchainlength;
1713
1714     if (DDMASTER(dd))
1715     {
1716         state_local->lambda = state->lambda;
1717         state_local->veta   = state->veta;
1718         state_local->vol0   = state->vol0;
1719         copy_mat(state->box,state_local->box);
1720         copy_mat(state->box_rel,state_local->box_rel);
1721         copy_mat(state->boxv,state_local->boxv);
1722         copy_mat(state->svir_prev,state_local->svir_prev);
1723         copy_mat(state->fvir_prev,state_local->fvir_prev);
1724         for(i=0; i<state_local->ngtc; i++)
1725         {
1726             for(j=0; j<nh; j++) {
1727                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1728                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1729             }
1730             state_local->therm_integral[i] = state->therm_integral[i];
1731         }
1732         for(i=0; i<state_local->nnhpres; i++)
1733         {
1734             for(j=0; j<nh; j++) {
1735                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1736                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1737             }
1738         }
1739     }
1740     dd_bcast(dd,sizeof(real),&state_local->lambda);
1741     dd_bcast(dd,sizeof(real),&state_local->veta);
1742     dd_bcast(dd,sizeof(real),&state_local->vol0);
1743     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1744     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1745     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1746     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1747     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1748     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1749     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1750     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1751     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1752     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1753
1754     if (dd->nat_home > state_local->nalloc)
1755     {
1756         dd_realloc_state(state_local,f,dd->nat_home);
1757     }
1758     for(i=0; i<estNR; i++)
1759     {
1760         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1761         {
1762             switch (i) {
1763             case estX:
1764                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1765                 break;
1766             case estV:
1767                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1768                 break;
1769             case estSDX:
1770                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1771                 break;
1772             case estCGP:
1773                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1774                 break;
1775             case estLD_RNG:
1776                 if (state->nrngi == 1)
1777                 {
1778                     dd_bcastc(dd,
1779                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1780                               state->ld_rng,state_local->ld_rng);
1781                 }
1782                 else
1783                 {
1784                     dd_scatter(dd,
1785                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1786                                state->ld_rng,state_local->ld_rng);
1787                 }
1788                 break;
1789             case estLD_RNGI:
1790                 if (state->nrngi == 1)
1791                 {
1792                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1793                               state->ld_rngi,state_local->ld_rngi);
1794                 }
1795                 else
1796                 {
1797                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1798                                state->ld_rngi,state_local->ld_rngi);
1799                 }   
1800                 break;
1801             case estDISRE_INITF:
1802             case estDISRE_RM3TAV:
1803             case estORIRE_INITF:
1804             case estORIRE_DTAV:
1805                 /* Not implemented yet */
1806                 break;
1807             default:
1808                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1809             }
1810         }
1811     }
1812 }
1813
1814 static char dim2char(int dim)
1815 {
1816     char c='?';
1817     
1818     switch (dim)
1819     {
1820     case XX: c = 'X'; break;
1821     case YY: c = 'Y'; break;
1822     case ZZ: c = 'Z'; break;
1823     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1824     }
1825     
1826     return c;
1827 }
1828
1829 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1830                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1831 {
1832     rvec grid_s[2],*grid_r=NULL,cx,r;
1833     char fname[STRLEN],format[STRLEN],buf[22];
1834     FILE *out;
1835     int  a,i,d,z,y,x;
1836     matrix tric;
1837     real vol;
1838
1839     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1840     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1841     
1842     if (DDMASTER(dd))
1843     {
1844         snew(grid_r,2*dd->nnodes);
1845     }
1846     
1847     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1848     
1849     if (DDMASTER(dd))
1850     {
1851         for(d=0; d<DIM; d++)
1852         {
1853             for(i=0; i<DIM; i++)
1854             {
1855                 if (d == i)
1856                 {
1857                     tric[d][i] = 1;
1858                 }
1859                 else
1860                 {
1861                     if (dd->nc[d] > 1 && d < ddbox->npbcdim)
1862                     {
1863                         tric[d][i] = box[i][d]/box[i][i];
1864                     }
1865                     else
1866                     {
1867                         tric[d][i] = 0;
1868                     }
1869                 }
1870             }
1871         }
1872         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1873         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1874         out = gmx_fio_fopen(fname,"w");
1875         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1876         a = 1;
1877         for(i=0; i<dd->nnodes; i++)
1878         {
1879             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1880             for(d=0; d<DIM; d++)
1881             {
1882                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1883             }
1884             for(z=0; z<2; z++)
1885             {
1886                 for(y=0; y<2; y++)
1887                 {
1888                     for(x=0; x<2; x++)
1889                     {
1890                         cx[XX] = grid_r[i*2+x][XX];
1891                         cx[YY] = grid_r[i*2+y][YY];
1892                         cx[ZZ] = grid_r[i*2+z][ZZ];
1893                         mvmul(tric,cx,r);
1894                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1895                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1896                     }
1897                 }
1898             }
1899             for(d=0; d<DIM; d++)
1900             {
1901                 for(x=0; x<4; x++)
1902                 {
1903                     switch(d)
1904                     {
1905                     case 0: y = 1 + i*8 + 2*x; break;
1906                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1907                     case 2: y = 1 + i*8 + x; break;
1908                     }
1909                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1910                 }
1911             }
1912         }
1913         gmx_fio_fclose(out);
1914         sfree(grid_r);
1915     }
1916 }
1917
1918 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1919                   gmx_mtop_t *mtop,t_commrec *cr,
1920                   int natoms,rvec x[],matrix box)
1921 {
1922     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1923     FILE *out;
1924     int  i,ii,resnr,c;
1925     char *atomname,*resname;
1926     real b;
1927     gmx_domdec_t *dd;
1928     
1929     dd = cr->dd;
1930     if (natoms == -1)
1931     {
1932         natoms = dd->comm->nat[ddnatVSITE];
1933     }
1934     
1935     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1936     
1937     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1938     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1939     
1940     out = gmx_fio_fopen(fname,"w");
1941     
1942     fprintf(out,"TITLE     %s\n",title);
1943     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1944     for(i=0; i<natoms; i++)
1945     {
1946         ii = dd->gatindex[i];
1947         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1948         if (i < dd->comm->nat[ddnatZONE])
1949         {
1950             c = 0;
1951             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1952             {
1953                 c++;
1954             }
1955             b = c;
1956         }
1957         else if (i < dd->comm->nat[ddnatVSITE])
1958         {
1959             b = dd->comm->zones.n;
1960         }
1961         else
1962         {
1963             b = dd->comm->zones.n + 1;
1964         }
1965         fprintf(out,strlen(atomname)<4 ? format : format4,
1966                 "ATOM",(ii+1)%100000,
1967                 atomname,resname,' ',resnr%10000,' ',
1968                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1969     }
1970     fprintf(out,"TER\n");
1971     
1972     gmx_fio_fclose(out);
1973 }
1974
1975 real dd_cutoff_mbody(gmx_domdec_t *dd)
1976 {
1977     gmx_domdec_comm_t *comm;
1978     int  di;
1979     real r;
1980
1981     comm = dd->comm;
1982
1983     r = -1;
1984     if (comm->bInterCGBondeds)
1985     {
1986         if (comm->cutoff_mbody > 0)
1987         {
1988             r = comm->cutoff_mbody;
1989         }
1990         else
1991         {
1992             /* cutoff_mbody=0 means we do not have DLB */
1993             r = comm->cellsize_min[dd->dim[0]];
1994             for(di=1; di<dd->ndim; di++)
1995             {
1996                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1997             }
1998             if (comm->bBondComm)
1999             {
2000                 r = max(r,comm->cutoff_mbody);
2001             }
2002             else
2003             {
2004                 r = min(r,comm->cutoff);
2005             }
2006         }
2007     }
2008
2009     return r;
2010 }
2011
2012 real dd_cutoff_twobody(gmx_domdec_t *dd)
2013 {
2014     real r_mb;
2015
2016     r_mb = dd_cutoff_mbody(dd);
2017
2018     return max(dd->comm->cutoff,r_mb);
2019 }
2020
2021
2022 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2023 {
2024     int nc,ntot;
2025     
2026     nc   = dd->nc[dd->comm->cartpmedim];
2027     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2028     copy_ivec(coord,coord_pme);
2029     coord_pme[dd->comm->cartpmedim] =
2030         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2031 }
2032
2033 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2034 {
2035     /* Here we assign a PME node to communicate with this DD node
2036      * by assuming that the major index of both is x.
2037      * We add cr->npmenodes/2 to obtain an even distribution.
2038      */
2039     return (ddindex*npme + npme/2)/ndd;
2040 }
2041
2042 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2043 {
2044     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2045 }
2046
2047 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2048 {
2049     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2050 }
2051
2052 static int *dd_pmenodes(t_commrec *cr)
2053 {
2054     int *pmenodes;
2055     int n,i,p0,p1;
2056     
2057     snew(pmenodes,cr->npmenodes);
2058     n = 0;
2059     for(i=0; i<cr->dd->nnodes; i++) {
2060         p0 = cr_ddindex2pmeindex(cr,i);
2061         p1 = cr_ddindex2pmeindex(cr,i+1);
2062         if (i+1 == cr->dd->nnodes || p1 > p0) {
2063             if (debug)
2064                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2065             pmenodes[n] = i + 1 + n;
2066             n++;
2067         }
2068     }
2069
2070     return pmenodes;
2071 }
2072
2073 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2074 {
2075     gmx_domdec_t *dd;
2076     ivec coords,coords_pme,nc;
2077     int  slab;
2078     
2079     dd = cr->dd;
2080     /*
2081       if (dd->comm->bCartesian) {
2082       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2083       dd_coords2pmecoords(dd,coords,coords_pme);
2084       copy_ivec(dd->ntot,nc);
2085       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2086       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2087       
2088       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2089       } else {
2090       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2091       }
2092     */
2093     coords[XX] = x;
2094     coords[YY] = y;
2095     coords[ZZ] = z;
2096     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2097     
2098     return slab;
2099 }
2100
2101 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2102 {
2103     gmx_domdec_comm_t *comm;
2104     ivec coords;
2105     int  ddindex,nodeid=-1;
2106     
2107     comm = cr->dd->comm;
2108     
2109     coords[XX] = x;
2110     coords[YY] = y;
2111     coords[ZZ] = z;
2112     if (comm->bCartesianPP_PME)
2113     {
2114 #ifdef GMX_MPI
2115         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2116 #endif
2117     }
2118     else
2119     {
2120         ddindex = dd_index(cr->dd->nc,coords);
2121         if (comm->bCartesianPP)
2122         {
2123             nodeid = comm->ddindex2simnodeid[ddindex];
2124         }
2125         else
2126         {
2127             if (comm->pmenodes)
2128             {
2129                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2130             }
2131             else
2132             {
2133                 nodeid = ddindex;
2134             }
2135         }
2136     }
2137   
2138     return nodeid;
2139 }
2140
2141 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2142 {
2143     gmx_domdec_t *dd;
2144     gmx_domdec_comm_t *comm;
2145     ivec coord,coord_pme;
2146     int  i;
2147     int  pmenode=-1;
2148     
2149     dd = cr->dd;
2150     comm = dd->comm;
2151     
2152     /* This assumes a uniform x domain decomposition grid cell size */
2153     if (comm->bCartesianPP_PME)
2154     {
2155 #ifdef GMX_MPI
2156         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2157         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2158         {
2159             /* This is a PP node */
2160             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2161             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2162         }
2163 #endif
2164     }
2165     else if (comm->bCartesianPP)
2166     {
2167         if (sim_nodeid < dd->nnodes)
2168         {
2169             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2170         }
2171     }
2172     else
2173     {
2174         /* This assumes DD cells with identical x coordinates
2175          * are numbered sequentially.
2176          */
2177         if (dd->comm->pmenodes == NULL)
2178         {
2179             if (sim_nodeid < dd->nnodes)
2180             {
2181                 /* The DD index equals the nodeid */
2182                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2183             }
2184         }
2185         else
2186         {
2187             i = 0;
2188             while (sim_nodeid > dd->comm->pmenodes[i])
2189             {
2190                 i++;
2191             }
2192             if (sim_nodeid < dd->comm->pmenodes[i])
2193             {
2194                 pmenode = dd->comm->pmenodes[i];
2195             }
2196         }
2197     }
2198     
2199     return pmenode;
2200 }
2201
2202 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2203 {
2204     gmx_bool bPMEOnlyNode;
2205     
2206     if (DOMAINDECOMP(cr))
2207     {
2208         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2209     }
2210     else
2211     {
2212         bPMEOnlyNode = FALSE;
2213     }
2214     
2215     return bPMEOnlyNode;
2216 }
2217
2218 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2219                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2220 {
2221     gmx_domdec_t *dd;
2222     int x,y,z;
2223     ivec coord,coord_pme;
2224     
2225     dd = cr->dd;
2226     
2227     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2228     
2229     *nmy_ddnodes = 0;
2230     for(x=0; x<dd->nc[XX]; x++)
2231     {
2232         for(y=0; y<dd->nc[YY]; y++)
2233         {
2234             for(z=0; z<dd->nc[ZZ]; z++)
2235             {
2236                 if (dd->comm->bCartesianPP_PME)
2237                 {
2238                     coord[XX] = x;
2239                     coord[YY] = y;
2240                     coord[ZZ] = z;
2241                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2242                     if (dd->ci[XX] == coord_pme[XX] &&
2243                         dd->ci[YY] == coord_pme[YY] &&
2244                         dd->ci[ZZ] == coord_pme[ZZ])
2245                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2246                 }
2247                 else
2248                 {
2249                     /* The slab corresponds to the nodeid in the PME group */
2250                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2251                     {
2252                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2253                     }
2254                 }
2255             }
2256         }
2257     }
2258     
2259     /* The last PP-only node is the peer node */
2260     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2261     
2262     if (debug)
2263     {
2264         fprintf(debug,"Receive coordinates from PP nodes:");
2265         for(x=0; x<*nmy_ddnodes; x++)
2266         {
2267             fprintf(debug," %d",(*my_ddnodes)[x]);
2268         }
2269         fprintf(debug,"\n");
2270     }
2271 }
2272
2273 static gmx_bool receive_vir_ener(t_commrec *cr)
2274 {
2275     gmx_domdec_comm_t *comm;
2276     int  pmenode,coords[DIM],rank;
2277     gmx_bool bReceive;
2278     
2279     bReceive = TRUE;
2280     if (cr->npmenodes < cr->dd->nnodes)
2281     {
2282         comm = cr->dd->comm;
2283         if (comm->bCartesianPP_PME)
2284         {
2285             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2286 #ifdef GMX_MPI
2287             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2288             coords[comm->cartpmedim]++;
2289             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2290             {
2291                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2292                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2293                 {
2294                     /* This is not the last PP node for pmenode */
2295                     bReceive = FALSE;
2296                 }
2297             }
2298 #endif  
2299         }
2300         else
2301         {
2302             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2303             if (cr->sim_nodeid+1 < cr->nnodes &&
2304                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2305             {
2306                 /* This is not the last PP node for pmenode */
2307                 bReceive = FALSE;
2308             }
2309         }
2310     }
2311     
2312     return bReceive;
2313 }
2314
2315 static void set_zones_ncg_home(gmx_domdec_t *dd)
2316 {
2317     gmx_domdec_zones_t *zones;
2318     int i;
2319
2320     zones = &dd->comm->zones;
2321
2322     zones->cg_range[0] = 0;
2323     for(i=1; i<zones->n+1; i++)
2324     {
2325         zones->cg_range[i] = dd->ncg_home;
2326     }
2327 }
2328
2329 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2330 {
2331     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2332     
2333     ind = state->cg_gl;
2334     dd_cg_gl = dd->index_gl;
2335     cgindex  = dd->cgindex;
2336     nat = 0;
2337     cgindex[0] = nat;
2338     for(i=0; i<state->ncg_gl; i++)
2339     {
2340         cgindex[i] = nat;
2341         cg_gl = ind[i];
2342         dd_cg_gl[i] = cg_gl;
2343         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2344     }
2345     cgindex[i] = nat;
2346     
2347     dd->ncg_home = state->ncg_gl;
2348     dd->nat_home = nat;
2349
2350     set_zones_ncg_home(dd);
2351 }
2352
2353 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2354 {
2355     while (cg >= cginfo_mb->cg_end)
2356     {
2357         cginfo_mb++;
2358     }
2359
2360     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2361 }
2362
2363 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2364                           t_forcerec *fr,char *bLocalCG)
2365 {
2366     cginfo_mb_t *cginfo_mb;
2367     int *cginfo;
2368     int cg;
2369
2370     if (fr != NULL)
2371     {
2372         cginfo_mb = fr->cginfo_mb;
2373         cginfo    = fr->cginfo;
2374
2375         for(cg=cg0; cg<cg1; cg++)
2376         {
2377             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2378         }
2379     }
2380
2381     if (bLocalCG != NULL)
2382     {
2383         for(cg=cg0; cg<cg1; cg++)
2384         {
2385             bLocalCG[index_gl[cg]] = TRUE;
2386         }
2387     }
2388 }
2389
2390 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2391 {
2392     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2393     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2394     gmx_ga2la_t *ga2la;
2395     char *bLocalCG;
2396
2397     bLocalCG = dd->comm->bLocalCG;
2398
2399     if (dd->nat_tot > dd->gatindex_nalloc)
2400     {
2401         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2402         srenew(dd->gatindex,dd->gatindex_nalloc);
2403     }
2404
2405     nzone      = dd->comm->zones.n;
2406     zone2cg    = dd->comm->zones.cg_range;
2407     zone_ncg1  = dd->comm->zone_ncg1;
2408     index_gl   = dd->index_gl;
2409     gatindex   = dd->gatindex;
2410
2411     if (zone2cg[1] != dd->ncg_home)
2412     {
2413         gmx_incons("dd->ncg_zone is not up to date");
2414     }
2415     
2416     /* Make the local to global and global to local atom index */
2417     a = dd->cgindex[cg_start];
2418     for(zone=0; zone<nzone; zone++)
2419     {
2420         if (zone == 0)
2421         {
2422             cg0 = cg_start;
2423         }
2424         else
2425         {
2426             cg0 = zone2cg[zone];
2427         }
2428         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2429         {
2430             zone1 = zone;
2431             if (cg - cg0 >= zone_ncg1[zone])
2432             {
2433                 /* Signal that this cg is from more than one zone away */
2434                 zone1 += nzone;
2435             }
2436             cg_gl = index_gl[cg];
2437             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2438             {
2439                 gatindex[a] = a_gl;
2440                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2441                 a++;
2442             }
2443         }
2444     }
2445 }
2446
2447 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2448                           const char *where)
2449 {
2450     int ncg,i,ngl,nerr;
2451
2452     nerr = 0;
2453     if (bLocalCG == NULL)
2454     {
2455         return nerr;
2456     }
2457     for(i=0; i<dd->ncg_tot; i++)
2458     {
2459         if (!bLocalCG[dd->index_gl[i]])
2460         {
2461             fprintf(stderr,
2462                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2463             nerr++;
2464         }
2465     }
2466     ngl = 0;
2467     for(i=0; i<ncg_sys; i++)
2468     {
2469         if (bLocalCG[i])
2470         {
2471             ngl++;
2472         }
2473     }
2474     if (ngl != dd->ncg_tot)
2475     {
2476         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2477         nerr++;
2478     }
2479
2480     return nerr;
2481 }
2482
2483 static void check_index_consistency(gmx_domdec_t *dd,
2484                                     int natoms_sys,int ncg_sys,
2485                                     const char *where)
2486 {
2487     int  nerr,ngl,i,a,cell;
2488     int  *have;
2489
2490     nerr = 0;
2491
2492     if (dd->comm->DD_debug > 1)
2493     {
2494         snew(have,natoms_sys);
2495         for(a=0; a<dd->nat_tot; a++)
2496         {
2497             if (have[dd->gatindex[a]] > 0)
2498             {
2499                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2500             }
2501             else
2502             {
2503                 have[dd->gatindex[a]] = a + 1;
2504             }
2505         }
2506         sfree(have);
2507     }
2508
2509     snew(have,dd->nat_tot);
2510
2511     ngl  = 0;
2512     for(i=0; i<natoms_sys; i++)
2513     {
2514         if (ga2la_get(dd->ga2la,i,&a,&cell))
2515         {
2516             if (a >= dd->nat_tot)
2517             {
2518                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2519                 nerr++;
2520             }
2521             else
2522             {
2523                 have[a] = 1;
2524                 if (dd->gatindex[a] != i)
2525                 {
2526                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2527                     nerr++;
2528                 }
2529             }
2530             ngl++;
2531         }
2532     }
2533     if (ngl != dd->nat_tot)
2534     {
2535         fprintf(stderr,
2536                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2537                 dd->rank,where,ngl,dd->nat_tot);
2538     }
2539     for(a=0; a<dd->nat_tot; a++)
2540     {
2541         if (have[a] == 0)
2542         {
2543             fprintf(stderr,
2544                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2545                     dd->rank,where,a+1,dd->gatindex[a]+1);
2546         }
2547     }
2548     sfree(have);
2549
2550     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2551
2552     if (nerr > 0) {
2553         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2554                   dd->rank,where,nerr);
2555     }
2556 }
2557
2558 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2559 {
2560     int  i;
2561     char *bLocalCG;
2562
2563     if (a_start == 0)
2564     {
2565         /* Clear the whole list without searching */
2566         ga2la_clear(dd->ga2la);
2567     }
2568     else
2569     {
2570         for(i=a_start; i<dd->nat_tot; i++)
2571         {
2572             ga2la_del(dd->ga2la,dd->gatindex[i]);
2573         }
2574     }
2575
2576     bLocalCG = dd->comm->bLocalCG;
2577     if (bLocalCG)
2578     {
2579         for(i=cg_start; i<dd->ncg_tot; i++)
2580         {
2581             bLocalCG[dd->index_gl[i]] = FALSE;
2582         }
2583     }
2584
2585     dd_clear_local_vsite_indices(dd);
2586     
2587     if (dd->constraints)
2588     {
2589         dd_clear_local_constraint_indices(dd);
2590     }
2591 }
2592
2593 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2594 {
2595     real grid_jump_limit;
2596
2597     /* The distance between the boundaries of cells at distance
2598      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2599      * and by the fact that cells should not be shifted by more than
2600      * half their size, such that cg's only shift by one cell
2601      * at redecomposition.
2602      */
2603     grid_jump_limit = comm->cellsize_limit;
2604     if (!comm->bVacDLBNoLimit)
2605     {
2606         grid_jump_limit = max(grid_jump_limit,
2607                               comm->cutoff/comm->cd[dim_ind].np);
2608     }
2609
2610     return grid_jump_limit;
2611 }
2612
2613 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2614 {
2615     gmx_domdec_comm_t *comm;
2616     int  d,dim;
2617     real limit,bfac;
2618     
2619     comm = dd->comm;
2620     
2621     for(d=1; d<dd->ndim; d++)
2622     {
2623         dim = dd->dim[d];
2624         limit = grid_jump_limit(comm,d);
2625         bfac = ddbox->box_size[dim];
2626         if (ddbox->tric_dir[dim])
2627         {
2628             bfac *= ddbox->skew_fac[dim];
2629         }
2630         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2631             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2632         {
2633             char buf[22];
2634             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2635                       gmx_step_str(step,buf),
2636                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2637         }
2638     }
2639 }
2640
2641 static int dd_load_count(gmx_domdec_comm_t *comm)
2642 {
2643     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2644 }
2645
2646 static float dd_force_load(gmx_domdec_comm_t *comm)
2647 {
2648     float load;
2649     
2650     if (comm->eFlop)
2651     {
2652         load = comm->flop;
2653         if (comm->eFlop > 1)
2654         {
2655             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2656         }
2657     } 
2658     else
2659     {
2660         load = comm->cycl[ddCyclF];
2661         if (comm->cycl_n[ddCyclF] > 1)
2662         {
2663             /* Subtract the maximum of the last n cycle counts
2664              * to get rid of possible high counts due to other soures,
2665              * for instance system activity, that would otherwise
2666              * affect the dynamic load balancing.
2667              */
2668             load -= comm->cycl_max[ddCyclF];
2669         }
2670     }
2671     
2672     return load;
2673 }
2674
2675 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2676 {
2677     gmx_domdec_comm_t *comm;
2678     int i;
2679     
2680     comm = dd->comm;
2681     
2682     snew(*dim_f,dd->nc[dim]+1);
2683     (*dim_f)[0] = 0;
2684     for(i=1; i<dd->nc[dim]; i++)
2685     {
2686         if (comm->slb_frac[dim])
2687         {
2688             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2689         }
2690         else
2691         {
2692             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2693         }
2694     }
2695     (*dim_f)[dd->nc[dim]] = 1;
2696 }
2697
2698 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2699 {
2700     int  pmeindex,slab,nso,i;
2701     ivec xyz;
2702     
2703     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2704     {
2705         ddpme->dim = YY;
2706     }
2707     else
2708     {
2709         ddpme->dim = dimind;
2710     }
2711     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2712     
2713     ddpme->nslab = (ddpme->dim == 0 ?
2714                     dd->comm->npmenodes_x :
2715                     dd->comm->npmenodes_y);
2716
2717     if (ddpme->nslab <= 1)
2718     {
2719         return;
2720     }
2721
2722     nso = dd->comm->npmenodes/ddpme->nslab;
2723     /* Determine for each PME slab the PP location range for dimension dim */
2724     snew(ddpme->pp_min,ddpme->nslab);
2725     snew(ddpme->pp_max,ddpme->nslab);
2726     for(slab=0; slab<ddpme->nslab; slab++) {
2727         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2728         ddpme->pp_max[slab] = 0;
2729     }
2730     for(i=0; i<dd->nnodes; i++) {
2731         ddindex2xyz(dd->nc,i,xyz);
2732         /* For y only use our y/z slab.
2733          * This assumes that the PME x grid size matches the DD grid size.
2734          */
2735         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2736             pmeindex = ddindex2pmeindex(dd,i);
2737             if (dimind == 0) {
2738                 slab = pmeindex/nso;
2739             } else {
2740                 slab = pmeindex % ddpme->nslab;
2741             }
2742             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2743             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2744         }
2745     }
2746
2747     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2748 }
2749
2750 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2751 {
2752     if (dd->comm->ddpme[0].dim == XX)
2753     {
2754         return dd->comm->ddpme[0].maxshift;
2755     }
2756     else
2757     {
2758         return 0;
2759     }
2760 }
2761
2762 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2763 {
2764     if (dd->comm->ddpme[0].dim == YY)
2765     {
2766         return dd->comm->ddpme[0].maxshift;
2767     }
2768     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2769     {
2770         return dd->comm->ddpme[1].maxshift;
2771     }
2772     else
2773     {
2774         return 0;
2775     }
2776 }
2777
2778 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2779                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2780 {
2781     gmx_domdec_comm_t *comm;
2782     int  nc,ns,s;
2783     int  *xmin,*xmax;
2784     real range,pme_boundary;
2785     int  sh;
2786     
2787     comm = dd->comm;
2788     nc  = dd->nc[ddpme->dim];
2789     ns  = ddpme->nslab;
2790     
2791     if (!ddpme->dim_match)
2792     {
2793         /* PP decomposition is not along dim: the worst situation */
2794         sh = ns/2;
2795     }
2796     else if (ns <= 3 || (bUniform && ns == nc))
2797     {
2798         /* The optimal situation */
2799         sh = 1;
2800     }
2801     else
2802     {
2803         /* We need to check for all pme nodes which nodes they
2804          * could possibly need to communicate with.
2805          */
2806         xmin = ddpme->pp_min;
2807         xmax = ddpme->pp_max;
2808         /* Allow for atoms to be maximally 2/3 times the cut-off
2809          * out of their DD cell. This is a reasonable balance between
2810          * between performance and support for most charge-group/cut-off
2811          * combinations.
2812          */
2813         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2814         /* Avoid extra communication when we are exactly at a boundary */
2815         range *= 0.999;
2816         
2817         sh = 1;
2818         for(s=0; s<ns; s++)
2819         {
2820             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2821             pme_boundary = (real)s/ns;
2822             while (sh+1 < ns &&
2823                    ((s-(sh+1) >= 0 &&
2824                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2825                     (s-(sh+1) <  0 &&
2826                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2827             {
2828                 sh++;
2829             }
2830             pme_boundary = (real)(s+1)/ns;
2831             while (sh+1 < ns &&
2832                    ((s+(sh+1) <  ns &&
2833                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2834                     (s+(sh+1) >= ns &&
2835                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2836             {
2837                 sh++;
2838             }
2839         }
2840     }
2841     
2842     ddpme->maxshift = sh;
2843     
2844     if (debug)
2845     {
2846         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2847                 ddpme->dim,ddpme->maxshift);
2848     }
2849 }
2850
2851 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2852 {
2853     int d,dim;
2854     
2855     for(d=0; d<dd->ndim; d++)
2856     {
2857         dim = dd->dim[d];
2858         if (dim < ddbox->nboundeddim &&
2859             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2860             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2861         {
2862             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2863                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2864                       dd->nc[dim],dd->comm->cellsize_limit);
2865         }
2866     }
2867 }
2868
2869 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2870                                   gmx_bool bMaster,ivec npulse)
2871 {
2872     gmx_domdec_comm_t *comm;
2873     int  d,j;
2874     rvec cellsize_min;
2875     real *cell_x,cell_dx,cellsize;
2876     
2877     comm = dd->comm;
2878     
2879     for(d=0; d<DIM; d++)
2880     {
2881         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2882         npulse[d] = 1;
2883         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2884         {
2885             /* Uniform grid */
2886             cell_dx = ddbox->box_size[d]/dd->nc[d];
2887             if (bMaster)
2888             {
2889                 for(j=0; j<dd->nc[d]+1; j++)
2890                 {
2891                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2892                 }
2893             }
2894             else
2895             {
2896                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2897                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2898             }
2899             cellsize = cell_dx*ddbox->skew_fac[d];
2900             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2901             {
2902                 npulse[d]++;
2903             }
2904             cellsize_min[d] = cellsize;
2905         }
2906         else
2907         {
2908             /* Statically load balanced grid */
2909             /* Also when we are not doing a master distribution we determine
2910              * all cell borders in a loop to obtain identical values
2911              * to the master distribution case and to determine npulse.
2912              */
2913             if (bMaster)
2914             {
2915                 cell_x = dd->ma->cell_x[d];
2916             }
2917             else
2918             {
2919                 snew(cell_x,dd->nc[d]+1);
2920             }
2921             cell_x[0] = ddbox->box0[d];
2922             for(j=0; j<dd->nc[d]; j++)
2923             {
2924                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2925                 cell_x[j+1] = cell_x[j] + cell_dx;
2926                 cellsize = cell_dx*ddbox->skew_fac[d];
2927                 while (cellsize*npulse[d] < comm->cutoff &&
2928                        npulse[d] < dd->nc[d]-1)
2929                 {
2930                     npulse[d]++;
2931                 }
2932                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2933             }
2934             if (!bMaster)
2935             {
2936                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2937                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2938                 sfree(cell_x);
2939             }
2940         }
2941         /* The following limitation is to avoid that a cell would receive
2942          * some of its own home charge groups back over the periodic boundary.
2943          * Double charge groups cause trouble with the global indices.
2944          */
2945         if (d < ddbox->npbcdim &&
2946             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2947         {
2948             gmx_fatal_collective(FARGS,NULL,dd,
2949                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2950                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2951                                  comm->cutoff,
2952                                  dd->nc[d],dd->nc[d],
2953                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2954         }
2955     }
2956     
2957     if (!comm->bDynLoadBal)
2958     {
2959         copy_rvec(cellsize_min,comm->cellsize_min);
2960     }
2961    
2962     for(d=0; d<comm->npmedecompdim; d++)
2963     {
2964         set_pme_maxshift(dd,&comm->ddpme[d],
2965                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2966                          comm->ddpme[d].slb_dim_f);
2967     }
2968 }
2969
2970
2971 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2972                                        int d,int dim,gmx_domdec_root_t *root,
2973                                        gmx_ddbox_t *ddbox,
2974                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2975 {
2976     gmx_domdec_comm_t *comm;
2977     int  ncd,i,j,nmin,nmin_old;
2978     gmx_bool bLimLo,bLimHi;
2979     real *cell_size;
2980     real fac,halfway,cellsize_limit_f_i,region_size;
2981     gmx_bool bPBC,bLastHi=FALSE;
2982     int nrange[]={range[0],range[1]};
2983
2984     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
2985
2986     comm = dd->comm;
2987
2988     ncd = dd->nc[dim];
2989
2990     bPBC = (dim < ddbox->npbcdim);
2991
2992     cell_size = root->buf_ncd;
2993
2994     if (debug) 
2995     {
2996         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2997     }
2998
2999     /* First we need to check if the scaling does not make cells
3000      * smaller than the smallest allowed size.
3001      * We need to do this iteratively, since if a cell is too small,
3002      * it needs to be enlarged, which makes all the other cells smaller,
3003      * which could in turn make another cell smaller than allowed.
3004      */
3005     for(i=range[0]; i<range[1]; i++)
3006     {
3007         root->bCellMin[i] = FALSE;
3008     }
3009     nmin = 0;
3010     do
3011     {
3012         nmin_old = nmin;
3013         /* We need the total for normalization */
3014         fac = 0;
3015         for(i=range[0]; i<range[1]; i++)
3016         {
3017             if (root->bCellMin[i] == FALSE)
3018             {
3019                 fac += cell_size[i];
3020             }
3021         }
3022         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3023         /* Determine the cell boundaries */
3024         for(i=range[0]; i<range[1]; i++)
3025         {
3026             if (root->bCellMin[i] == FALSE)
3027             {
3028                 cell_size[i] *= fac;
3029                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3030                 {
3031                     cellsize_limit_f_i = 0;
3032                 }
3033                 else
3034                 {
3035                     cellsize_limit_f_i = cellsize_limit_f;
3036                 }
3037                 if (cell_size[i] < cellsize_limit_f_i)
3038                 {
3039                     root->bCellMin[i] = TRUE;
3040                     cell_size[i] = cellsize_limit_f_i;
3041                     nmin++;
3042                 }
3043             }
3044             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3045         }
3046     }
3047     while (nmin > nmin_old);
3048     
3049     i=range[1]-1;
3050     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3051     /* For this check we should not use DD_CELL_MARGIN,
3052      * but a slightly smaller factor,
3053      * since rounding could get use below the limit.
3054      */
3055     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3056     {
3057         char buf[22];
3058         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3059                   gmx_step_str(step,buf),
3060                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3061                   ncd,comm->cellsize_min[dim]);
3062     }
3063     
3064     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3065     
3066     if (!bUniform)
3067     {
3068         /* Check if the boundary did not displace more than halfway
3069          * each of the cells it bounds, as this could cause problems,
3070          * especially when the differences between cell sizes are large.
3071          * If changes are applied, they will not make cells smaller
3072          * than the cut-off, as we check all the boundaries which
3073          * might be affected by a change and if the old state was ok,
3074          * the cells will at most be shrunk back to their old size.
3075          */
3076         for(i=range[0]+1; i<range[1]; i++)
3077         {
3078             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3079             if (root->cell_f[i] < halfway)
3080             {
3081                 root->cell_f[i] = halfway;
3082                 /* Check if the change also causes shifts of the next boundaries */
3083                 for(j=i+1; j<range[1]; j++)
3084                 {
3085                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3086                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3087                 }
3088             }
3089             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3090             if (root->cell_f[i] > halfway)
3091             {
3092                 root->cell_f[i] = halfway;
3093                 /* Check if the change also causes shifts of the next boundaries */
3094                 for(j=i-1; j>=range[0]+1; j--)
3095                 {
3096                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3097                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3098                 }
3099             }
3100         }
3101     }
3102     
3103     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3104     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3105      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3106      * for a and b nrange is used */
3107     if (d > 0)
3108     {
3109         /* Take care of the staggering of the cell boundaries */
3110         if (bUniform)
3111         {
3112             for(i=range[0]; i<range[1]; i++)
3113             {
3114                 root->cell_f_max0[i] = root->cell_f[i];
3115                 root->cell_f_min1[i] = root->cell_f[i+1];
3116             }
3117         }
3118         else
3119         {
3120             for(i=range[0]+1; i<range[1]; i++)
3121             {
3122                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3123                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3124                 if (bLimLo && bLimHi)
3125                 {
3126                     /* Both limits violated, try the best we can */
3127                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3128                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3129                     nrange[0]=range[0];
3130                     nrange[1]=i;
3131                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3132
3133                     nrange[0]=i;
3134                     nrange[1]=range[1];
3135                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3136
3137                     return;
3138                 }
3139                 else if (bLimLo)
3140                 {
3141                     /* root->cell_f[i] = root->bound_min[i]; */
3142                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3143                     bLastHi=FALSE;
3144                 }
3145                 else if (bLimHi && !bLastHi)
3146                 {
3147                     bLastHi=TRUE;
3148                     if (nrange[1] < range[1])   /* found a LimLo before */
3149                     {
3150                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3151                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3152                         nrange[0]=nrange[1];
3153                     }
3154                     root->cell_f[i] = root->bound_max[i];
3155                     nrange[1]=i; 
3156                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3157                     nrange[0]=i;
3158                     nrange[1]=range[1];
3159                 }
3160             }
3161             if (nrange[1] < range[1])   /* found last a LimLo */
3162             {
3163                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3164                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3165                 nrange[0]=nrange[1];
3166                 nrange[1]=range[1];
3167                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3168             } 
3169             else if (nrange[0] > range[0]) /* found at least one LimHi */
3170             {
3171                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3172             }
3173         }
3174     }
3175 }
3176
3177
3178 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3179                                        int d,int dim,gmx_domdec_root_t *root,
3180                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3181                                        gmx_bool bUniform,gmx_large_int_t step)
3182 {
3183     gmx_domdec_comm_t *comm;
3184     int  ncd,d1,i,j,pos;
3185     real *cell_size;
3186     real load_aver,load_i,imbalance,change,change_max,sc;
3187     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3188     real change_limit;
3189     real relax = 0.5;
3190     gmx_bool bPBC;
3191     int range[] = { 0, 0 };
3192
3193     comm = dd->comm;
3194
3195     /* Convert the maximum change from the input percentage to a fraction */
3196     change_limit = comm->dlb_scale_lim*0.01;
3197
3198     ncd = dd->nc[dim];
3199
3200     bPBC = (dim < ddbox->npbcdim);
3201
3202     cell_size = root->buf_ncd;
3203
3204     /* Store the original boundaries */
3205     for(i=0; i<ncd+1; i++)
3206     {
3207         root->old_cell_f[i] = root->cell_f[i];
3208     }
3209     if (bUniform) {
3210         for(i=0; i<ncd; i++)
3211         {
3212             cell_size[i] = 1.0/ncd;
3213         }
3214     }
3215     else if (dd_load_count(comm))
3216     {
3217         load_aver = comm->load[d].sum_m/ncd;
3218         change_max = 0;
3219         for(i=0; i<ncd; i++)
3220         {
3221             /* Determine the relative imbalance of cell i */
3222             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3223             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3224             /* Determine the change of the cell size using underrelaxation */
3225             change = -relax*imbalance;
3226             change_max = max(change_max,max(change,-change));
3227         }
3228         /* Limit the amount of scaling.
3229          * We need to use the same rescaling for all cells in one row,
3230          * otherwise the load balancing might not converge.
3231          */
3232         sc = relax;
3233         if (change_max > change_limit)
3234         {
3235             sc *= change_limit/change_max;
3236         }
3237         for(i=0; i<ncd; i++)
3238         {
3239             /* Determine the relative imbalance of cell i */
3240             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3241             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3242             /* Determine the change of the cell size using underrelaxation */
3243             change = -sc*imbalance;
3244             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3245         }
3246     }
3247     
3248     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3249     cellsize_limit_f *= DD_CELL_MARGIN;
3250     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3251     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3252     if (ddbox->tric_dir[dim])
3253     {
3254         cellsize_limit_f /= ddbox->skew_fac[dim];
3255         dist_min_f       /= ddbox->skew_fac[dim];
3256     }
3257     if (bDynamicBox && d > 0)
3258     {
3259         dist_min_f *= DD_PRES_SCALE_MARGIN;
3260     }
3261     if (d > 0 && !bUniform)
3262     {
3263         /* Make sure that the grid is not shifted too much */
3264         for(i=1; i<ncd; i++) {
3265             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3266             {
3267                 gmx_incons("Inconsistent DD boundary staggering limits!");
3268             }
3269             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3270             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3271             if (space > 0) {
3272                 root->bound_min[i] += 0.5*space;
3273             }
3274             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3275             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3276             if (space < 0) {
3277                 root->bound_max[i] += 0.5*space;
3278             }
3279             if (debug)
3280             {
3281                 fprintf(debug,
3282                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3283                         d,i,
3284                         root->cell_f_max0[i-1] + dist_min_f,
3285                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3286                         root->cell_f_min1[i] - dist_min_f);
3287             }
3288         }
3289     }
3290     range[1]=ncd;
3291     root->cell_f[0] = 0;
3292     root->cell_f[ncd] = 1;
3293     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3294
3295
3296     /* After the checks above, the cells should obey the cut-off
3297      * restrictions, but it does not hurt to check.
3298      */
3299     for(i=0; i<ncd; i++)
3300     {
3301         if (debug)
3302         {
3303             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3304                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3305         }
3306
3307         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3308             root->cell_f[i+1] - root->cell_f[i] <
3309             cellsize_limit_f/DD_CELL_MARGIN)
3310         {
3311             char buf[22];
3312             fprintf(stderr,
3313                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3314                     gmx_step_str(step,buf),dim2char(dim),i,
3315                     (root->cell_f[i+1] - root->cell_f[i])
3316                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3317         }
3318     }
3319     
3320     pos = ncd + 1;
3321     /* Store the cell boundaries of the lower dimensions at the end */
3322     for(d1=0; d1<d; d1++)
3323     {
3324         root->cell_f[pos++] = comm->cell_f0[d1];
3325         root->cell_f[pos++] = comm->cell_f1[d1];
3326     }
3327     
3328     if (d < comm->npmedecompdim)
3329     {
3330         /* The master determines the maximum shift for
3331          * the coordinate communication between separate PME nodes.
3332          */
3333         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3334     }
3335     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3336     if (d >= 1)
3337     {
3338         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3339     }
3340 }    
3341
3342 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3343                                              gmx_ddbox_t *ddbox,int dimind)
3344 {
3345     gmx_domdec_comm_t *comm;
3346     int dim;
3347
3348     comm = dd->comm;
3349
3350     /* Set the cell dimensions */
3351     dim = dd->dim[dimind];
3352     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3353     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3354     if (dim >= ddbox->nboundeddim)
3355     {
3356         comm->cell_x0[dim] += ddbox->box0[dim];
3357         comm->cell_x1[dim] += ddbox->box0[dim];
3358     }
3359 }
3360
3361 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3362                                          int d,int dim,real *cell_f_row,
3363                                          gmx_ddbox_t *ddbox)
3364 {
3365     gmx_domdec_comm_t *comm;
3366     int d1,dim1,pos;
3367
3368     comm = dd->comm;
3369
3370 #ifdef GMX_MPI
3371     /* Each node would only need to know two fractions,
3372      * but it is probably cheaper to broadcast the whole array.
3373      */
3374     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3375               0,comm->mpi_comm_load[d]);
3376 #endif
3377     /* Copy the fractions for this dimension from the buffer */
3378     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3379     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3380     /* The whole array was communicated, so set the buffer position */
3381     pos = dd->nc[dim] + 1;
3382     for(d1=0; d1<=d; d1++)
3383     {
3384         if (d1 < d)
3385         {
3386             /* Copy the cell fractions of the lower dimensions */
3387             comm->cell_f0[d1] = cell_f_row[pos++];
3388             comm->cell_f1[d1] = cell_f_row[pos++];
3389         }
3390         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3391     }
3392     /* Convert the communicated shift from float to int */
3393     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3394     if (d >= 1)
3395     {
3396         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3397     }
3398 }
3399
3400 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3401                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3402                                          gmx_bool bUniform,gmx_large_int_t step)
3403 {
3404     gmx_domdec_comm_t *comm;
3405     int d,dim,d1;
3406     gmx_bool bRowMember,bRowRoot;
3407     real *cell_f_row;
3408     
3409     comm = dd->comm;
3410
3411     for(d=0; d<dd->ndim; d++)
3412     {
3413         dim = dd->dim[d];
3414         bRowMember = TRUE;
3415         bRowRoot = TRUE;
3416         for(d1=d; d1<dd->ndim; d1++)
3417         {
3418             if (dd->ci[dd->dim[d1]] > 0)
3419             {
3420                 if (d1 > d)
3421                 {
3422                     bRowMember = FALSE;
3423                 }
3424                 bRowRoot = FALSE;
3425             }
3426         }
3427         if (bRowMember)
3428         {
3429             if (bRowRoot)
3430             {
3431                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3432                                            ddbox,bDynamicBox,bUniform,step);
3433                 cell_f_row = comm->root[d]->cell_f;
3434             }
3435             else
3436             {
3437                 cell_f_row = comm->cell_f_row;
3438             }
3439             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3440         }
3441     }
3442 }    
3443
3444 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3445 {
3446     int d;
3447
3448     /* This function assumes the box is static and should therefore
3449      * not be called when the box has changed since the last
3450      * call to dd_partition_system.
3451      */
3452     for(d=0; d<dd->ndim; d++)
3453     {
3454         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3455     }
3456 }
3457
3458
3459
3460 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3461                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3462                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3463                                   gmx_wallcycle_t wcycle)
3464 {
3465     gmx_domdec_comm_t *comm;
3466     int dim;
3467
3468     comm = dd->comm;
3469     
3470     if (bDoDLB)
3471     {
3472         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3473         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3474         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3475     }
3476     else if (bDynamicBox)
3477     {
3478         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3479     }
3480     
3481     /* Set the dimensions for which no DD is used */
3482     for(dim=0; dim<DIM; dim++) {
3483         if (dd->nc[dim] == 1) {
3484             comm->cell_x0[dim] = 0;
3485             comm->cell_x1[dim] = ddbox->box_size[dim];
3486             if (dim >= ddbox->nboundeddim)
3487             {
3488                 comm->cell_x0[dim] += ddbox->box0[dim];
3489                 comm->cell_x1[dim] += ddbox->box0[dim];
3490             }
3491         }
3492     }
3493 }
3494
3495 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3496 {
3497     int d,np,i;
3498     gmx_domdec_comm_dim_t *cd;
3499     
3500     for(d=0; d<dd->ndim; d++)
3501     {
3502         cd = &dd->comm->cd[d];
3503         np = npulse[dd->dim[d]];
3504         if (np > cd->np_nalloc)
3505         {
3506             if (debug)
3507             {
3508                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3509                         dim2char(dd->dim[d]),np);
3510             }
3511             if (DDMASTER(dd) && cd->np_nalloc > 0)
3512             {
3513                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3514             }
3515             srenew(cd->ind,np);
3516             for(i=cd->np_nalloc; i<np; i++)
3517             {
3518                 cd->ind[i].index  = NULL;
3519                 cd->ind[i].nalloc = 0;
3520             }
3521             cd->np_nalloc = np;
3522         }
3523         cd->np = np;
3524     }
3525 }
3526
3527
3528 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3529                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3530                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3531                               gmx_wallcycle_t wcycle)
3532 {
3533     gmx_domdec_comm_t *comm;
3534     int  d;
3535     ivec npulse;
3536     
3537     comm = dd->comm;
3538
3539     /* Copy the old cell boundaries for the cg displacement check */
3540     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3541     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3542     
3543     if (comm->bDynLoadBal)
3544     {
3545         if (DDMASTER(dd))
3546         {
3547             check_box_size(dd,ddbox);
3548         }
3549         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3550     }
3551     else
3552     {
3553         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3554         realloc_comm_ind(dd,npulse);
3555     }
3556     
3557     if (debug)
3558     {
3559         for(d=0; d<DIM; d++)
3560         {
3561             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3562                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3563         }
3564     }
3565 }
3566
3567 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3568                                   gmx_ddbox_t *ddbox,
3569                                   rvec cell_ns_x0,rvec cell_ns_x1,
3570                                   gmx_large_int_t step)
3571 {
3572     gmx_domdec_comm_t *comm;
3573     int dim_ind,dim;
3574     
3575     comm = dd->comm;
3576
3577     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3578     {
3579         dim = dd->dim[dim_ind];
3580         
3581         /* Without PBC we don't have restrictions on the outer cells */
3582         if (!(dim >= ddbox->npbcdim && 
3583               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3584             comm->bDynLoadBal &&
3585             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3586             comm->cellsize_min[dim])
3587         {
3588             char buf[22];
3589             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3590                       gmx_step_str(step,buf),dim2char(dim),
3591                       comm->cell_x1[dim] - comm->cell_x0[dim],
3592                       ddbox->skew_fac[dim],
3593                       dd->comm->cellsize_min[dim],
3594                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3595         }
3596     }
3597     
3598     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3599     {
3600         /* Communicate the boundaries and update cell_ns_x0/1 */
3601         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3602         if (dd->bGridJump && dd->ndim > 1)
3603         {
3604             check_grid_jump(step,dd,ddbox);
3605         }
3606     }
3607 }
3608
3609 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3610 {
3611     if (YY < npbcdim)
3612     {
3613         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3614     }
3615     else
3616     {
3617         tcm[YY][XX] = 0;
3618     }
3619     if (ZZ < npbcdim)
3620     {
3621         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3622         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3623     }
3624     else
3625     {
3626         tcm[ZZ][XX] = 0;
3627         tcm[ZZ][YY] = 0;
3628     }
3629 }
3630
3631 static void check_screw_box(matrix box)
3632 {
3633     /* Mathematical limitation */
3634     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3635     {
3636         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3637     }
3638     
3639     /* Limitation due to the asymmetry of the eighth shell method */
3640     if (box[ZZ][YY] != 0)
3641     {
3642         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3643     }
3644 }
3645
3646 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3647                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3648                           gmx_domdec_t *dd)
3649 {
3650     gmx_domdec_master_t *ma;
3651     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3652     int  i,icg,j,k,k0,k1,d,npbcdim;
3653     matrix tcm;
3654     rvec box_size,cg_cm;
3655     ivec ind;
3656     real nrcg,inv_ncg,pos_d;
3657     atom_id *cgindex;
3658     gmx_bool bUnbounded,bScrew;
3659
3660     ma = dd->ma;
3661     
3662     if (tmp_ind == NULL)
3663     {
3664         snew(tmp_nalloc,dd->nnodes);
3665         snew(tmp_ind,dd->nnodes);
3666         for(i=0; i<dd->nnodes; i++)
3667         {
3668             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3669             snew(tmp_ind[i],tmp_nalloc[i]);
3670         }
3671     }
3672     
3673     /* Clear the count */
3674     for(i=0; i<dd->nnodes; i++)
3675     {
3676         ma->ncg[i] = 0;
3677         ma->nat[i] = 0;
3678     }
3679     
3680     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3681     
3682     cgindex = cgs->index;
3683     
3684     /* Compute the center of geometry for all charge groups */
3685     for(icg=0; icg<cgs->nr; icg++)
3686     {
3687         k0      = cgindex[icg];
3688         k1      = cgindex[icg+1];
3689         nrcg    = k1 - k0;
3690         if (nrcg == 1)
3691         {
3692             copy_rvec(pos[k0],cg_cm);
3693         }
3694         else
3695         {
3696             inv_ncg = 1.0/nrcg;
3697             
3698             clear_rvec(cg_cm);
3699             for(k=k0; (k<k1); k++)
3700             {
3701                 rvec_inc(cg_cm,pos[k]);
3702             }
3703             for(d=0; (d<DIM); d++)
3704             {
3705                 cg_cm[d] *= inv_ncg;
3706             }
3707         }
3708         /* Put the charge group in the box and determine the cell index */
3709         for(d=DIM-1; d>=0; d--) {
3710             pos_d = cg_cm[d];
3711             if (d < dd->npbcdim)
3712             {
3713                 bScrew = (dd->bScrewPBC && d == XX);
3714                 if (tric_dir[d] && dd->nc[d] > 1)
3715                 {
3716                     /* Use triclinic coordintates for this dimension */
3717                     for(j=d+1; j<DIM; j++)
3718                     {
3719                         pos_d += cg_cm[j]*tcm[j][d];
3720                     }
3721                 }
3722                 while(pos_d >= box[d][d])
3723                 {
3724                     pos_d -= box[d][d];
3725                     rvec_dec(cg_cm,box[d]);
3726                     if (bScrew)
3727                     {
3728                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3729                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3730                     }
3731                     for(k=k0; (k<k1); k++)
3732                     {
3733                         rvec_dec(pos[k],box[d]);
3734                         if (bScrew)
3735                         {
3736                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3737                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3738                         }
3739                     }
3740                 }
3741                 while(pos_d < 0)
3742                 {
3743                     pos_d += box[d][d];
3744                     rvec_inc(cg_cm,box[d]);
3745                     if (bScrew)
3746                     {
3747                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3748                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3749                     }
3750                     for(k=k0; (k<k1); k++)
3751                     {
3752                         rvec_inc(pos[k],box[d]);
3753                         if (bScrew) {
3754                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3755                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3756                         }
3757                     }
3758                 }
3759             }
3760             /* This could be done more efficiently */
3761             ind[d] = 0;
3762             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3763             {
3764                 ind[d]++;
3765             }
3766         }
3767         i = dd_index(dd->nc,ind);
3768         if (ma->ncg[i] == tmp_nalloc[i])
3769         {
3770             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3771             srenew(tmp_ind[i],tmp_nalloc[i]);
3772         }
3773         tmp_ind[i][ma->ncg[i]] = icg;
3774         ma->ncg[i]++;
3775         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3776     }
3777     
3778     k1 = 0;
3779     for(i=0; i<dd->nnodes; i++)
3780     {
3781         ma->index[i] = k1;
3782         for(k=0; k<ma->ncg[i]; k++)
3783         {
3784             ma->cg[k1++] = tmp_ind[i][k];
3785         }
3786     }
3787     ma->index[dd->nnodes] = k1;
3788     
3789     for(i=0; i<dd->nnodes; i++)
3790     {
3791         sfree(tmp_ind[i]);
3792     }
3793     sfree(tmp_ind);
3794     sfree(tmp_nalloc);
3795     
3796     if (fplog)
3797     {
3798         char buf[22];
3799         fprintf(fplog,"Charge group distribution at step %s:",
3800                 gmx_step_str(step,buf));
3801         for(i=0; i<dd->nnodes; i++)
3802         {
3803             fprintf(fplog," %d",ma->ncg[i]);
3804         }
3805         fprintf(fplog,"\n");
3806     }
3807 }
3808
3809 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3810                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3811                                 rvec pos[])
3812 {
3813     gmx_domdec_master_t *ma=NULL;
3814     ivec npulse;
3815     int  i,cg_gl;
3816     int  *ibuf,buf2[2] = { 0, 0 };
3817     
3818     if (DDMASTER(dd))
3819     {
3820         ma = dd->ma;
3821         
3822         if (dd->bScrewPBC)
3823         {
3824             check_screw_box(box);
3825         }
3826     
3827         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3828     
3829         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3830         for(i=0; i<dd->nnodes; i++)
3831         {
3832             ma->ibuf[2*i]   = ma->ncg[i];
3833             ma->ibuf[2*i+1] = ma->nat[i];
3834         }
3835         ibuf = ma->ibuf;
3836     }
3837     else
3838     {
3839         ibuf = NULL;
3840     }
3841     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3842     
3843     dd->ncg_home = buf2[0];
3844     dd->nat_home = buf2[1];
3845     dd->ncg_tot  = dd->ncg_home;
3846     dd->nat_tot  = dd->nat_home;
3847     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3848     {
3849         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3850         srenew(dd->index_gl,dd->cg_nalloc);
3851         srenew(dd->cgindex,dd->cg_nalloc+1);
3852     }
3853     if (DDMASTER(dd))
3854     {
3855         for(i=0; i<dd->nnodes; i++)
3856         {
3857             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3858             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3859         }
3860     }
3861     
3862     dd_scatterv(dd,
3863                 DDMASTER(dd) ? ma->ibuf : NULL,
3864                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3865                 DDMASTER(dd) ? ma->cg : NULL,
3866                 dd->ncg_home*sizeof(int),dd->index_gl);
3867     
3868     /* Determine the home charge group sizes */
3869     dd->cgindex[0] = 0;
3870     for(i=0; i<dd->ncg_home; i++)
3871     {
3872         cg_gl = dd->index_gl[i];
3873         dd->cgindex[i+1] =
3874             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3875     }
3876     
3877     if (debug)
3878     {
3879         fprintf(debug,"Home charge groups:\n");
3880         for(i=0; i<dd->ncg_home; i++)
3881         {
3882             fprintf(debug," %d",dd->index_gl[i]);
3883             if (i % 10 == 9) 
3884                 fprintf(debug,"\n");
3885         }
3886         fprintf(debug,"\n");
3887     }
3888 }
3889
3890 static int compact_and_copy_vec_at(int ncg,int *move,
3891                                    int *cgindex,
3892                                    int nvec,int vec,
3893                                    rvec *src,gmx_domdec_comm_t *comm,
3894                                    gmx_bool bCompact)
3895 {
3896     int m,icg,i,i0,i1,nrcg;
3897     int home_pos;
3898     int pos_vec[DIM*2];
3899     
3900     home_pos = 0;
3901
3902     for(m=0; m<DIM*2; m++)
3903     {
3904         pos_vec[m] = 0;
3905     }
3906     
3907     i0 = 0;
3908     for(icg=0; icg<ncg; icg++)
3909     {
3910         i1 = cgindex[icg+1];
3911         m = move[icg];
3912         if (m == -1)
3913         {
3914             if (bCompact)
3915             {
3916                 /* Compact the home array in place */
3917                 for(i=i0; i<i1; i++)
3918                 {
3919                     copy_rvec(src[i],src[home_pos++]);
3920                 }
3921             }
3922         }
3923         else
3924         {
3925             /* Copy to the communication buffer */
3926             nrcg = i1 - i0;
3927             pos_vec[m] += 1 + vec*nrcg;
3928             for(i=i0; i<i1; i++)
3929             {
3930                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3931             }
3932             pos_vec[m] += (nvec - vec - 1)*nrcg;
3933         }
3934         if (!bCompact)
3935         {
3936             home_pos += i1 - i0;
3937         }
3938         i0 = i1;
3939     }
3940     
3941     return home_pos;
3942 }
3943
3944 static int compact_and_copy_vec_cg(int ncg,int *move,
3945                                    int *cgindex,
3946                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3947                                    gmx_bool bCompact)
3948 {
3949     int m,icg,i0,i1,nrcg;
3950     int home_pos;
3951     int pos_vec[DIM*2];
3952     
3953     home_pos = 0;
3954     
3955     for(m=0; m<DIM*2; m++)
3956     {
3957         pos_vec[m] = 0;
3958     }
3959     
3960     i0 = 0;
3961     for(icg=0; icg<ncg; icg++)
3962     {
3963         i1 = cgindex[icg+1];
3964         m = move[icg];
3965         if (m == -1)
3966         {
3967             if (bCompact)
3968             {
3969                 /* Compact the home array in place */
3970                 copy_rvec(src[icg],src[home_pos++]);
3971             }
3972         }
3973         else
3974         {
3975             nrcg = i1 - i0;
3976             /* Copy to the communication buffer */
3977             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3978             pos_vec[m] += 1 + nrcg*nvec;
3979         }
3980         i0 = i1;
3981     }
3982     if (!bCompact)
3983     {
3984         home_pos = ncg;
3985     }
3986     
3987     return home_pos;
3988 }
3989
3990 static int compact_ind(int ncg,int *move,
3991                        int *index_gl,int *cgindex,
3992                        int *gatindex,
3993                        gmx_ga2la_t ga2la,char *bLocalCG,
3994                        int *cginfo)
3995 {
3996     int cg,nat,a0,a1,a,a_gl;
3997     int home_pos;
3998
3999     home_pos = 0;
4000     nat = 0;
4001     for(cg=0; cg<ncg; cg++)
4002     {
4003         a0 = cgindex[cg];
4004         a1 = cgindex[cg+1];
4005         if (move[cg] == -1)
4006         {
4007             /* Compact the home arrays in place.
4008              * Anything that can be done here avoids access to global arrays.
4009              */
4010             cgindex[home_pos] = nat;
4011             for(a=a0; a<a1; a++)
4012             {
4013                 a_gl = gatindex[a];
4014                 gatindex[nat] = a_gl;
4015                 /* The cell number stays 0, so we don't need to set it */
4016                 ga2la_change_la(ga2la,a_gl,nat);
4017                 nat++;
4018             }
4019             index_gl[home_pos] = index_gl[cg];
4020             cginfo[home_pos]   = cginfo[cg];
4021             /* The charge group remains local, so bLocalCG does not change */
4022             home_pos++;
4023         }
4024         else
4025         {
4026             /* Clear the global indices */
4027             for(a=a0; a<a1; a++)
4028             {
4029                 ga2la_del(ga2la,gatindex[a]);
4030             }
4031             if (bLocalCG)
4032             {
4033                 bLocalCG[index_gl[cg]] = FALSE;
4034             }
4035         }
4036     }
4037     cgindex[home_pos] = nat;
4038     
4039     return home_pos;
4040 }
4041
4042 static void clear_and_mark_ind(int ncg,int *move,
4043                                int *index_gl,int *cgindex,int *gatindex,
4044                                gmx_ga2la_t ga2la,char *bLocalCG,
4045                                int *cell_index)
4046 {
4047     int cg,a0,a1,a;
4048     
4049     for(cg=0; cg<ncg; cg++)
4050     {
4051         if (move[cg] >= 0)
4052         {
4053             a0 = cgindex[cg];
4054             a1 = cgindex[cg+1];
4055             /* Clear the global indices */
4056             for(a=a0; a<a1; a++)
4057             {
4058                 ga2la_del(ga2la,gatindex[a]);
4059             }
4060             if (bLocalCG)
4061             {
4062                 bLocalCG[index_gl[cg]] = FALSE;
4063             }
4064             /* Signal that this cg has moved using the ns cell index.
4065              * Here we set it to -1.
4066              * fill_grid will change it from -1 to 4*grid->ncells.
4067              */
4068             cell_index[cg] = -1;
4069         }
4070     }
4071 }
4072
4073 static void print_cg_move(FILE *fplog,
4074                           gmx_domdec_t *dd,
4075                           gmx_large_int_t step,int cg,int dim,int dir,
4076                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4077                           rvec cm_old,rvec cm_new,real pos_d)
4078 {
4079     gmx_domdec_comm_t *comm;
4080     char buf[22];
4081
4082     comm = dd->comm;
4083
4084     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4085     if (bHaveLimitdAndCMOld)
4086     {
4087         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4088                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4089     }
4090     else
4091     {
4092         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4093                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4094     }
4095     fprintf(fplog,"distance out of cell %f\n",
4096             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4097     if (bHaveLimitdAndCMOld)
4098     {
4099         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4100                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4101     }
4102     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4103             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4104     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4105             dim2char(dim),
4106             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4107     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4108             dim2char(dim),
4109             comm->cell_x0[dim],comm->cell_x1[dim]);
4110 }
4111
4112 static void cg_move_error(FILE *fplog,
4113                           gmx_domdec_t *dd,
4114                           gmx_large_int_t step,int cg,int dim,int dir,
4115                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4116                           rvec cm_old,rvec cm_new,real pos_d)
4117 {
4118     if (fplog)
4119     {
4120         print_cg_move(fplog, dd,step,cg,dim,dir,
4121                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4122     }
4123     print_cg_move(stderr,dd,step,cg,dim,dir,
4124                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4125     gmx_fatal(FARGS,
4126               "A charge group moved too far between two domain decomposition steps\n"
4127               "This usually means that your system is not well equilibrated");
4128 }
4129
4130 static void rotate_state_atom(t_state *state,int a)
4131 {
4132     int est;
4133
4134     for(est=0; est<estNR; est++)
4135     {
4136         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4137             switch (est) {
4138             case estX:
4139                 /* Rotate the complete state; for a rectangular box only */
4140                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4141                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4142                 break;
4143             case estV:
4144                 state->v[a][YY] = -state->v[a][YY];
4145                 state->v[a][ZZ] = -state->v[a][ZZ];
4146                 break;
4147             case estSDX:
4148                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4149                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4150                 break;
4151             case estCGP:
4152                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4153                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4154                 break;
4155             case estDISRE_INITF:
4156             case estDISRE_RM3TAV:
4157             case estORIRE_INITF:
4158             case estORIRE_DTAV:
4159                 /* These are distances, so not affected by rotation */
4160                 break;
4161             default:
4162                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4163             }
4164         }
4165     }
4166 }
4167
4168 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4169                               gmx_domdec_t *dd,ivec tric_dir,
4170                               t_state *state,rvec **f,
4171                               t_forcerec *fr,t_mdatoms *md,
4172                               gmx_bool bCompact,
4173                               t_nrnb *nrnb)
4174 {
4175     int  *move;
4176     int  npbcdim;
4177     int  ncg[DIM*2],nat[DIM*2];
4178     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4179     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4180     int  sbuf[2],rbuf[2];
4181     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4182     int  flag;
4183     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4184     gmx_bool bScrew;
4185     ivec dev;
4186     real inv_ncg,pos_d;
4187     matrix tcm;
4188     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4189     atom_id *cgindex;
4190     cginfo_mb_t *cginfo_mb;
4191     gmx_domdec_comm_t *comm;
4192     
4193     if (dd->bScrewPBC)
4194     {
4195         check_screw_box(state->box);
4196     }
4197     
4198     comm  = dd->comm;
4199     cg_cm = fr->cg_cm;
4200     
4201     for(i=0; i<estNR; i++)
4202     {
4203         if (EST_DISTR(i))
4204         {
4205             switch (i)
4206             {
4207             case estX:   /* Always present */            break;
4208             case estV:   bV   = (state->flags & (1<<i)); break;
4209             case estSDX: bSDX = (state->flags & (1<<i)); break;
4210             case estCGP: bCGP = (state->flags & (1<<i)); break;
4211             case estLD_RNG:
4212             case estLD_RNGI:
4213             case estDISRE_INITF:
4214             case estDISRE_RM3TAV:
4215             case estORIRE_INITF:
4216             case estORIRE_DTAV:
4217                 /* No processing required */
4218                 break;
4219             default:
4220             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4221             }
4222         }
4223     }
4224     
4225     if (dd->ncg_tot > comm->nalloc_int)
4226     {
4227         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4228         srenew(comm->buf_int,comm->nalloc_int);
4229     }
4230     move = comm->buf_int;
4231     
4232     /* Clear the count */
4233     for(c=0; c<dd->ndim*2; c++)
4234     {
4235         ncg[c] = 0;
4236         nat[c] = 0;
4237     }
4238
4239     npbcdim = dd->npbcdim;
4240
4241     for(d=0; (d<DIM); d++)
4242     {
4243         limitd[d] = dd->comm->cellsize_min[d];
4244         if (d >= npbcdim && dd->ci[d] == 0)
4245         {
4246             cell_x0[d] = -GMX_FLOAT_MAX;
4247         }
4248         else
4249         {
4250             cell_x0[d] = comm->cell_x0[d];
4251         }
4252         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4253         {
4254             cell_x1[d] = GMX_FLOAT_MAX;
4255         }
4256         else
4257         {
4258             cell_x1[d] = comm->cell_x1[d];
4259         }
4260         if (d < npbcdim)
4261         {
4262             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4263             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4264         }
4265         else
4266         {
4267             /* We check after communication if a charge group moved
4268              * more than one cell. Set the pre-comm check limit to float_max.
4269              */
4270             limit0[d] = -GMX_FLOAT_MAX;
4271             limit1[d] =  GMX_FLOAT_MAX;
4272         }
4273     }
4274     
4275     make_tric_corr_matrix(npbcdim,state->box,tcm);
4276     
4277     cgindex = dd->cgindex;
4278     
4279     /* Compute the center of geometry for all home charge groups
4280      * and put them in the box and determine where they should go.
4281      */
4282     for(cg=0; cg<dd->ncg_home; cg++)
4283     {
4284         k0   = cgindex[cg];
4285         k1   = cgindex[cg+1];
4286         nrcg = k1 - k0;
4287         if (nrcg == 1)
4288         {
4289             copy_rvec(state->x[k0],cm_new);
4290         }
4291         else
4292         {
4293             inv_ncg = 1.0/nrcg;
4294             
4295             clear_rvec(cm_new);
4296             for(k=k0; (k<k1); k++)
4297             {
4298                 rvec_inc(cm_new,state->x[k]);
4299             }
4300             for(d=0; (d<DIM); d++)
4301             {
4302                 cm_new[d] = inv_ncg*cm_new[d];
4303             }
4304         }
4305         
4306         clear_ivec(dev);
4307         /* Do pbc and check DD cell boundary crossings */
4308         for(d=DIM-1; d>=0; d--)
4309         {
4310             if (dd->nc[d] > 1)
4311             {
4312                 bScrew = (dd->bScrewPBC && d == XX);
4313                 /* Determine the location of this cg in lattice coordinates */
4314                 pos_d = cm_new[d];
4315                 if (tric_dir[d])
4316                 {
4317                     for(d2=d+1; d2<DIM; d2++)
4318                     {
4319                         pos_d += cm_new[d2]*tcm[d2][d];
4320                     }
4321                 }
4322                 /* Put the charge group in the triclinic unit-cell */
4323                 if (pos_d >= cell_x1[d])
4324                 {
4325                     if (pos_d >= limit1[d])
4326                     {
4327                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4328                                       cg_cm[cg],cm_new,pos_d);
4329                     }
4330                     dev[d] = 1;
4331                     if (dd->ci[d] == dd->nc[d] - 1)
4332                     {
4333                         rvec_dec(cm_new,state->box[d]);
4334                         if (bScrew)
4335                         {
4336                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4337                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4338                         }
4339                         for(k=k0; (k<k1); k++)
4340                         {
4341                             rvec_dec(state->x[k],state->box[d]);
4342                             if (bScrew)
4343                             {
4344                                 rotate_state_atom(state,k);
4345                             }
4346                         }
4347                     }
4348                 }
4349                 else if (pos_d < cell_x0[d])
4350                 {
4351                     if (pos_d < limit0[d])
4352                     {
4353                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4354                                       cg_cm[cg],cm_new,pos_d);
4355                     }
4356                     dev[d] = -1;
4357                     if (dd->ci[d] == 0)
4358                     {
4359                         rvec_inc(cm_new,state->box[d]);
4360                         if (bScrew)
4361                         {
4362                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4363                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4364                         }
4365                         for(k=k0; (k<k1); k++)
4366                         {
4367                             rvec_inc(state->x[k],state->box[d]);
4368                             if (bScrew)
4369                             {
4370                                 rotate_state_atom(state,k);
4371                             }
4372                         }
4373                     }
4374                 }
4375             }
4376             else if (d < npbcdim)
4377             {
4378                 /* Put the charge group in the rectangular unit-cell */
4379                 while (cm_new[d] >= state->box[d][d])
4380                 {
4381                     rvec_dec(cm_new,state->box[d]);
4382                     for(k=k0; (k<k1); k++)
4383                     {
4384                         rvec_dec(state->x[k],state->box[d]);
4385                     }
4386                 }
4387                 while (cm_new[d] < 0)
4388                 {
4389                     rvec_inc(cm_new,state->box[d]);
4390                     for(k=k0; (k<k1); k++)
4391                     {
4392                         rvec_inc(state->x[k],state->box[d]);
4393                     }
4394                 }
4395             }
4396         }
4397     
4398         copy_rvec(cm_new,cg_cm[cg]);
4399         
4400         /* Determine where this cg should go */
4401         flag = 0;
4402         mc = -1;
4403         for(d=0; d<dd->ndim; d++)
4404         {
4405             dim = dd->dim[d];
4406             if (dev[dim] == 1)
4407             {
4408                 flag |= DD_FLAG_FW(d);
4409                 if (mc == -1)
4410                 {
4411                     mc = d*2;
4412                 }
4413             }
4414             else if (dev[dim] == -1)
4415             {
4416                 flag |= DD_FLAG_BW(d);
4417                 if (mc == -1) {
4418                     if (dd->nc[dim] > 2)
4419                     {
4420                         mc = d*2 + 1;
4421                     }
4422                     else
4423                     {
4424                         mc = d*2;
4425                     }
4426                 }
4427             }
4428         }
4429         move[cg] = mc;
4430         if (mc >= 0)
4431         {
4432             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4433             {
4434                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4435                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4436             }
4437             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4438             /* We store the cg size in the lower 16 bits
4439              * and the place where the charge group should go
4440              * in the next 6 bits. This saves some communication volume.
4441              */
4442             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4443             ncg[mc] += 1;
4444             nat[mc] += nrcg;
4445         }
4446     }
4447     
4448     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4449     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4450     
4451     nvec = 1;
4452     if (bV)
4453     {
4454         nvec++;
4455     }
4456     if (bSDX)
4457     {
4458         nvec++;
4459     }
4460     if (bCGP)
4461     {
4462         nvec++;
4463     }
4464     
4465     /* Make sure the communication buffers are large enough */
4466     for(mc=0; mc<dd->ndim*2; mc++)
4467     {
4468         nvr = ncg[mc] + nat[mc]*nvec;
4469         if (nvr > comm->cgcm_state_nalloc[mc])
4470         {
4471             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4472             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4473         }
4474     }
4475     
4476     /* Recalculating cg_cm might be cheaper than communicating,
4477      * but that could give rise to rounding issues.
4478      */
4479     home_pos_cg =
4480         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4481                                 nvec,cg_cm,comm,bCompact);
4482     
4483     vec = 0;
4484     home_pos_at =
4485         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4486                                 nvec,vec++,state->x,comm,bCompact);
4487     if (bV)
4488     {
4489         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4490                                 nvec,vec++,state->v,comm,bCompact);
4491     }
4492     if (bSDX)
4493     {
4494         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4495                                 nvec,vec++,state->sd_X,comm,bCompact);
4496     }
4497     if (bCGP)
4498     {
4499         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4500                                 nvec,vec++,state->cg_p,comm,bCompact);
4501     }
4502     
4503     if (bCompact)
4504     {
4505         compact_ind(dd->ncg_home,move,
4506                     dd->index_gl,dd->cgindex,dd->gatindex,
4507                     dd->ga2la,comm->bLocalCG,
4508                     fr->cginfo);
4509     }
4510     else
4511     {
4512         clear_and_mark_ind(dd->ncg_home,move,
4513                            dd->index_gl,dd->cgindex,dd->gatindex,
4514                            dd->ga2la,comm->bLocalCG,
4515                            fr->ns.grid->cell_index);
4516     }
4517     
4518     cginfo_mb = fr->cginfo_mb;
4519
4520     ncg_stay_home = home_pos_cg;
4521     for(d=0; d<dd->ndim; d++)
4522     {
4523         dim = dd->dim[d];
4524         ncg_recv = 0;
4525         nat_recv = 0;
4526         nvr      = 0;
4527         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4528         {
4529             cdd = d*2 + dir;
4530             /* Communicate the cg and atom counts */
4531             sbuf[0] = ncg[cdd];
4532             sbuf[1] = nat[cdd];
4533             if (debug)
4534             {
4535                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4536                         d,dir,sbuf[0],sbuf[1]);
4537             }
4538             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4539             
4540             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4541             {
4542                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4543                 srenew(comm->buf_int,comm->nalloc_int);
4544             }
4545             
4546             /* Communicate the charge group indices, sizes and flags */
4547             dd_sendrecv_int(dd, d, dir,
4548                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4549                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4550             
4551             nvs = ncg[cdd] + nat[cdd]*nvec;
4552             i   = rbuf[0]  + rbuf[1] *nvec;
4553             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4554             
4555             /* Communicate cgcm and state */
4556             dd_sendrecv_rvec(dd, d, dir,
4557                              comm->cgcm_state[cdd], nvs,
4558                              comm->vbuf.v+nvr, i);
4559             ncg_recv += rbuf[0];
4560             nat_recv += rbuf[1];
4561             nvr      += i;
4562         }
4563         
4564         /* Process the received charge groups */
4565         buf_pos = 0;
4566         for(cg=0; cg<ncg_recv; cg++)
4567         {
4568             flag = comm->buf_int[cg*DD_CGIBS+1];
4569
4570             if (dim >= npbcdim && dd->nc[dim] > 2)
4571             {
4572                 /* No pbc in this dim and more than one domain boundary.
4573                  * We to a separate check if a charge did not move too far.
4574                  */
4575                 if (((flag & DD_FLAG_FW(d)) &&
4576                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4577                     ((flag & DD_FLAG_BW(d)) &&
4578                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4579                 {
4580                     cg_move_error(fplog,dd,step,cg,d,
4581                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4582                                    FALSE,0,
4583                                    comm->vbuf.v[buf_pos],
4584                                    comm->vbuf.v[buf_pos],
4585                                    comm->vbuf.v[buf_pos][d]);
4586                 }
4587             }
4588
4589             mc = -1;
4590             if (d < dd->ndim-1)
4591             {
4592                 /* Check which direction this cg should go */
4593                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4594                 {
4595                     if (dd->bGridJump)
4596                     {
4597                         /* The cell boundaries for dimension d2 are not equal
4598                          * for each cell row of the lower dimension(s),
4599                          * therefore we might need to redetermine where
4600                          * this cg should go.
4601                          */
4602                         dim2 = dd->dim[d2];
4603                         /* If this cg crosses the box boundary in dimension d2
4604                          * we can use the communicated flag, so we do not
4605                          * have to worry about pbc.
4606                          */
4607                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4608                                (flag & DD_FLAG_FW(d2))) ||
4609                               (dd->ci[dim2] == 0 &&
4610                                (flag & DD_FLAG_BW(d2)))))
4611                         {
4612                             /* Clear the two flags for this dimension */
4613                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4614                             /* Determine the location of this cg
4615                              * in lattice coordinates
4616                              */
4617                             pos_d = comm->vbuf.v[buf_pos][dim2];
4618                             if (tric_dir[dim2])
4619                             {
4620                                 for(d3=dim2+1; d3<DIM; d3++)
4621                                 {
4622                                     pos_d +=
4623                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4624                                 }
4625                             }
4626                             /* Check of we are not at the box edge.
4627                              * pbc is only handled in the first step above,
4628                              * but this check could move over pbc while
4629                              * the first step did not due to different rounding.
4630                              */
4631                             if (pos_d >= cell_x1[dim2] &&
4632                                 dd->ci[dim2] != dd->nc[dim2]-1)
4633                             {
4634                                 flag |= DD_FLAG_FW(d2);
4635                             }
4636                             else if (pos_d < cell_x0[dim2] &&
4637                                      dd->ci[dim2] != 0)
4638                             {
4639                                 flag |= DD_FLAG_BW(d2);
4640                             }
4641                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4642                         }
4643                     }
4644                     /* Set to which neighboring cell this cg should go */
4645                     if (flag & DD_FLAG_FW(d2))
4646                     {
4647                         mc = d2*2;
4648                     }
4649                     else if (flag & DD_FLAG_BW(d2))
4650                     {
4651                         if (dd->nc[dd->dim[d2]] > 2)
4652                         {
4653                             mc = d2*2+1;
4654                         }
4655                         else
4656                         {
4657                             mc = d2*2;
4658                         }
4659                     }
4660                 }
4661             }
4662             
4663             nrcg = flag & DD_FLAG_NRCG;
4664             if (mc == -1)
4665             {
4666                 if (home_pos_cg+1 > dd->cg_nalloc)
4667                 {
4668                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4669                     srenew(dd->index_gl,dd->cg_nalloc);
4670                     srenew(dd->cgindex,dd->cg_nalloc+1);
4671                 }
4672                 /* Set the global charge group index and size */
4673                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4674                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4675                 /* Copy the state from the buffer */
4676                 if (home_pos_cg >= fr->cg_nalloc)
4677                 {
4678                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4679                     cg_cm = fr->cg_cm;
4680                 }
4681                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4682                 /* Set the cginfo */
4683                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4684                                                    dd->index_gl[home_pos_cg]);
4685                 if (comm->bLocalCG)
4686                 {
4687                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4688                 }
4689
4690                 if (home_pos_at+nrcg > state->nalloc)
4691                 {
4692                     dd_realloc_state(state,f,home_pos_at+nrcg);
4693                 }
4694                 for(i=0; i<nrcg; i++)
4695                 {
4696                     copy_rvec(comm->vbuf.v[buf_pos++],
4697                               state->x[home_pos_at+i]);
4698                 }
4699                 if (bV)
4700                 {
4701                     for(i=0; i<nrcg; i++)
4702                     {
4703                         copy_rvec(comm->vbuf.v[buf_pos++],
4704                                   state->v[home_pos_at+i]);
4705                     }
4706                 }
4707                 if (bSDX)
4708                 {
4709                     for(i=0; i<nrcg; i++)
4710                     {
4711                         copy_rvec(comm->vbuf.v[buf_pos++],
4712                                   state->sd_X[home_pos_at+i]);
4713                     }
4714                 }
4715                 if (bCGP)
4716                 {
4717                     for(i=0; i<nrcg; i++)
4718                     {
4719                         copy_rvec(comm->vbuf.v[buf_pos++],
4720                                   state->cg_p[home_pos_at+i]);
4721                     }
4722                 }
4723                 home_pos_cg += 1;
4724                 home_pos_at += nrcg;
4725             }
4726             else
4727             {
4728                 /* Reallocate the buffers if necessary  */
4729                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4730                 {
4731                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4732                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4733                 }
4734                 nvr = ncg[mc] + nat[mc]*nvec;
4735                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4736                 {
4737                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4738                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4739                 }
4740                 /* Copy from the receive to the send buffers */
4741                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4742                        comm->buf_int + cg*DD_CGIBS,
4743                        DD_CGIBS*sizeof(int));
4744                 memcpy(comm->cgcm_state[mc][nvr],
4745                        comm->vbuf.v[buf_pos],
4746                        (1+nrcg*nvec)*sizeof(rvec));
4747                 buf_pos += 1 + nrcg*nvec;
4748                 ncg[mc] += 1;
4749                 nat[mc] += nrcg;
4750             }
4751         }
4752     }
4753     
4754     /* With sorting (!bCompact) the indices are now only partially up to date
4755      * and ncg_home and nat_home are not the real count, since there are
4756      * "holes" in the arrays for the charge groups that moved to neighbors.
4757      */
4758     dd->ncg_home = home_pos_cg;
4759     dd->nat_home = home_pos_at;
4760
4761     if (debug)
4762     {
4763         fprintf(debug,"Finished repartitioning\n");
4764     }
4765
4766     return ncg_stay_home;
4767 }
4768
4769 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4770 {
4771     dd->comm->cycl[ddCycl] += cycles;
4772     dd->comm->cycl_n[ddCycl]++;
4773     if (cycles > dd->comm->cycl_max[ddCycl])
4774     {
4775         dd->comm->cycl_max[ddCycl] = cycles;
4776     }
4777 }
4778
4779 static double force_flop_count(t_nrnb *nrnb)
4780 {
4781     int i;
4782     double sum;
4783     const char *name;
4784
4785     sum = 0;
4786     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4787     {
4788         /* To get closer to the real timings, we half the count
4789          * for the normal loops and again half it for water loops.
4790          */
4791         name = nrnb_str(i);
4792         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4793         {
4794             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4795         }
4796         else
4797         {
4798             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4799         }
4800     }
4801     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4802     {
4803         name = nrnb_str(i);
4804         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4805         sum += nrnb->n[i]*cost_nrnb(i);
4806     }
4807     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4808     {
4809         sum += nrnb->n[i]*cost_nrnb(i);
4810     }
4811
4812     return sum;
4813 }
4814
4815 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4816 {
4817     if (dd->comm->eFlop)
4818     {
4819         dd->comm->flop -= force_flop_count(nrnb);
4820     }
4821 }
4822 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4823 {
4824     if (dd->comm->eFlop)
4825     {
4826         dd->comm->flop += force_flop_count(nrnb);
4827         dd->comm->flop_n++;
4828     }
4829 }  
4830
4831 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4832 {
4833     int i;
4834     
4835     for(i=0; i<ddCyclNr; i++)
4836     {
4837         dd->comm->cycl[i] = 0;
4838         dd->comm->cycl_n[i] = 0;
4839         dd->comm->cycl_max[i] = 0;
4840     }
4841     dd->comm->flop = 0;
4842     dd->comm->flop_n = 0;
4843 }
4844
4845 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4846 {
4847     gmx_domdec_comm_t *comm;
4848     gmx_domdec_load_t *load;
4849     gmx_domdec_root_t *root=NULL;
4850     int  d,dim,cid,i,pos;
4851     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4852     gmx_bool bSepPME;
4853     
4854     if (debug)
4855     {
4856         fprintf(debug,"get_load_distribution start\n");
4857     }
4858
4859     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4860     
4861     comm = dd->comm;
4862     
4863     bSepPME = (dd->pme_nodeid >= 0);
4864     
4865     for(d=dd->ndim-1; d>=0; d--)
4866     {
4867         dim = dd->dim[d];
4868         /* Check if we participate in the communication in this dimension */
4869         if (d == dd->ndim-1 || 
4870             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4871         {
4872             load = &comm->load[d];
4873             if (dd->bGridJump)
4874             {
4875                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4876             }
4877             pos = 0;
4878             if (d == dd->ndim-1)
4879             {
4880                 sbuf[pos++] = dd_force_load(comm);
4881                 sbuf[pos++] = sbuf[0];
4882                 if (dd->bGridJump)
4883                 {
4884                     sbuf[pos++] = sbuf[0];
4885                     sbuf[pos++] = cell_frac;
4886                     if (d > 0)
4887                     {
4888                         sbuf[pos++] = comm->cell_f_max0[d];
4889                         sbuf[pos++] = comm->cell_f_min1[d];
4890                     }
4891                 }
4892                 if (bSepPME)
4893                 {
4894                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4895                     sbuf[pos++] = comm->cycl[ddCyclPME];
4896                 }
4897             }
4898             else
4899             {
4900                 sbuf[pos++] = comm->load[d+1].sum;
4901                 sbuf[pos++] = comm->load[d+1].max;
4902                 if (dd->bGridJump)
4903                 {
4904                     sbuf[pos++] = comm->load[d+1].sum_m;
4905                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4906                     sbuf[pos++] = comm->load[d+1].flags;
4907                     if (d > 0)
4908                     {
4909                         sbuf[pos++] = comm->cell_f_max0[d];
4910                         sbuf[pos++] = comm->cell_f_min1[d];
4911                     }
4912                 }
4913                 if (bSepPME)
4914                 {
4915                     sbuf[pos++] = comm->load[d+1].mdf;
4916                     sbuf[pos++] = comm->load[d+1].pme;
4917                 }
4918             }
4919             load->nload = pos;
4920             /* Communicate a row in DD direction d.
4921              * The communicators are setup such that the root always has rank 0.
4922              */
4923 #ifdef GMX_MPI
4924             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4925                        load->load,load->nload*sizeof(float),MPI_BYTE,
4926                        0,comm->mpi_comm_load[d]);
4927 #endif
4928             if (dd->ci[dim] == dd->master_ci[dim])
4929             {
4930                 /* We are the root, process this row */
4931                 if (comm->bDynLoadBal)
4932                 {
4933                     root = comm->root[d];
4934                 }
4935                 load->sum = 0;
4936                 load->max = 0;
4937                 load->sum_m = 0;
4938                 load->cvol_min = 1;
4939                 load->flags = 0;
4940                 load->mdf = 0;
4941                 load->pme = 0;
4942                 pos = 0;
4943                 for(i=0; i<dd->nc[dim]; i++)
4944                 {
4945                     load->sum += load->load[pos++];
4946                     load->max = max(load->max,load->load[pos]);
4947                     pos++;
4948                     if (dd->bGridJump)
4949                     {
4950                         if (root->bLimited)
4951                         {
4952                             /* This direction could not be load balanced properly,
4953                              * therefore we need to use the maximum iso the average load.
4954                              */
4955                             load->sum_m = max(load->sum_m,load->load[pos]);
4956                         }
4957                         else
4958                         {
4959                             load->sum_m += load->load[pos];
4960                         }
4961                         pos++;
4962                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4963                         pos++;
4964                         if (d < dd->ndim-1)
4965                         {
4966                             load->flags = (int)(load->load[pos++] + 0.5);
4967                         }
4968                         if (d > 0)
4969                         {
4970                             root->cell_f_max0[i] = load->load[pos++];
4971                             root->cell_f_min1[i] = load->load[pos++];
4972                         }
4973                     }
4974                     if (bSepPME)
4975                     {
4976                         load->mdf = max(load->mdf,load->load[pos]);
4977                         pos++;
4978                         load->pme = max(load->pme,load->load[pos]);
4979                         pos++;
4980                     }
4981                 }
4982                 if (comm->bDynLoadBal && root->bLimited)
4983                 {
4984                     load->sum_m *= dd->nc[dim];
4985                     load->flags |= (1<<d);
4986                 }
4987             }
4988         }
4989     }
4990
4991     if (DDMASTER(dd))
4992     {
4993         comm->nload      += dd_load_count(comm);
4994         comm->load_step  += comm->cycl[ddCyclStep];
4995         comm->load_sum   += comm->load[0].sum;
4996         comm->load_max   += comm->load[0].max;
4997         if (comm->bDynLoadBal)
4998         {
4999             for(d=0; d<dd->ndim; d++)
5000             {
5001                 if (comm->load[0].flags & (1<<d))
5002                 {
5003                     comm->load_lim[d]++;
5004                 }
5005             }
5006         }
5007         if (bSepPME)
5008         {
5009             comm->load_mdf += comm->load[0].mdf;
5010             comm->load_pme += comm->load[0].pme;
5011         }
5012     }
5013
5014     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5015     
5016     if (debug)
5017     {
5018         fprintf(debug,"get_load_distribution finished\n");
5019     }
5020 }
5021
5022 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5023 {
5024     /* Return the relative performance loss on the total run time
5025      * due to the force calculation load imbalance.
5026      */
5027     if (dd->comm->nload > 0)
5028     {
5029         return
5030             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5031             (dd->comm->load_step*dd->nnodes);
5032     }
5033     else
5034     {
5035         return 0;
5036     }
5037 }
5038
5039 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5040 {
5041     char  buf[STRLEN];
5042     int   npp,npme,nnodes,d,limp;
5043     float imbal,pme_f_ratio,lossf,lossp=0;
5044     gmx_bool  bLim;
5045     gmx_domdec_comm_t *comm;
5046
5047     comm = dd->comm;
5048     if (DDMASTER(dd) && comm->nload > 0)
5049     {
5050         npp    = dd->nnodes;
5051         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5052         nnodes = npp + npme;
5053         imbal = comm->load_max*npp/comm->load_sum - 1;
5054         lossf = dd_force_imb_perf_loss(dd);
5055         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5056         fprintf(fplog,"%s",buf);
5057         fprintf(stderr,"\n");
5058         fprintf(stderr,"%s",buf);
5059         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5060         fprintf(fplog,"%s",buf);
5061         fprintf(stderr,"%s",buf);
5062         bLim = FALSE;
5063         if (comm->bDynLoadBal)
5064         {
5065             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5066             for(d=0; d<dd->ndim; d++)
5067             {
5068                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5069                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5070                 if (limp >= 50)
5071                 {
5072                     bLim = TRUE;
5073                 }
5074             }
5075             sprintf(buf+strlen(buf),"\n");
5076             fprintf(fplog,"%s",buf);
5077             fprintf(stderr,"%s",buf);
5078         }
5079         if (npme > 0)
5080         {
5081             pme_f_ratio = comm->load_pme/comm->load_mdf;
5082             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5083             if (lossp <= 0)
5084             {
5085                 lossp *= (float)npme/(float)nnodes;
5086             }
5087             else
5088             {
5089                 lossp *= (float)npp/(float)nnodes;
5090             }
5091             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5092             fprintf(fplog,"%s",buf);
5093             fprintf(stderr,"%s",buf);
5094             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5095             fprintf(fplog,"%s",buf);
5096             fprintf(stderr,"%s",buf);
5097         }
5098         fprintf(fplog,"\n");
5099         fprintf(stderr,"\n");
5100         
5101         if (lossf >= DD_PERF_LOSS)
5102         {
5103             sprintf(buf,
5104                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5105                     "      in the domain decomposition.\n",lossf*100);
5106             if (!comm->bDynLoadBal)
5107             {
5108                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5109             }
5110             else if (bLim)
5111             {
5112                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5113             }
5114             fprintf(fplog,"%s\n",buf);
5115             fprintf(stderr,"%s\n",buf);
5116         }
5117         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5118         {
5119             sprintf(buf,
5120                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5121                     "      had %s work to do than the PP nodes.\n"
5122                     "      You might want to %s the number of PME nodes\n"
5123                     "      or %s the cut-off and the grid spacing.\n",
5124                     fabs(lossp*100),
5125                     (lossp < 0) ? "less"     : "more",
5126                     (lossp < 0) ? "decrease" : "increase",
5127                     (lossp < 0) ? "decrease" : "increase");
5128             fprintf(fplog,"%s\n",buf);
5129             fprintf(stderr,"%s\n",buf);
5130         }
5131     }
5132 }
5133
5134 static float dd_vol_min(gmx_domdec_t *dd)
5135 {
5136     return dd->comm->load[0].cvol_min*dd->nnodes;
5137 }
5138
5139 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5140 {
5141     return dd->comm->load[0].flags;
5142 }
5143
5144 static float dd_f_imbal(gmx_domdec_t *dd)
5145 {
5146     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5147 }
5148
5149 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5150 {
5151     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5152 }
5153
5154 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5155 {
5156     int flags,d;
5157     char buf[22];
5158     
5159     flags = dd_load_flags(dd);
5160     if (flags)
5161     {
5162         fprintf(fplog,
5163                 "DD  load balancing is limited by minimum cell size in dimension");
5164         for(d=0; d<dd->ndim; d++)
5165         {
5166             if (flags & (1<<d))
5167             {
5168                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5169             }
5170         }
5171         fprintf(fplog,"\n");
5172     }
5173     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5174     if (dd->comm->bDynLoadBal)
5175     {
5176         fprintf(fplog,"  vol min/aver %5.3f%c",
5177                 dd_vol_min(dd),flags ? '!' : ' ');
5178     }
5179     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5180     if (dd->comm->cycl_n[ddCyclPME])
5181     {
5182         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5183     }
5184     fprintf(fplog,"\n\n");
5185 }
5186
5187 static void dd_print_load_verbose(gmx_domdec_t *dd)
5188 {
5189     if (dd->comm->bDynLoadBal)
5190     {
5191         fprintf(stderr,"vol %4.2f%c ",
5192                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5193     }
5194     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5195     if (dd->comm->cycl_n[ddCyclPME])
5196     {
5197         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5198     }
5199 }
5200
5201 #ifdef GMX_MPI
5202 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5203                                    int dim_ind,ivec loc)
5204 {
5205     MPI_Group g_row = MPI_GROUP_EMPTY;
5206     MPI_Comm  c_row;
5207     int  dim,i,*rank;
5208     ivec loc_c;
5209     gmx_domdec_root_t *root;
5210     gmx_bool bPartOfGroup = FALSE;
5211     
5212     dim = dd->dim[dim_ind];
5213     copy_ivec(loc,loc_c);
5214     snew(rank,dd->nc[dim]);
5215     for(i=0; i<dd->nc[dim]; i++)
5216     {
5217         loc_c[dim] = i;
5218         rank[i] = dd_index(dd->nc,loc_c);
5219         if (rank[i] == dd->rank)
5220         {
5221             /* This process is part of the group */
5222             bPartOfGroup = TRUE;
5223         }
5224     }
5225     if (bPartOfGroup)
5226     {
5227         MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5228     }
5229     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5230     if (bPartOfGroup)
5231     {
5232         dd->comm->mpi_comm_load[dim_ind] = c_row;
5233         if (dd->comm->eDLB != edlbNO)
5234         {
5235             if (dd->ci[dim] == dd->master_ci[dim])
5236             {
5237                 /* This is the root process of this row */
5238                 snew(dd->comm->root[dim_ind],1);
5239                 root = dd->comm->root[dim_ind];
5240                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5241                 snew(root->old_cell_f,dd->nc[dim]+1);
5242                 snew(root->bCellMin,dd->nc[dim]);
5243                 if (dim_ind > 0)
5244                 {
5245                     snew(root->cell_f_max0,dd->nc[dim]);
5246                     snew(root->cell_f_min1,dd->nc[dim]);
5247                     snew(root->bound_min,dd->nc[dim]);
5248                     snew(root->bound_max,dd->nc[dim]);
5249                 }
5250                 snew(root->buf_ncd,dd->nc[dim]);
5251             }
5252             else
5253             {
5254                 /* This is not a root process, we only need to receive cell_f */
5255                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5256             }
5257         }
5258         if (dd->ci[dim] == dd->master_ci[dim])
5259         {
5260             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5261         }
5262     }
5263     sfree(rank);
5264 }
5265 #endif
5266
5267 static void make_load_communicators(gmx_domdec_t *dd)
5268 {
5269 #ifdef GMX_MPI
5270   MPI_Group g_all;
5271   int  dim0,dim1,i,j;
5272   ivec loc;
5273
5274   if (debug)
5275     fprintf(debug,"Making load communicators\n");
5276
5277   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5278   
5279   snew(dd->comm->load,dd->ndim);
5280   snew(dd->comm->mpi_comm_load,dd->ndim);
5281   
5282   clear_ivec(loc);
5283   make_load_communicator(dd,g_all,0,loc);
5284   if (dd->ndim > 1) {
5285     dim0 = dd->dim[0];
5286     for(i=0; i<dd->nc[dim0]; i++) {
5287       loc[dim0] = i;
5288       make_load_communicator(dd,g_all,1,loc);
5289     }
5290   }
5291   if (dd->ndim > 2) {
5292     dim0 = dd->dim[0];
5293     for(i=0; i<dd->nc[dim0]; i++) {
5294       loc[dim0] = i;
5295       dim1 = dd->dim[1];
5296       for(j=0; j<dd->nc[dim1]; j++) {
5297           loc[dim1] = j;
5298           make_load_communicator(dd,g_all,2,loc);
5299       }
5300     }
5301   }
5302
5303   MPI_Group_free(&g_all);
5304
5305   if (debug)
5306     fprintf(debug,"Finished making load communicators\n");
5307 #endif
5308 }
5309
5310 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5311 {
5312     gmx_bool bZYX;
5313     int  d,dim,i,j,m;
5314     ivec tmp,s;
5315     int  nzone,nzonep;
5316     ivec dd_zp[DD_MAXIZONE];
5317     gmx_domdec_zones_t *zones;
5318     gmx_domdec_ns_ranges_t *izone;
5319     
5320     for(d=0; d<dd->ndim; d++)
5321     {
5322         dim = dd->dim[d];
5323         copy_ivec(dd->ci,tmp);
5324         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5325         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5326         copy_ivec(dd->ci,tmp);
5327         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5328         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5329         if (debug)
5330         {
5331             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5332                     dd->rank,dim,
5333                     dd->neighbor[d][0],
5334                     dd->neighbor[d][1]);
5335         }
5336     }
5337     
5338     if (DDMASTER(dd))
5339     {
5340         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5341             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5342     }
5343     if (fplog)
5344     {
5345         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5346                 dd->ndim,
5347                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5348                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5349     }
5350     switch (dd->ndim)
5351     {
5352     case 3:
5353         nzone  = dd_z3n;
5354         nzonep = dd_zp3n;
5355         for(i=0; i<nzonep; i++)
5356         {
5357             copy_ivec(dd_zp3[i],dd_zp[i]);
5358         }
5359         break;
5360     case 2:
5361         nzone  = dd_z2n;
5362         nzonep = dd_zp2n;
5363         for(i=0; i<nzonep; i++)
5364         {
5365             copy_ivec(dd_zp2[i],dd_zp[i]);
5366         }
5367         break;
5368     case 1:
5369         nzone  = dd_z1n;
5370         nzonep = dd_zp1n;
5371         for(i=0; i<nzonep; i++)
5372         {
5373             copy_ivec(dd_zp1[i],dd_zp[i]);
5374         }
5375         break;
5376     default:
5377         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5378         nzone = 0;
5379         nzonep = 0;
5380     }
5381
5382     zones = &dd->comm->zones;
5383
5384     for(i=0; i<nzone; i++)
5385     {
5386         m = 0;
5387         clear_ivec(zones->shift[i]);
5388         for(d=0; d<dd->ndim; d++)
5389         {
5390             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5391         }
5392     }
5393     
5394     zones->n = nzone;
5395     for(i=0; i<nzone; i++)
5396     {
5397         for(d=0; d<DIM; d++)
5398         {
5399             s[d] = dd->ci[d] - zones->shift[i][d];
5400             if (s[d] < 0)
5401             {
5402                 s[d] += dd->nc[d];
5403             }
5404             else if (s[d] >= dd->nc[d])
5405             {
5406                 s[d] -= dd->nc[d];
5407             }
5408         }
5409     }
5410     zones->nizone = nzonep;
5411     for(i=0; i<zones->nizone; i++)
5412     {
5413         if (dd_zp[i][0] != i)
5414         {
5415             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5416         }
5417         izone = &zones->izone[i];
5418         izone->j0 = dd_zp[i][1];
5419         izone->j1 = dd_zp[i][2];
5420         for(dim=0; dim<DIM; dim++)
5421         {
5422             if (dd->nc[dim] == 1)
5423             {
5424                 /* All shifts should be allowed */
5425                 izone->shift0[dim] = -1;
5426                 izone->shift1[dim] = 1;
5427             }
5428             else
5429             {
5430                 /*
5431                   izone->shift0[d] = 0;
5432                   izone->shift1[d] = 0;
5433                   for(j=izone->j0; j<izone->j1; j++) {
5434                   if (dd->shift[j][d] > dd->shift[i][d])
5435                   izone->shift0[d] = -1;
5436                   if (dd->shift[j][d] < dd->shift[i][d])
5437                   izone->shift1[d] = 1;
5438                   }
5439                 */
5440                 
5441                 int shift_diff;
5442                 
5443                 /* Assume the shift are not more than 1 cell */
5444                 izone->shift0[dim] = 1;
5445                 izone->shift1[dim] = -1;
5446                 for(j=izone->j0; j<izone->j1; j++)
5447                 {
5448                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5449                     if (shift_diff < izone->shift0[dim])
5450                     {
5451                         izone->shift0[dim] = shift_diff;
5452                     }
5453                     if (shift_diff > izone->shift1[dim])
5454                     {
5455                         izone->shift1[dim] = shift_diff;
5456                     }
5457                 }
5458             }
5459         }
5460     }
5461     
5462     if (dd->comm->eDLB != edlbNO)
5463     {
5464         snew(dd->comm->root,dd->ndim);
5465     }
5466     
5467     if (dd->comm->bRecordLoad)
5468     {
5469         make_load_communicators(dd);
5470     }
5471 }
5472
5473 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5474 {
5475     gmx_domdec_t *dd;
5476     gmx_domdec_comm_t *comm;
5477     int  i,rank,*buf;
5478     ivec periods;
5479 #ifdef GMX_MPI
5480     MPI_Comm comm_cart;
5481 #endif
5482     
5483     dd = cr->dd;
5484     comm = dd->comm;
5485     
5486 #ifdef GMX_MPI
5487     if (comm->bCartesianPP)
5488     {
5489         /* Set up cartesian communication for the particle-particle part */
5490         if (fplog)
5491         {
5492             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5493                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5494         }
5495         
5496         for(i=0; i<DIM; i++)
5497         {
5498             periods[i] = TRUE;
5499         }
5500         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5501                         &comm_cart);
5502         /* We overwrite the old communicator with the new cartesian one */
5503         cr->mpi_comm_mygroup = comm_cart;
5504     }
5505     
5506     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5507     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5508     
5509     if (comm->bCartesianPP_PME)
5510     {
5511         /* Since we want to use the original cartesian setup for sim,
5512          * and not the one after split, we need to make an index.
5513          */
5514         snew(comm->ddindex2ddnodeid,dd->nnodes);
5515         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5516         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5517         /* Get the rank of the DD master,
5518          * above we made sure that the master node is a PP node.
5519          */
5520         if (MASTER(cr))
5521         {
5522             rank = dd->rank;
5523         }
5524         else
5525         {
5526             rank = 0;
5527         }
5528         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5529     }
5530     else if (comm->bCartesianPP)
5531     {
5532         if (cr->npmenodes == 0)
5533         {
5534             /* The PP communicator is also
5535              * the communicator for this simulation
5536              */
5537             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5538         }
5539         cr->nodeid = dd->rank;
5540         
5541         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5542         
5543         /* We need to make an index to go from the coordinates
5544          * to the nodeid of this simulation.
5545          */
5546         snew(comm->ddindex2simnodeid,dd->nnodes);
5547         snew(buf,dd->nnodes);
5548         if (cr->duty & DUTY_PP)
5549         {
5550             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5551         }
5552         /* Communicate the ddindex to simulation nodeid index */
5553         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5554                       cr->mpi_comm_mysim);
5555         sfree(buf);
5556         
5557         /* Determine the master coordinates and rank.
5558          * The DD master should be the same node as the master of this sim.
5559          */
5560         for(i=0; i<dd->nnodes; i++)
5561         {
5562             if (comm->ddindex2simnodeid[i] == 0)
5563             {
5564                 ddindex2xyz(dd->nc,i,dd->master_ci);
5565                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5566             }
5567         }
5568         if (debug)
5569         {
5570             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5571         }
5572     }
5573     else
5574     {
5575         /* No Cartesian communicators */
5576         /* We use the rank in dd->comm->all as DD index */
5577         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5578         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5579         dd->masterrank = 0;
5580         clear_ivec(dd->master_ci);
5581     }
5582 #endif
5583   
5584     if (fplog)
5585     {
5586         fprintf(fplog,
5587                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5588                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5589     }
5590     if (debug)
5591     {
5592         fprintf(debug,
5593                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5594                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5595     }
5596 }
5597
5598 static void receive_ddindex2simnodeid(t_commrec *cr)
5599 {
5600     gmx_domdec_t *dd;
5601     
5602     gmx_domdec_comm_t *comm;
5603     int  *buf;
5604     
5605     dd = cr->dd;
5606     comm = dd->comm;
5607     
5608 #ifdef GMX_MPI
5609     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5610     {
5611         snew(comm->ddindex2simnodeid,dd->nnodes);
5612         snew(buf,dd->nnodes);
5613         if (cr->duty & DUTY_PP)
5614         {
5615             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5616         }
5617 #ifdef GMX_MPI
5618         /* Communicate the ddindex to simulation nodeid index */
5619         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5620                       cr->mpi_comm_mysim);
5621 #endif
5622         sfree(buf);
5623     }
5624 #endif
5625 }
5626
5627 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5628                                                      int ncg,int natoms)
5629 {
5630     gmx_domdec_master_t *ma;
5631     int i;
5632
5633     snew(ma,1);
5634     
5635     snew(ma->ncg,dd->nnodes);
5636     snew(ma->index,dd->nnodes+1);
5637     snew(ma->cg,ncg);
5638     snew(ma->nat,dd->nnodes);
5639     snew(ma->ibuf,dd->nnodes*2);
5640     snew(ma->cell_x,DIM);
5641     for(i=0; i<DIM; i++)
5642     {
5643         snew(ma->cell_x[i],dd->nc[i]+1);
5644     }
5645
5646     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5647     {
5648         ma->vbuf = NULL;
5649     }
5650     else
5651     {
5652         snew(ma->vbuf,natoms);
5653     }
5654
5655     return ma;
5656 }
5657
5658 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5659                                int reorder)
5660 {
5661     gmx_domdec_t *dd;
5662     gmx_domdec_comm_t *comm;
5663     int  i,rank;
5664     gmx_bool bDiv[DIM];
5665     ivec periods;
5666 #ifdef GMX_MPI
5667     MPI_Comm comm_cart;
5668 #endif
5669     
5670     dd = cr->dd;
5671     comm = dd->comm;
5672     
5673     if (comm->bCartesianPP)
5674     {
5675         for(i=1; i<DIM; i++)
5676         {
5677             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5678         }
5679         if (bDiv[YY] || bDiv[ZZ])
5680         {
5681             comm->bCartesianPP_PME = TRUE;
5682             /* If we have 2D PME decomposition, which is always in x+y,
5683              * we stack the PME only nodes in z.
5684              * Otherwise we choose the direction that provides the thinnest slab
5685              * of PME only nodes as this will have the least effect
5686              * on the PP communication.
5687              * But for the PME communication the opposite might be better.
5688              */
5689             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5690                              !bDiv[YY] ||
5691                              dd->nc[YY] > dd->nc[ZZ]))
5692             {
5693                 comm->cartpmedim = ZZ;
5694             }
5695             else
5696             {
5697                 comm->cartpmedim = YY;
5698             }
5699             comm->ntot[comm->cartpmedim]
5700                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5701         }
5702         else if (fplog)
5703         {
5704             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5705             fprintf(fplog,
5706                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5707         }
5708     }
5709     
5710 #ifdef GMX_MPI
5711     if (comm->bCartesianPP_PME)
5712     {
5713         if (fplog)
5714         {
5715             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5716         }
5717         
5718         for(i=0; i<DIM; i++)
5719         {
5720             periods[i] = TRUE;
5721         }
5722         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5723                         &comm_cart);
5724         
5725         MPI_Comm_rank(comm_cart,&rank);
5726         if (MASTERNODE(cr) && rank != 0)
5727         {
5728             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5729         }
5730         
5731         /* With this assigment we loose the link to the original communicator
5732          * which will usually be MPI_COMM_WORLD, unless have multisim.
5733          */
5734         cr->mpi_comm_mysim = comm_cart;
5735         cr->sim_nodeid = rank;
5736         
5737         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5738         
5739         if (fplog)
5740         {
5741             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5742                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5743         }
5744         
5745         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5746         {
5747             cr->duty = DUTY_PP;
5748         }
5749         if (cr->npmenodes == 0 ||
5750             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5751         {
5752             cr->duty = DUTY_PME;
5753         }
5754         
5755         /* Split the sim communicator into PP and PME only nodes */
5756         MPI_Comm_split(cr->mpi_comm_mysim,
5757                        cr->duty,
5758                        dd_index(comm->ntot,dd->ci),
5759                        &cr->mpi_comm_mygroup);
5760     }
5761     else
5762     {
5763         switch (dd_node_order)
5764         {
5765         case ddnoPP_PME:
5766             if (fplog)
5767             {
5768                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5769             }
5770             break;
5771         case ddnoINTERLEAVE:
5772             /* Interleave the PP-only and PME-only nodes,
5773              * as on clusters with dual-core machines this will double
5774              * the communication bandwidth of the PME processes
5775              * and thus speed up the PP <-> PME and inter PME communication.
5776              */
5777             if (fplog)
5778             {
5779                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5780             }
5781             comm->pmenodes = dd_pmenodes(cr);
5782             break;
5783         case ddnoCARTESIAN:
5784             break;
5785         default:
5786             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5787         }
5788     
5789         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5790         {
5791             cr->duty = DUTY_PME;
5792         }
5793         else
5794         {
5795             cr->duty = DUTY_PP;
5796         }
5797         
5798         /* Split the sim communicator into PP and PME only nodes */
5799         MPI_Comm_split(cr->mpi_comm_mysim,
5800                        cr->duty,
5801                        cr->nodeid,
5802                        &cr->mpi_comm_mygroup);
5803         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5804     }
5805 #endif
5806
5807     if (fplog)
5808     {
5809         fprintf(fplog,"This is a %s only node\n\n",
5810                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5811     }
5812 }
5813
5814 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5815 {
5816     gmx_domdec_t *dd;
5817     gmx_domdec_comm_t *comm;
5818     int CartReorder;
5819     
5820     dd = cr->dd;
5821     comm = dd->comm;
5822     
5823     copy_ivec(dd->nc,comm->ntot);
5824     
5825     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5826     comm->bCartesianPP_PME = FALSE;
5827     
5828     /* Reorder the nodes by default. This might change the MPI ranks.
5829      * Real reordering is only supported on very few architectures,
5830      * Blue Gene is one of them.
5831      */
5832     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5833     
5834     if (cr->npmenodes > 0)
5835     {
5836         /* Split the communicator into a PP and PME part */
5837         split_communicator(fplog,cr,dd_node_order,CartReorder);
5838         if (comm->bCartesianPP_PME)
5839         {
5840             /* We (possibly) reordered the nodes in split_communicator,
5841              * so it is no longer required in make_pp_communicator.
5842              */
5843             CartReorder = FALSE;
5844         }
5845     }
5846     else
5847     {
5848         /* All nodes do PP and PME */
5849 #ifdef GMX_MPI    
5850         /* We do not require separate communicators */
5851         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5852 #endif
5853     }
5854     
5855     if (cr->duty & DUTY_PP)
5856     {
5857         /* Copy or make a new PP communicator */
5858         make_pp_communicator(fplog,cr,CartReorder);
5859     }
5860     else
5861     {
5862         receive_ddindex2simnodeid(cr);
5863     }
5864     
5865     if (!(cr->duty & DUTY_PME))
5866     {
5867         /* Set up the commnuication to our PME node */
5868         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5869         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5870         if (debug)
5871         {
5872             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5873                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5874         }
5875     }
5876     else
5877     {
5878         dd->pme_nodeid = -1;
5879     }
5880
5881     if (DDMASTER(dd))
5882     {
5883         dd->ma = init_gmx_domdec_master_t(dd,
5884                                           comm->cgs_gl.nr,
5885                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5886     }
5887 }
5888
5889 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5890 {
5891     real *slb_frac,tot;
5892     int  i,n;
5893     double dbl;
5894     
5895     slb_frac = NULL;
5896     if (nc > 1 && size_string != NULL)
5897     {
5898         if (fplog)
5899         {
5900             fprintf(fplog,"Using static load balancing for the %s direction\n",
5901                     dir);
5902         }
5903         snew(slb_frac,nc);
5904         tot = 0;
5905         for (i=0; i<nc; i++)
5906         {
5907             dbl = 0;
5908             sscanf(size_string,"%lf%n",&dbl,&n);
5909             if (dbl == 0)
5910             {
5911                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5912             }
5913             slb_frac[i] = dbl;
5914             size_string += n;
5915             tot += slb_frac[i];
5916         }
5917         /* Normalize */
5918         if (fplog)
5919         {
5920             fprintf(fplog,"Relative cell sizes:");
5921         }
5922         for (i=0; i<nc; i++)
5923         {
5924             slb_frac[i] /= tot;
5925             if (fplog)
5926             {
5927                 fprintf(fplog," %5.3f",slb_frac[i]);
5928             }
5929         }
5930         if (fplog)
5931         {
5932             fprintf(fplog,"\n");
5933         }
5934     }
5935     
5936     return slb_frac;
5937 }
5938
5939 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5940 {
5941     int n,nmol,ftype;
5942     gmx_mtop_ilistloop_t iloop;
5943     t_ilist *il;
5944     
5945     n = 0;
5946     iloop = gmx_mtop_ilistloop_init(mtop);
5947     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5948     {
5949         for(ftype=0; ftype<F_NRE; ftype++)
5950         {
5951             if ((interaction_function[ftype].flags & IF_BOND) &&
5952                 NRAL(ftype) >  2)
5953             {
5954                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5955             }
5956         }
5957   }
5958
5959   return n;
5960 }
5961
5962 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5963 {
5964     char *val;
5965     int  nst;
5966     
5967     nst = def;
5968     val = getenv(env_var);
5969     if (val)
5970     {
5971         if (sscanf(val,"%d",&nst) <= 0)
5972         {
5973             nst = 1;
5974         }
5975         if (fplog)
5976         {
5977             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5978                     env_var,val,nst);
5979         }
5980     }
5981     
5982     return nst;
5983 }
5984
5985 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5986 {
5987     if (MASTER(cr))
5988     {
5989         fprintf(stderr,"\n%s\n",warn_string);
5990     }
5991     if (fplog)
5992     {
5993         fprintf(fplog,"\n%s\n",warn_string);
5994     }
5995 }
5996
5997 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5998                                   t_inputrec *ir,FILE *fplog)
5999 {
6000     if (ir->ePBC == epbcSCREW &&
6001         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6002     {
6003         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6004     }
6005
6006     if (ir->ns_type == ensSIMPLE)
6007     {
6008         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6009     }
6010
6011     if (ir->nstlist == 0)
6012     {
6013         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6014     }
6015
6016     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6017     {
6018         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6019     }
6020 }
6021
6022 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6023 {
6024     int  di,d;
6025     real r;
6026
6027     r = ddbox->box_size[XX];
6028     for(di=0; di<dd->ndim; di++)
6029     {
6030         d = dd->dim[di];
6031         /* Check using the initial average cell size */
6032         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6033     }
6034
6035     return r;
6036 }
6037
6038 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6039                              const char *dlb_opt,gmx_bool bRecordLoad,
6040                              unsigned long Flags,t_inputrec *ir)
6041 {
6042     gmx_domdec_t *dd;
6043     int  eDLB=-1;
6044     char buf[STRLEN];
6045
6046     switch (dlb_opt[0])
6047     {
6048     case 'a': eDLB = edlbAUTO; break;
6049     case 'n': eDLB = edlbNO;   break;
6050     case 'y': eDLB = edlbYES;  break;
6051     default: gmx_incons("Unknown dlb_opt");
6052     }
6053
6054     if (Flags & MD_RERUN)
6055     {
6056         return edlbNO;
6057     }
6058
6059     if (!EI_DYNAMICS(ir->eI))
6060     {
6061         if (eDLB == edlbYES)
6062         {
6063             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6064             dd_warning(cr,fplog,buf);
6065         }
6066             
6067         return edlbNO;
6068     }
6069
6070     if (!bRecordLoad)
6071     {
6072         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6073
6074         return edlbNO;
6075     }
6076
6077     if (Flags & MD_REPRODUCIBLE)
6078     {
6079         switch (eDLB)
6080         {
6081                         case edlbNO: 
6082                                 break;
6083                         case edlbAUTO:
6084                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6085                                 eDLB = edlbNO;
6086                                 break;
6087                         case edlbYES:
6088                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6089                                 break;
6090                         default:
6091                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6092                                 break;
6093         }
6094     }
6095
6096     return eDLB;
6097 }
6098
6099 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6100 {
6101     int dim;
6102
6103     dd->ndim = 0;
6104     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6105     {
6106         /* Decomposition order z,y,x */
6107         if (fplog)
6108         {
6109             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6110         }
6111         for(dim=DIM-1; dim>=0; dim--)
6112         {
6113             if (dd->nc[dim] > 1)
6114             {
6115                 dd->dim[dd->ndim++] = dim;
6116             }
6117         }
6118     }
6119     else
6120     {
6121         /* Decomposition order x,y,z */
6122         for(dim=0; dim<DIM; dim++)
6123         {
6124             if (dd->nc[dim] > 1)
6125             {
6126                 dd->dim[dd->ndim++] = dim;
6127             }
6128         }
6129     }
6130 }
6131
6132 static gmx_domdec_comm_t *init_dd_comm()
6133 {
6134     gmx_domdec_comm_t *comm;
6135     int  i;
6136
6137     snew(comm,1);
6138     snew(comm->cggl_flag,DIM*2);
6139     snew(comm->cgcm_state,DIM*2);
6140     for(i=0; i<DIM*2; i++)
6141     {
6142         comm->cggl_flag_nalloc[i]  = 0;
6143         comm->cgcm_state_nalloc[i] = 0;
6144     }
6145     
6146     comm->nalloc_int = 0;
6147     comm->buf_int    = NULL;
6148
6149     vec_rvec_init(&comm->vbuf);
6150
6151     comm->n_load_have    = 0;
6152     comm->n_load_collect = 0;
6153
6154     for(i=0; i<ddnatNR-ddnatZONE; i++)
6155     {
6156         comm->sum_nat[i] = 0;
6157     }
6158     comm->ndecomp = 0;
6159     comm->nload   = 0;
6160     comm->load_step = 0;
6161     comm->load_sum  = 0;
6162     comm->load_max  = 0;
6163     clear_ivec(comm->load_lim);
6164     comm->load_mdf  = 0;
6165     comm->load_pme  = 0;
6166
6167     return comm;
6168 }
6169
6170 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6171                                         unsigned long Flags,
6172                                         ivec nc,
6173                                         real comm_distance_min,real rconstr,
6174                                         const char *dlb_opt,real dlb_scale,
6175                                         const char *sizex,const char *sizey,const char *sizez,
6176                                         gmx_mtop_t *mtop,t_inputrec *ir,
6177                                         matrix box,rvec *x,
6178                                         gmx_ddbox_t *ddbox,
6179                                         int *npme_x,int *npme_y)
6180 {
6181     gmx_domdec_t *dd;
6182     gmx_domdec_comm_t *comm;
6183     int  recload;
6184     int  d,i,j;
6185     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6186     gmx_bool bC;
6187     char buf[STRLEN];
6188     
6189     if (fplog)
6190     {
6191         fprintf(fplog,
6192                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6193     }
6194     
6195     snew(dd,1);
6196
6197     dd->comm = init_dd_comm();
6198     comm = dd->comm;
6199     snew(comm->cggl_flag,DIM*2);
6200     snew(comm->cgcm_state,DIM*2);
6201
6202     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6203     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6204     
6205     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6206     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6207     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6208     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6209     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6210     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6211     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6212     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6213
6214     dd->pme_recv_f_alloc = 0;
6215     dd->pme_recv_f_buf = NULL;
6216
6217     if (dd->bSendRecv2 && fplog)
6218     {
6219         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6220     }
6221     if (comm->eFlop)
6222     {
6223         if (fplog)
6224         {
6225             fprintf(fplog,"Will load balance based on FLOP count\n");
6226         }
6227         if (comm->eFlop > 1)
6228         {
6229             srand(1+cr->nodeid);
6230         }
6231         comm->bRecordLoad = TRUE;
6232     }
6233     else
6234     {
6235         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6236                              
6237     }
6238     
6239     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6240     
6241     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6242     if (fplog)
6243     {
6244         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6245     }
6246     dd->bGridJump = comm->bDynLoadBal;
6247     
6248     if (comm->nstSortCG)
6249     {
6250         if (fplog)
6251         {
6252             if (comm->nstSortCG == 1)
6253             {
6254                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6255             }
6256             else
6257             {
6258                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6259                         comm->nstSortCG);
6260             }
6261         }
6262         snew(comm->sort,1);
6263     }
6264     else
6265     {
6266         if (fplog)
6267         {
6268             fprintf(fplog,"Will not sort the charge groups\n");
6269         }
6270     }
6271     
6272     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6273     if (comm->bInterCGBondeds)
6274     {
6275         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6276     }
6277     else
6278     {
6279         comm->bInterCGMultiBody = FALSE;
6280     }
6281     
6282     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6283
6284     if (ir->rlistlong == 0)
6285     {
6286         /* Set the cut-off to some very large value,
6287          * so we don't need if statements everywhere in the code.
6288          * We use sqrt, since the cut-off is squared in some places.
6289          */
6290         comm->cutoff   = GMX_CUTOFF_INF;
6291     }
6292     else
6293     {
6294         comm->cutoff   = ir->rlistlong;
6295     }
6296     comm->cutoff_mbody = 0;
6297     
6298     comm->cellsize_limit = 0;
6299     comm->bBondComm = FALSE;
6300
6301     if (comm->bInterCGBondeds)
6302     {
6303         if (comm_distance_min > 0)
6304         {
6305             comm->cutoff_mbody = comm_distance_min;
6306             if (Flags & MD_DDBONDCOMM)
6307             {
6308                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6309             }
6310             else
6311             {
6312                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6313             }
6314             r_bonded_limit = comm->cutoff_mbody;
6315         }
6316         else if (ir->bPeriodicMols)
6317         {
6318             /* Can not easily determine the required cut-off */
6319             dd_warning(cr,fplog,"NOTE: Periodic molecules: can not easily determine the required minimum bonded cut-off, using half the non-bonded cut-off\n");
6320             comm->cutoff_mbody = comm->cutoff/2;
6321             r_bonded_limit = comm->cutoff_mbody;
6322         }
6323         else
6324         {
6325             if (MASTER(cr))
6326             {
6327                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6328                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6329             }
6330             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6331             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6332
6333             /* We use an initial margin of 10% for the minimum cell size,
6334              * except when we are just below the non-bonded cut-off.
6335              */
6336             if (Flags & MD_DDBONDCOMM)
6337             {
6338                 if (max(r_2b,r_mb) > comm->cutoff)
6339                 {
6340                     r_bonded       = max(r_2b,r_mb);
6341                     r_bonded_limit = 1.1*r_bonded;
6342                     comm->bBondComm = TRUE;
6343                 }
6344                 else
6345                 {
6346                     r_bonded       = r_mb;
6347                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6348                 }
6349                 /* We determine cutoff_mbody later */
6350             }
6351             else
6352             {
6353                 /* No special bonded communication,
6354                  * simply increase the DD cut-off.
6355                  */
6356                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6357                 comm->cutoff_mbody = r_bonded_limit;
6358                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6359             }
6360         }
6361         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6362         if (fplog)
6363         {
6364             fprintf(fplog,
6365                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6366                     comm->cellsize_limit);
6367         }
6368     }
6369
6370     if (dd->bInterCGcons && rconstr <= 0)
6371     {
6372         /* There is a cell size limit due to the constraints (P-LINCS) */
6373         rconstr = constr_r_max(fplog,mtop,ir);
6374         if (fplog)
6375         {
6376             fprintf(fplog,
6377                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6378                     rconstr);
6379             if (rconstr > comm->cellsize_limit)
6380             {
6381                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6382             }
6383         }
6384     }
6385     else if (rconstr > 0 && fplog)
6386     {
6387         /* Here we do not check for dd->bInterCGcons,
6388          * because one can also set a cell size limit for virtual sites only
6389          * and at this point we don't know yet if there are intercg v-sites.
6390          */
6391         fprintf(fplog,
6392                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6393                 rconstr);
6394     }
6395     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6396
6397     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6398
6399     if (nc[XX] > 0)
6400     {
6401         copy_ivec(nc,dd->nc);
6402         set_dd_dim(fplog,dd);
6403         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6404
6405         if (cr->npmenodes == -1)
6406         {
6407             cr->npmenodes = 0;
6408         }
6409         acs = average_cellsize_min(dd,ddbox);
6410         if (acs < comm->cellsize_limit)
6411         {
6412             if (fplog)
6413             {
6414                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6415             }
6416             gmx_fatal_collective(FARGS,cr,NULL,
6417                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6418                                  acs,comm->cellsize_limit);
6419         }
6420     }
6421     else
6422     {
6423         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6424
6425         /* We need to choose the optimal DD grid and possibly PME nodes */
6426         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6427                                comm->eDLB!=edlbNO,dlb_scale,
6428                                comm->cellsize_limit,comm->cutoff,
6429                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6430         
6431         if (dd->nc[XX] == 0)
6432         {
6433             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6434             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6435                     !bC ? "-rdd" : "-rcon",
6436                     comm->eDLB!=edlbNO ? " or -dds" : "",
6437                     bC ? " or your LINCS settings" : "");
6438
6439             gmx_fatal_collective(FARGS,cr,NULL,
6440                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6441                                  "%s\n"
6442                                  "Look in the log file for details on the domain decomposition",
6443                                  cr->nnodes-cr->npmenodes,limit,buf);
6444         }
6445         set_dd_dim(fplog,dd);
6446     }
6447
6448     if (fplog)
6449     {
6450         fprintf(fplog,
6451                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6452                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6453     }
6454     
6455     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6456     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6457     {
6458         gmx_fatal_collective(FARGS,cr,NULL,
6459                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6460                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6461     }
6462     if (cr->npmenodes > dd->nnodes)
6463     {
6464         gmx_fatal_collective(FARGS,cr,NULL,
6465                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6466     }
6467     if (cr->npmenodes > 0)
6468     {
6469         comm->npmenodes = cr->npmenodes;
6470     }
6471     else
6472     {
6473         comm->npmenodes = dd->nnodes;
6474     }
6475
6476     if (EEL_PME(ir->coulombtype))
6477     {
6478         /* The following choices should match those
6479          * in comm_cost_est in domdec_setup.c.
6480          * Note that here the checks have to take into account
6481          * that the decomposition might occur in a different order than xyz
6482          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6483          * in which case they will not match those in comm_cost_est,
6484          * but since that is mainly for testing purposes that's fine.
6485          */
6486         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6487             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6488             getenv("GMX_PMEONEDD") == NULL)
6489         {
6490             comm->npmedecompdim = 2;
6491             comm->npmenodes_x   = dd->nc[XX];
6492             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6493         }
6494         else
6495         {
6496             /* In case nc is 1 in both x and y we could still choose to
6497              * decompose pme in y instead of x, but we use x for simplicity.
6498              */
6499             comm->npmedecompdim = 1;
6500             if (dd->dim[0] == YY)
6501             {
6502                 comm->npmenodes_x = 1;
6503                 comm->npmenodes_y = comm->npmenodes;
6504             }
6505             else
6506             {
6507                 comm->npmenodes_x = comm->npmenodes;
6508                 comm->npmenodes_y = 1;
6509             }
6510         }    
6511         if (fplog)
6512         {
6513             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6514                     comm->npmenodes_x,comm->npmenodes_y,1);
6515         }
6516     }
6517     else
6518     {
6519         comm->npmedecompdim = 0;
6520         comm->npmenodes_x   = 0;
6521         comm->npmenodes_y   = 0;
6522     }
6523     
6524     /* Technically we don't need both of these,
6525      * but it simplifies code not having to recalculate it.
6526      */
6527     *npme_x = comm->npmenodes_x;
6528     *npme_y = comm->npmenodes_y;
6529         
6530     snew(comm->slb_frac,DIM);
6531     if (comm->eDLB == edlbNO)
6532     {
6533         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6534         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6535         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6536     }
6537
6538     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6539     {
6540         if (comm->bBondComm || comm->eDLB != edlbNO)
6541         {
6542             /* Set the bonded communication distance to halfway
6543              * the minimum and the maximum,
6544              * since the extra communication cost is nearly zero.
6545              */
6546             acs = average_cellsize_min(dd,ddbox);
6547             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6548             if (comm->eDLB != edlbNO)
6549             {
6550                 /* Check if this does not limit the scaling */
6551                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6552             }
6553             if (!comm->bBondComm)
6554             {
6555                 /* Without bBondComm do not go beyond the n.b. cut-off */
6556                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6557                 if (comm->cellsize_limit >= comm->cutoff)
6558                 {
6559                     /* We don't loose a lot of efficieny
6560                      * when increasing it to the n.b. cut-off.
6561                      * It can even be slightly faster, because we need
6562                      * less checks for the communication setup.
6563                      */
6564                     comm->cutoff_mbody = comm->cutoff;
6565                 }
6566             }
6567             /* Check if we did not end up below our original limit */
6568             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6569
6570             if (comm->cutoff_mbody > comm->cellsize_limit)
6571             {
6572                 comm->cellsize_limit = comm->cutoff_mbody;
6573             }
6574         }
6575         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6576     }
6577
6578     if (debug)
6579     {
6580         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6581                 "cellsize limit %f\n",
6582                 comm->bBondComm,comm->cellsize_limit);
6583     }
6584     
6585     if (MASTER(cr))
6586     {
6587         check_dd_restrictions(cr,dd,ir,fplog);
6588     }
6589
6590     comm->globalcomm_step = INT_MIN;
6591     dd->ddp_count = 0;
6592
6593     clear_dd_cycle_counts(dd);
6594
6595     return dd;
6596 }
6597
6598 static void set_dlb_limits(gmx_domdec_t *dd)
6599
6600 {
6601     int d;
6602
6603     for(d=0; d<dd->ndim; d++)
6604     {
6605         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6606         dd->comm->cellsize_min[dd->dim[d]] =
6607             dd->comm->cellsize_min_dlb[dd->dim[d]];
6608     }
6609 }
6610
6611
6612 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6613 {
6614     gmx_domdec_t *dd;
6615     gmx_domdec_comm_t *comm;
6616     real cellsize_min;
6617     int  d,nc,i;
6618     char buf[STRLEN];
6619     
6620     dd = cr->dd;
6621     comm = dd->comm;
6622     
6623     if (fplog)
6624     {
6625         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6626     }
6627
6628     cellsize_min = comm->cellsize_min[dd->dim[0]];
6629     for(d=1; d<dd->ndim; d++)
6630     {
6631         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6632     }
6633
6634     if (cellsize_min < comm->cellsize_limit*1.05)
6635     {
6636         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6637
6638         /* Change DLB from "auto" to "no". */
6639         comm->eDLB = edlbNO;
6640
6641         return;
6642     }
6643
6644     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6645     comm->bDynLoadBal = TRUE;
6646     dd->bGridJump = TRUE;
6647     
6648     set_dlb_limits(dd);
6649
6650     /* We can set the required cell size info here,
6651      * so we do not need to communicate this.
6652      * The grid is completely uniform.
6653      */
6654     for(d=0; d<dd->ndim; d++)
6655     {
6656         if (comm->root[d])
6657         {
6658             comm->load[d].sum_m = comm->load[d].sum;
6659
6660             nc = dd->nc[dd->dim[d]];
6661             for(i=0; i<nc; i++)
6662             {
6663                 comm->root[d]->cell_f[i]    = i/(real)nc;
6664                 if (d > 0)
6665                 {
6666                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6667                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6668                 }
6669             }
6670             comm->root[d]->cell_f[nc] = 1.0;
6671         }
6672     }
6673 }
6674
6675 static char *init_bLocalCG(gmx_mtop_t *mtop)
6676 {
6677     int  ncg,cg;
6678     char *bLocalCG;
6679     
6680     ncg = ncg_mtop(mtop);
6681     snew(bLocalCG,ncg);
6682     for(cg=0; cg<ncg; cg++)
6683     {
6684         bLocalCG[cg] = FALSE;
6685     }
6686
6687     return bLocalCG;
6688 }
6689
6690 void dd_init_bondeds(FILE *fplog,
6691                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6692                      gmx_vsite_t *vsite,gmx_constr_t constr,
6693                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6694 {
6695     gmx_domdec_comm_t *comm;
6696     gmx_bool bBondComm;
6697     int  d;
6698
6699     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6700
6701     comm = dd->comm;
6702
6703     if (comm->bBondComm)
6704     {
6705         /* Communicate atoms beyond the cut-off for bonded interactions */
6706         comm = dd->comm;
6707
6708         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6709
6710         comm->bLocalCG = init_bLocalCG(mtop);
6711     }
6712     else
6713     {
6714         /* Only communicate atoms based on cut-off */
6715         comm->cglink   = NULL;
6716         comm->bLocalCG = NULL;
6717     }
6718 }
6719
6720 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6721                               t_inputrec *ir,
6722                               gmx_bool bDynLoadBal,real dlb_scale,
6723                               gmx_ddbox_t *ddbox)
6724 {
6725     gmx_domdec_comm_t *comm;
6726     int  d;
6727     ivec np;
6728     real limit,shrink;
6729     char buf[64];
6730
6731     if (fplog == NULL)
6732     {
6733         return;
6734     }
6735
6736     comm = dd->comm;
6737
6738     if (bDynLoadBal)
6739     {
6740         fprintf(fplog,"The maximum number of communication pulses is:");
6741         for(d=0; d<dd->ndim; d++)
6742         {
6743             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6744         }
6745         fprintf(fplog,"\n");
6746         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6747         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6748         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6749         for(d=0; d<DIM; d++)
6750         {
6751             if (dd->nc[d] > 1)
6752             {
6753                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6754                 {
6755                     shrink = 0;
6756                 }
6757                 else
6758                 {
6759                     shrink =
6760                         comm->cellsize_min_dlb[d]/
6761                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6762                 }
6763                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6764             }
6765         }
6766         fprintf(fplog,"\n");
6767     }
6768     else
6769     {
6770         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6771         fprintf(fplog,"The initial number of communication pulses is:");
6772         for(d=0; d<dd->ndim; d++)
6773         {
6774             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6775         }
6776         fprintf(fplog,"\n");
6777         fprintf(fplog,"The initial domain decomposition cell size is:");
6778         for(d=0; d<DIM; d++) {
6779             if (dd->nc[d] > 1)
6780             {
6781                 fprintf(fplog," %c %.2f nm",
6782                         dim2char(d),dd->comm->cellsize_min[d]);
6783             }
6784         }
6785         fprintf(fplog,"\n\n");
6786     }
6787     
6788     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6789     {
6790         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6791         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6792                 "non-bonded interactions","",comm->cutoff);
6793
6794         if (bDynLoadBal)
6795         {
6796             limit = dd->comm->cellsize_limit;
6797         }
6798         else
6799         {
6800             if (dynamic_dd_box(ddbox,ir))
6801             {
6802                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6803             }
6804             limit = dd->comm->cellsize_min[XX];
6805             for(d=1; d<DIM; d++)
6806             {
6807                 limit = min(limit,dd->comm->cellsize_min[d]);
6808             }
6809         }
6810
6811         if (comm->bInterCGBondeds)
6812         {
6813             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6814                     "two-body bonded interactions","(-rdd)",
6815                     max(comm->cutoff,comm->cutoff_mbody));
6816             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6817                     "multi-body bonded interactions","(-rdd)",
6818                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6819         }
6820         if (dd->vsite_comm)
6821         {
6822             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6823                     "virtual site constructions","(-rcon)",limit);
6824         }
6825         if (dd->constraint_comm)
6826         {
6827             sprintf(buf,"atoms separated by up to %d constraints",
6828                     1+ir->nProjOrder);
6829             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6830                     buf,"(-rcon)",limit);
6831         }
6832         fprintf(fplog,"\n");
6833     }
6834     
6835     fflush(fplog);
6836 }
6837
6838 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6839                        t_inputrec *ir,t_forcerec *fr,
6840                        gmx_ddbox_t *ddbox)
6841 {
6842     gmx_domdec_comm_t *comm;
6843     int  d,dim,npulse,npulse_d_max,npulse_d;
6844     gmx_bool bNoCutOff;
6845     int  natoms_tot;
6846     real vol_frac;
6847
6848     comm = dd->comm;
6849
6850     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6851
6852     if (EEL_PME(ir->coulombtype))
6853     {
6854         init_ddpme(dd,&comm->ddpme[0],0);
6855         if (comm->npmedecompdim >= 2)
6856         {
6857             init_ddpme(dd,&comm->ddpme[1],1);
6858         }
6859     }
6860     else
6861     {
6862         comm->npmenodes = 0;
6863         if (dd->pme_nodeid >= 0)
6864         {
6865             gmx_fatal_collective(FARGS,NULL,dd,
6866                                  "Can not have separate PME nodes without PME electrostatics");
6867         }
6868     }
6869     
6870     /* If each molecule is a single charge group
6871      * or we use domain decomposition for each periodic dimension,
6872      * we do not need to take pbc into account for the bonded interactions.
6873      */
6874     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6875         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6876     {
6877         fr->bMolPBC = FALSE;
6878     }
6879     else
6880     {
6881         fr->bMolPBC = TRUE;
6882     }
6883         
6884     if (debug)
6885     {
6886         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6887     }
6888     if (comm->eDLB != edlbNO)
6889     {
6890         /* Determine the maximum number of comm. pulses in one dimension */
6891         
6892         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6893         
6894         /* Determine the maximum required number of grid pulses */
6895         if (comm->cellsize_limit >= comm->cutoff)
6896         {
6897             /* Only a single pulse is required */
6898             npulse = 1;
6899         }
6900         else if (!bNoCutOff && comm->cellsize_limit > 0)
6901         {
6902             /* We round down slightly here to avoid overhead due to the latency
6903              * of extra communication calls when the cut-off
6904              * would be only slightly longer than the cell size.
6905              * Later cellsize_limit is redetermined,
6906              * so we can not miss interactions due to this rounding.
6907              */
6908             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6909         }
6910         else
6911         {
6912             /* There is no cell size limit */
6913             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6914         }
6915
6916         if (!bNoCutOff && npulse > 1)
6917         {
6918             /* See if we can do with less pulses, based on dlb_scale */
6919             npulse_d_max = 0;
6920             for(d=0; d<dd->ndim; d++)
6921             {
6922                 dim = dd->dim[d];
6923                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6924                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6925                 npulse_d_max = max(npulse_d_max,npulse_d);
6926             }
6927             npulse = min(npulse,npulse_d_max);
6928         }
6929         
6930         /* This env var can override npulse */
6931         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6932         if (d > 0)
6933         {
6934             npulse = d;
6935         }
6936
6937         comm->maxpulse = 1;
6938         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6939         for(d=0; d<dd->ndim; d++)
6940         {
6941             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6942             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6943             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6944             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6945             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6946             {
6947                 comm->bVacDLBNoLimit = FALSE;
6948             }
6949         }
6950         
6951         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6952         if (!comm->bVacDLBNoLimit)
6953         {
6954             comm->cellsize_limit = max(comm->cellsize_limit,
6955                                        comm->cutoff/comm->maxpulse);
6956         }
6957         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6958         /* Set the minimum cell size for each DD dimension */
6959         for(d=0; d<dd->ndim; d++)
6960         {
6961             if (comm->bVacDLBNoLimit ||
6962                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6963             {
6964                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6965             }
6966             else
6967             {
6968                 comm->cellsize_min_dlb[dd->dim[d]] =
6969                     comm->cutoff/comm->cd[d].np_dlb;
6970             }
6971         }
6972         if (comm->cutoff_mbody <= 0)
6973         {
6974             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6975         }
6976         if (comm->bDynLoadBal)
6977         {
6978             set_dlb_limits(dd);
6979         }
6980     }
6981     
6982     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6983     if (comm->eDLB == edlbAUTO)
6984     {
6985         if (fplog)
6986         {
6987             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6988         }
6989         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6990     }
6991
6992     if (ir->ePBC == epbcNONE)
6993     {
6994         vol_frac = 1 - 1/(double)dd->nnodes;
6995     }
6996     else
6997     {
6998         vol_frac =
6999             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7000     }
7001     if (debug)
7002     {
7003         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7004     }
7005     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7006    
7007     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7008 }
7009
7010 static void merge_cg_buffers(int ncell,
7011                              gmx_domdec_comm_dim_t *cd, int pulse,
7012                              int  *ncg_cell,
7013                              int  *index_gl, int  *recv_i,
7014                              rvec *cg_cm,    rvec *recv_vr,
7015                              int *cgindex,
7016                              cginfo_mb_t *cginfo_mb,int *cginfo)
7017 {
7018     gmx_domdec_ind_t *ind,*ind_p;
7019     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7020     int shift,shift_at;
7021     
7022     ind = &cd->ind[pulse];
7023     
7024     /* First correct the already stored data */
7025     shift = ind->nrecv[ncell];
7026     for(cell=ncell-1; cell>=0; cell--)
7027     {
7028         shift -= ind->nrecv[cell];
7029         if (shift > 0)
7030         {
7031             /* Move the cg's present from previous grid pulses */
7032             cg0 = ncg_cell[ncell+cell];
7033             cg1 = ncg_cell[ncell+cell+1];
7034             cgindex[cg1+shift] = cgindex[cg1];
7035             for(cg=cg1-1; cg>=cg0; cg--)
7036             {
7037                 index_gl[cg+shift] = index_gl[cg];
7038                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7039                 cgindex[cg+shift] = cgindex[cg];
7040                 cginfo[cg+shift] = cginfo[cg];
7041             }
7042             /* Correct the already stored send indices for the shift */
7043             for(p=1; p<=pulse; p++)
7044             {
7045                 ind_p = &cd->ind[p];
7046                 cg0 = 0;
7047                 for(c=0; c<cell; c++)
7048                 {
7049                     cg0 += ind_p->nsend[c];
7050                 }
7051                 cg1 = cg0 + ind_p->nsend[cell];
7052                 for(cg=cg0; cg<cg1; cg++)
7053                 {
7054                     ind_p->index[cg] += shift;
7055                 }
7056             }
7057         }
7058     }
7059
7060     /* Merge in the communicated buffers */
7061     shift = 0;
7062     shift_at = 0;
7063     cg0 = 0;
7064     for(cell=0; cell<ncell; cell++)
7065     {
7066         cg1 = ncg_cell[ncell+cell+1] + shift;
7067         if (shift_at > 0)
7068         {
7069             /* Correct the old cg indices */
7070             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7071             {
7072                 cgindex[cg+1] += shift_at;
7073             }
7074         }
7075         for(cg=0; cg<ind->nrecv[cell]; cg++)
7076         {
7077             /* Copy this charge group from the buffer */
7078             index_gl[cg1] = recv_i[cg0];
7079             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7080             /* Add it to the cgindex */
7081             cg_gl = index_gl[cg1];
7082             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7083             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7084             cgindex[cg1+1] = cgindex[cg1] + nat;
7085             cg0++;
7086             cg1++;
7087             shift_at += nat;
7088         }
7089         shift += ind->nrecv[cell];
7090         ncg_cell[ncell+cell+1] = cg1;
7091     }
7092 }
7093
7094 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7095                                int nzone,int cg0,const int *cgindex)
7096 {
7097     int cg,zone,p;
7098     
7099     /* Store the atom block boundaries for easy copying of communication buffers
7100      */
7101     cg = cg0;
7102     for(zone=0; zone<nzone; zone++)
7103     {
7104         for(p=0; p<cd->np; p++) {
7105             cd->ind[p].cell2at0[zone] = cgindex[cg];
7106             cg += cd->ind[p].nrecv[zone];
7107             cd->ind[p].cell2at1[zone] = cgindex[cg];
7108         }
7109     }
7110 }
7111
7112 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7113 {
7114     int  i;
7115     gmx_bool bMiss;
7116
7117     bMiss = FALSE;
7118     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7119     {
7120         if (!bLocalCG[link->a[i]])
7121         {
7122             bMiss = TRUE;
7123         }
7124     }
7125
7126     return bMiss;
7127 }
7128
7129 static void setup_dd_communication(gmx_domdec_t *dd,
7130                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7131 {
7132     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7133     int nzone,nzone_send,zone,zonei,cg0,cg1;
7134     int c,i,j,cg,cg_gl,nrcg;
7135     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7136     gmx_domdec_comm_t *comm;
7137     gmx_domdec_zones_t *zones;
7138     gmx_domdec_comm_dim_t *cd;
7139     gmx_domdec_ind_t *ind;
7140     cginfo_mb_t *cginfo_mb;
7141     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7142     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7143     rvec rb,rn;
7144     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7145     real bcorner[DIM],bcorner_round_1=0;
7146     ivec tric_dist;
7147     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7148     real skew_fac2_d,skew_fac_01;
7149     rvec sf2_round;
7150     int  nsend,nat;
7151     
7152     if (debug)
7153     {
7154         fprintf(debug,"Setting up DD communication\n");
7155     }
7156     
7157     comm  = dd->comm;
7158     cg_cm = fr->cg_cm;
7159
7160     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7161     {
7162         dim = dd->dim[dim_ind];
7163
7164         /* Check if we need to use triclinic distances */
7165         tric_dist[dim_ind] = 0;
7166         for(i=0; i<=dim_ind; i++)
7167         {
7168             if (ddbox->tric_dir[dd->dim[i]])
7169             {
7170                 tric_dist[dim_ind] = 1;
7171             }
7172         }
7173     }
7174
7175     bBondComm = comm->bBondComm;
7176
7177     /* Do we need to determine extra distances for multi-body bondeds? */
7178     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7179     
7180     /* Do we need to determine extra distances for only two-body bondeds? */
7181     bDist2B = (bBondComm && !bDistMB);
7182
7183     r_comm2  = sqr(comm->cutoff);
7184     r_bcomm2 = sqr(comm->cutoff_mbody);
7185
7186     if (debug)
7187     {
7188         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7189     }
7190
7191     zones = &comm->zones;
7192     
7193     dim0 = dd->dim[0];
7194     /* The first dimension is equal for all cells */
7195     corner[0][0] = comm->cell_x0[dim0];
7196     if (bDistMB)
7197     {
7198         bcorner[0] = corner[0][0];
7199     }
7200     if (dd->ndim >= 2)
7201     {
7202         dim1 = dd->dim[1];
7203         /* This cell row is only seen from the first row */
7204         corner[1][0] = comm->cell_x0[dim1];
7205         /* All rows can see this row */
7206         corner[1][1] = comm->cell_x0[dim1];
7207         if (dd->bGridJump)
7208         {
7209             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7210             if (bDistMB)
7211             {
7212                 /* For the multi-body distance we need the maximum */
7213                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7214             }
7215         }
7216         /* Set the upper-right corner for rounding */
7217         corner_round_0 = comm->cell_x1[dim0];
7218         
7219         if (dd->ndim >= 3)
7220         {
7221             dim2 = dd->dim[2];
7222             for(j=0; j<4; j++)
7223             {
7224                 corner[2][j] = comm->cell_x0[dim2];
7225             }
7226             if (dd->bGridJump)
7227             {
7228                 /* Use the maximum of the i-cells that see a j-cell */
7229                 for(i=0; i<zones->nizone; i++)
7230                 {
7231                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7232                     {
7233                         if (j >= 4)
7234                         {
7235                             corner[2][j-4] =
7236                                 max(corner[2][j-4],
7237                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7238                         }
7239                     }
7240                 }
7241                 if (bDistMB)
7242                 {
7243                     /* For the multi-body distance we need the maximum */
7244                     bcorner[2] = comm->cell_x0[dim2];
7245                     for(i=0; i<2; i++)
7246                     {
7247                         for(j=0; j<2; j++)
7248                         {
7249                             bcorner[2] = max(bcorner[2],
7250                                              comm->zone_d2[i][j].p1_0);
7251                         }
7252                     }
7253                 }
7254             }
7255             
7256             /* Set the upper-right corner for rounding */
7257             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7258              * Only cell (0,0,0) can see cell 7 (1,1,1)
7259              */
7260             corner_round_1[0] = comm->cell_x1[dim1];
7261             corner_round_1[3] = comm->cell_x1[dim1];
7262             if (dd->bGridJump)
7263             {
7264                 corner_round_1[0] = max(comm->cell_x1[dim1],
7265                                         comm->zone_d1[1].mch1);
7266                 if (bDistMB)
7267                 {
7268                     /* For the multi-body distance we need the maximum */
7269                     bcorner_round_1 = max(comm->cell_x1[dim1],
7270                                           comm->zone_d1[1].p1_1);
7271                 }
7272             }
7273         }
7274     }
7275     
7276     /* Triclinic stuff */
7277     normal = ddbox->normal;
7278     skew_fac_01 = 0;
7279     if (dd->ndim >= 2)
7280     {
7281         v_0 = ddbox->v[dim0];
7282         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7283         {
7284             /* Determine the coupling coefficient for the distances
7285              * to the cell planes along dim0 and dim1 through dim2.
7286              * This is required for correct rounding.
7287              */
7288             skew_fac_01 =
7289                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7290             if (debug)
7291             {
7292                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7293             }
7294         }
7295     }
7296     if (dd->ndim >= 3)
7297     {
7298         v_1 = ddbox->v[dim1];
7299     }
7300     
7301     zone_cg_range = zones->cg_range;
7302     index_gl = dd->index_gl;
7303     cgindex  = dd->cgindex;
7304     cginfo_mb = fr->cginfo_mb;
7305     
7306     zone_cg_range[0]   = 0;
7307     zone_cg_range[1]   = dd->ncg_home;
7308     comm->zone_ncg1[0] = dd->ncg_home;
7309     pos_cg             = dd->ncg_home;
7310     
7311     nat_tot = dd->nat_home;
7312     nzone = 1;
7313     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7314     {
7315         dim = dd->dim[dim_ind];
7316         cd = &comm->cd[dim_ind];
7317         
7318         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7319         {
7320             /* No pbc in this dimension, the first node should not comm. */
7321             nzone_send = 0;
7322         }
7323         else
7324         {
7325             nzone_send = nzone;
7326         }
7327
7328         bScrew = (dd->bScrewPBC && dim == XX);
7329         
7330         v_d = ddbox->v[dim];
7331         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7332
7333         cd->bInPlace = TRUE;
7334         for(p=0; p<cd->np; p++)
7335         {
7336             /* Only atoms communicated in the first pulse are used
7337              * for multi-body bonded interactions or for bBondComm.
7338              */
7339             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7340             bDistMB_pulse = (bDistMB && bDistBonded);
7341
7342             ind = &cd->ind[p];
7343             nsend = 0;
7344             nat = 0;
7345             for(zone=0; zone<nzone_send; zone++)
7346             {
7347                 if (tric_dist[dim_ind] && dim_ind > 0)
7348                 {
7349                     /* Determine slightly more optimized skew_fac's
7350                      * for rounding.
7351                      * This reduces the number of communicated atoms
7352                      * by about 10% for 3D DD of rhombic dodecahedra.
7353                      */
7354                     for(dimd=0; dimd<dim; dimd++)
7355                     {
7356                         sf2_round[dimd] = 1;
7357                         if (ddbox->tric_dir[dimd])
7358                         {
7359                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7360                             {
7361                                 /* If we are shifted in dimension i
7362                                  * and the cell plane is tilted forward
7363                                  * in dimension i, skip this coupling.
7364                                  */
7365                                 if (!(zones->shift[nzone+zone][i] &&
7366                                       ddbox->v[dimd][i][dimd] >= 0))
7367                                 {
7368                                     sf2_round[dimd] +=
7369                                         sqr(ddbox->v[dimd][i][dimd]);
7370                                 }
7371                             }
7372                             sf2_round[dimd] = 1/sf2_round[dimd];
7373                         }
7374                     }
7375                 }
7376
7377                 zonei = zone_perm[dim_ind][zone];
7378                 if (p == 0)
7379                 {
7380                     /* Here we permutate the zones to obtain a convenient order
7381                      * for neighbor searching
7382                      */
7383                     cg0 = zone_cg_range[zonei];
7384                     cg1 = zone_cg_range[zonei+1];
7385                 }
7386                 else
7387                 {
7388                     /* Look only at the cg's received in the previous grid pulse
7389                      */
7390                     cg1 = zone_cg_range[nzone+zone+1];
7391                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7392                 }
7393                 ind->nsend[zone] = 0;
7394                 for(cg=cg0; cg<cg1; cg++)
7395                 {
7396                     r2  = 0;
7397                     rb2 = 0;
7398                     if (tric_dist[dim_ind] == 0)
7399                     {
7400                         /* Rectangular direction, easy */
7401                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7402                         if (r > 0)
7403                         {
7404                             r2 += r*r;
7405                         }
7406                         if (bDistMB_pulse)
7407                         {
7408                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7409                             if (r > 0)
7410                             {
7411                                 rb2 += r*r;
7412                             }
7413                         }
7414                         /* Rounding gives at most a 16% reduction
7415                          * in communicated atoms
7416                          */
7417                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7418                         {
7419                             r = cg_cm[cg][dim0] - corner_round_0;
7420                             /* This is the first dimension, so always r >= 0 */
7421                             r2 += r*r;
7422                             if (bDistMB_pulse)
7423                             {
7424                                 rb2 += r*r;
7425                             }
7426                         }
7427                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7428                         {
7429                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7430                             if (r > 0)
7431                             {
7432                                 r2 += r*r;
7433                             }
7434                             if (bDistMB_pulse)
7435                             {
7436                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7437                                 if (r > 0)
7438                                 {
7439                                     rb2 += r*r;
7440                                 }
7441                             }
7442                         }
7443                     }
7444                     else
7445                     {
7446                         /* Triclinic direction, more complicated */
7447                         clear_rvec(rn);
7448                         clear_rvec(rb);
7449                         /* Rounding, conservative as the skew_fac multiplication
7450                          * will slightly underestimate the distance.
7451                          */
7452                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7453                         {
7454                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7455                             for(i=dim0+1; i<DIM; i++)
7456                             {
7457                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7458                             }
7459                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7460                             if (bDistMB_pulse)
7461                             {
7462                                 rb[dim0] = rn[dim0];
7463                                 rb2 = r2;
7464                             }
7465                             /* Take care that the cell planes along dim0 might not
7466                              * be orthogonal to those along dim1 and dim2.
7467                              */
7468                             for(i=1; i<=dim_ind; i++)
7469                             {
7470                                 dimd = dd->dim[i];
7471                                 if (normal[dim0][dimd] > 0)
7472                                 {
7473                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7474                                     if (bDistMB_pulse)
7475                                     {
7476                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7477                                     }
7478                                 }
7479                             }
7480                         }
7481                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7482                         {
7483                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7484                             tric_sh = 0;
7485                             for(i=dim1+1; i<DIM; i++)
7486                             {
7487                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7488                             }
7489                             rn[dim1] += tric_sh;
7490                             if (rn[dim1] > 0)
7491                             {
7492                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7493                                 /* Take care of coupling of the distances
7494                                  * to the planes along dim0 and dim1 through dim2.
7495                                  */
7496                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7497                                 /* Take care that the cell planes along dim1
7498                                  * might not be orthogonal to that along dim2.
7499                                  */
7500                                 if (normal[dim1][dim2] > 0)
7501                                 {
7502                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7503                                 }
7504                             }
7505                             if (bDistMB_pulse)
7506                             {
7507                                 rb[dim1] +=
7508                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7509                                 if (rb[dim1] > 0)
7510                                 {
7511                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7512                                     /* Take care of coupling of the distances
7513                                      * to the planes along dim0 and dim1 through dim2.
7514                                      */
7515                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7516                                     /* Take care that the cell planes along dim1
7517                                      * might not be orthogonal to that along dim2.
7518                                      */
7519                                     if (normal[dim1][dim2] > 0)
7520                                     {
7521                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7522                                     }
7523                                 }
7524                             }
7525                         }
7526                         /* The distance along the communication direction */
7527                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7528                         tric_sh = 0;
7529                         for(i=dim+1; i<DIM; i++)
7530                         {
7531                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7532                         }
7533                         rn[dim] += tric_sh;
7534                         if (rn[dim] > 0)
7535                         {
7536                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7537                             /* Take care of coupling of the distances
7538                              * to the planes along dim0 and dim1 through dim2.
7539                              */
7540                             if (dim_ind == 1 && zonei == 1)
7541                             {
7542                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7543                             }
7544                         }
7545                         if (bDistMB_pulse)
7546                         {
7547                             clear_rvec(rb);
7548                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7549                             if (rb[dim] > 0)
7550                             {
7551                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7552                                 /* Take care of coupling of the distances
7553                                  * to the planes along dim0 and dim1 through dim2.
7554                                  */
7555                                 if (dim_ind == 1 && zonei == 1)
7556                                 {
7557                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7558                                 }
7559                             }
7560                         }
7561                     }
7562                     
7563                     if (r2 < r_comm2 ||
7564                         (bDistBonded &&
7565                          ((bDistMB && rb2 < r_bcomm2) ||
7566                           (bDist2B && r2  < r_bcomm2)) &&
7567                          (!bBondComm ||
7568                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7569                            missing_link(comm->cglink,index_gl[cg],
7570                                         comm->bLocalCG)))))
7571                     {
7572                         /* Make an index to the local charge groups */
7573                         if (nsend+1 > ind->nalloc)
7574                         {
7575                             ind->nalloc = over_alloc_large(nsend+1);
7576                             srenew(ind->index,ind->nalloc);
7577                         }
7578                         if (nsend+1 > comm->nalloc_int)
7579                         {
7580                             comm->nalloc_int = over_alloc_large(nsend+1);
7581                             srenew(comm->buf_int,comm->nalloc_int);
7582                         }
7583                         ind->index[nsend] = cg;
7584                         comm->buf_int[nsend] = index_gl[cg];
7585                         ind->nsend[zone]++;
7586                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7587
7588                         if (dd->ci[dim] == 0)
7589                         {
7590                             /* Correct cg_cm for pbc */
7591                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7592                             if (bScrew)
7593                             {
7594                                 comm->vbuf.v[nsend][YY] =
7595                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7596                                 comm->vbuf.v[nsend][ZZ] =
7597                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7598                             }
7599                         }
7600                         else
7601                         {
7602                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7603                         }
7604                         nsend++;
7605                         nat += cgindex[cg+1] - cgindex[cg];
7606                     }
7607                 }
7608             }
7609             /* Clear the counts in case we do not have pbc */
7610             for(zone=nzone_send; zone<nzone; zone++)
7611             {
7612                 ind->nsend[zone] = 0;
7613             }
7614             ind->nsend[nzone]   = nsend;
7615             ind->nsend[nzone+1] = nat;
7616             /* Communicate the number of cg's and atoms to receive */
7617             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7618                             ind->nsend, nzone+2,
7619                             ind->nrecv, nzone+2);
7620             
7621             /* The rvec buffer is also required for atom buffers of size nsend
7622              * in dd_move_x and dd_move_f.
7623              */
7624             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7625
7626             if (p > 0)
7627             {
7628                 /* We can receive in place if only the last zone is not empty */
7629                 for(zone=0; zone<nzone-1; zone++)
7630                 {
7631                     if (ind->nrecv[zone] > 0)
7632                     {
7633                         cd->bInPlace = FALSE;
7634                     }
7635                 }
7636                 if (!cd->bInPlace)
7637                 {
7638                     /* The int buffer is only required here for the cg indices */
7639                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7640                     {
7641                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7642                         srenew(comm->buf_int2,comm->nalloc_int2);
7643                     }
7644                     /* The rvec buffer is also required for atom buffers
7645                      * of size nrecv in dd_move_x and dd_move_f.
7646                      */
7647                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7648                     vec_rvec_check_alloc(&comm->vbuf2,i);
7649                 }
7650             }
7651             
7652             /* Make space for the global cg indices */
7653             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7654                 || dd->cg_nalloc == 0)
7655             {
7656                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7657                 srenew(index_gl,dd->cg_nalloc);
7658                 srenew(cgindex,dd->cg_nalloc+1);
7659             }
7660             /* Communicate the global cg indices */
7661             if (cd->bInPlace)
7662             {
7663                 recv_i = index_gl + pos_cg;
7664             }
7665             else
7666             {
7667                 recv_i = comm->buf_int2;
7668             }
7669             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7670                             comm->buf_int, nsend,
7671                             recv_i,        ind->nrecv[nzone]);
7672
7673             /* Make space for cg_cm */
7674             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7675             {
7676                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7677                 cg_cm = fr->cg_cm;
7678             }
7679             /* Communicate cg_cm */
7680             if (cd->bInPlace)
7681             {
7682                 recv_vr = cg_cm + pos_cg;
7683             }
7684             else
7685             {
7686                 recv_vr = comm->vbuf2.v;
7687             }
7688             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7689                              comm->vbuf.v, nsend,
7690                              recv_vr,      ind->nrecv[nzone]);
7691             
7692             /* Make the charge group index */
7693             if (cd->bInPlace)
7694             {
7695                 zone = (p == 0 ? 0 : nzone - 1);
7696                 while (zone < nzone)
7697                 {
7698                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7699                     {
7700                         cg_gl = index_gl[pos_cg];
7701                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7702                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7703                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7704                         if (bBondComm)
7705                         {
7706                             /* Update the charge group presence,
7707                              * so we can use it in the next pass of the loop.
7708                              */
7709                             comm->bLocalCG[cg_gl] = TRUE;
7710                         }
7711                         pos_cg++;
7712                     }
7713                     if (p == 0)
7714                     {
7715                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7716                     }
7717                     zone++;
7718                     zone_cg_range[nzone+zone] = pos_cg;
7719                 }
7720             }
7721             else
7722             {
7723                 /* This part of the code is never executed with bBondComm. */
7724                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7725                                  index_gl,recv_i,cg_cm,recv_vr,
7726                                  cgindex,fr->cginfo_mb,fr->cginfo);
7727                 pos_cg += ind->nrecv[nzone];
7728             }
7729             nat_tot += ind->nrecv[nzone+1];
7730         }
7731         if (!cd->bInPlace)
7732         {
7733             /* Store the atom block for easy copying of communication buffers */
7734             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7735         }
7736         nzone += nzone;
7737     }
7738     dd->index_gl = index_gl;
7739     dd->cgindex  = cgindex;
7740     
7741     dd->ncg_tot = zone_cg_range[zones->n];
7742     dd->nat_tot = nat_tot;
7743     comm->nat[ddnatHOME] = dd->nat_home;
7744     for(i=ddnatZONE; i<ddnatNR; i++)
7745     {
7746         comm->nat[i] = dd->nat_tot;
7747     }
7748
7749     if (!bBondComm)
7750     {
7751         /* We don't need to update cginfo, since that was alrady done above.
7752          * So we pass NULL for the forcerec.
7753          */
7754         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7755                       NULL,comm->bLocalCG);
7756     }
7757
7758     if (debug)
7759     {
7760         fprintf(debug,"Finished setting up DD communication, zones:");
7761         for(c=0; c<zones->n; c++)
7762         {
7763             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7764         }
7765         fprintf(debug,"\n");
7766     }
7767 }
7768
7769 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7770 {
7771     int c;
7772     
7773     for(c=0; c<zones->nizone; c++)
7774     {
7775         zones->izone[c].cg1  = zones->cg_range[c+1];
7776         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7777         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7778     }
7779 }
7780
7781 static int comp_cgsort(const void *a,const void *b)
7782 {
7783     int comp;
7784     
7785     gmx_cgsort_t *cga,*cgb;
7786     cga = (gmx_cgsort_t *)a;
7787     cgb = (gmx_cgsort_t *)b;
7788     
7789     comp = cga->nsc - cgb->nsc;
7790     if (comp == 0)
7791     {
7792         comp = cga->ind_gl - cgb->ind_gl;
7793     }
7794     
7795     return comp;
7796 }
7797
7798 static void order_int_cg(int n,gmx_cgsort_t *sort,
7799                          int *a,int *buf)
7800 {
7801     int i;
7802     
7803     /* Order the data */
7804     for(i=0; i<n; i++)
7805     {
7806         buf[i] = a[sort[i].ind];
7807     }
7808     
7809     /* Copy back to the original array */
7810     for(i=0; i<n; i++)
7811     {
7812         a[i] = buf[i];
7813     }
7814 }
7815
7816 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7817                          rvec *v,rvec *buf)
7818 {
7819     int i;
7820     
7821     /* Order the data */
7822     for(i=0; i<n; i++)
7823     {
7824         copy_rvec(v[sort[i].ind],buf[i]);
7825     }
7826     
7827     /* Copy back to the original array */
7828     for(i=0; i<n; i++)
7829     {
7830         copy_rvec(buf[i],v[i]);
7831     }
7832 }
7833
7834 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7835                            rvec *v,rvec *buf)
7836 {
7837     int a,atot,cg,cg0,cg1,i;
7838     
7839     /* Order the data */
7840     a = 0;
7841     for(cg=0; cg<ncg; cg++)
7842     {
7843         cg0 = cgindex[sort[cg].ind];
7844         cg1 = cgindex[sort[cg].ind+1];
7845         for(i=cg0; i<cg1; i++)
7846         {
7847             copy_rvec(v[i],buf[a]);
7848             a++;
7849         }
7850     }
7851     atot = a;
7852     
7853     /* Copy back to the original array */
7854     for(a=0; a<atot; a++)
7855     {
7856         copy_rvec(buf[a],v[a]);
7857     }
7858 }
7859
7860 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7861                          int nsort_new,gmx_cgsort_t *sort_new,
7862                          gmx_cgsort_t *sort1)
7863 {
7864     int i1,i2,i_new;
7865     
7866     /* The new indices are not very ordered, so we qsort them */
7867     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7868     
7869     /* sort2 is already ordered, so now we can merge the two arrays */
7870     i1 = 0;
7871     i2 = 0;
7872     i_new = 0;
7873     while(i2 < nsort2 || i_new < nsort_new)
7874     {
7875         if (i2 == nsort2)
7876         {
7877             sort1[i1++] = sort_new[i_new++];
7878         }
7879         else if (i_new == nsort_new)
7880         {
7881             sort1[i1++] = sort2[i2++];
7882         }
7883         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7884                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7885                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7886         {
7887             sort1[i1++] = sort2[i2++];
7888         }
7889         else
7890         {
7891             sort1[i1++] = sort_new[i_new++];
7892         }
7893     }
7894 }
7895
7896 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7897                           rvec *cgcm,t_forcerec *fr,t_state *state,
7898                           int ncg_home_old)
7899 {
7900     gmx_domdec_sort_t *sort;
7901     gmx_cgsort_t *cgsort,*sort_i;
7902     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7903     rvec *vbuf;
7904     
7905     sort = dd->comm->sort;
7906     
7907     if (dd->ncg_home > sort->sort_nalloc)
7908     {
7909         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7910         srenew(sort->sort1,sort->sort_nalloc);
7911         srenew(sort->sort2,sort->sort_nalloc);
7912     }
7913     
7914     if (ncg_home_old >= 0)
7915     {
7916         /* The charge groups that remained in the same ns grid cell
7917          * are completely ordered. So we can sort efficiently by sorting
7918          * the charge groups that did move into the stationary list.
7919          */
7920         ncg_new = 0;
7921         nsort2 = 0;
7922         nsort_new = 0;
7923         for(i=0; i<dd->ncg_home; i++)
7924         {
7925             /* Check if this cg did not move to another node */
7926             cell_index = fr->ns.grid->cell_index[i];
7927             if (cell_index !=  4*fr->ns.grid->ncells)
7928             {
7929                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7930                 {
7931                     /* This cg is new on this node or moved ns grid cell */
7932                     if (nsort_new >= sort->sort_new_nalloc)
7933                     {
7934                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7935                         srenew(sort->sort_new,sort->sort_new_nalloc);
7936                     }
7937                     sort_i = &(sort->sort_new[nsort_new++]);
7938                 }
7939                 else
7940                 {
7941                     /* This cg did not move */
7942                     sort_i = &(sort->sort2[nsort2++]);
7943                 }
7944                 /* Sort on the ns grid cell indices
7945                  * and the global topology index
7946                  */
7947                 sort_i->nsc    = cell_index;
7948                 sort_i->ind_gl = dd->index_gl[i];
7949                 sort_i->ind    = i;
7950                 ncg_new++;
7951             }
7952         }
7953         if (debug)
7954         {
7955             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7956                     nsort2,nsort_new);
7957         }
7958         /* Sort efficiently */
7959         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7960     }
7961     else
7962     {
7963         cgsort = sort->sort1;
7964         ncg_new = 0;
7965         for(i=0; i<dd->ncg_home; i++)
7966         {
7967             /* Sort on the ns grid cell indices
7968              * and the global topology index
7969              */
7970             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7971             cgsort[i].ind_gl = dd->index_gl[i];
7972             cgsort[i].ind    = i;
7973             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7974             {
7975                 ncg_new++;
7976             }
7977         }
7978         if (debug)
7979         {
7980             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7981         }
7982         /* Determine the order of the charge groups using qsort */
7983         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7984     }
7985     cgsort = sort->sort1;
7986     
7987     /* We alloc with the old size, since cgindex is still old */
7988     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7989     vbuf = dd->comm->vbuf.v;
7990     
7991     /* Remove the charge groups which are no longer at home here */
7992     dd->ncg_home = ncg_new;
7993     
7994     /* Reorder the state */
7995     for(i=0; i<estNR; i++)
7996     {
7997         if (EST_DISTR(i) && (state->flags & (1<<i)))
7998         {
7999             switch (i)
8000             {
8001             case estX:
8002                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8003                 break;
8004             case estV:
8005                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8006                 break;
8007             case estSDX:
8008                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8009                 break;
8010             case estCGP:
8011                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8012                 break;
8013             case estLD_RNG:
8014             case estLD_RNGI:
8015             case estDISRE_INITF:
8016             case estDISRE_RM3TAV:
8017             case estORIRE_INITF:
8018             case estORIRE_DTAV:
8019                 /* No ordering required */
8020                 break;
8021             default:
8022                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8023                 break;
8024             }
8025         }
8026     }
8027     /* Reorder cgcm */
8028     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8029     
8030     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8031     {
8032         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8033         srenew(sort->ibuf,sort->ibuf_nalloc);
8034     }
8035     ibuf = sort->ibuf;
8036     /* Reorder the global cg index */
8037     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8038     /* Reorder the cginfo */
8039     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8040     /* Rebuild the local cg index */
8041     ibuf[0] = 0;
8042     for(i=0; i<dd->ncg_home; i++)
8043     {
8044         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8045         ibuf[i+1] = ibuf[i] + cgsize;
8046     }
8047     for(i=0; i<dd->ncg_home+1; i++)
8048     {
8049         dd->cgindex[i] = ibuf[i];
8050     }
8051     /* Set the home atom number */
8052     dd->nat_home = dd->cgindex[dd->ncg_home];
8053     
8054     /* Copy the sorted ns cell indices back to the ns grid struct */
8055     for(i=0; i<dd->ncg_home; i++)
8056     {
8057         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8058     }
8059     fr->ns.grid->nr = dd->ncg_home;
8060 }
8061
8062 static void add_dd_statistics(gmx_domdec_t *dd)
8063 {
8064     gmx_domdec_comm_t *comm;
8065     int ddnat;
8066     
8067     comm = dd->comm;
8068     
8069     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8070     {
8071         comm->sum_nat[ddnat-ddnatZONE] +=
8072             comm->nat[ddnat] - comm->nat[ddnat-1];
8073     }
8074     comm->ndecomp++;
8075 }
8076
8077 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8078 {
8079     gmx_domdec_comm_t *comm;
8080     int ddnat;
8081     
8082     comm = dd->comm;
8083
8084     /* Reset all the statistics and counters for total run counting */
8085     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8086     {
8087         comm->sum_nat[ddnat-ddnatZONE] = 0;
8088     }
8089     comm->ndecomp = 0;
8090     comm->nload = 0;
8091     comm->load_step = 0;
8092     comm->load_sum = 0;
8093     comm->load_max = 0;
8094     clear_ivec(comm->load_lim);
8095     comm->load_mdf = 0;
8096     comm->load_pme = 0;
8097 }
8098
8099 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8100 {
8101     gmx_domdec_comm_t *comm;
8102     int ddnat;
8103     double av;
8104    
8105     comm = cr->dd->comm;
8106     
8107     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8108     
8109     if (fplog == NULL)
8110     {
8111         return;
8112     }
8113     
8114     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8115             
8116     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8117     {
8118         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8119         switch(ddnat)
8120         {
8121         case ddnatZONE:
8122             fprintf(fplog,
8123                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8124                     2,av);
8125             break;
8126         case ddnatVSITE:
8127             if (cr->dd->vsite_comm)
8128             {
8129                 fprintf(fplog,
8130                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8131                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8132                         av);
8133             }
8134             break;
8135         case ddnatCON:
8136             if (cr->dd->constraint_comm)
8137             {
8138                 fprintf(fplog,
8139                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8140                         1 + ir->nLincsIter,av);
8141             }
8142             break;
8143         default:
8144             gmx_incons(" Unknown type for DD statistics");
8145         }
8146     }
8147     fprintf(fplog,"\n");
8148     
8149     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8150     {
8151         print_dd_load_av(fplog,cr->dd);
8152     }
8153 }
8154
8155 void dd_partition_system(FILE            *fplog,
8156                          gmx_large_int_t      step,
8157                          t_commrec       *cr,
8158                          gmx_bool            bMasterState,
8159                          int             nstglobalcomm,
8160                          t_state         *state_global,
8161                          gmx_mtop_t      *top_global,
8162                          t_inputrec      *ir,
8163                          t_state         *state_local,
8164                          rvec            **f,
8165                          t_mdatoms       *mdatoms,
8166                          gmx_localtop_t  *top_local,
8167                          t_forcerec      *fr,
8168                          gmx_vsite_t     *vsite,
8169                          gmx_shellfc_t   shellfc,
8170                          gmx_constr_t    constr,
8171                          t_nrnb          *nrnb,
8172                          gmx_wallcycle_t wcycle,
8173                          gmx_bool            bVerbose)
8174 {
8175     gmx_domdec_t *dd;
8176     gmx_domdec_comm_t *comm;
8177     gmx_ddbox_t ddbox={0};
8178     t_block *cgs_gl;
8179     gmx_large_int_t step_pcoupl;
8180     rvec cell_ns_x0,cell_ns_x1;
8181     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8182     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8183     gmx_bool bRedist,bSortCG,bResortAll;
8184     ivec ncells_old,np;
8185     real grid_density;
8186     char sbuf[22];
8187         
8188     dd = cr->dd;
8189     comm = dd->comm;
8190
8191     bBoxChanged = (bMasterState || DEFORM(*ir));
8192     if (ir->epc != epcNO)
8193     {
8194         /* With nstpcouple > 1 pressure coupling happens.
8195          * one step after calculating the pressure.
8196          * Box scaling happens at the end of the MD step,
8197          * after the DD partitioning.
8198          * We therefore have to do DLB in the first partitioning
8199          * after an MD step where P-coupling occured.
8200          * We need to determine the last step in which p-coupling occurred.
8201          * MRS -- need to validate this for vv?
8202          */
8203         n = ir->nstpcouple;
8204         if (n == 1)
8205         {
8206             step_pcoupl = step - 1;
8207         }
8208         else
8209         {
8210             step_pcoupl = ((step - 1)/n)*n + 1;
8211         }
8212         if (step_pcoupl >= comm->globalcomm_step)
8213         {
8214             bBoxChanged = TRUE;
8215         }
8216     }
8217
8218     bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8219
8220     if (!comm->bDynLoadBal)
8221     {
8222         bDoDLB = FALSE;
8223     }
8224     else
8225     {
8226         /* Should we do dynamic load balacing this step?
8227          * Since it requires (possibly expensive) global communication,
8228          * we might want to do DLB less frequently.
8229          */
8230         if (bBoxChanged || ir->epc != epcNO)
8231         {
8232             bDoDLB = bBoxChanged;
8233         }
8234         else
8235         {
8236             bDoDLB = bNStGlobalComm;
8237         }
8238     }
8239
8240     /* Check if we have recorded loads on the nodes */
8241     if (comm->bRecordLoad && dd_load_count(comm))
8242     {
8243         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8244         {
8245             /* Check if we should use DLB at the second partitioning
8246              * and every 100 partitionings,
8247              * so the extra communication cost is negligible.
8248              */
8249             n = max(100,nstglobalcomm);
8250             bCheckDLB = (comm->n_load_collect == 0 ||
8251                          comm->n_load_have % n == n-1);
8252         }
8253         else
8254         {
8255             bCheckDLB = FALSE;
8256         }
8257         
8258         /* Print load every nstlog, first and last step to the log file */
8259         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8260                     comm->n_load_collect == 0 ||
8261                     (ir->nsteps >= 0 &&
8262                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
8263
8264         /* Avoid extra communication due to verbose screen output
8265          * when nstglobalcomm is set.
8266          */
8267         if (bDoDLB || bLogLoad || bCheckDLB ||
8268             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8269         {
8270             get_load_distribution(dd,wcycle);
8271             if (DDMASTER(dd))
8272             {
8273                 if (bLogLoad)
8274                 {
8275                     dd_print_load(fplog,dd,step-1);
8276                 }
8277                 if (bVerbose)
8278                 {
8279                     dd_print_load_verbose(dd);
8280                 }
8281             }
8282             comm->n_load_collect++;
8283
8284             if (bCheckDLB) {
8285                 /* Since the timings are node dependent, the master decides */
8286                 if (DDMASTER(dd))
8287                 {
8288                     bTurnOnDLB =
8289                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8290                     if (debug)
8291                     {
8292                         fprintf(debug,"step %s, imb loss %f\n",
8293                                 gmx_step_str(step,sbuf),
8294                                 dd_force_imb_perf_loss(dd));
8295                     }
8296                 }
8297                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8298                 if (bTurnOnDLB)
8299                 {
8300                     turn_on_dlb(fplog,cr,step);
8301                     bDoDLB = TRUE;
8302                 }
8303             }
8304         }
8305         comm->n_load_have++;
8306     }
8307
8308     cgs_gl = &comm->cgs_gl;
8309
8310     bRedist = FALSE;
8311     if (bMasterState)
8312     {
8313         /* Clear the old state */
8314         clear_dd_indices(dd,0,0);
8315
8316         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8317                   TRUE,cgs_gl,state_global->x,&ddbox);
8318     
8319         get_cg_distribution(fplog,step,dd,cgs_gl,
8320                             state_global->box,&ddbox,state_global->x);
8321         
8322         dd_distribute_state(dd,cgs_gl,
8323                             state_global,state_local,f);
8324         
8325         dd_make_local_cgs(dd,&top_local->cgs);
8326         
8327         if (dd->ncg_home > fr->cg_nalloc)
8328         {
8329             dd_realloc_fr_cg(fr,dd->ncg_home);
8330         }
8331         calc_cgcm(fplog,0,dd->ncg_home,
8332                   &top_local->cgs,state_local->x,fr->cg_cm);
8333         
8334         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8335         
8336         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8337
8338         cg0 = 0;
8339     }
8340     else if (state_local->ddp_count != dd->ddp_count)
8341     {
8342         if (state_local->ddp_count > dd->ddp_count)
8343         {
8344             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8345         }
8346         
8347         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8348         {
8349             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8350         }
8351         
8352         /* Clear the old state */
8353         clear_dd_indices(dd,0,0);
8354         
8355         /* Build the new indices */
8356         rebuild_cgindex(dd,cgs_gl->index,state_local);
8357         make_dd_indices(dd,cgs_gl->index,0);
8358         
8359         /* Redetermine the cg COMs */
8360         calc_cgcm(fplog,0,dd->ncg_home,
8361                   &top_local->cgs,state_local->x,fr->cg_cm);
8362         
8363         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8364
8365         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8366
8367         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8368                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8369
8370         bRedist = comm->bDynLoadBal;
8371     }
8372     else
8373     {
8374         /* We have the full state, only redistribute the cgs */
8375
8376         /* Clear the non-home indices */
8377         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8378
8379         /* Avoid global communication for dim's without pbc and -gcom */
8380         if (!bNStGlobalComm)
8381         {
8382             copy_rvec(comm->box0    ,ddbox.box0    );
8383             copy_rvec(comm->box_size,ddbox.box_size);
8384         }
8385         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8386                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8387
8388         bBoxChanged = TRUE;
8389         bRedist = TRUE;
8390     }
8391     /* For dim's without pbc and -gcom */
8392     copy_rvec(ddbox.box0    ,comm->box0    );
8393     copy_rvec(ddbox.box_size,comm->box_size);
8394     
8395     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8396                       step,wcycle);
8397     
8398     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8399     {
8400         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8401     }
8402     
8403     /* Check if we should sort the charge groups */
8404     if (comm->nstSortCG > 0)
8405     {
8406         bSortCG = (bMasterState ||
8407                    (bRedist && (step % comm->nstSortCG == 0)));
8408     }
8409     else
8410     {
8411         bSortCG = FALSE;
8412     }
8413
8414     ncg_home_old = dd->ncg_home;
8415
8416     if (bRedist)
8417     {
8418         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8419                                  state_local,f,fr,mdatoms,
8420                                  !bSortCG,nrnb);
8421     }
8422     
8423     get_nsgrid_boundaries(fr->ns.grid,dd,
8424                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8425                           dd->ncg_home,fr->cg_cm,
8426                           cell_ns_x0,cell_ns_x1,&grid_density);
8427
8428     if (bBoxChanged)
8429     {
8430         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8431     }
8432
8433     copy_ivec(fr->ns.grid->n,ncells_old);
8434     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8435                state_local->box,cell_ns_x0,cell_ns_x1,
8436                fr->rlistlong,grid_density);
8437     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8438     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8439
8440     if (bSortCG)
8441     {
8442         /* Sort the state on charge group position.
8443          * This enables exact restarts from this step.
8444          * It also improves performance by about 15% with larger numbers
8445          * of atoms per node.
8446          */
8447         
8448         /* Fill the ns grid with the home cell,
8449          * so we can sort with the indices.
8450          */
8451         set_zones_ncg_home(dd);
8452         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8453                   0,dd->ncg_home,fr->cg_cm);
8454         
8455         /* Check if we can user the old order and ns grid cell indices
8456          * of the charge groups to sort the charge groups efficiently.
8457          */
8458         bResortAll = (bMasterState ||
8459                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8460                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8461                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8462
8463         if (debug)
8464         {
8465             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8466                     gmx_step_str(step,sbuf),dd->ncg_home);
8467         }
8468         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8469                       bResortAll ? -1 : ncg_home_old);
8470         /* Rebuild all the indices */
8471         cg0 = 0;
8472         ga2la_clear(dd->ga2la);
8473     }
8474     
8475     /* Setup up the communication and communicate the coordinates */
8476     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8477     
8478     /* Set the indices */
8479     make_dd_indices(dd,cgs_gl->index,cg0);
8480
8481     /* Set the charge group boundaries for neighbor searching */
8482     set_cg_boundaries(&comm->zones);
8483     
8484     /*
8485     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8486                  -1,state_local->x,state_local->box);
8487     */
8488     
8489     /* Extract a local topology from the global topology */
8490     for(i=0; i<dd->ndim; i++)
8491     {
8492         np[dd->dim[i]] = comm->cd[i].np;
8493     }
8494     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8495                       comm->cellsize_min,np,
8496                       fr,vsite,top_global,top_local);
8497     
8498     /* Set up the special atom communication */
8499     n = comm->nat[ddnatZONE];
8500     for(i=ddnatZONE+1; i<ddnatNR; i++)
8501     {
8502         switch(i)
8503         {
8504         case ddnatVSITE:
8505             if (vsite && vsite->n_intercg_vsite)
8506             {
8507                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8508             }
8509             break;
8510         case ddnatCON:
8511             if (dd->bInterCGcons)
8512             {
8513                 /* Only for inter-cg constraints we need special code */
8514                 n = dd_make_local_constraints(dd,n,top_global,
8515                                               constr,ir->nProjOrder,
8516                                               &top_local->idef.il[F_CONSTR]);
8517             }
8518             break;
8519         default:
8520             gmx_incons("Unknown special atom type setup");
8521         }
8522         comm->nat[i] = n;
8523     }
8524     
8525     /* Make space for the extra coordinates for virtual site
8526      * or constraint communication.
8527      */
8528     state_local->natoms = comm->nat[ddnatNR-1];
8529     if (state_local->natoms > state_local->nalloc)
8530     {
8531         dd_realloc_state(state_local,f,state_local->natoms);
8532     }
8533
8534     if (fr->bF_NoVirSum)
8535     {
8536         if (vsite && vsite->n_intercg_vsite)
8537         {
8538             nat_f_novirsum = comm->nat[ddnatVSITE];
8539         }
8540         else
8541         {
8542             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8543             {
8544                 nat_f_novirsum = dd->nat_tot;
8545             }
8546             else
8547             {
8548                 nat_f_novirsum = dd->nat_home;
8549             }
8550         }
8551     }
8552     else
8553     {
8554         nat_f_novirsum = 0;
8555     }
8556
8557     /* Set the number of atoms required for the force calculation.
8558      * Forces need to be constrained when using a twin-range setup
8559      * or with energy minimization. For simple simulations we could
8560      * avoid some allocation, zeroing and copying, but this is
8561      * probably not worth the complications ande checking.
8562      */
8563     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8564                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8565
8566     /* We make the all mdatoms up to nat_tot_con.
8567      * We could save some work by only setting invmass
8568      * between nat_tot and nat_tot_con.
8569      */
8570     /* This call also sets the new number of home particles to dd->nat_home */
8571     atoms2md(top_global,ir,
8572              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8573
8574     /* Now we have the charges we can sort the FE interactions */
8575     dd_sort_local_top(dd,mdatoms,top_local);
8576
8577     if (shellfc)
8578     {
8579         /* Make the local shell stuff, currently no communication is done */
8580         make_local_shells(cr,mdatoms,shellfc);
8581     }
8582     
8583         if (ir->implicit_solvent)
8584     {
8585         make_local_gb(cr,fr->born,ir->gb_algorithm);
8586     }
8587         
8588     if (!(cr->duty & DUTY_PME))
8589     {
8590         /* Send the charges to our PME only node */
8591         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8592                        mdatoms->chargeA,mdatoms->chargeB,
8593                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8594     }
8595     
8596     if (constr)
8597     {
8598         set_constraints(constr,top_local,ir,mdatoms,cr);
8599     }
8600     
8601     if (ir->ePull != epullNO)
8602     {
8603         /* Update the local pull groups */
8604         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8605     }
8606     
8607     if (ir->bRot)
8608     {
8609         /* Update the local rotation groups */
8610         dd_make_local_rotation_groups(dd,ir->rot);
8611     }
8612
8613
8614     add_dd_statistics(dd);
8615     
8616     /* Make sure we only count the cycles for this DD partitioning */
8617     clear_dd_cycle_counts(dd);
8618     
8619     /* Because the order of the atoms might have changed since
8620      * the last vsite construction, we need to communicate the constructing
8621      * atom coordinates again (for spreading the forces this MD step).
8622      */
8623     dd_move_x_vsites(dd,state_local->box,state_local->x);
8624     
8625     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8626     {
8627         dd_move_x(dd,state_local->box,state_local->x);
8628         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8629                      -1,state_local->x,state_local->box);
8630     }
8631
8632     if (bNStGlobalComm)
8633     {
8634         /* Store the global communication step */
8635         comm->globalcomm_step = step;
8636     }
8637     
8638     /* Increase the DD partitioning counter */
8639     dd->ddp_count++;
8640     /* The state currently matches this DD partitioning count, store it */
8641     state_local->ddp_count = dd->ddp_count;
8642     if (bMasterState)
8643     {
8644         /* The DD master node knows the complete cg distribution,
8645          * store the count so we can possibly skip the cg info communication.
8646          */
8647         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8648     }
8649
8650     if (comm->DD_debug > 0)
8651     {
8652         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8653         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8654                                 "after partitioning");
8655     }
8656 }