Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "vec.h"
31 #include "domdec.h"
32 #include "domdec_network.h"
33 #include "nrnb.h"
34 #include "pbc.h"
35 #include "chargegroup.h"
36 #include "constr.h"
37 #include "mdatoms.h"
38 #include "names.h"
39 #include "pdbio.h"
40 #include "futil.h"
41 #include "force.h"
42 #include "pme.h"
43 #include "pull.h"
44 #include "pull_rotation.h"
45 #include "gmx_wallcycle.h"
46 #include "mdrun.h"
47 #include "nsgrid.h"
48 #include "shellfc.h"
49 #include "mtop_util.h"
50 #include "gmxfio.h"
51 #include "gmx_ga2la.h"
52 #include "gmx_sort.h"
53 #include "macros.h"
54
55 #ifdef GMX_LIB_MPI
56 #include <mpi.h>
57 #endif
58 #ifdef GMX_THREAD_MPI
59 #include "tmpi.h"
60 #endif
61
62 #define DDRANK(dd,rank)    (rank)
63 #define DDMASTERRANK(dd)   (dd->masterrank)
64
65 typedef struct gmx_domdec_master
66 {
67     /* The cell boundaries */
68     real **cell_x;
69     /* The global charge group division */
70     int  *ncg;     /* Number of home charge groups for each node */
71     int  *index;   /* Index of nnodes+1 into cg */
72     int  *cg;      /* Global charge group index */
73     int  *nat;     /* Number of home atoms for each node. */
74     int  *ibuf;    /* Buffer for communication */
75     rvec *vbuf;    /* Buffer for state scattering and gathering */
76 } gmx_domdec_master_t;
77
78 typedef struct
79 {
80     /* The numbers of charge groups to send and receive for each cell
81      * that requires communication, the last entry contains the total
82      * number of atoms that needs to be communicated.
83      */
84     int nsend[DD_MAXIZONE+2];
85     int nrecv[DD_MAXIZONE+2];
86     /* The charge groups to send */
87     int *index;
88     int nalloc;
89     /* The atom range for non-in-place communication */
90     int cell2at0[DD_MAXIZONE];
91     int cell2at1[DD_MAXIZONE];
92 } gmx_domdec_ind_t;
93
94 typedef struct
95 {
96     int  np;                   /* Number of grid pulses in this dimension */
97     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
98     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
99     int  np_nalloc;
100     gmx_bool bInPlace;             /* Can we communicate in place?            */
101 } gmx_domdec_comm_dim_t;
102
103 typedef struct
104 {
105     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
106     real *cell_f;      /* State var.: cell boundaries, box relative      */
107     real *old_cell_f;  /* Temp. var.: old cell size                      */
108     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
109     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
110     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
111     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
112     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
113     real *buf_ncd;     /* Temp. var.                                     */
114 } gmx_domdec_root_t;
115
116 #define DD_NLOAD_MAX 9
117
118 /* Here floats are accurate enough, since these variables
119  * only influence the load balancing, not the actual MD results.
120  */
121 typedef struct
122 {
123     int  nload;
124     float *load;
125     float sum;
126     float max;
127     float sum_m;
128     float cvol_min;
129     float mdf;
130     float pme;
131     int   flags;
132 } gmx_domdec_load_t;
133
134 typedef struct
135 {
136     int  nsc;
137     int  ind_gl;
138     int  ind;
139 } gmx_cgsort_t;
140
141 typedef struct
142 {
143     gmx_cgsort_t *sort1,*sort2;
144     int  sort_nalloc;
145     gmx_cgsort_t *sort_new;
146     int  sort_new_nalloc;
147     int  *ibuf;
148     int  ibuf_nalloc;
149 } gmx_domdec_sort_t;
150
151 typedef struct
152 {
153     rvec *v;
154     int  nalloc;
155 } vec_rvec_t;
156
157 /* This enum determines the order of the coordinates.
158  * ddnatHOME and ddnatZONE should be first and second,
159  * the others can be ordered as wanted.
160  */
161 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
162
163 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
164 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
165
166 typedef struct
167 {
168     int  dim;      /* The dimension                                          */
169     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
170     int  nslab;    /* The number of PME slabs in this dimension              */
171     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
172     int  *pp_min;  /* The minimum pp node location, size nslab               */
173     int  *pp_max;  /* The maximum pp node location,size nslab                */
174     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
175 } gmx_ddpme_t;
176
177 typedef struct
178 {
179     real min0;    /* The minimum bottom of this zone                        */
180     real max1;    /* The maximum top of this zone                           */
181     real mch0;    /* The maximum bottom communicaton height for this zone   */
182     real mch1;    /* The maximum top communicaton height for this zone      */
183     real p1_0;    /* The bottom value of the first cell in this zone        */
184     real p1_1;    /* The top value of the first cell in this zone           */
185 } gmx_ddzone_t;
186
187 typedef struct gmx_domdec_comm
188 {
189     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
190      * unless stated otherwise.
191      */
192
193     /* The number of decomposition dimensions for PME, 0: no PME */
194     int  npmedecompdim;
195     /* The number of nodes doing PME (PP/PME or only PME) */
196     int  npmenodes;
197     int  npmenodes_x;
198     int  npmenodes_y;
199     /* The communication setup including the PME only nodes */
200     gmx_bool bCartesianPP_PME;
201     ivec ntot;
202     int  cartpmedim;
203     int  *pmenodes;          /* size npmenodes                         */
204     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
205                               * but with bCartesianPP_PME              */
206     gmx_ddpme_t ddpme[2];
207     
208     /* The DD particle-particle nodes only */
209     gmx_bool bCartesianPP;
210     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
211     
212     /* The global charge groups */
213     t_block cgs_gl;
214
215     /* Should we sort the cgs */
216     int  nstSortCG;
217     gmx_domdec_sort_t *sort;
218     
219     /* Are there bonded and multi-body interactions between charge groups? */
220     gmx_bool bInterCGBondeds;
221     gmx_bool bInterCGMultiBody;
222
223     /* Data for the optional bonded interaction atom communication range */
224     gmx_bool bBondComm;
225     t_blocka *cglink;
226     char *bLocalCG;
227
228     /* The DLB option */
229     int  eDLB;
230     /* Are we actually using DLB? */
231     gmx_bool bDynLoadBal;
232
233     /* Cell sizes for static load balancing, first index cartesian */
234     real **slb_frac;
235     
236     /* The width of the communicated boundaries */
237     real cutoff_mbody;
238     real cutoff;
239     /* The minimum cell size (including triclinic correction) */
240     rvec cellsize_min;
241     /* For dlb, for use with edlbAUTO */
242     rvec cellsize_min_dlb;
243     /* The lower limit for the DD cell size with DLB */
244     real cellsize_limit;
245     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
246     gmx_bool bVacDLBNoLimit;
247
248     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
249     ivec tric_dir;
250     /* box0 and box_size are required with dim's without pbc and -gcom */
251     rvec box0;
252     rvec box_size;
253     
254     /* The cell boundaries */
255     rvec cell_x0;
256     rvec cell_x1;
257
258     /* The old location of the cell boundaries, to check cg displacements */
259     rvec old_cell_x0;
260     rvec old_cell_x1;
261
262     /* The communication setup and charge group boundaries for the zones */
263     gmx_domdec_zones_t zones;
264     
265     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
266      * cell boundaries of neighboring cells for dynamic load balancing.
267      */
268     gmx_ddzone_t zone_d1[2];
269     gmx_ddzone_t zone_d2[2][2];
270     
271     /* The coordinate/force communication setup and indices */
272     gmx_domdec_comm_dim_t cd[DIM];
273     /* The maximum number of cells to communicate with in one dimension */
274     int  maxpulse;
275     
276     /* Which cg distribution is stored on the master node */
277     int master_cg_ddp_count;
278     
279     /* The number of cg's received from the direct neighbors */
280     int  zone_ncg1[DD_MAXZONE];
281     
282     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
283     int  nat[ddnatNR];
284     
285     /* Communication buffer for general use */
286     int  *buf_int;
287     int  nalloc_int;
288
289      /* Communication buffer for general use */
290     vec_rvec_t vbuf;
291     
292     /* Communication buffers only used with multiple grid pulses */
293     int  *buf_int2;
294     int  nalloc_int2;
295     vec_rvec_t vbuf2;
296     
297     /* Communication buffers for local redistribution */
298     int  **cggl_flag;
299     int  cggl_flag_nalloc[DIM*2];
300     rvec **cgcm_state;
301     int  cgcm_state_nalloc[DIM*2];
302     
303     /* Cell sizes for dynamic load balancing */
304     gmx_domdec_root_t **root;
305     real *cell_f_row;
306     real cell_f0[DIM];
307     real cell_f1[DIM];
308     real cell_f_max0[DIM];
309     real cell_f_min1[DIM];
310     
311     /* Stuff for load communication */
312     gmx_bool bRecordLoad;
313     gmx_domdec_load_t *load;
314 #ifdef GMX_MPI
315     MPI_Comm *mpi_comm_load;
316 #endif
317
318     /* Maximum DLB scaling per load balancing step in percent */
319     int dlb_scale_lim;
320
321     /* Cycle counters */
322     float cycl[ddCyclNr];
323     int   cycl_n[ddCyclNr];
324     float cycl_max[ddCyclNr];
325     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
326     int eFlop;
327     double flop;
328     int    flop_n;
329     /* Have often have did we have load measurements */
330     int    n_load_have;
331     /* Have often have we collected the load measurements */
332     int    n_load_collect;
333     
334     /* Statistics */
335     double sum_nat[ddnatNR-ddnatZONE];
336     int    ndecomp;
337     int    nload;
338     double load_step;
339     double load_sum;
340     double load_max;
341     ivec   load_lim;
342     double load_mdf;
343     double load_pme;
344
345     /* The last partition step */
346     gmx_large_int_t globalcomm_step;
347
348     /* Debugging */
349     int  nstDDDump;
350     int  nstDDDumpGrid;
351     int  DD_debug;
352 } gmx_domdec_comm_t;
353
354 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
355 #define DD_CGIBS 2
356
357 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
358 #define DD_FLAG_NRCG  65535
359 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
360 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
361
362 /* Zone permutation required to obtain consecutive charge groups
363  * for neighbor searching.
364  */
365 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
366
367 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
368  * components see only j zones with that component 0.
369  */
370
371 /* The DD zone order */
372 static const ivec dd_zo[DD_MAXZONE] =
373   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
374
375 /* The 3D setup */
376 #define dd_z3n  8
377 #define dd_zp3n 4
378 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
379
380 /* The 2D setup */
381 #define dd_z2n  4
382 #define dd_zp2n 2
383 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
384
385 /* The 1D setup */
386 #define dd_z1n  2
387 #define dd_zp1n 1
388 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
389
390 /* Factors used to avoid problems due to rounding issues */
391 #define DD_CELL_MARGIN       1.0001
392 #define DD_CELL_MARGIN2      1.00005
393 /* Factor to account for pressure scaling during nstlist steps */
394 #define DD_PRES_SCALE_MARGIN 1.02
395
396 /* Allowed performance loss before we DLB or warn */
397 #define DD_PERF_LOSS 0.05
398
399 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
400
401 /* Use separate MPI send and receive commands
402  * when nnodes <= GMX_DD_NNODES_SENDRECV.
403  * This saves memory (and some copying for small nnodes).
404  * For high parallelization scatter and gather calls are used.
405  */
406 #define GMX_DD_NNODES_SENDRECV 4
407
408
409 /*
410 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
411
412 static void index2xyz(ivec nc,int ind,ivec xyz)
413 {
414   xyz[XX] = ind % nc[XX];
415   xyz[YY] = (ind / nc[XX]) % nc[YY];
416   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
417 }
418 */
419
420 /* This order is required to minimize the coordinate communication in PME
421  * which uses decomposition in the x direction.
422  */
423 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
424
425 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
426 {
427     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
428     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
429     xyz[ZZ] = ind % nc[ZZ];
430 }
431
432 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
433 {
434     int ddindex;
435     int ddnodeid=-1;
436     
437     ddindex = dd_index(dd->nc,c);
438     if (dd->comm->bCartesianPP_PME)
439     {
440         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
441     }
442     else if (dd->comm->bCartesianPP)
443     {
444 #ifdef GMX_MPI
445         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
446 #endif
447     }
448     else
449     {
450         ddnodeid = ddindex;
451     }
452     
453     return ddnodeid;
454 }
455
456 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
457 {
458     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
459 }
460
461 int ddglatnr(gmx_domdec_t *dd,int i)
462 {
463     int atnr;
464     
465     if (dd == NULL)
466     {
467         atnr = i + 1;
468     }
469     else
470     {
471         if (i >= dd->comm->nat[ddnatNR-1])
472         {
473             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
474         }
475         atnr = dd->gatindex[i] + 1;
476     }
477     
478     return atnr;
479 }
480
481 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
482 {
483     return &dd->comm->cgs_gl;
484 }
485
486 static void vec_rvec_init(vec_rvec_t *v)
487 {
488     v->nalloc = 0;
489     v->v      = NULL;
490 }
491
492 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
493 {
494     if (n > v->nalloc)
495     {
496         v->nalloc = over_alloc_dd(n);
497         srenew(v->v,v->nalloc);
498     }
499 }
500
501 void dd_store_state(gmx_domdec_t *dd,t_state *state)
502 {
503     int i;
504     
505     if (state->ddp_count != dd->ddp_count)
506     {
507         gmx_incons("The state does not the domain decomposition state");
508     }
509     
510     state->ncg_gl = dd->ncg_home;
511     if (state->ncg_gl > state->cg_gl_nalloc)
512     {
513         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
514         srenew(state->cg_gl,state->cg_gl_nalloc);
515     }
516     for(i=0; i<state->ncg_gl; i++)
517     {
518         state->cg_gl[i] = dd->index_gl[i];
519     }
520     
521     state->ddp_count_cg_gl = dd->ddp_count;
522 }
523
524 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
525 {
526     return &dd->comm->zones;
527 }
528
529 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
530                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
531 {
532     gmx_domdec_zones_t *zones;
533     int izone,d,dim;
534
535     zones = &dd->comm->zones;
536
537     izone = 0;
538     while (icg >= zones->izone[izone].cg1)
539     {
540         izone++;
541     }
542     
543     if (izone == 0)
544     {
545         *jcg0 = icg;
546     }
547     else if (izone < zones->nizone)
548     {
549         *jcg0 = zones->izone[izone].jcg0;
550     }
551     else
552     {
553         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
554                   icg,izone,zones->nizone);
555     }
556         
557     *jcg1 = zones->izone[izone].jcg1;
558     
559     for(d=0; d<dd->ndim; d++)
560     {
561         dim = dd->dim[d];
562         shift0[dim] = zones->izone[izone].shift0[dim];
563         shift1[dim] = zones->izone[izone].shift1[dim];
564         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
565         {
566             /* A conservative approach, this can be optimized */
567             shift0[dim] -= 1;
568             shift1[dim] += 1;
569         }
570     }
571 }
572
573 int dd_natoms_vsite(gmx_domdec_t *dd)
574 {
575     return dd->comm->nat[ddnatVSITE];
576 }
577
578 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
579 {
580     *at_start = dd->comm->nat[ddnatCON-1];
581     *at_end   = dd->comm->nat[ddnatCON];
582 }
583
584 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
585 {
586     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
587     int  *index,*cgindex;
588     gmx_domdec_comm_t *comm;
589     gmx_domdec_comm_dim_t *cd;
590     gmx_domdec_ind_t *ind;
591     rvec shift={0,0,0},*buf,*rbuf;
592     gmx_bool bPBC,bScrew;
593     
594     comm = dd->comm;
595     
596     cgindex = dd->cgindex;
597     
598     buf = comm->vbuf.v;
599
600     nzone = 1;
601     nat_tot = dd->nat_home;
602     for(d=0; d<dd->ndim; d++)
603     {
604         bPBC   = (dd->ci[dd->dim[d]] == 0);
605         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
606         if (bPBC)
607         {
608             copy_rvec(box[dd->dim[d]],shift);
609         }
610         cd = &comm->cd[d];
611         for(p=0; p<cd->np; p++)
612         {
613             ind = &cd->ind[p];
614             index = ind->index;
615             n = 0;
616             if (!bPBC)
617             {
618                 for(i=0; i<ind->nsend[nzone]; i++)
619                 {
620                     at0 = cgindex[index[i]];
621                     at1 = cgindex[index[i]+1];
622                     for(j=at0; j<at1; j++)
623                     {
624                         copy_rvec(x[j],buf[n]);
625                         n++;
626                     }
627                 }
628             }
629             else if (!bScrew)
630             {
631                 for(i=0; i<ind->nsend[nzone]; i++)
632                 {
633                     at0 = cgindex[index[i]];
634                     at1 = cgindex[index[i]+1];
635                     for(j=at0; j<at1; j++)
636                     {
637                         /* We need to shift the coordinates */
638                         rvec_add(x[j],shift,buf[n]);
639                         n++;
640                     }
641                 }
642             }
643             else
644             {
645                 for(i=0; i<ind->nsend[nzone]; i++)
646                 {
647                     at0 = cgindex[index[i]];
648                     at1 = cgindex[index[i]+1];
649                     for(j=at0; j<at1; j++)
650                     {
651                         /* Shift x */
652                         buf[n][XX] = x[j][XX] + shift[XX];
653                         /* Rotate y and z.
654                          * This operation requires a special shift force
655                          * treatment, which is performed in calc_vir.
656                          */
657                         buf[n][YY] = box[YY][YY] - x[j][YY];
658                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
659                         n++;
660                     }
661                 }
662             }
663             
664             if (cd->bInPlace)
665             {
666                 rbuf = x + nat_tot;
667             }
668             else
669             {
670                 rbuf = comm->vbuf2.v;
671             }
672             /* Send and receive the coordinates */
673             dd_sendrecv_rvec(dd, d, dddirBackward,
674                              buf,  ind->nsend[nzone+1],
675                              rbuf, ind->nrecv[nzone+1]);
676             if (!cd->bInPlace)
677             {
678                 j = 0;
679                 for(zone=0; zone<nzone; zone++)
680                 {
681                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
682                     {
683                         copy_rvec(rbuf[j],x[i]);
684                         j++;
685                     }
686                 }
687             }
688             nat_tot += ind->nrecv[nzone+1];
689         }
690         nzone += nzone;
691     }
692 }
693
694 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
695 {
696     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
697     int  *index,*cgindex;
698     gmx_domdec_comm_t *comm;
699     gmx_domdec_comm_dim_t *cd;
700     gmx_domdec_ind_t *ind;
701     rvec *buf,*sbuf;
702     ivec vis;
703     int  is;
704     gmx_bool bPBC,bScrew;
705     
706     comm = dd->comm;
707     
708     cgindex = dd->cgindex;
709
710     buf = comm->vbuf.v;
711
712     n = 0;
713     nzone = comm->zones.n/2;
714     nat_tot = dd->nat_tot;
715     for(d=dd->ndim-1; d>=0; d--)
716     {
717         bPBC   = (dd->ci[dd->dim[d]] == 0);
718         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
719         if (fshift == NULL && !bScrew)
720         {
721             bPBC = FALSE;
722         }
723         /* Determine which shift vector we need */
724         clear_ivec(vis);
725         vis[dd->dim[d]] = 1;
726         is = IVEC2IS(vis);
727         
728         cd = &comm->cd[d];
729         for(p=cd->np-1; p>=0; p--) {
730             ind = &cd->ind[p];
731             nat_tot -= ind->nrecv[nzone+1];
732             if (cd->bInPlace)
733             {
734                 sbuf = f + nat_tot;
735             }
736             else
737             {
738                 sbuf = comm->vbuf2.v;
739                 j = 0;
740                 for(zone=0; zone<nzone; zone++)
741                 {
742                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
743                     {
744                         copy_rvec(f[i],sbuf[j]);
745                         j++;
746                     }
747                 }
748             }
749             /* Communicate the forces */
750             dd_sendrecv_rvec(dd, d, dddirForward,
751                              sbuf, ind->nrecv[nzone+1],
752                              buf,  ind->nsend[nzone+1]);
753             index = ind->index;
754             /* Add the received forces */
755             n = 0;
756             if (!bPBC)
757             {
758                 for(i=0; i<ind->nsend[nzone]; i++)
759                 {
760                     at0 = cgindex[index[i]];
761                     at1 = cgindex[index[i]+1];
762                     for(j=at0; j<at1; j++)
763                     {
764                         rvec_inc(f[j],buf[n]);
765                         n++;
766                     }
767                 } 
768             }
769             else if (!bScrew)
770             {
771                 for(i=0; i<ind->nsend[nzone]; i++)
772                 {
773                     at0 = cgindex[index[i]];
774                     at1 = cgindex[index[i]+1];
775                     for(j=at0; j<at1; j++)
776                     {
777                         rvec_inc(f[j],buf[n]);
778                         /* Add this force to the shift force */
779                         rvec_inc(fshift[is],buf[n]);
780                         n++;
781                     }
782                 }
783             }
784             else
785             {
786                 for(i=0; i<ind->nsend[nzone]; i++)
787                 {
788                     at0 = cgindex[index[i]];
789                     at1 = cgindex[index[i]+1];
790                     for(j=at0; j<at1; j++)
791                     {
792                         /* Rotate the force */
793                         f[j][XX] += buf[n][XX];
794                         f[j][YY] -= buf[n][YY];
795                         f[j][ZZ] -= buf[n][ZZ];
796                         if (fshift)
797                         {
798                             /* Add this force to the shift force */
799                             rvec_inc(fshift[is],buf[n]);
800                         }
801                         n++;
802                     }
803                 }
804             }
805         }
806         nzone /= 2;
807     }
808 }
809
810 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
811 {
812     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
813     int  *index,*cgindex;
814     gmx_domdec_comm_t *comm;
815     gmx_domdec_comm_dim_t *cd;
816     gmx_domdec_ind_t *ind;
817     real *buf,*rbuf;
818     
819     comm = dd->comm;
820     
821     cgindex = dd->cgindex;
822     
823     buf = &comm->vbuf.v[0][0];
824
825     nzone = 1;
826     nat_tot = dd->nat_home;
827     for(d=0; d<dd->ndim; d++)
828     {
829         cd = &comm->cd[d];
830         for(p=0; p<cd->np; p++)
831         {
832             ind = &cd->ind[p];
833             index = ind->index;
834             n = 0;
835             for(i=0; i<ind->nsend[nzone]; i++)
836             {
837                 at0 = cgindex[index[i]];
838                 at1 = cgindex[index[i]+1];
839                 for(j=at0; j<at1; j++)
840                 {
841                     buf[n] = v[j];
842                     n++;
843                 }
844             }
845             
846             if (cd->bInPlace)
847             {
848                 rbuf = v + nat_tot;
849             }
850             else
851             {
852                 rbuf = &comm->vbuf2.v[0][0];
853             }
854             /* Send and receive the coordinates */
855             dd_sendrecv_real(dd, d, dddirBackward,
856                              buf,  ind->nsend[nzone+1],
857                              rbuf, ind->nrecv[nzone+1]);
858             if (!cd->bInPlace)
859             {
860                 j = 0;
861                 for(zone=0; zone<nzone; zone++)
862                 {
863                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
864                     {
865                         v[i] = rbuf[j];
866                         j++;
867                     }
868                 }
869             }
870             nat_tot += ind->nrecv[nzone+1];
871         }
872         nzone += nzone;
873     }
874 }
875
876 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
877 {
878     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
879     int  *index,*cgindex;
880     gmx_domdec_comm_t *comm;
881     gmx_domdec_comm_dim_t *cd;
882     gmx_domdec_ind_t *ind;
883     real *buf,*sbuf;
884     
885     comm = dd->comm;
886     
887     cgindex = dd->cgindex;
888
889     buf = &comm->vbuf.v[0][0];
890
891     n = 0;
892     nzone = comm->zones.n/2;
893     nat_tot = dd->nat_tot;
894     for(d=dd->ndim-1; d>=0; d--)
895     {
896         cd = &comm->cd[d];
897         for(p=cd->np-1; p>=0; p--) {
898             ind = &cd->ind[p];
899             nat_tot -= ind->nrecv[nzone+1];
900             if (cd->bInPlace)
901             {
902                 sbuf = v + nat_tot;
903             }
904             else
905             {
906                 sbuf = &comm->vbuf2.v[0][0];
907                 j = 0;
908                 for(zone=0; zone<nzone; zone++)
909                 {
910                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
911                     {
912                         sbuf[j] = v[i];
913                         j++;
914                     }
915                 }
916             }
917             /* Communicate the forces */
918             dd_sendrecv_real(dd, d, dddirForward,
919                              sbuf, ind->nrecv[nzone+1],
920                              buf,  ind->nsend[nzone+1]);
921             index = ind->index;
922             /* Add the received forces */
923             n = 0;
924             for(i=0; i<ind->nsend[nzone]; i++)
925             {
926                 at0 = cgindex[index[i]];
927                 at1 = cgindex[index[i]+1];
928                 for(j=at0; j<at1; j++)
929                 {
930                     v[j] += buf[n];
931                     n++;
932                 }
933             } 
934         }
935         nzone /= 2;
936     }
937 }
938
939 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
940 {
941     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
942             d,i,j,
943             zone->min0,zone->max1,
944             zone->mch0,zone->mch0,
945             zone->p1_0,zone->p1_1);
946 }
947
948 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
949                                int ddimind,int direction,
950                                gmx_ddzone_t *buf_s,int n_s,
951                                gmx_ddzone_t *buf_r,int n_r)
952 {
953     rvec vbuf_s[5*2],vbuf_r[5*2];
954     int i;
955
956     for(i=0; i<n_s; i++)
957     {
958         vbuf_s[i*2  ][0] = buf_s[i].min0;
959         vbuf_s[i*2  ][1] = buf_s[i].max1;
960         vbuf_s[i*2  ][2] = buf_s[i].mch0;
961         vbuf_s[i*2+1][0] = buf_s[i].mch1;
962         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
963         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
964     }
965
966     dd_sendrecv_rvec(dd, ddimind, direction,
967                      vbuf_s, n_s*2,
968                      vbuf_r, n_r*2);
969
970     for(i=0; i<n_r; i++)
971     {
972         buf_r[i].min0 = vbuf_r[i*2  ][0];
973         buf_r[i].max1 = vbuf_r[i*2  ][1];
974         buf_r[i].mch0 = vbuf_r[i*2  ][2];
975         buf_r[i].mch1 = vbuf_r[i*2+1][0];
976         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
977         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
978     }
979 }
980
981 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
982                           rvec cell_ns_x0,rvec cell_ns_x1)
983 {
984     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
985     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
986     rvec extr_s[2],extr_r[2];
987     rvec dh;
988     real dist_d,c=0,det;
989     gmx_domdec_comm_t *comm;
990     gmx_bool bPBC,bUse;
991
992     comm = dd->comm;
993
994     for(d=1; d<dd->ndim; d++)
995     {
996         dim = dd->dim[d];
997         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
998         zp->min0 = cell_ns_x0[dim];
999         zp->max1 = cell_ns_x1[dim];
1000         zp->mch0 = cell_ns_x0[dim];
1001         zp->mch1 = cell_ns_x1[dim];
1002         zp->p1_0 = cell_ns_x0[dim];
1003         zp->p1_1 = cell_ns_x1[dim];
1004     }
1005     
1006     for(d=dd->ndim-2; d>=0; d--)
1007     {
1008         dim  = dd->dim[d];
1009         bPBC = (dim < ddbox->npbcdim);
1010
1011         /* Use an rvec to store two reals */
1012         extr_s[d][0] = comm->cell_f0[d+1];
1013         extr_s[d][1] = comm->cell_f1[d+1];
1014         extr_s[d][2] = 0;
1015
1016         pos = 0;
1017         /* Store the extremes in the backward sending buffer,
1018          * so the get updated separately from the forward communication.
1019          */
1020         for(d1=d; d1<dd->ndim-1; d1++)
1021         {
1022             /* We invert the order to be able to use the same loop for buf_e */
1023             buf_s[pos].min0 = extr_s[d1][1];
1024             buf_s[pos].max1 = extr_s[d1][0];
1025             buf_s[pos].mch0 = 0;
1026             buf_s[pos].mch1 = 0;
1027             /* Store the cell corner of the dimension we communicate along */
1028             buf_s[pos].p1_0 = comm->cell_x0[dim];
1029             buf_s[pos].p1_1 = 0;
1030             pos++;
1031         }
1032
1033         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1034         pos++;
1035
1036         if (dd->ndim == 3 && d == 0)
1037         {
1038             buf_s[pos] = comm->zone_d2[0][1];
1039             pos++;
1040             buf_s[pos] = comm->zone_d1[0];
1041             pos++;
1042         }
1043
1044         /* We only need to communicate the extremes
1045          * in the forward direction
1046          */
1047         npulse = comm->cd[d].np;
1048         if (bPBC)
1049         {
1050             /* Take the minimum to avoid double communication */
1051             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1052         }
1053         else
1054         {
1055             /* Without PBC we should really not communicate over
1056              * the boundaries, but implementing that complicates
1057              * the communication setup and therefore we simply
1058              * do all communication, but ignore some data.
1059              */
1060             npulse_min = npulse;
1061         }
1062         for(p=0; p<npulse_min; p++)
1063         {
1064             /* Communicate the extremes forward */
1065             bUse = (bPBC || dd->ci[dim] > 0);
1066
1067             dd_sendrecv_rvec(dd, d, dddirForward,
1068                              extr_s+d, dd->ndim-d-1,
1069                              extr_r+d, dd->ndim-d-1);
1070
1071             if (bUse)
1072             {
1073                 for(d1=d; d1<dd->ndim-1; d1++)
1074                 {
1075                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1076                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1077                 }
1078             }
1079         }
1080
1081         buf_size = pos;
1082         for(p=0; p<npulse; p++)
1083         {
1084             /* Communicate all the zone information backward */
1085             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1086
1087             dd_sendrecv_ddzone(dd, d, dddirBackward,
1088                                buf_s, buf_size,
1089                                buf_r, buf_size);
1090
1091             clear_rvec(dh);
1092             if (p > 0)
1093             {
1094                 for(d1=d+1; d1<dd->ndim; d1++)
1095                 {
1096                     /* Determine the decrease of maximum required
1097                      * communication height along d1 due to the distance along d,
1098                      * this avoids a lot of useless atom communication.
1099                      */
1100                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1101
1102                     if (ddbox->tric_dir[dim])
1103                     {
1104                         /* c is the off-diagonal coupling between the cell planes
1105                          * along directions d and d1.
1106                          */
1107                         c = ddbox->v[dim][dd->dim[d1]][dim];
1108                     }
1109                     else
1110                     {
1111                         c = 0;
1112                     }
1113                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1114                     if (det > 0)
1115                     {
1116                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1117                     }
1118                     else
1119                     {
1120                         /* A negative value signals out of range */
1121                         dh[d1] = -1;
1122                     }
1123                 }
1124             }
1125
1126             /* Accumulate the extremes over all pulses */
1127             for(i=0; i<buf_size; i++)
1128             {
1129                 if (p == 0)
1130                 {
1131                     buf_e[i] = buf_r[i];
1132                 }
1133                 else
1134                 {
1135                     if (bUse)
1136                     {
1137                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1138                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1139                     }
1140
1141                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1142                     {
1143                         d1 = 1;
1144                     }
1145                     else
1146                     {
1147                         d1 = d + 1;
1148                     }
1149                     if (bUse && dh[d1] >= 0)
1150                     {
1151                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1152                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1153                     }
1154                 }
1155                 /* Copy the received buffer to the send buffer,
1156                  * to pass the data through with the next pulse.
1157                  */
1158                 buf_s[i] = buf_r[i];
1159             }
1160             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1161                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1162             {
1163                 /* Store the extremes */ 
1164                 pos = 0;
1165
1166                 for(d1=d; d1<dd->ndim-1; d1++)
1167                 {
1168                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1169                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1170                     pos++;
1171                 }
1172
1173                 if (d == 1 || (d == 0 && dd->ndim == 3))
1174                 {
1175                     for(i=d; i<2; i++)
1176                     {
1177                         comm->zone_d2[1-d][i] = buf_e[pos];
1178                         pos++;
1179                     }
1180                 }
1181                 if (d == 0)
1182                 {
1183                     comm->zone_d1[1] = buf_e[pos];
1184                     pos++;
1185                 }
1186             }
1187         }
1188     }
1189     
1190     if (dd->ndim >= 2)
1191     {
1192         dim = dd->dim[1];
1193         for(i=0; i<2; i++)
1194         {
1195             if (debug)
1196             {
1197                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1198             }
1199             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1200             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1201         }
1202     }
1203     if (dd->ndim >= 3)
1204     {
1205         dim = dd->dim[2];
1206         for(i=0; i<2; i++)
1207         {
1208             for(j=0; j<2; j++)
1209             {
1210                 if (debug)
1211                 {
1212                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1213                 }
1214                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1215                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1216             }
1217         }
1218     }
1219     for(d=1; d<dd->ndim; d++)
1220     {
1221         comm->cell_f_max0[d] = extr_s[d-1][0];
1222         comm->cell_f_min1[d] = extr_s[d-1][1];
1223         if (debug)
1224         {
1225             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1226                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1227         }
1228     }
1229 }
1230
1231 static void dd_collect_cg(gmx_domdec_t *dd,
1232                           t_state *state_local)
1233 {
1234     gmx_domdec_master_t *ma=NULL;
1235     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1236     t_block *cgs_gl;
1237
1238     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1239     {
1240         /* The master has the correct distribution */
1241         return;
1242     }
1243     
1244     if (state_local->ddp_count == dd->ddp_count)
1245     {
1246         ncg_home = dd->ncg_home;
1247         cg       = dd->index_gl;
1248         nat_home = dd->nat_home;
1249     } 
1250     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1251     {
1252         cgs_gl = &dd->comm->cgs_gl;
1253
1254         ncg_home = state_local->ncg_gl;
1255         cg       = state_local->cg_gl;
1256         nat_home = 0;
1257         for(i=0; i<ncg_home; i++)
1258         {
1259             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1260         }
1261     }
1262     else
1263     {
1264         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1265     }
1266     
1267     buf2[0] = dd->ncg_home;
1268     buf2[1] = dd->nat_home;
1269     if (DDMASTER(dd))
1270     {
1271         ma = dd->ma;
1272         ibuf = ma->ibuf;
1273     }
1274     else
1275     {
1276         ibuf = NULL;
1277     }
1278     /* Collect the charge group and atom counts on the master */
1279     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1280     
1281     if (DDMASTER(dd))
1282     {
1283         ma->index[0] = 0;
1284         for(i=0; i<dd->nnodes; i++)
1285         {
1286             ma->ncg[i] = ma->ibuf[2*i];
1287             ma->nat[i] = ma->ibuf[2*i+1];
1288             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1289             
1290         }
1291         /* Make byte counts and indices */
1292         for(i=0; i<dd->nnodes; i++)
1293         {
1294             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1295             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1296         }
1297         if (debug)
1298         {
1299             fprintf(debug,"Initial charge group distribution: ");
1300             for(i=0; i<dd->nnodes; i++)
1301                 fprintf(debug," %d",ma->ncg[i]);
1302             fprintf(debug,"\n");
1303         }
1304     }
1305     
1306     /* Collect the charge group indices on the master */
1307     dd_gatherv(dd,
1308                dd->ncg_home*sizeof(int),dd->index_gl,
1309                DDMASTER(dd) ? ma->ibuf : NULL,
1310                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1311                DDMASTER(dd) ? ma->cg : NULL);
1312     
1313     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1314 }
1315
1316 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1317                                     rvec *lv,rvec *v)
1318 {
1319     gmx_domdec_master_t *ma;
1320     int  n,i,c,a,nalloc=0;
1321     rvec *buf=NULL;
1322     t_block *cgs_gl;
1323
1324     ma = dd->ma;
1325     
1326     if (!DDMASTER(dd))
1327     {
1328 #ifdef GMX_MPI
1329         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1330                  dd->rank,dd->mpi_comm_all);
1331 #endif
1332     } else {
1333         /* Copy the master coordinates to the global array */
1334         cgs_gl = &dd->comm->cgs_gl;
1335
1336         n = DDMASTERRANK(dd);
1337         a = 0;
1338         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1339         {
1340             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1341             {
1342                 copy_rvec(lv[a++],v[c]);
1343             }
1344         }
1345         
1346         for(n=0; n<dd->nnodes; n++)
1347         {
1348             if (n != dd->rank)
1349             {
1350                 if (ma->nat[n] > nalloc)
1351                 {
1352                     nalloc = over_alloc_dd(ma->nat[n]);
1353                     srenew(buf,nalloc);
1354                 }
1355 #ifdef GMX_MPI
1356                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1357                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1358 #endif
1359                 a = 0;
1360                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1361                 {
1362                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1363                     {
1364                         copy_rvec(buf[a++],v[c]);
1365                     }
1366                 }
1367             }
1368         }
1369         sfree(buf);
1370     }
1371 }
1372
1373 static void get_commbuffer_counts(gmx_domdec_t *dd,
1374                                   int **counts,int **disps)
1375 {
1376     gmx_domdec_master_t *ma;
1377     int n;
1378
1379     ma = dd->ma;
1380     
1381     /* Make the rvec count and displacment arrays */
1382     *counts  = ma->ibuf;
1383     *disps   = ma->ibuf + dd->nnodes;
1384     for(n=0; n<dd->nnodes; n++)
1385     {
1386         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1387         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1388     }
1389 }
1390
1391 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1392                                    rvec *lv,rvec *v)
1393 {
1394     gmx_domdec_master_t *ma;
1395     int  *rcounts=NULL,*disps=NULL;
1396     int  n,i,c,a;
1397     rvec *buf=NULL;
1398     t_block *cgs_gl;
1399     
1400     ma = dd->ma;
1401     
1402     if (DDMASTER(dd))
1403     {
1404         get_commbuffer_counts(dd,&rcounts,&disps);
1405
1406         buf = ma->vbuf;
1407     }
1408     
1409     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1410
1411     if (DDMASTER(dd))
1412     {
1413         cgs_gl = &dd->comm->cgs_gl;
1414
1415         a = 0;
1416         for(n=0; n<dd->nnodes; n++)
1417         {
1418             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1419             {
1420                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1421                 {
1422                     copy_rvec(buf[a++],v[c]);
1423                 }
1424             }
1425         }
1426     }
1427 }
1428
1429 void dd_collect_vec(gmx_domdec_t *dd,
1430                     t_state *state_local,rvec *lv,rvec *v)
1431 {
1432     gmx_domdec_master_t *ma;
1433     int  n,i,c,a,nalloc=0;
1434     rvec *buf=NULL;
1435     
1436     dd_collect_cg(dd,state_local);
1437
1438     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1439     {
1440         dd_collect_vec_sendrecv(dd,lv,v);
1441     }
1442     else
1443     {
1444         dd_collect_vec_gatherv(dd,lv,v);
1445     }
1446 }
1447
1448
1449 void dd_collect_state(gmx_domdec_t *dd,
1450                       t_state *state_local,t_state *state)
1451 {
1452     int est,i,j,nh;
1453
1454     nh = state->nhchainlength;
1455
1456     if (DDMASTER(dd))
1457     {
1458         state->lambda = state_local->lambda;
1459         state->veta = state_local->veta;
1460         state->vol0 = state_local->vol0;
1461         copy_mat(state_local->box,state->box);
1462         copy_mat(state_local->boxv,state->boxv);
1463         copy_mat(state_local->svir_prev,state->svir_prev);
1464         copy_mat(state_local->fvir_prev,state->fvir_prev);
1465         copy_mat(state_local->pres_prev,state->pres_prev);
1466
1467
1468         for(i=0; i<state_local->ngtc; i++)
1469         {
1470             for(j=0; j<nh; j++) {
1471                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1472                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1473             }
1474             state->therm_integral[i] = state_local->therm_integral[i];            
1475         }
1476         for(i=0; i<state_local->nnhpres; i++) 
1477         {
1478             for(j=0; j<nh; j++) {
1479                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1480                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1481             }
1482         }
1483     }
1484     for(est=0; est<estNR; est++)
1485     {
1486         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1487         {
1488             switch (est) {
1489             case estX:
1490                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1491                 break;
1492             case estV:
1493                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1494                 break;
1495             case estSDX:
1496                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1497                 break;
1498             case estCGP:
1499                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1500                 break;
1501             case estLD_RNG:
1502                 if (state->nrngi == 1)
1503                 {
1504                     if (DDMASTER(dd))
1505                     {
1506                         for(i=0; i<state_local->nrng; i++)
1507                         {
1508                             state->ld_rng[i] = state_local->ld_rng[i];
1509                         }
1510                     }
1511                 }
1512                 else
1513                 {
1514                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1515                               state_local->ld_rng,state->ld_rng);
1516                 }
1517                 break;
1518             case estLD_RNGI:
1519                 if (state->nrngi == 1)
1520                 {
1521                    if (DDMASTER(dd))
1522                     {
1523                         state->ld_rngi[0] = state_local->ld_rngi[0];
1524                     } 
1525                 }
1526                 else
1527                 {
1528                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1529                               state_local->ld_rngi,state->ld_rngi);
1530                 }
1531                 break;
1532             case estDISRE_INITF:
1533             case estDISRE_RM3TAV:
1534             case estORIRE_INITF:
1535             case estORIRE_DTAV:
1536                 break;
1537             default:
1538                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1539             }
1540         }
1541     }
1542 }
1543
1544 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1545 {
1546     if (debug)
1547     {
1548         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1549     }
1550     fr->cg_nalloc = over_alloc_dd(nalloc);
1551     srenew(fr->cg_cm,fr->cg_nalloc);
1552     srenew(fr->cginfo,fr->cg_nalloc);
1553 }
1554
1555 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1556 {
1557     int est;
1558
1559     if (debug)
1560     {
1561         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1562     }
1563
1564     state->nalloc = over_alloc_dd(nalloc);
1565     
1566     for(est=0; est<estNR; est++)
1567     {
1568         if (EST_DISTR(est) && (state->flags & (1<<est)))
1569         {
1570             switch(est) {
1571             case estX:
1572                 srenew(state->x,state->nalloc);
1573                 break;
1574             case estV:
1575                 srenew(state->v,state->nalloc);
1576                 break;
1577             case estSDX:
1578                 srenew(state->sd_X,state->nalloc);
1579                 break;
1580             case estCGP:
1581                 srenew(state->cg_p,state->nalloc);
1582                 break;
1583             case estLD_RNG:
1584             case estLD_RNGI:
1585             case estDISRE_INITF:
1586             case estDISRE_RM3TAV:
1587             case estORIRE_INITF:
1588             case estORIRE_DTAV:
1589                 /* No reallocation required */
1590                 break;
1591             default:
1592                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1593             }
1594         }
1595     }
1596     
1597     if (f != NULL)
1598     {
1599         srenew(*f,state->nalloc);
1600     }
1601 }
1602
1603 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1604                                        rvec *v,rvec *lv)
1605 {
1606     gmx_domdec_master_t *ma;
1607     int  n,i,c,a,nalloc=0;
1608     rvec *buf=NULL;
1609     
1610     if (DDMASTER(dd))
1611     {
1612         ma  = dd->ma;
1613         
1614         for(n=0; n<dd->nnodes; n++)
1615         {
1616             if (n != dd->rank)
1617             {
1618                 if (ma->nat[n] > nalloc)
1619                 {
1620                     nalloc = over_alloc_dd(ma->nat[n]);
1621                     srenew(buf,nalloc);
1622                 }
1623                 /* Use lv as a temporary buffer */
1624                 a = 0;
1625                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1626                 {
1627                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1628                     {
1629                         copy_rvec(v[c],buf[a++]);
1630                     }
1631                 }
1632                 if (a != ma->nat[n])
1633                 {
1634                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1635                               a,ma->nat[n]);
1636                 }
1637                 
1638 #ifdef GMX_MPI
1639                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1640                          DDRANK(dd,n),n,dd->mpi_comm_all);
1641 #endif
1642             }
1643         }
1644         sfree(buf);
1645         n = DDMASTERRANK(dd);
1646         a = 0;
1647         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1648         {
1649             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1650             {
1651                 copy_rvec(v[c],lv[a++]);
1652             }
1653         }
1654     }
1655     else
1656     {
1657 #ifdef GMX_MPI
1658         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1659                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1660 #endif
1661     }
1662 }
1663
1664 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1665                                        rvec *v,rvec *lv)
1666 {
1667     gmx_domdec_master_t *ma;
1668     int  *scounts=NULL,*disps=NULL;
1669     int  n,i,c,a,nalloc=0;
1670     rvec *buf=NULL;
1671     
1672     if (DDMASTER(dd))
1673     {
1674         ma  = dd->ma;
1675      
1676         get_commbuffer_counts(dd,&scounts,&disps);
1677
1678         buf = ma->vbuf;
1679         a = 0;
1680         for(n=0; n<dd->nnodes; n++)
1681         {
1682             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1683             {
1684                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1685                 {
1686                     copy_rvec(v[c],buf[a++]);
1687                 }
1688             }
1689         }
1690     }
1691
1692     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1693 }
1694
1695 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1696 {
1697     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1698     {
1699         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1700     }
1701     else
1702     {
1703         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1704     }
1705 }
1706
1707 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1708                                 t_state *state,t_state *state_local,
1709                                 rvec **f)
1710 {
1711     int  i,j,ngtch,ngtcp,nh;
1712
1713     nh = state->nhchainlength;
1714
1715     if (DDMASTER(dd))
1716     {
1717         state_local->lambda = state->lambda;
1718         state_local->veta   = state->veta;
1719         state_local->vol0   = state->vol0;
1720         copy_mat(state->box,state_local->box);
1721         copy_mat(state->box_rel,state_local->box_rel);
1722         copy_mat(state->boxv,state_local->boxv);
1723         copy_mat(state->svir_prev,state_local->svir_prev);
1724         copy_mat(state->fvir_prev,state_local->fvir_prev);
1725         for(i=0; i<state_local->ngtc; i++)
1726         {
1727             for(j=0; j<nh; j++) {
1728                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1729                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1730             }
1731             state_local->therm_integral[i] = state->therm_integral[i];
1732         }
1733         for(i=0; i<state_local->nnhpres; i++)
1734         {
1735             for(j=0; j<nh; j++) {
1736                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1737                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1738             }
1739         }
1740     }
1741     dd_bcast(dd,sizeof(real),&state_local->lambda);
1742     dd_bcast(dd,sizeof(real),&state_local->veta);
1743     dd_bcast(dd,sizeof(real),&state_local->vol0);
1744     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1745     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1746     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1747     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1748     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1749     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1750     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1751     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1752     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1753     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1754
1755     if (dd->nat_home > state_local->nalloc)
1756     {
1757         dd_realloc_state(state_local,f,dd->nat_home);
1758     }
1759     for(i=0; i<estNR; i++)
1760     {
1761         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1762         {
1763             switch (i) {
1764             case estX:
1765                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1766                 break;
1767             case estV:
1768                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1769                 break;
1770             case estSDX:
1771                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1772                 break;
1773             case estCGP:
1774                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1775                 break;
1776             case estLD_RNG:
1777                 if (state->nrngi == 1)
1778                 {
1779                     dd_bcastc(dd,
1780                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1781                               state->ld_rng,state_local->ld_rng);
1782                 }
1783                 else
1784                 {
1785                     dd_scatter(dd,
1786                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1787                                state->ld_rng,state_local->ld_rng);
1788                 }
1789                 break;
1790             case estLD_RNGI:
1791                 if (state->nrngi == 1)
1792                 {
1793                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1794                               state->ld_rngi,state_local->ld_rngi);
1795                 }
1796                 else
1797                 {
1798                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1799                                state->ld_rngi,state_local->ld_rngi);
1800                 }   
1801                 break;
1802             case estDISRE_INITF:
1803             case estDISRE_RM3TAV:
1804             case estORIRE_INITF:
1805             case estORIRE_DTAV:
1806                 /* Not implemented yet */
1807                 break;
1808             default:
1809                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1810             }
1811         }
1812     }
1813 }
1814
1815 static char dim2char(int dim)
1816 {
1817     char c='?';
1818     
1819     switch (dim)
1820     {
1821     case XX: c = 'X'; break;
1822     case YY: c = 'Y'; break;
1823     case ZZ: c = 'Z'; break;
1824     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1825     }
1826     
1827     return c;
1828 }
1829
1830 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1831                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1832 {
1833     rvec grid_s[2],*grid_r=NULL,cx,r;
1834     char fname[STRLEN],format[STRLEN],buf[22];
1835     FILE *out;
1836     int  a,i,d,z,y,x;
1837     matrix tric;
1838     real vol;
1839
1840     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1841     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1842     
1843     if (DDMASTER(dd))
1844     {
1845         snew(grid_r,2*dd->nnodes);
1846     }
1847     
1848     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1849     
1850     if (DDMASTER(dd))
1851     {
1852         for(d=0; d<DIM; d++)
1853         {
1854             for(i=0; i<DIM; i++)
1855             {
1856                 if (d == i)
1857                 {
1858                     tric[d][i] = 1;
1859                 }
1860                 else
1861                 {
1862                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1863                     {
1864                         tric[d][i] = box[i][d]/box[i][i];
1865                     }
1866                     else
1867                     {
1868                         tric[d][i] = 0;
1869                     }
1870                 }
1871             }
1872         }
1873         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1874         sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1875         out = gmx_fio_fopen(fname,"w");
1876         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1877         a = 1;
1878         for(i=0; i<dd->nnodes; i++)
1879         {
1880             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1881             for(d=0; d<DIM; d++)
1882             {
1883                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1884             }
1885             for(z=0; z<2; z++)
1886             {
1887                 for(y=0; y<2; y++)
1888                 {
1889                     for(x=0; x<2; x++)
1890                     {
1891                         cx[XX] = grid_r[i*2+x][XX];
1892                         cx[YY] = grid_r[i*2+y][YY];
1893                         cx[ZZ] = grid_r[i*2+z][ZZ];
1894                         mvmul(tric,cx,r);
1895                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1896                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1897                     }
1898                 }
1899             }
1900             for(d=0; d<DIM; d++)
1901             {
1902                 for(x=0; x<4; x++)
1903                 {
1904                     switch(d)
1905                     {
1906                     case 0: y = 1 + i*8 + 2*x; break;
1907                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1908                     case 2: y = 1 + i*8 + x; break;
1909                     }
1910                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1911                 }
1912             }
1913         }
1914         gmx_fio_fclose(out);
1915         sfree(grid_r);
1916     }
1917 }
1918
1919 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1920                   gmx_mtop_t *mtop,t_commrec *cr,
1921                   int natoms,rvec x[],matrix box)
1922 {
1923     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1924     FILE *out;
1925     int  i,ii,resnr,c;
1926     char *atomname,*resname;
1927     real b;
1928     gmx_domdec_t *dd;
1929     
1930     dd = cr->dd;
1931     if (natoms == -1)
1932     {
1933         natoms = dd->comm->nat[ddnatVSITE];
1934     }
1935     
1936     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1937     
1938     sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1939     sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
1940     
1941     out = gmx_fio_fopen(fname,"w");
1942     
1943     fprintf(out,"TITLE     %s\n",title);
1944     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1945     for(i=0; i<natoms; i++)
1946     {
1947         ii = dd->gatindex[i];
1948         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1949         if (i < dd->comm->nat[ddnatZONE])
1950         {
1951             c = 0;
1952             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1953             {
1954                 c++;
1955             }
1956             b = c;
1957         }
1958         else if (i < dd->comm->nat[ddnatVSITE])
1959         {
1960             b = dd->comm->zones.n;
1961         }
1962         else
1963         {
1964             b = dd->comm->zones.n + 1;
1965         }
1966         fprintf(out,strlen(atomname)<4 ? format : format4,
1967                 "ATOM",(ii+1)%100000,
1968                 atomname,resname,' ',resnr%10000,' ',
1969                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1970     }
1971     fprintf(out,"TER\n");
1972     
1973     gmx_fio_fclose(out);
1974 }
1975
1976 real dd_cutoff_mbody(gmx_domdec_t *dd)
1977 {
1978     gmx_domdec_comm_t *comm;
1979     int  di;
1980     real r;
1981
1982     comm = dd->comm;
1983
1984     r = -1;
1985     if (comm->bInterCGBondeds)
1986     {
1987         if (comm->cutoff_mbody > 0)
1988         {
1989             r = comm->cutoff_mbody;
1990         }
1991         else
1992         {
1993             /* cutoff_mbody=0 means we do not have DLB */
1994             r = comm->cellsize_min[dd->dim[0]];
1995             for(di=1; di<dd->ndim; di++)
1996             {
1997                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1998             }
1999             if (comm->bBondComm)
2000             {
2001                 r = max(r,comm->cutoff_mbody);
2002             }
2003             else
2004             {
2005                 r = min(r,comm->cutoff);
2006             }
2007         }
2008     }
2009
2010     return r;
2011 }
2012
2013 real dd_cutoff_twobody(gmx_domdec_t *dd)
2014 {
2015     real r_mb;
2016
2017     r_mb = dd_cutoff_mbody(dd);
2018
2019     return max(dd->comm->cutoff,r_mb);
2020 }
2021
2022
2023 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2024 {
2025     int nc,ntot;
2026     
2027     nc   = dd->nc[dd->comm->cartpmedim];
2028     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2029     copy_ivec(coord,coord_pme);
2030     coord_pme[dd->comm->cartpmedim] =
2031         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2032 }
2033
2034 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2035 {
2036     /* Here we assign a PME node to communicate with this DD node
2037      * by assuming that the major index of both is x.
2038      * We add cr->npmenodes/2 to obtain an even distribution.
2039      */
2040     return (ddindex*npme + npme/2)/ndd;
2041 }
2042
2043 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2044 {
2045     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2046 }
2047
2048 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2049 {
2050     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2051 }
2052
2053 static int *dd_pmenodes(t_commrec *cr)
2054 {
2055     int *pmenodes;
2056     int n,i,p0,p1;
2057     
2058     snew(pmenodes,cr->npmenodes);
2059     n = 0;
2060     for(i=0; i<cr->dd->nnodes; i++) {
2061         p0 = cr_ddindex2pmeindex(cr,i);
2062         p1 = cr_ddindex2pmeindex(cr,i+1);
2063         if (i+1 == cr->dd->nnodes || p1 > p0) {
2064             if (debug)
2065                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2066             pmenodes[n] = i + 1 + n;
2067             n++;
2068         }
2069     }
2070
2071     return pmenodes;
2072 }
2073
2074 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2075 {
2076     gmx_domdec_t *dd;
2077     ivec coords,coords_pme,nc;
2078     int  slab;
2079     
2080     dd = cr->dd;
2081     /*
2082       if (dd->comm->bCartesian) {
2083       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2084       dd_coords2pmecoords(dd,coords,coords_pme);
2085       copy_ivec(dd->ntot,nc);
2086       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2087       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2088       
2089       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2090       } else {
2091       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2092       }
2093     */
2094     coords[XX] = x;
2095     coords[YY] = y;
2096     coords[ZZ] = z;
2097     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2098     
2099     return slab;
2100 }
2101
2102 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2103 {
2104     gmx_domdec_comm_t *comm;
2105     ivec coords;
2106     int  ddindex,nodeid=-1;
2107     
2108     comm = cr->dd->comm;
2109     
2110     coords[XX] = x;
2111     coords[YY] = y;
2112     coords[ZZ] = z;
2113     if (comm->bCartesianPP_PME)
2114     {
2115 #ifdef GMX_MPI
2116         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2117 #endif
2118     }
2119     else
2120     {
2121         ddindex = dd_index(cr->dd->nc,coords);
2122         if (comm->bCartesianPP)
2123         {
2124             nodeid = comm->ddindex2simnodeid[ddindex];
2125         }
2126         else
2127         {
2128             if (comm->pmenodes)
2129             {
2130                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2131             }
2132             else
2133             {
2134                 nodeid = ddindex;
2135             }
2136         }
2137     }
2138   
2139     return nodeid;
2140 }
2141
2142 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2143 {
2144     gmx_domdec_t *dd;
2145     gmx_domdec_comm_t *comm;
2146     ivec coord,coord_pme;
2147     int  i;
2148     int  pmenode=-1;
2149     
2150     dd = cr->dd;
2151     comm = dd->comm;
2152     
2153     /* This assumes a uniform x domain decomposition grid cell size */
2154     if (comm->bCartesianPP_PME)
2155     {
2156 #ifdef GMX_MPI
2157         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2158         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2159         {
2160             /* This is a PP node */
2161             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2162             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2163         }
2164 #endif
2165     }
2166     else if (comm->bCartesianPP)
2167     {
2168         if (sim_nodeid < dd->nnodes)
2169         {
2170             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2171         }
2172     }
2173     else
2174     {
2175         /* This assumes DD cells with identical x coordinates
2176          * are numbered sequentially.
2177          */
2178         if (dd->comm->pmenodes == NULL)
2179         {
2180             if (sim_nodeid < dd->nnodes)
2181             {
2182                 /* The DD index equals the nodeid */
2183                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2184             }
2185         }
2186         else
2187         {
2188             i = 0;
2189             while (sim_nodeid > dd->comm->pmenodes[i])
2190             {
2191                 i++;
2192             }
2193             if (sim_nodeid < dd->comm->pmenodes[i])
2194             {
2195                 pmenode = dd->comm->pmenodes[i];
2196             }
2197         }
2198     }
2199     
2200     return pmenode;
2201 }
2202
2203 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2204 {
2205     gmx_bool bPMEOnlyNode;
2206     
2207     if (DOMAINDECOMP(cr))
2208     {
2209         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2210     }
2211     else
2212     {
2213         bPMEOnlyNode = FALSE;
2214     }
2215     
2216     return bPMEOnlyNode;
2217 }
2218
2219 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2220                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2221 {
2222     gmx_domdec_t *dd;
2223     int x,y,z;
2224     ivec coord,coord_pme;
2225     
2226     dd = cr->dd;
2227     
2228     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2229     
2230     *nmy_ddnodes = 0;
2231     for(x=0; x<dd->nc[XX]; x++)
2232     {
2233         for(y=0; y<dd->nc[YY]; y++)
2234         {
2235             for(z=0; z<dd->nc[ZZ]; z++)
2236             {
2237                 if (dd->comm->bCartesianPP_PME)
2238                 {
2239                     coord[XX] = x;
2240                     coord[YY] = y;
2241                     coord[ZZ] = z;
2242                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2243                     if (dd->ci[XX] == coord_pme[XX] &&
2244                         dd->ci[YY] == coord_pme[YY] &&
2245                         dd->ci[ZZ] == coord_pme[ZZ])
2246                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2247                 }
2248                 else
2249                 {
2250                     /* The slab corresponds to the nodeid in the PME group */
2251                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2252                     {
2253                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2254                     }
2255                 }
2256             }
2257         }
2258     }
2259     
2260     /* The last PP-only node is the peer node */
2261     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2262     
2263     if (debug)
2264     {
2265         fprintf(debug,"Receive coordinates from PP nodes:");
2266         for(x=0; x<*nmy_ddnodes; x++)
2267         {
2268             fprintf(debug," %d",(*my_ddnodes)[x]);
2269         }
2270         fprintf(debug,"\n");
2271     }
2272 }
2273
2274 static gmx_bool receive_vir_ener(t_commrec *cr)
2275 {
2276     gmx_domdec_comm_t *comm;
2277     int  pmenode,coords[DIM],rank;
2278     gmx_bool bReceive;
2279     
2280     bReceive = TRUE;
2281     if (cr->npmenodes < cr->dd->nnodes)
2282     {
2283         comm = cr->dd->comm;
2284         if (comm->bCartesianPP_PME)
2285         {
2286             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2287 #ifdef GMX_MPI
2288             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2289             coords[comm->cartpmedim]++;
2290             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2291             {
2292                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2293                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2294                 {
2295                     /* This is not the last PP node for pmenode */
2296                     bReceive = FALSE;
2297                 }
2298             }
2299 #endif  
2300         }
2301         else
2302         {
2303             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2304             if (cr->sim_nodeid+1 < cr->nnodes &&
2305                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2306             {
2307                 /* This is not the last PP node for pmenode */
2308                 bReceive = FALSE;
2309             }
2310         }
2311     }
2312     
2313     return bReceive;
2314 }
2315
2316 static void set_zones_ncg_home(gmx_domdec_t *dd)
2317 {
2318     gmx_domdec_zones_t *zones;
2319     int i;
2320
2321     zones = &dd->comm->zones;
2322
2323     zones->cg_range[0] = 0;
2324     for(i=1; i<zones->n+1; i++)
2325     {
2326         zones->cg_range[i] = dd->ncg_home;
2327     }
2328 }
2329
2330 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2331 {
2332     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2333     
2334     ind = state->cg_gl;
2335     dd_cg_gl = dd->index_gl;
2336     cgindex  = dd->cgindex;
2337     nat = 0;
2338     cgindex[0] = nat;
2339     for(i=0; i<state->ncg_gl; i++)
2340     {
2341         cgindex[i] = nat;
2342         cg_gl = ind[i];
2343         dd_cg_gl[i] = cg_gl;
2344         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2345     }
2346     cgindex[i] = nat;
2347     
2348     dd->ncg_home = state->ncg_gl;
2349     dd->nat_home = nat;
2350
2351     set_zones_ncg_home(dd);
2352 }
2353
2354 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2355 {
2356     while (cg >= cginfo_mb->cg_end)
2357     {
2358         cginfo_mb++;
2359     }
2360
2361     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2362 }
2363
2364 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2365                           t_forcerec *fr,char *bLocalCG)
2366 {
2367     cginfo_mb_t *cginfo_mb;
2368     int *cginfo;
2369     int cg;
2370
2371     if (fr != NULL)
2372     {
2373         cginfo_mb = fr->cginfo_mb;
2374         cginfo    = fr->cginfo;
2375
2376         for(cg=cg0; cg<cg1; cg++)
2377         {
2378             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2379         }
2380     }
2381
2382     if (bLocalCG != NULL)
2383     {
2384         for(cg=cg0; cg<cg1; cg++)
2385         {
2386             bLocalCG[index_gl[cg]] = TRUE;
2387         }
2388     }
2389 }
2390
2391 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2392 {
2393     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2394     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2395     gmx_ga2la_t *ga2la;
2396     char *bLocalCG;
2397
2398     bLocalCG = dd->comm->bLocalCG;
2399
2400     if (dd->nat_tot > dd->gatindex_nalloc)
2401     {
2402         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2403         srenew(dd->gatindex,dd->gatindex_nalloc);
2404     }
2405
2406     nzone      = dd->comm->zones.n;
2407     zone2cg    = dd->comm->zones.cg_range;
2408     zone_ncg1  = dd->comm->zone_ncg1;
2409     index_gl   = dd->index_gl;
2410     gatindex   = dd->gatindex;
2411
2412     if (zone2cg[1] != dd->ncg_home)
2413     {
2414         gmx_incons("dd->ncg_zone is not up to date");
2415     }
2416     
2417     /* Make the local to global and global to local atom index */
2418     a = dd->cgindex[cg_start];
2419     for(zone=0; zone<nzone; zone++)
2420     {
2421         if (zone == 0)
2422         {
2423             cg0 = cg_start;
2424         }
2425         else
2426         {
2427             cg0 = zone2cg[zone];
2428         }
2429         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2430         {
2431             zone1 = zone;
2432             if (cg - cg0 >= zone_ncg1[zone])
2433             {
2434                 /* Signal that this cg is from more than one zone away */
2435                 zone1 += nzone;
2436             }
2437             cg_gl = index_gl[cg];
2438             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2439             {
2440                 gatindex[a] = a_gl;
2441                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2442                 a++;
2443             }
2444         }
2445     }
2446 }
2447
2448 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2449                           const char *where)
2450 {
2451     int ncg,i,ngl,nerr;
2452
2453     nerr = 0;
2454     if (bLocalCG == NULL)
2455     {
2456         return nerr;
2457     }
2458     for(i=0; i<dd->ncg_tot; i++)
2459     {
2460         if (!bLocalCG[dd->index_gl[i]])
2461         {
2462             fprintf(stderr,
2463                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2464             nerr++;
2465         }
2466     }
2467     ngl = 0;
2468     for(i=0; i<ncg_sys; i++)
2469     {
2470         if (bLocalCG[i])
2471         {
2472             ngl++;
2473         }
2474     }
2475     if (ngl != dd->ncg_tot)
2476     {
2477         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2478         nerr++;
2479     }
2480
2481     return nerr;
2482 }
2483
2484 static void check_index_consistency(gmx_domdec_t *dd,
2485                                     int natoms_sys,int ncg_sys,
2486                                     const char *where)
2487 {
2488     int  nerr,ngl,i,a,cell;
2489     int  *have;
2490
2491     nerr = 0;
2492
2493     if (dd->comm->DD_debug > 1)
2494     {
2495         snew(have,natoms_sys);
2496         for(a=0; a<dd->nat_tot; a++)
2497         {
2498             if (have[dd->gatindex[a]] > 0)
2499             {
2500                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2501             }
2502             else
2503             {
2504                 have[dd->gatindex[a]] = a + 1;
2505             }
2506         }
2507         sfree(have);
2508     }
2509
2510     snew(have,dd->nat_tot);
2511
2512     ngl  = 0;
2513     for(i=0; i<natoms_sys; i++)
2514     {
2515         if (ga2la_get(dd->ga2la,i,&a,&cell))
2516         {
2517             if (a >= dd->nat_tot)
2518             {
2519                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2520                 nerr++;
2521             }
2522             else
2523             {
2524                 have[a] = 1;
2525                 if (dd->gatindex[a] != i)
2526                 {
2527                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2528                     nerr++;
2529                 }
2530             }
2531             ngl++;
2532         }
2533     }
2534     if (ngl != dd->nat_tot)
2535     {
2536         fprintf(stderr,
2537                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2538                 dd->rank,where,ngl,dd->nat_tot);
2539     }
2540     for(a=0; a<dd->nat_tot; a++)
2541     {
2542         if (have[a] == 0)
2543         {
2544             fprintf(stderr,
2545                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2546                     dd->rank,where,a+1,dd->gatindex[a]+1);
2547         }
2548     }
2549     sfree(have);
2550
2551     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2552
2553     if (nerr > 0) {
2554         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2555                   dd->rank,where,nerr);
2556     }
2557 }
2558
2559 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2560 {
2561     int  i;
2562     char *bLocalCG;
2563
2564     if (a_start == 0)
2565     {
2566         /* Clear the whole list without searching */
2567         ga2la_clear(dd->ga2la);
2568     }
2569     else
2570     {
2571         for(i=a_start; i<dd->nat_tot; i++)
2572         {
2573             ga2la_del(dd->ga2la,dd->gatindex[i]);
2574         }
2575     }
2576
2577     bLocalCG = dd->comm->bLocalCG;
2578     if (bLocalCG)
2579     {
2580         for(i=cg_start; i<dd->ncg_tot; i++)
2581         {
2582             bLocalCG[dd->index_gl[i]] = FALSE;
2583         }
2584     }
2585
2586     dd_clear_local_vsite_indices(dd);
2587     
2588     if (dd->constraints)
2589     {
2590         dd_clear_local_constraint_indices(dd);
2591     }
2592 }
2593
2594 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2595 {
2596     real grid_jump_limit;
2597
2598     /* The distance between the boundaries of cells at distance
2599      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2600      * and by the fact that cells should not be shifted by more than
2601      * half their size, such that cg's only shift by one cell
2602      * at redecomposition.
2603      */
2604     grid_jump_limit = comm->cellsize_limit;
2605     if (!comm->bVacDLBNoLimit)
2606     {
2607         grid_jump_limit = max(grid_jump_limit,
2608                               comm->cutoff/comm->cd[dim_ind].np);
2609     }
2610
2611     return grid_jump_limit;
2612 }
2613
2614 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2615 {
2616     gmx_domdec_comm_t *comm;
2617     int  d,dim;
2618     real limit,bfac;
2619     
2620     comm = dd->comm;
2621     
2622     for(d=1; d<dd->ndim; d++)
2623     {
2624         dim = dd->dim[d];
2625         limit = grid_jump_limit(comm,d);
2626         bfac = ddbox->box_size[dim];
2627         if (ddbox->tric_dir[dim])
2628         {
2629             bfac *= ddbox->skew_fac[dim];
2630         }
2631         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2632             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2633         {
2634             char buf[22];
2635             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2636                       gmx_step_str(step,buf),
2637                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2638         }
2639     }
2640 }
2641
2642 static int dd_load_count(gmx_domdec_comm_t *comm)
2643 {
2644     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2645 }
2646
2647 static float dd_force_load(gmx_domdec_comm_t *comm)
2648 {
2649     float load;
2650     
2651     if (comm->eFlop)
2652     {
2653         load = comm->flop;
2654         if (comm->eFlop > 1)
2655         {
2656             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2657         }
2658     } 
2659     else
2660     {
2661         load = comm->cycl[ddCyclF];
2662         if (comm->cycl_n[ddCyclF] > 1)
2663         {
2664             /* Subtract the maximum of the last n cycle counts
2665              * to get rid of possible high counts due to other soures,
2666              * for instance system activity, that would otherwise
2667              * affect the dynamic load balancing.
2668              */
2669             load -= comm->cycl_max[ddCyclF];
2670         }
2671     }
2672     
2673     return load;
2674 }
2675
2676 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2677 {
2678     gmx_domdec_comm_t *comm;
2679     int i;
2680     
2681     comm = dd->comm;
2682     
2683     snew(*dim_f,dd->nc[dim]+1);
2684     (*dim_f)[0] = 0;
2685     for(i=1; i<dd->nc[dim]; i++)
2686     {
2687         if (comm->slb_frac[dim])
2688         {
2689             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2690         }
2691         else
2692         {
2693             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2694         }
2695     }
2696     (*dim_f)[dd->nc[dim]] = 1;
2697 }
2698
2699 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2700 {
2701     int  pmeindex,slab,nso,i;
2702     ivec xyz;
2703     
2704     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2705     {
2706         ddpme->dim = YY;
2707     }
2708     else
2709     {
2710         ddpme->dim = dimind;
2711     }
2712     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2713     
2714     ddpme->nslab = (ddpme->dim == 0 ?
2715                     dd->comm->npmenodes_x :
2716                     dd->comm->npmenodes_y);
2717
2718     if (ddpme->nslab <= 1)
2719     {
2720         return;
2721     }
2722
2723     nso = dd->comm->npmenodes/ddpme->nslab;
2724     /* Determine for each PME slab the PP location range for dimension dim */
2725     snew(ddpme->pp_min,ddpme->nslab);
2726     snew(ddpme->pp_max,ddpme->nslab);
2727     for(slab=0; slab<ddpme->nslab; slab++) {
2728         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2729         ddpme->pp_max[slab] = 0;
2730     }
2731     for(i=0; i<dd->nnodes; i++) {
2732         ddindex2xyz(dd->nc,i,xyz);
2733         /* For y only use our y/z slab.
2734          * This assumes that the PME x grid size matches the DD grid size.
2735          */
2736         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2737             pmeindex = ddindex2pmeindex(dd,i);
2738             if (dimind == 0) {
2739                 slab = pmeindex/nso;
2740             } else {
2741                 slab = pmeindex % ddpme->nslab;
2742             }
2743             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2744             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2745         }
2746     }
2747
2748     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2749 }
2750
2751 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2752 {
2753     if (dd->comm->ddpme[0].dim == XX)
2754     {
2755         return dd->comm->ddpme[0].maxshift;
2756     }
2757     else
2758     {
2759         return 0;
2760     }
2761 }
2762
2763 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2764 {
2765     if (dd->comm->ddpme[0].dim == YY)
2766     {
2767         return dd->comm->ddpme[0].maxshift;
2768     }
2769     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2770     {
2771         return dd->comm->ddpme[1].maxshift;
2772     }
2773     else
2774     {
2775         return 0;
2776     }
2777 }
2778
2779 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2780                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2781 {
2782     gmx_domdec_comm_t *comm;
2783     int  nc,ns,s;
2784     int  *xmin,*xmax;
2785     real range,pme_boundary;
2786     int  sh;
2787     
2788     comm = dd->comm;
2789     nc  = dd->nc[ddpme->dim];
2790     ns  = ddpme->nslab;
2791     
2792     if (!ddpme->dim_match)
2793     {
2794         /* PP decomposition is not along dim: the worst situation */
2795         sh = ns/2;
2796     }
2797     else if (ns <= 3 || (bUniform && ns == nc))
2798     {
2799         /* The optimal situation */
2800         sh = 1;
2801     }
2802     else
2803     {
2804         /* We need to check for all pme nodes which nodes they
2805          * could possibly need to communicate with.
2806          */
2807         xmin = ddpme->pp_min;
2808         xmax = ddpme->pp_max;
2809         /* Allow for atoms to be maximally 2/3 times the cut-off
2810          * out of their DD cell. This is a reasonable balance between
2811          * between performance and support for most charge-group/cut-off
2812          * combinations.
2813          */
2814         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2815         /* Avoid extra communication when we are exactly at a boundary */
2816         range *= 0.999;
2817         
2818         sh = 1;
2819         for(s=0; s<ns; s++)
2820         {
2821             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2822             pme_boundary = (real)s/ns;
2823             while (sh+1 < ns &&
2824                    ((s-(sh+1) >= 0 &&
2825                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2826                     (s-(sh+1) <  0 &&
2827                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2828             {
2829                 sh++;
2830             }
2831             pme_boundary = (real)(s+1)/ns;
2832             while (sh+1 < ns &&
2833                    ((s+(sh+1) <  ns &&
2834                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2835                     (s+(sh+1) >= ns &&
2836                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2837             {
2838                 sh++;
2839             }
2840         }
2841     }
2842     
2843     ddpme->maxshift = sh;
2844     
2845     if (debug)
2846     {
2847         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2848                 ddpme->dim,ddpme->maxshift);
2849     }
2850 }
2851
2852 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2853 {
2854     int d,dim;
2855     
2856     for(d=0; d<dd->ndim; d++)
2857     {
2858         dim = dd->dim[d];
2859         if (dim < ddbox->nboundeddim &&
2860             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2861             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2862         {
2863             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2864                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2865                       dd->nc[dim],dd->comm->cellsize_limit);
2866         }
2867     }
2868 }
2869
2870 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2871                                   gmx_bool bMaster,ivec npulse)
2872 {
2873     gmx_domdec_comm_t *comm;
2874     int  d,j;
2875     rvec cellsize_min;
2876     real *cell_x,cell_dx,cellsize;
2877     
2878     comm = dd->comm;
2879     
2880     for(d=0; d<DIM; d++)
2881     {
2882         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2883         npulse[d] = 1;
2884         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2885         {
2886             /* Uniform grid */
2887             cell_dx = ddbox->box_size[d]/dd->nc[d];
2888             if (bMaster)
2889             {
2890                 for(j=0; j<dd->nc[d]+1; j++)
2891                 {
2892                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2893                 }
2894             }
2895             else
2896             {
2897                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2898                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2899             }
2900             cellsize = cell_dx*ddbox->skew_fac[d];
2901             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2902             {
2903                 npulse[d]++;
2904             }
2905             cellsize_min[d] = cellsize;
2906         }
2907         else
2908         {
2909             /* Statically load balanced grid */
2910             /* Also when we are not doing a master distribution we determine
2911              * all cell borders in a loop to obtain identical values
2912              * to the master distribution case and to determine npulse.
2913              */
2914             if (bMaster)
2915             {
2916                 cell_x = dd->ma->cell_x[d];
2917             }
2918             else
2919             {
2920                 snew(cell_x,dd->nc[d]+1);
2921             }
2922             cell_x[0] = ddbox->box0[d];
2923             for(j=0; j<dd->nc[d]; j++)
2924             {
2925                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2926                 cell_x[j+1] = cell_x[j] + cell_dx;
2927                 cellsize = cell_dx*ddbox->skew_fac[d];
2928                 while (cellsize*npulse[d] < comm->cutoff &&
2929                        npulse[d] < dd->nc[d]-1)
2930                 {
2931                     npulse[d]++;
2932                 }
2933                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2934             }
2935             if (!bMaster)
2936             {
2937                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2938                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2939                 sfree(cell_x);
2940             }
2941         }
2942         /* The following limitation is to avoid that a cell would receive
2943          * some of its own home charge groups back over the periodic boundary.
2944          * Double charge groups cause trouble with the global indices.
2945          */
2946         if (d < ddbox->npbcdim &&
2947             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2948         {
2949             gmx_fatal_collective(FARGS,NULL,dd,
2950                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2951                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2952                                  comm->cutoff,
2953                                  dd->nc[d],dd->nc[d],
2954                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2955         }
2956     }
2957     
2958     if (!comm->bDynLoadBal)
2959     {
2960         copy_rvec(cellsize_min,comm->cellsize_min);
2961     }
2962    
2963     for(d=0; d<comm->npmedecompdim; d++)
2964     {
2965         set_pme_maxshift(dd,&comm->ddpme[d],
2966                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2967                          comm->ddpme[d].slb_dim_f);
2968     }
2969 }
2970
2971
2972 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2973                                        int d,int dim,gmx_domdec_root_t *root,
2974                                        gmx_ddbox_t *ddbox,
2975                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2976 {
2977     gmx_domdec_comm_t *comm;
2978     int  ncd,i,j,nmin,nmin_old;
2979     gmx_bool bLimLo,bLimHi;
2980     real *cell_size;
2981     real fac,halfway,cellsize_limit_f_i,region_size;
2982     gmx_bool bPBC,bLastHi=FALSE;
2983     int nrange[]={range[0],range[1]};
2984
2985     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
2986
2987     comm = dd->comm;
2988
2989     ncd = dd->nc[dim];
2990
2991     bPBC = (dim < ddbox->npbcdim);
2992
2993     cell_size = root->buf_ncd;
2994
2995     if (debug) 
2996     {
2997         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2998     }
2999
3000     /* First we need to check if the scaling does not make cells
3001      * smaller than the smallest allowed size.
3002      * We need to do this iteratively, since if a cell is too small,
3003      * it needs to be enlarged, which makes all the other cells smaller,
3004      * which could in turn make another cell smaller than allowed.
3005      */
3006     for(i=range[0]; i<range[1]; i++)
3007     {
3008         root->bCellMin[i] = FALSE;
3009     }
3010     nmin = 0;
3011     do
3012     {
3013         nmin_old = nmin;
3014         /* We need the total for normalization */
3015         fac = 0;
3016         for(i=range[0]; i<range[1]; i++)
3017         {
3018             if (root->bCellMin[i] == FALSE)
3019             {
3020                 fac += cell_size[i];
3021             }
3022         }
3023         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3024         /* Determine the cell boundaries */
3025         for(i=range[0]; i<range[1]; i++)
3026         {
3027             if (root->bCellMin[i] == FALSE)
3028             {
3029                 cell_size[i] *= fac;
3030                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3031                 {
3032                     cellsize_limit_f_i = 0;
3033                 }
3034                 else
3035                 {
3036                     cellsize_limit_f_i = cellsize_limit_f;
3037                 }
3038                 if (cell_size[i] < cellsize_limit_f_i)
3039                 {
3040                     root->bCellMin[i] = TRUE;
3041                     cell_size[i] = cellsize_limit_f_i;
3042                     nmin++;
3043                 }
3044             }
3045             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3046         }
3047     }
3048     while (nmin > nmin_old);
3049     
3050     i=range[1]-1;
3051     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3052     /* For this check we should not use DD_CELL_MARGIN,
3053      * but a slightly smaller factor,
3054      * since rounding could get use below the limit.
3055      */
3056     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3057     {
3058         char buf[22];
3059         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3060                   gmx_step_str(step,buf),
3061                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3062                   ncd,comm->cellsize_min[dim]);
3063     }
3064     
3065     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3066     
3067     if (!bUniform)
3068     {
3069         /* Check if the boundary did not displace more than halfway
3070          * each of the cells it bounds, as this could cause problems,
3071          * especially when the differences between cell sizes are large.
3072          * If changes are applied, they will not make cells smaller
3073          * than the cut-off, as we check all the boundaries which
3074          * might be affected by a change and if the old state was ok,
3075          * the cells will at most be shrunk back to their old size.
3076          */
3077         for(i=range[0]+1; i<range[1]; i++)
3078         {
3079             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3080             if (root->cell_f[i] < halfway)
3081             {
3082                 root->cell_f[i] = halfway;
3083                 /* Check if the change also causes shifts of the next boundaries */
3084                 for(j=i+1; j<range[1]; j++)
3085                 {
3086                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3087                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3088                 }
3089             }
3090             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3091             if (root->cell_f[i] > halfway)
3092             {
3093                 root->cell_f[i] = halfway;
3094                 /* Check if the change also causes shifts of the next boundaries */
3095                 for(j=i-1; j>=range[0]+1; j--)
3096                 {
3097                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3098                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3099                 }
3100             }
3101         }
3102     }
3103     
3104     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3105     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3106      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3107      * for a and b nrange is used */
3108     if (d > 0)
3109     {
3110         /* Take care of the staggering of the cell boundaries */
3111         if (bUniform)
3112         {
3113             for(i=range[0]; i<range[1]; i++)
3114             {
3115                 root->cell_f_max0[i] = root->cell_f[i];
3116                 root->cell_f_min1[i] = root->cell_f[i+1];
3117             }
3118         }
3119         else
3120         {
3121             for(i=range[0]+1; i<range[1]; i++)
3122             {
3123                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3124                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3125                 if (bLimLo && bLimHi)
3126                 {
3127                     /* Both limits violated, try the best we can */
3128                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3129                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3130                     nrange[0]=range[0];
3131                     nrange[1]=i;
3132                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3133
3134                     nrange[0]=i;
3135                     nrange[1]=range[1];
3136                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3137
3138                     return;
3139                 }
3140                 else if (bLimLo)
3141                 {
3142                     /* root->cell_f[i] = root->bound_min[i]; */
3143                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3144                     bLastHi=FALSE;
3145                 }
3146                 else if (bLimHi && !bLastHi)
3147                 {
3148                     bLastHi=TRUE;
3149                     if (nrange[1] < range[1])   /* found a LimLo before */
3150                     {
3151                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3152                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3153                         nrange[0]=nrange[1];
3154                     }
3155                     root->cell_f[i] = root->bound_max[i];
3156                     nrange[1]=i; 
3157                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3158                     nrange[0]=i;
3159                     nrange[1]=range[1];
3160                 }
3161             }
3162             if (nrange[1] < range[1])   /* found last a LimLo */
3163             {
3164                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3165                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3166                 nrange[0]=nrange[1];
3167                 nrange[1]=range[1];
3168                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3169             } 
3170             else if (nrange[0] > range[0]) /* found at least one LimHi */
3171             {
3172                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3173             }
3174         }
3175     }
3176 }
3177
3178
3179 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3180                                        int d,int dim,gmx_domdec_root_t *root,
3181                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3182                                        gmx_bool bUniform,gmx_large_int_t step)
3183 {
3184     gmx_domdec_comm_t *comm;
3185     int  ncd,d1,i,j,pos;
3186     real *cell_size;
3187     real load_aver,load_i,imbalance,change,change_max,sc;
3188     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3189     real change_limit;
3190     real relax = 0.5;
3191     gmx_bool bPBC;
3192     int range[] = { 0, 0 };
3193
3194     comm = dd->comm;
3195
3196     /* Convert the maximum change from the input percentage to a fraction */
3197     change_limit = comm->dlb_scale_lim*0.01;
3198
3199     ncd = dd->nc[dim];
3200
3201     bPBC = (dim < ddbox->npbcdim);
3202
3203     cell_size = root->buf_ncd;
3204
3205     /* Store the original boundaries */
3206     for(i=0; i<ncd+1; i++)
3207     {
3208         root->old_cell_f[i] = root->cell_f[i];
3209     }
3210     if (bUniform) {
3211         for(i=0; i<ncd; i++)
3212         {
3213             cell_size[i] = 1.0/ncd;
3214         }
3215     }
3216     else if (dd_load_count(comm))
3217     {
3218         load_aver = comm->load[d].sum_m/ncd;
3219         change_max = 0;
3220         for(i=0; i<ncd; i++)
3221         {
3222             /* Determine the relative imbalance of cell i */
3223             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3224             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3225             /* Determine the change of the cell size using underrelaxation */
3226             change = -relax*imbalance;
3227             change_max = max(change_max,max(change,-change));
3228         }
3229         /* Limit the amount of scaling.
3230          * We need to use the same rescaling for all cells in one row,
3231          * otherwise the load balancing might not converge.
3232          */
3233         sc = relax;
3234         if (change_max > change_limit)
3235         {
3236             sc *= change_limit/change_max;
3237         }
3238         for(i=0; i<ncd; i++)
3239         {
3240             /* Determine the relative imbalance of cell i */
3241             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3242             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3243             /* Determine the change of the cell size using underrelaxation */
3244             change = -sc*imbalance;
3245             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3246         }
3247     }
3248     
3249     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3250     cellsize_limit_f *= DD_CELL_MARGIN;
3251     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3252     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3253     if (ddbox->tric_dir[dim])
3254     {
3255         cellsize_limit_f /= ddbox->skew_fac[dim];
3256         dist_min_f       /= ddbox->skew_fac[dim];
3257     }
3258     if (bDynamicBox && d > 0)
3259     {
3260         dist_min_f *= DD_PRES_SCALE_MARGIN;
3261     }
3262     if (d > 0 && !bUniform)
3263     {
3264         /* Make sure that the grid is not shifted too much */
3265         for(i=1; i<ncd; i++) {
3266             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3267             {
3268                 gmx_incons("Inconsistent DD boundary staggering limits!");
3269             }
3270             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3271             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3272             if (space > 0) {
3273                 root->bound_min[i] += 0.5*space;
3274             }
3275             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3276             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3277             if (space < 0) {
3278                 root->bound_max[i] += 0.5*space;
3279             }
3280             if (debug)
3281             {
3282                 fprintf(debug,
3283                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3284                         d,i,
3285                         root->cell_f_max0[i-1] + dist_min_f,
3286                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3287                         root->cell_f_min1[i] - dist_min_f);
3288             }
3289         }
3290     }
3291     range[1]=ncd;
3292     root->cell_f[0] = 0;
3293     root->cell_f[ncd] = 1;
3294     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3295
3296
3297     /* After the checks above, the cells should obey the cut-off
3298      * restrictions, but it does not hurt to check.
3299      */
3300     for(i=0; i<ncd; i++)
3301     {
3302         if (debug)
3303         {
3304             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3305                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3306         }
3307
3308         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3309             root->cell_f[i+1] - root->cell_f[i] <
3310             cellsize_limit_f/DD_CELL_MARGIN)
3311         {
3312             char buf[22];
3313             fprintf(stderr,
3314                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3315                     gmx_step_str(step,buf),dim2char(dim),i,
3316                     (root->cell_f[i+1] - root->cell_f[i])
3317                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3318         }
3319     }
3320     
3321     pos = ncd + 1;
3322     /* Store the cell boundaries of the lower dimensions at the end */
3323     for(d1=0; d1<d; d1++)
3324     {
3325         root->cell_f[pos++] = comm->cell_f0[d1];
3326         root->cell_f[pos++] = comm->cell_f1[d1];
3327     }
3328     
3329     if (d < comm->npmedecompdim)
3330     {
3331         /* The master determines the maximum shift for
3332          * the coordinate communication between separate PME nodes.
3333          */
3334         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3335     }
3336     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3337     if (d >= 1)
3338     {
3339         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3340     }
3341 }    
3342
3343 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3344                                              gmx_ddbox_t *ddbox,int dimind)
3345 {
3346     gmx_domdec_comm_t *comm;
3347     int dim;
3348
3349     comm = dd->comm;
3350
3351     /* Set the cell dimensions */
3352     dim = dd->dim[dimind];
3353     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3354     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3355     if (dim >= ddbox->nboundeddim)
3356     {
3357         comm->cell_x0[dim] += ddbox->box0[dim];
3358         comm->cell_x1[dim] += ddbox->box0[dim];
3359     }
3360 }
3361
3362 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3363                                          int d,int dim,real *cell_f_row,
3364                                          gmx_ddbox_t *ddbox)
3365 {
3366     gmx_domdec_comm_t *comm;
3367     int d1,dim1,pos;
3368
3369     comm = dd->comm;
3370
3371 #ifdef GMX_MPI
3372     /* Each node would only need to know two fractions,
3373      * but it is probably cheaper to broadcast the whole array.
3374      */
3375     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3376               0,comm->mpi_comm_load[d]);
3377 #endif
3378     /* Copy the fractions for this dimension from the buffer */
3379     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3380     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3381     /* The whole array was communicated, so set the buffer position */
3382     pos = dd->nc[dim] + 1;
3383     for(d1=0; d1<=d; d1++)
3384     {
3385         if (d1 < d)
3386         {
3387             /* Copy the cell fractions of the lower dimensions */
3388             comm->cell_f0[d1] = cell_f_row[pos++];
3389             comm->cell_f1[d1] = cell_f_row[pos++];
3390         }
3391         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3392     }
3393     /* Convert the communicated shift from float to int */
3394     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3395     if (d >= 1)
3396     {
3397         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3398     }
3399 }
3400
3401 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3402                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3403                                          gmx_bool bUniform,gmx_large_int_t step)
3404 {
3405     gmx_domdec_comm_t *comm;
3406     int d,dim,d1;
3407     gmx_bool bRowMember,bRowRoot;
3408     real *cell_f_row;
3409     
3410     comm = dd->comm;
3411
3412     for(d=0; d<dd->ndim; d++)
3413     {
3414         dim = dd->dim[d];
3415         bRowMember = TRUE;
3416         bRowRoot = TRUE;
3417         for(d1=d; d1<dd->ndim; d1++)
3418         {
3419             if (dd->ci[dd->dim[d1]] > 0)
3420             {
3421                 if (d1 > d)
3422                 {
3423                     bRowMember = FALSE;
3424                 }
3425                 bRowRoot = FALSE;
3426             }
3427         }
3428         if (bRowMember)
3429         {
3430             if (bRowRoot)
3431             {
3432                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3433                                            ddbox,bDynamicBox,bUniform,step);
3434                 cell_f_row = comm->root[d]->cell_f;
3435             }
3436             else
3437             {
3438                 cell_f_row = comm->cell_f_row;
3439             }
3440             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3441         }
3442     }
3443 }    
3444
3445 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3446 {
3447     int d;
3448
3449     /* This function assumes the box is static and should therefore
3450      * not be called when the box has changed since the last
3451      * call to dd_partition_system.
3452      */
3453     for(d=0; d<dd->ndim; d++)
3454     {
3455         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3456     }
3457 }
3458
3459
3460
3461 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3462                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3463                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3464                                   gmx_wallcycle_t wcycle)
3465 {
3466     gmx_domdec_comm_t *comm;
3467     int dim;
3468
3469     comm = dd->comm;
3470     
3471     if (bDoDLB)
3472     {
3473         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3474         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3475         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3476     }
3477     else if (bDynamicBox)
3478     {
3479         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3480     }
3481     
3482     /* Set the dimensions for which no DD is used */
3483     for(dim=0; dim<DIM; dim++) {
3484         if (dd->nc[dim] == 1) {
3485             comm->cell_x0[dim] = 0;
3486             comm->cell_x1[dim] = ddbox->box_size[dim];
3487             if (dim >= ddbox->nboundeddim)
3488             {
3489                 comm->cell_x0[dim] += ddbox->box0[dim];
3490                 comm->cell_x1[dim] += ddbox->box0[dim];
3491             }
3492         }
3493     }
3494 }
3495
3496 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3497 {
3498     int d,np,i;
3499     gmx_domdec_comm_dim_t *cd;
3500     
3501     for(d=0; d<dd->ndim; d++)
3502     {
3503         cd = &dd->comm->cd[d];
3504         np = npulse[dd->dim[d]];
3505         if (np > cd->np_nalloc)
3506         {
3507             if (debug)
3508             {
3509                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3510                         dim2char(dd->dim[d]),np);
3511             }
3512             if (DDMASTER(dd) && cd->np_nalloc > 0)
3513             {
3514                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3515             }
3516             srenew(cd->ind,np);
3517             for(i=cd->np_nalloc; i<np; i++)
3518             {
3519                 cd->ind[i].index  = NULL;
3520                 cd->ind[i].nalloc = 0;
3521             }
3522             cd->np_nalloc = np;
3523         }
3524         cd->np = np;
3525     }
3526 }
3527
3528
3529 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3530                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3531                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3532                               gmx_wallcycle_t wcycle)
3533 {
3534     gmx_domdec_comm_t *comm;
3535     int  d;
3536     ivec npulse;
3537     
3538     comm = dd->comm;
3539
3540     /* Copy the old cell boundaries for the cg displacement check */
3541     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3542     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3543     
3544     if (comm->bDynLoadBal)
3545     {
3546         if (DDMASTER(dd))
3547         {
3548             check_box_size(dd,ddbox);
3549         }
3550         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3551     }
3552     else
3553     {
3554         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3555         realloc_comm_ind(dd,npulse);
3556     }
3557     
3558     if (debug)
3559     {
3560         for(d=0; d<DIM; d++)
3561         {
3562             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3563                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3564         }
3565     }
3566 }
3567
3568 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3569                                   gmx_ddbox_t *ddbox,
3570                                   rvec cell_ns_x0,rvec cell_ns_x1,
3571                                   gmx_large_int_t step)
3572 {
3573     gmx_domdec_comm_t *comm;
3574     int dim_ind,dim;
3575     
3576     comm = dd->comm;
3577
3578     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3579     {
3580         dim = dd->dim[dim_ind];
3581         
3582         /* Without PBC we don't have restrictions on the outer cells */
3583         if (!(dim >= ddbox->npbcdim && 
3584               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3585             comm->bDynLoadBal &&
3586             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3587             comm->cellsize_min[dim])
3588         {
3589             char buf[22];
3590             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3591                       gmx_step_str(step,buf),dim2char(dim),
3592                       comm->cell_x1[dim] - comm->cell_x0[dim],
3593                       ddbox->skew_fac[dim],
3594                       dd->comm->cellsize_min[dim],
3595                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3596         }
3597     }
3598     
3599     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3600     {
3601         /* Communicate the boundaries and update cell_ns_x0/1 */
3602         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3603         if (dd->bGridJump && dd->ndim > 1)
3604         {
3605             check_grid_jump(step,dd,ddbox);
3606         }
3607     }
3608 }
3609
3610 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3611 {
3612     if (YY < npbcdim)
3613     {
3614         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3615     }
3616     else
3617     {
3618         tcm[YY][XX] = 0;
3619     }
3620     if (ZZ < npbcdim)
3621     {
3622         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3623         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3624     }
3625     else
3626     {
3627         tcm[ZZ][XX] = 0;
3628         tcm[ZZ][YY] = 0;
3629     }
3630 }
3631
3632 static void check_screw_box(matrix box)
3633 {
3634     /* Mathematical limitation */
3635     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3636     {
3637         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3638     }
3639     
3640     /* Limitation due to the asymmetry of the eighth shell method */
3641     if (box[ZZ][YY] != 0)
3642     {
3643         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3644     }
3645 }
3646
3647 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3648                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3649                           gmx_domdec_t *dd)
3650 {
3651     gmx_domdec_master_t *ma;
3652     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3653     int  i,icg,j,k,k0,k1,d,npbcdim;
3654     matrix tcm;
3655     rvec box_size,cg_cm;
3656     ivec ind;
3657     real nrcg,inv_ncg,pos_d;
3658     atom_id *cgindex;
3659     gmx_bool bUnbounded,bScrew;
3660
3661     ma = dd->ma;
3662     
3663     if (tmp_ind == NULL)
3664     {
3665         snew(tmp_nalloc,dd->nnodes);
3666         snew(tmp_ind,dd->nnodes);
3667         for(i=0; i<dd->nnodes; i++)
3668         {
3669             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3670             snew(tmp_ind[i],tmp_nalloc[i]);
3671         }
3672     }
3673     
3674     /* Clear the count */
3675     for(i=0; i<dd->nnodes; i++)
3676     {
3677         ma->ncg[i] = 0;
3678         ma->nat[i] = 0;
3679     }
3680     
3681     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3682     
3683     cgindex = cgs->index;
3684     
3685     /* Compute the center of geometry for all charge groups */
3686     for(icg=0; icg<cgs->nr; icg++)
3687     {
3688         k0      = cgindex[icg];
3689         k1      = cgindex[icg+1];
3690         nrcg    = k1 - k0;
3691         if (nrcg == 1)
3692         {
3693             copy_rvec(pos[k0],cg_cm);
3694         }
3695         else
3696         {
3697             inv_ncg = 1.0/nrcg;
3698             
3699             clear_rvec(cg_cm);
3700             for(k=k0; (k<k1); k++)
3701             {
3702                 rvec_inc(cg_cm,pos[k]);
3703             }
3704             for(d=0; (d<DIM); d++)
3705             {
3706                 cg_cm[d] *= inv_ncg;
3707             }
3708         }
3709         /* Put the charge group in the box and determine the cell index */
3710         for(d=DIM-1; d>=0; d--) {
3711             pos_d = cg_cm[d];
3712             if (d < dd->npbcdim)
3713             {
3714                 bScrew = (dd->bScrewPBC && d == XX);
3715                 if (tric_dir[d] && dd->nc[d] > 1)
3716                 {
3717                     /* Use triclinic coordintates for this dimension */
3718                     for(j=d+1; j<DIM; j++)
3719                     {
3720                         pos_d += cg_cm[j]*tcm[j][d];
3721                     }
3722                 }
3723                 while(pos_d >= box[d][d])
3724                 {
3725                     pos_d -= box[d][d];
3726                     rvec_dec(cg_cm,box[d]);
3727                     if (bScrew)
3728                     {
3729                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3730                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3731                     }
3732                     for(k=k0; (k<k1); k++)
3733                     {
3734                         rvec_dec(pos[k],box[d]);
3735                         if (bScrew)
3736                         {
3737                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3738                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3739                         }
3740                     }
3741                 }
3742                 while(pos_d < 0)
3743                 {
3744                     pos_d += box[d][d];
3745                     rvec_inc(cg_cm,box[d]);
3746                     if (bScrew)
3747                     {
3748                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3749                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3750                     }
3751                     for(k=k0; (k<k1); k++)
3752                     {
3753                         rvec_inc(pos[k],box[d]);
3754                         if (bScrew) {
3755                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3756                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3757                         }
3758                     }
3759                 }
3760             }
3761             /* This could be done more efficiently */
3762             ind[d] = 0;
3763             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3764             {
3765                 ind[d]++;
3766             }
3767         }
3768         i = dd_index(dd->nc,ind);
3769         if (ma->ncg[i] == tmp_nalloc[i])
3770         {
3771             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3772             srenew(tmp_ind[i],tmp_nalloc[i]);
3773         }
3774         tmp_ind[i][ma->ncg[i]] = icg;
3775         ma->ncg[i]++;
3776         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3777     }
3778     
3779     k1 = 0;
3780     for(i=0; i<dd->nnodes; i++)
3781     {
3782         ma->index[i] = k1;
3783         for(k=0; k<ma->ncg[i]; k++)
3784         {
3785             ma->cg[k1++] = tmp_ind[i][k];
3786         }
3787     }
3788     ma->index[dd->nnodes] = k1;
3789     
3790     for(i=0; i<dd->nnodes; i++)
3791     {
3792         sfree(tmp_ind[i]);
3793     }
3794     sfree(tmp_ind);
3795     sfree(tmp_nalloc);
3796     
3797     if (fplog)
3798     {
3799         char buf[22];
3800         fprintf(fplog,"Charge group distribution at step %s:",
3801                 gmx_step_str(step,buf));
3802         for(i=0; i<dd->nnodes; i++)
3803         {
3804             fprintf(fplog," %d",ma->ncg[i]);
3805         }
3806         fprintf(fplog,"\n");
3807     }
3808 }
3809
3810 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3811                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3812                                 rvec pos[])
3813 {
3814     gmx_domdec_master_t *ma=NULL;
3815     ivec npulse;
3816     int  i,cg_gl;
3817     int  *ibuf,buf2[2] = { 0, 0 };
3818     gmx_bool bMaster = DDMASTER(dd);
3819     if (bMaster)
3820     {
3821         ma = dd->ma;
3822         
3823         if (dd->bScrewPBC)
3824         {
3825             check_screw_box(box);
3826         }
3827     
3828         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3829     
3830         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3831         for(i=0; i<dd->nnodes; i++)
3832         {
3833             ma->ibuf[2*i]   = ma->ncg[i];
3834             ma->ibuf[2*i+1] = ma->nat[i];
3835         }
3836         ibuf = ma->ibuf;
3837     }
3838     else
3839     {
3840         ibuf = NULL;
3841     }
3842     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3843     
3844     dd->ncg_home = buf2[0];
3845     dd->nat_home = buf2[1];
3846     dd->ncg_tot  = dd->ncg_home;
3847     dd->nat_tot  = dd->nat_home;
3848     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3849     {
3850         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3851         srenew(dd->index_gl,dd->cg_nalloc);
3852         srenew(dd->cgindex,dd->cg_nalloc+1);
3853     }
3854     if (bMaster)
3855     {
3856         for(i=0; i<dd->nnodes; i++)
3857         {
3858             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3859             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3860         }
3861     }
3862     
3863     dd_scatterv(dd,
3864                 DDMASTER(dd) ? ma->ibuf : NULL,
3865                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3866                 DDMASTER(dd) ? ma->cg : NULL,
3867                 dd->ncg_home*sizeof(int),dd->index_gl);
3868     
3869     /* Determine the home charge group sizes */
3870     dd->cgindex[0] = 0;
3871     for(i=0; i<dd->ncg_home; i++)
3872     {
3873         cg_gl = dd->index_gl[i];
3874         dd->cgindex[i+1] =
3875             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3876     }
3877     
3878     if (debug)
3879     {
3880         fprintf(debug,"Home charge groups:\n");
3881         for(i=0; i<dd->ncg_home; i++)
3882         {
3883             fprintf(debug," %d",dd->index_gl[i]);
3884             if (i % 10 == 9) 
3885                 fprintf(debug,"\n");
3886         }
3887         fprintf(debug,"\n");
3888     }
3889 }
3890
3891 static int compact_and_copy_vec_at(int ncg,int *move,
3892                                    int *cgindex,
3893                                    int nvec,int vec,
3894                                    rvec *src,gmx_domdec_comm_t *comm,
3895                                    gmx_bool bCompact)
3896 {
3897     int m,icg,i,i0,i1,nrcg;
3898     int home_pos;
3899     int pos_vec[DIM*2];
3900     
3901     home_pos = 0;
3902
3903     for(m=0; m<DIM*2; m++)
3904     {
3905         pos_vec[m] = 0;
3906     }
3907     
3908     i0 = 0;
3909     for(icg=0; icg<ncg; icg++)
3910     {
3911         i1 = cgindex[icg+1];
3912         m = move[icg];
3913         if (m == -1)
3914         {
3915             if (bCompact)
3916             {
3917                 /* Compact the home array in place */
3918                 for(i=i0; i<i1; i++)
3919                 {
3920                     copy_rvec(src[i],src[home_pos++]);
3921                 }
3922             }
3923         }
3924         else
3925         {
3926             /* Copy to the communication buffer */
3927             nrcg = i1 - i0;
3928             pos_vec[m] += 1 + vec*nrcg;
3929             for(i=i0; i<i1; i++)
3930             {
3931                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3932             }
3933             pos_vec[m] += (nvec - vec - 1)*nrcg;
3934         }
3935         if (!bCompact)
3936         {
3937             home_pos += i1 - i0;
3938         }
3939         i0 = i1;
3940     }
3941     
3942     return home_pos;
3943 }
3944
3945 static int compact_and_copy_vec_cg(int ncg,int *move,
3946                                    int *cgindex,
3947                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3948                                    gmx_bool bCompact)
3949 {
3950     int m,icg,i0,i1,nrcg;
3951     int home_pos;
3952     int pos_vec[DIM*2];
3953     
3954     home_pos = 0;
3955     
3956     for(m=0; m<DIM*2; m++)
3957     {
3958         pos_vec[m] = 0;
3959     }
3960     
3961     i0 = 0;
3962     for(icg=0; icg<ncg; icg++)
3963     {
3964         i1 = cgindex[icg+1];
3965         m = move[icg];
3966         if (m == -1)
3967         {
3968             if (bCompact)
3969             {
3970                 /* Compact the home array in place */
3971                 copy_rvec(src[icg],src[home_pos++]);
3972             }
3973         }
3974         else
3975         {
3976             nrcg = i1 - i0;
3977             /* Copy to the communication buffer */
3978             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3979             pos_vec[m] += 1 + nrcg*nvec;
3980         }
3981         i0 = i1;
3982     }
3983     if (!bCompact)
3984     {
3985         home_pos = ncg;
3986     }
3987     
3988     return home_pos;
3989 }
3990
3991 static int compact_ind(int ncg,int *move,
3992                        int *index_gl,int *cgindex,
3993                        int *gatindex,
3994                        gmx_ga2la_t ga2la,char *bLocalCG,
3995                        int *cginfo)
3996 {
3997     int cg,nat,a0,a1,a,a_gl;
3998     int home_pos;
3999
4000     home_pos = 0;
4001     nat = 0;
4002     for(cg=0; cg<ncg; cg++)
4003     {
4004         a0 = cgindex[cg];
4005         a1 = cgindex[cg+1];
4006         if (move[cg] == -1)
4007         {
4008             /* Compact the home arrays in place.
4009              * Anything that can be done here avoids access to global arrays.
4010              */
4011             cgindex[home_pos] = nat;
4012             for(a=a0; a<a1; a++)
4013             {
4014                 a_gl = gatindex[a];
4015                 gatindex[nat] = a_gl;
4016                 /* The cell number stays 0, so we don't need to set it */
4017                 ga2la_change_la(ga2la,a_gl,nat);
4018                 nat++;
4019             }
4020             index_gl[home_pos] = index_gl[cg];
4021             cginfo[home_pos]   = cginfo[cg];
4022             /* The charge group remains local, so bLocalCG does not change */
4023             home_pos++;
4024         }
4025         else
4026         {
4027             /* Clear the global indices */
4028             for(a=a0; a<a1; a++)
4029             {
4030                 ga2la_del(ga2la,gatindex[a]);
4031             }
4032             if (bLocalCG)
4033             {
4034                 bLocalCG[index_gl[cg]] = FALSE;
4035             }
4036         }
4037     }
4038     cgindex[home_pos] = nat;
4039     
4040     return home_pos;
4041 }
4042
4043 static void clear_and_mark_ind(int ncg,int *move,
4044                                int *index_gl,int *cgindex,int *gatindex,
4045                                gmx_ga2la_t ga2la,char *bLocalCG,
4046                                int *cell_index)
4047 {
4048     int cg,a0,a1,a;
4049     
4050     for(cg=0; cg<ncg; cg++)
4051     {
4052         if (move[cg] >= 0)
4053         {
4054             a0 = cgindex[cg];
4055             a1 = cgindex[cg+1];
4056             /* Clear the global indices */
4057             for(a=a0; a<a1; a++)
4058             {
4059                 ga2la_del(ga2la,gatindex[a]);
4060             }
4061             if (bLocalCG)
4062             {
4063                 bLocalCG[index_gl[cg]] = FALSE;
4064             }
4065             /* Signal that this cg has moved using the ns cell index.
4066              * Here we set it to -1.
4067              * fill_grid will change it from -1 to 4*grid->ncells.
4068              */
4069             cell_index[cg] = -1;
4070         }
4071     }
4072 }
4073
4074 static void print_cg_move(FILE *fplog,
4075                           gmx_domdec_t *dd,
4076                           gmx_large_int_t step,int cg,int dim,int dir,
4077                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4078                           rvec cm_old,rvec cm_new,real pos_d)
4079 {
4080     gmx_domdec_comm_t *comm;
4081     char buf[22];
4082
4083     comm = dd->comm;
4084
4085     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4086     if (bHaveLimitdAndCMOld)
4087     {
4088         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4089                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4090     }
4091     else
4092     {
4093         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4094                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4095     }
4096     fprintf(fplog,"distance out of cell %f\n",
4097             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4098     if (bHaveLimitdAndCMOld)
4099     {
4100         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4101                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4102     }
4103     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4104             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4105     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4106             dim2char(dim),
4107             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4108     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4109             dim2char(dim),
4110             comm->cell_x0[dim],comm->cell_x1[dim]);
4111 }
4112
4113 static void cg_move_error(FILE *fplog,
4114                           gmx_domdec_t *dd,
4115                           gmx_large_int_t step,int cg,int dim,int dir,
4116                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4117                           rvec cm_old,rvec cm_new,real pos_d)
4118 {
4119     if (fplog)
4120     {
4121         print_cg_move(fplog, dd,step,cg,dim,dir,
4122                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4123     }
4124     print_cg_move(stderr,dd,step,cg,dim,dir,
4125                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4126     gmx_fatal(FARGS,
4127               "A charge group moved too far between two domain decomposition steps\n"
4128               "This usually means that your system is not well equilibrated");
4129 }
4130
4131 static void rotate_state_atom(t_state *state,int a)
4132 {
4133     int est;
4134
4135     for(est=0; est<estNR; est++)
4136     {
4137         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4138             switch (est) {
4139             case estX:
4140                 /* Rotate the complete state; for a rectangular box only */
4141                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4142                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4143                 break;
4144             case estV:
4145                 state->v[a][YY] = -state->v[a][YY];
4146                 state->v[a][ZZ] = -state->v[a][ZZ];
4147                 break;
4148             case estSDX:
4149                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4150                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4151                 break;
4152             case estCGP:
4153                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4154                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4155                 break;
4156             case estDISRE_INITF:
4157             case estDISRE_RM3TAV:
4158             case estORIRE_INITF:
4159             case estORIRE_DTAV:
4160                 /* These are distances, so not affected by rotation */
4161                 break;
4162             default:
4163                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4164             }
4165         }
4166     }
4167 }
4168
4169 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4170                               gmx_domdec_t *dd,ivec tric_dir,
4171                               t_state *state,rvec **f,
4172                               t_forcerec *fr,t_mdatoms *md,
4173                               gmx_bool bCompact,
4174                               t_nrnb *nrnb)
4175 {
4176     int  *move;
4177     int  npbcdim;
4178     int  ncg[DIM*2],nat[DIM*2];
4179     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4180     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4181     int  sbuf[2],rbuf[2];
4182     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4183     int  flag;
4184     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4185     gmx_bool bScrew;
4186     ivec dev;
4187     real inv_ncg,pos_d;
4188     matrix tcm;
4189     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4190     atom_id *cgindex;
4191     cginfo_mb_t *cginfo_mb;
4192     gmx_domdec_comm_t *comm;
4193     
4194     if (dd->bScrewPBC)
4195     {
4196         check_screw_box(state->box);
4197     }
4198     
4199     comm  = dd->comm;
4200     cg_cm = fr->cg_cm;
4201     
4202     for(i=0; i<estNR; i++)
4203     {
4204         if (EST_DISTR(i))
4205         {
4206             switch (i)
4207             {
4208             case estX:   /* Always present */            break;
4209             case estV:   bV   = (state->flags & (1<<i)); break;
4210             case estSDX: bSDX = (state->flags & (1<<i)); break;
4211             case estCGP: bCGP = (state->flags & (1<<i)); break;
4212             case estLD_RNG:
4213             case estLD_RNGI:
4214             case estDISRE_INITF:
4215             case estDISRE_RM3TAV:
4216             case estORIRE_INITF:
4217             case estORIRE_DTAV:
4218                 /* No processing required */
4219                 break;
4220             default:
4221             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4222             }
4223         }
4224     }
4225     
4226     if (dd->ncg_tot > comm->nalloc_int)
4227     {
4228         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4229         srenew(comm->buf_int,comm->nalloc_int);
4230     }
4231     move = comm->buf_int;
4232     
4233     /* Clear the count */
4234     for(c=0; c<dd->ndim*2; c++)
4235     {
4236         ncg[c] = 0;
4237         nat[c] = 0;
4238     }
4239
4240     npbcdim = dd->npbcdim;
4241
4242     for(d=0; (d<DIM); d++)
4243     {
4244         limitd[d] = dd->comm->cellsize_min[d];
4245         if (d >= npbcdim && dd->ci[d] == 0)
4246         {
4247             cell_x0[d] = -GMX_FLOAT_MAX;
4248         }
4249         else
4250         {
4251             cell_x0[d] = comm->cell_x0[d];
4252         }
4253         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4254         {
4255             cell_x1[d] = GMX_FLOAT_MAX;
4256         }
4257         else
4258         {
4259             cell_x1[d] = comm->cell_x1[d];
4260         }
4261         if (d < npbcdim)
4262         {
4263             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4264             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4265         }
4266         else
4267         {
4268             /* We check after communication if a charge group moved
4269              * more than one cell. Set the pre-comm check limit to float_max.
4270              */
4271             limit0[d] = -GMX_FLOAT_MAX;
4272             limit1[d] =  GMX_FLOAT_MAX;
4273         }
4274     }
4275     
4276     make_tric_corr_matrix(npbcdim,state->box,tcm);
4277     
4278     cgindex = dd->cgindex;
4279     
4280     /* Compute the center of geometry for all home charge groups
4281      * and put them in the box and determine where they should go.
4282      */
4283     for(cg=0; cg<dd->ncg_home; cg++)
4284     {
4285         k0   = cgindex[cg];
4286         k1   = cgindex[cg+1];
4287         nrcg = k1 - k0;
4288         if (nrcg == 1)
4289         {
4290             copy_rvec(state->x[k0],cm_new);
4291         }
4292         else
4293         {
4294             inv_ncg = 1.0/nrcg;
4295             
4296             clear_rvec(cm_new);
4297             for(k=k0; (k<k1); k++)
4298             {
4299                 rvec_inc(cm_new,state->x[k]);
4300             }
4301             for(d=0; (d<DIM); d++)
4302             {
4303                 cm_new[d] = inv_ncg*cm_new[d];
4304             }
4305         }
4306         
4307         clear_ivec(dev);
4308         /* Do pbc and check DD cell boundary crossings */
4309         for(d=DIM-1; d>=0; d--)
4310         {
4311             if (dd->nc[d] > 1)
4312             {
4313                 bScrew = (dd->bScrewPBC && d == XX);
4314                 /* Determine the location of this cg in lattice coordinates */
4315                 pos_d = cm_new[d];
4316                 if (tric_dir[d])
4317                 {
4318                     for(d2=d+1; d2<DIM; d2++)
4319                     {
4320                         pos_d += cm_new[d2]*tcm[d2][d];
4321                     }
4322                 }
4323                 /* Put the charge group in the triclinic unit-cell */
4324                 if (pos_d >= cell_x1[d])
4325                 {
4326                     if (pos_d >= limit1[d])
4327                     {
4328                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4329                                       cg_cm[cg],cm_new,pos_d);
4330                     }
4331                     dev[d] = 1;
4332                     if (dd->ci[d] == dd->nc[d] - 1)
4333                     {
4334                         rvec_dec(cm_new,state->box[d]);
4335                         if (bScrew)
4336                         {
4337                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4338                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4339                         }
4340                         for(k=k0; (k<k1); k++)
4341                         {
4342                             rvec_dec(state->x[k],state->box[d]);
4343                             if (bScrew)
4344                             {
4345                                 rotate_state_atom(state,k);
4346                             }
4347                         }
4348                     }
4349                 }
4350                 else if (pos_d < cell_x0[d])
4351                 {
4352                     if (pos_d < limit0[d])
4353                     {
4354                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4355                                       cg_cm[cg],cm_new,pos_d);
4356                     }
4357                     dev[d] = -1;
4358                     if (dd->ci[d] == 0)
4359                     {
4360                         rvec_inc(cm_new,state->box[d]);
4361                         if (bScrew)
4362                         {
4363                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4364                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4365                         }
4366                         for(k=k0; (k<k1); k++)
4367                         {
4368                             rvec_inc(state->x[k],state->box[d]);
4369                             if (bScrew)
4370                             {
4371                                 rotate_state_atom(state,k);
4372                             }
4373                         }
4374                     }
4375                 }
4376             }
4377             else if (d < npbcdim)
4378             {
4379                 /* Put the charge group in the rectangular unit-cell */
4380                 while (cm_new[d] >= state->box[d][d])
4381                 {
4382                     rvec_dec(cm_new,state->box[d]);
4383                     for(k=k0; (k<k1); k++)
4384                     {
4385                         rvec_dec(state->x[k],state->box[d]);
4386                     }
4387                 }
4388                 while (cm_new[d] < 0)
4389                 {
4390                     rvec_inc(cm_new,state->box[d]);
4391                     for(k=k0; (k<k1); k++)
4392                     {
4393                         rvec_inc(state->x[k],state->box[d]);
4394                     }
4395                 }
4396             }
4397         }
4398     
4399         copy_rvec(cm_new,cg_cm[cg]);
4400         
4401         /* Determine where this cg should go */
4402         flag = 0;
4403         mc = -1;
4404         for(d=0; d<dd->ndim; d++)
4405         {
4406             dim = dd->dim[d];
4407             if (dev[dim] == 1)
4408             {
4409                 flag |= DD_FLAG_FW(d);
4410                 if (mc == -1)
4411                 {
4412                     mc = d*2;
4413                 }
4414             }
4415             else if (dev[dim] == -1)
4416             {
4417                 flag |= DD_FLAG_BW(d);
4418                 if (mc == -1) {
4419                     if (dd->nc[dim] > 2)
4420                     {
4421                         mc = d*2 + 1;
4422                     }
4423                     else
4424                     {
4425                         mc = d*2;
4426                     }
4427                 }
4428             }
4429         }
4430         move[cg] = mc;
4431         if (mc >= 0)
4432         {
4433             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4434             {
4435                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4436                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4437             }
4438             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4439             /* We store the cg size in the lower 16 bits
4440              * and the place where the charge group should go
4441              * in the next 6 bits. This saves some communication volume.
4442              */
4443             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4444             ncg[mc] += 1;
4445             nat[mc] += nrcg;
4446         }
4447     }
4448     
4449     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4450     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4451     
4452     nvec = 1;
4453     if (bV)
4454     {
4455         nvec++;
4456     }
4457     if (bSDX)
4458     {
4459         nvec++;
4460     }
4461     if (bCGP)
4462     {
4463         nvec++;
4464     }
4465     
4466     /* Make sure the communication buffers are large enough */
4467     for(mc=0; mc<dd->ndim*2; mc++)
4468     {
4469         nvr = ncg[mc] + nat[mc]*nvec;
4470         if (nvr > comm->cgcm_state_nalloc[mc])
4471         {
4472             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4473             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4474         }
4475     }
4476     
4477     /* Recalculating cg_cm might be cheaper than communicating,
4478      * but that could give rise to rounding issues.
4479      */
4480     home_pos_cg =
4481         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4482                                 nvec,cg_cm,comm,bCompact);
4483     
4484     vec = 0;
4485     home_pos_at =
4486         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4487                                 nvec,vec++,state->x,comm,bCompact);
4488     if (bV)
4489     {
4490         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4491                                 nvec,vec++,state->v,comm,bCompact);
4492     }
4493     if (bSDX)
4494     {
4495         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4496                                 nvec,vec++,state->sd_X,comm,bCompact);
4497     }
4498     if (bCGP)
4499     {
4500         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4501                                 nvec,vec++,state->cg_p,comm,bCompact);
4502     }
4503     
4504     if (bCompact)
4505     {
4506         compact_ind(dd->ncg_home,move,
4507                     dd->index_gl,dd->cgindex,dd->gatindex,
4508                     dd->ga2la,comm->bLocalCG,
4509                     fr->cginfo);
4510     }
4511     else
4512     {
4513         clear_and_mark_ind(dd->ncg_home,move,
4514                            dd->index_gl,dd->cgindex,dd->gatindex,
4515                            dd->ga2la,comm->bLocalCG,
4516                            fr->ns.grid->cell_index);
4517     }
4518     
4519     cginfo_mb = fr->cginfo_mb;
4520
4521     ncg_stay_home = home_pos_cg;
4522     for(d=0; d<dd->ndim; d++)
4523     {
4524         dim = dd->dim[d];
4525         ncg_recv = 0;
4526         nat_recv = 0;
4527         nvr      = 0;
4528         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4529         {
4530             cdd = d*2 + dir;
4531             /* Communicate the cg and atom counts */
4532             sbuf[0] = ncg[cdd];
4533             sbuf[1] = nat[cdd];
4534             if (debug)
4535             {
4536                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4537                         d,dir,sbuf[0],sbuf[1]);
4538             }
4539             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4540             
4541             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4542             {
4543                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4544                 srenew(comm->buf_int,comm->nalloc_int);
4545             }
4546             
4547             /* Communicate the charge group indices, sizes and flags */
4548             dd_sendrecv_int(dd, d, dir,
4549                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4550                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4551             
4552             nvs = ncg[cdd] + nat[cdd]*nvec;
4553             i   = rbuf[0]  + rbuf[1] *nvec;
4554             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4555             
4556             /* Communicate cgcm and state */
4557             dd_sendrecv_rvec(dd, d, dir,
4558                              comm->cgcm_state[cdd], nvs,
4559                              comm->vbuf.v+nvr, i);
4560             ncg_recv += rbuf[0];
4561             nat_recv += rbuf[1];
4562             nvr      += i;
4563         }
4564         
4565         /* Process the received charge groups */
4566         buf_pos = 0;
4567         for(cg=0; cg<ncg_recv; cg++)
4568         {
4569             flag = comm->buf_int[cg*DD_CGIBS+1];
4570
4571             if (dim >= npbcdim && dd->nc[dim] > 2)
4572             {
4573                 /* No pbc in this dim and more than one domain boundary.
4574                  * We to a separate check if a charge did not move too far.
4575                  */
4576                 if (((flag & DD_FLAG_FW(d)) &&
4577                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4578                     ((flag & DD_FLAG_BW(d)) &&
4579                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4580                 {
4581                     cg_move_error(fplog,dd,step,cg,d,
4582                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4583                                    FALSE,0,
4584                                    comm->vbuf.v[buf_pos],
4585                                    comm->vbuf.v[buf_pos],
4586                                    comm->vbuf.v[buf_pos][d]);
4587                 }
4588             }
4589
4590             mc = -1;
4591             if (d < dd->ndim-1)
4592             {
4593                 /* Check which direction this cg should go */
4594                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4595                 {
4596                     if (dd->bGridJump)
4597                     {
4598                         /* The cell boundaries for dimension d2 are not equal
4599                          * for each cell row of the lower dimension(s),
4600                          * therefore we might need to redetermine where
4601                          * this cg should go.
4602                          */
4603                         dim2 = dd->dim[d2];
4604                         /* If this cg crosses the box boundary in dimension d2
4605                          * we can use the communicated flag, so we do not
4606                          * have to worry about pbc.
4607                          */
4608                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4609                                (flag & DD_FLAG_FW(d2))) ||
4610                               (dd->ci[dim2] == 0 &&
4611                                (flag & DD_FLAG_BW(d2)))))
4612                         {
4613                             /* Clear the two flags for this dimension */
4614                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4615                             /* Determine the location of this cg
4616                              * in lattice coordinates
4617                              */
4618                             pos_d = comm->vbuf.v[buf_pos][dim2];
4619                             if (tric_dir[dim2])
4620                             {
4621                                 for(d3=dim2+1; d3<DIM; d3++)
4622                                 {
4623                                     pos_d +=
4624                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4625                                 }
4626                             }
4627                             /* Check of we are not at the box edge.
4628                              * pbc is only handled in the first step above,
4629                              * but this check could move over pbc while
4630                              * the first step did not due to different rounding.
4631                              */
4632                             if (pos_d >= cell_x1[dim2] &&
4633                                 dd->ci[dim2] != dd->nc[dim2]-1)
4634                             {
4635                                 flag |= DD_FLAG_FW(d2);
4636                             }
4637                             else if (pos_d < cell_x0[dim2] &&
4638                                      dd->ci[dim2] != 0)
4639                             {
4640                                 flag |= DD_FLAG_BW(d2);
4641                             }
4642                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4643                         }
4644                     }
4645                     /* Set to which neighboring cell this cg should go */
4646                     if (flag & DD_FLAG_FW(d2))
4647                     {
4648                         mc = d2*2;
4649                     }
4650                     else if (flag & DD_FLAG_BW(d2))
4651                     {
4652                         if (dd->nc[dd->dim[d2]] > 2)
4653                         {
4654                             mc = d2*2+1;
4655                         }
4656                         else
4657                         {
4658                             mc = d2*2;
4659                         }
4660                     }
4661                 }
4662             }
4663             
4664             nrcg = flag & DD_FLAG_NRCG;
4665             if (mc == -1)
4666             {
4667                 if (home_pos_cg+1 > dd->cg_nalloc)
4668                 {
4669                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4670                     srenew(dd->index_gl,dd->cg_nalloc);
4671                     srenew(dd->cgindex,dd->cg_nalloc+1);
4672                 }
4673                 /* Set the global charge group index and size */
4674                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4675                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4676                 /* Copy the state from the buffer */
4677                 if (home_pos_cg >= fr->cg_nalloc)
4678                 {
4679                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4680                     cg_cm = fr->cg_cm;
4681                 }
4682                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4683                 /* Set the cginfo */
4684                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4685                                                    dd->index_gl[home_pos_cg]);
4686                 if (comm->bLocalCG)
4687                 {
4688                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4689                 }
4690
4691                 if (home_pos_at+nrcg > state->nalloc)
4692                 {
4693                     dd_realloc_state(state,f,home_pos_at+nrcg);
4694                 }
4695                 for(i=0; i<nrcg; i++)
4696                 {
4697                     copy_rvec(comm->vbuf.v[buf_pos++],
4698                               state->x[home_pos_at+i]);
4699                 }
4700                 if (bV)
4701                 {
4702                     for(i=0; i<nrcg; i++)
4703                     {
4704                         copy_rvec(comm->vbuf.v[buf_pos++],
4705                                   state->v[home_pos_at+i]);
4706                     }
4707                 }
4708                 if (bSDX)
4709                 {
4710                     for(i=0; i<nrcg; i++)
4711                     {
4712                         copy_rvec(comm->vbuf.v[buf_pos++],
4713                                   state->sd_X[home_pos_at+i]);
4714                     }
4715                 }
4716                 if (bCGP)
4717                 {
4718                     for(i=0; i<nrcg; i++)
4719                     {
4720                         copy_rvec(comm->vbuf.v[buf_pos++],
4721                                   state->cg_p[home_pos_at+i]);
4722                     }
4723                 }
4724                 home_pos_cg += 1;
4725                 home_pos_at += nrcg;
4726             }
4727             else
4728             {
4729                 /* Reallocate the buffers if necessary  */
4730                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4731                 {
4732                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4733                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4734                 }
4735                 nvr = ncg[mc] + nat[mc]*nvec;
4736                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4737                 {
4738                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4739                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4740                 }
4741                 /* Copy from the receive to the send buffers */
4742                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4743                        comm->buf_int + cg*DD_CGIBS,
4744                        DD_CGIBS*sizeof(int));
4745                 memcpy(comm->cgcm_state[mc][nvr],
4746                        comm->vbuf.v[buf_pos],
4747                        (1+nrcg*nvec)*sizeof(rvec));
4748                 buf_pos += 1 + nrcg*nvec;
4749                 ncg[mc] += 1;
4750                 nat[mc] += nrcg;
4751             }
4752         }
4753     }
4754     
4755     /* With sorting (!bCompact) the indices are now only partially up to date
4756      * and ncg_home and nat_home are not the real count, since there are
4757      * "holes" in the arrays for the charge groups that moved to neighbors.
4758      */
4759     dd->ncg_home = home_pos_cg;
4760     dd->nat_home = home_pos_at;
4761
4762     if (debug)
4763     {
4764         fprintf(debug,"Finished repartitioning\n");
4765     }
4766
4767     return ncg_stay_home;
4768 }
4769
4770 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4771 {
4772     dd->comm->cycl[ddCycl] += cycles;
4773     dd->comm->cycl_n[ddCycl]++;
4774     if (cycles > dd->comm->cycl_max[ddCycl])
4775     {
4776         dd->comm->cycl_max[ddCycl] = cycles;
4777     }
4778 }
4779
4780 static double force_flop_count(t_nrnb *nrnb)
4781 {
4782     int i;
4783     double sum;
4784     const char *name;
4785
4786     sum = 0;
4787     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4788     {
4789         /* To get closer to the real timings, we half the count
4790          * for the normal loops and again half it for water loops.
4791          */
4792         name = nrnb_str(i);
4793         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4794         {
4795             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4796         }
4797         else
4798         {
4799             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4800         }
4801     }
4802     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4803     {
4804         name = nrnb_str(i);
4805         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4806         sum += nrnb->n[i]*cost_nrnb(i);
4807     }
4808     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4809     {
4810         sum += nrnb->n[i]*cost_nrnb(i);
4811     }
4812
4813     return sum;
4814 }
4815
4816 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4817 {
4818     if (dd->comm->eFlop)
4819     {
4820         dd->comm->flop -= force_flop_count(nrnb);
4821     }
4822 }
4823 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4824 {
4825     if (dd->comm->eFlop)
4826     {
4827         dd->comm->flop += force_flop_count(nrnb);
4828         dd->comm->flop_n++;
4829     }
4830 }  
4831
4832 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4833 {
4834     int i;
4835     
4836     for(i=0; i<ddCyclNr; i++)
4837     {
4838         dd->comm->cycl[i] = 0;
4839         dd->comm->cycl_n[i] = 0;
4840         dd->comm->cycl_max[i] = 0;
4841     }
4842     dd->comm->flop = 0;
4843     dd->comm->flop_n = 0;
4844 }
4845
4846 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4847 {
4848     gmx_domdec_comm_t *comm;
4849     gmx_domdec_load_t *load;
4850     gmx_domdec_root_t *root=NULL;
4851     int  d,dim,cid,i,pos;
4852     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4853     gmx_bool bSepPME;
4854     
4855     if (debug)
4856     {
4857         fprintf(debug,"get_load_distribution start\n");
4858     }
4859
4860     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4861     
4862     comm = dd->comm;
4863     
4864     bSepPME = (dd->pme_nodeid >= 0);
4865     
4866     for(d=dd->ndim-1; d>=0; d--)
4867     {
4868         dim = dd->dim[d];
4869         /* Check if we participate in the communication in this dimension */
4870         if (d == dd->ndim-1 || 
4871             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4872         {
4873             load = &comm->load[d];
4874             if (dd->bGridJump)
4875             {
4876                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4877             }
4878             pos = 0;
4879             if (d == dd->ndim-1)
4880             {
4881                 sbuf[pos++] = dd_force_load(comm);
4882                 sbuf[pos++] = sbuf[0];
4883                 if (dd->bGridJump)
4884                 {
4885                     sbuf[pos++] = sbuf[0];
4886                     sbuf[pos++] = cell_frac;
4887                     if (d > 0)
4888                     {
4889                         sbuf[pos++] = comm->cell_f_max0[d];
4890                         sbuf[pos++] = comm->cell_f_min1[d];
4891                     }
4892                 }
4893                 if (bSepPME)
4894                 {
4895                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4896                     sbuf[pos++] = comm->cycl[ddCyclPME];
4897                 }
4898             }
4899             else
4900             {
4901                 sbuf[pos++] = comm->load[d+1].sum;
4902                 sbuf[pos++] = comm->load[d+1].max;
4903                 if (dd->bGridJump)
4904                 {
4905                     sbuf[pos++] = comm->load[d+1].sum_m;
4906                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4907                     sbuf[pos++] = comm->load[d+1].flags;
4908                     if (d > 0)
4909                     {
4910                         sbuf[pos++] = comm->cell_f_max0[d];
4911                         sbuf[pos++] = comm->cell_f_min1[d];
4912                     }
4913                 }
4914                 if (bSepPME)
4915                 {
4916                     sbuf[pos++] = comm->load[d+1].mdf;
4917                     sbuf[pos++] = comm->load[d+1].pme;
4918                 }
4919             }
4920             load->nload = pos;
4921             /* Communicate a row in DD direction d.
4922              * The communicators are setup such that the root always has rank 0.
4923              */
4924 #ifdef GMX_MPI
4925             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4926                        load->load,load->nload*sizeof(float),MPI_BYTE,
4927                        0,comm->mpi_comm_load[d]);
4928 #endif
4929             if (dd->ci[dim] == dd->master_ci[dim])
4930             {
4931                 /* We are the root, process this row */
4932                 if (comm->bDynLoadBal)
4933                 {
4934                     root = comm->root[d];
4935                 }
4936                 load->sum = 0;
4937                 load->max = 0;
4938                 load->sum_m = 0;
4939                 load->cvol_min = 1;
4940                 load->flags = 0;
4941                 load->mdf = 0;
4942                 load->pme = 0;
4943                 pos = 0;
4944                 for(i=0; i<dd->nc[dim]; i++)
4945                 {
4946                     load->sum += load->load[pos++];
4947                     load->max = max(load->max,load->load[pos]);
4948                     pos++;
4949                     if (dd->bGridJump)
4950                     {
4951                         if (root->bLimited)
4952                         {
4953                             /* This direction could not be load balanced properly,
4954                              * therefore we need to use the maximum iso the average load.
4955                              */
4956                             load->sum_m = max(load->sum_m,load->load[pos]);
4957                         }
4958                         else
4959                         {
4960                             load->sum_m += load->load[pos];
4961                         }
4962                         pos++;
4963                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4964                         pos++;
4965                         if (d < dd->ndim-1)
4966                         {
4967                             load->flags = (int)(load->load[pos++] + 0.5);
4968                         }
4969                         if (d > 0)
4970                         {
4971                             root->cell_f_max0[i] = load->load[pos++];
4972                             root->cell_f_min1[i] = load->load[pos++];
4973                         }
4974                     }
4975                     if (bSepPME)
4976                     {
4977                         load->mdf = max(load->mdf,load->load[pos]);
4978                         pos++;
4979                         load->pme = max(load->pme,load->load[pos]);
4980                         pos++;
4981                     }
4982                 }
4983                 if (comm->bDynLoadBal && root->bLimited)
4984                 {
4985                     load->sum_m *= dd->nc[dim];
4986                     load->flags |= (1<<d);
4987                 }
4988             }
4989         }
4990     }
4991
4992     if (DDMASTER(dd))
4993     {
4994         comm->nload      += dd_load_count(comm);
4995         comm->load_step  += comm->cycl[ddCyclStep];
4996         comm->load_sum   += comm->load[0].sum;
4997         comm->load_max   += comm->load[0].max;
4998         if (comm->bDynLoadBal)
4999         {
5000             for(d=0; d<dd->ndim; d++)
5001             {
5002                 if (comm->load[0].flags & (1<<d))
5003                 {
5004                     comm->load_lim[d]++;
5005                 }
5006             }
5007         }
5008         if (bSepPME)
5009         {
5010             comm->load_mdf += comm->load[0].mdf;
5011             comm->load_pme += comm->load[0].pme;
5012         }
5013     }
5014
5015     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5016     
5017     if (debug)
5018     {
5019         fprintf(debug,"get_load_distribution finished\n");
5020     }
5021 }
5022
5023 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5024 {
5025     /* Return the relative performance loss on the total run time
5026      * due to the force calculation load imbalance.
5027      */
5028     if (dd->comm->nload > 0)
5029     {
5030         return
5031             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5032             (dd->comm->load_step*dd->nnodes);
5033     }
5034     else
5035     {
5036         return 0;
5037     }
5038 }
5039
5040 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5041 {
5042     char  buf[STRLEN];
5043     int   npp,npme,nnodes,d,limp;
5044     float imbal,pme_f_ratio,lossf,lossp=0;
5045     gmx_bool  bLim;
5046     gmx_domdec_comm_t *comm;
5047
5048     comm = dd->comm;
5049     if (DDMASTER(dd) && comm->nload > 0)
5050     {
5051         npp    = dd->nnodes;
5052         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5053         nnodes = npp + npme;
5054         imbal = comm->load_max*npp/comm->load_sum - 1;
5055         lossf = dd_force_imb_perf_loss(dd);
5056         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5057         fprintf(fplog,"%s",buf);
5058         fprintf(stderr,"\n");
5059         fprintf(stderr,"%s",buf);
5060         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5061         fprintf(fplog,"%s",buf);
5062         fprintf(stderr,"%s",buf);
5063         bLim = FALSE;
5064         if (comm->bDynLoadBal)
5065         {
5066             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5067             for(d=0; d<dd->ndim; d++)
5068             {
5069                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5070                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5071                 if (limp >= 50)
5072                 {
5073                     bLim = TRUE;
5074                 }
5075             }
5076             sprintf(buf+strlen(buf),"\n");
5077             fprintf(fplog,"%s",buf);
5078             fprintf(stderr,"%s",buf);
5079         }
5080         if (npme > 0)
5081         {
5082             pme_f_ratio = comm->load_pme/comm->load_mdf;
5083             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5084             if (lossp <= 0)
5085             {
5086                 lossp *= (float)npme/(float)nnodes;
5087             }
5088             else
5089             {
5090                 lossp *= (float)npp/(float)nnodes;
5091             }
5092             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5093             fprintf(fplog,"%s",buf);
5094             fprintf(stderr,"%s",buf);
5095             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5096             fprintf(fplog,"%s",buf);
5097             fprintf(stderr,"%s",buf);
5098         }
5099         fprintf(fplog,"\n");
5100         fprintf(stderr,"\n");
5101         
5102         if (lossf >= DD_PERF_LOSS)
5103         {
5104             sprintf(buf,
5105                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5106                     "      in the domain decomposition.\n",lossf*100);
5107             if (!comm->bDynLoadBal)
5108             {
5109                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5110             }
5111             else if (bLim)
5112             {
5113                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5114             }
5115             fprintf(fplog,"%s\n",buf);
5116             fprintf(stderr,"%s\n",buf);
5117         }
5118         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5119         {
5120             sprintf(buf,
5121                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5122                     "      had %s work to do than the PP nodes.\n"
5123                     "      You might want to %s the number of PME nodes\n"
5124                     "      or %s the cut-off and the grid spacing.\n",
5125                     fabs(lossp*100),
5126                     (lossp < 0) ? "less"     : "more",
5127                     (lossp < 0) ? "decrease" : "increase",
5128                     (lossp < 0) ? "decrease" : "increase");
5129             fprintf(fplog,"%s\n",buf);
5130             fprintf(stderr,"%s\n",buf);
5131         }
5132     }
5133 }
5134
5135 static float dd_vol_min(gmx_domdec_t *dd)
5136 {
5137     return dd->comm->load[0].cvol_min*dd->nnodes;
5138 }
5139
5140 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5141 {
5142     return dd->comm->load[0].flags;
5143 }
5144
5145 static float dd_f_imbal(gmx_domdec_t *dd)
5146 {
5147     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5148 }
5149
5150 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5151 {
5152     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5153 }
5154
5155 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5156 {
5157     int flags,d;
5158     char buf[22];
5159     
5160     flags = dd_load_flags(dd);
5161     if (flags)
5162     {
5163         fprintf(fplog,
5164                 "DD  load balancing is limited by minimum cell size in dimension");
5165         for(d=0; d<dd->ndim; d++)
5166         {
5167             if (flags & (1<<d))
5168             {
5169                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5170             }
5171         }
5172         fprintf(fplog,"\n");
5173     }
5174     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5175     if (dd->comm->bDynLoadBal)
5176     {
5177         fprintf(fplog,"  vol min/aver %5.3f%c",
5178                 dd_vol_min(dd),flags ? '!' : ' ');
5179     }
5180     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5181     if (dd->comm->cycl_n[ddCyclPME])
5182     {
5183         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5184     }
5185     fprintf(fplog,"\n\n");
5186 }
5187
5188 static void dd_print_load_verbose(gmx_domdec_t *dd)
5189 {
5190     if (dd->comm->bDynLoadBal)
5191     {
5192         fprintf(stderr,"vol %4.2f%c ",
5193                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5194     }
5195     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5196     if (dd->comm->cycl_n[ddCyclPME])
5197     {
5198         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5199     }
5200 }
5201
5202 #ifdef GMX_MPI
5203 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5204                                    int dim_ind,ivec loc)
5205 {
5206     MPI_Group g_row = MPI_GROUP_EMPTY;
5207     MPI_Comm  c_row;
5208     int  dim,i,*rank;
5209     ivec loc_c;
5210     gmx_domdec_root_t *root;
5211     gmx_bool bPartOfGroup = FALSE;
5212     
5213     dim = dd->dim[dim_ind];
5214     copy_ivec(loc,loc_c);
5215     snew(rank,dd->nc[dim]);
5216     for(i=0; i<dd->nc[dim]; i++)
5217     {
5218         loc_c[dim] = i;
5219         rank[i] = dd_index(dd->nc,loc_c);
5220         if (rank[i] == dd->rank)
5221         {
5222             /* This process is part of the group */
5223             bPartOfGroup = TRUE;
5224         }
5225     }
5226     if (bPartOfGroup)
5227     {
5228         MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5229     }
5230     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5231     if (bPartOfGroup)
5232     {
5233         dd->comm->mpi_comm_load[dim_ind] = c_row;
5234         if (dd->comm->eDLB != edlbNO)
5235         {
5236             if (dd->ci[dim] == dd->master_ci[dim])
5237             {
5238                 /* This is the root process of this row */
5239                 snew(dd->comm->root[dim_ind],1);
5240                 root = dd->comm->root[dim_ind];
5241                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5242                 snew(root->old_cell_f,dd->nc[dim]+1);
5243                 snew(root->bCellMin,dd->nc[dim]);
5244                 if (dim_ind > 0)
5245                 {
5246                     snew(root->cell_f_max0,dd->nc[dim]);
5247                     snew(root->cell_f_min1,dd->nc[dim]);
5248                     snew(root->bound_min,dd->nc[dim]);
5249                     snew(root->bound_max,dd->nc[dim]);
5250                 }
5251                 snew(root->buf_ncd,dd->nc[dim]);
5252             }
5253             else
5254             {
5255                 /* This is not a root process, we only need to receive cell_f */
5256                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5257             }
5258         }
5259         if (dd->ci[dim] == dd->master_ci[dim])
5260         {
5261             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5262         }
5263     }
5264     sfree(rank);
5265 }
5266 #endif
5267
5268 static void make_load_communicators(gmx_domdec_t *dd)
5269 {
5270 #ifdef GMX_MPI
5271   MPI_Group g_all;
5272   int  dim0,dim1,i,j;
5273   ivec loc;
5274
5275   if (debug)
5276     fprintf(debug,"Making load communicators\n");
5277
5278   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5279   
5280   snew(dd->comm->load,dd->ndim);
5281   snew(dd->comm->mpi_comm_load,dd->ndim);
5282   
5283   clear_ivec(loc);
5284   make_load_communicator(dd,g_all,0,loc);
5285   if (dd->ndim > 1) {
5286     dim0 = dd->dim[0];
5287     for(i=0; i<dd->nc[dim0]; i++) {
5288       loc[dim0] = i;
5289       make_load_communicator(dd,g_all,1,loc);
5290     }
5291   }
5292   if (dd->ndim > 2) {
5293     dim0 = dd->dim[0];
5294     for(i=0; i<dd->nc[dim0]; i++) {
5295       loc[dim0] = i;
5296       dim1 = dd->dim[1];
5297       for(j=0; j<dd->nc[dim1]; j++) {
5298           loc[dim1] = j;
5299           make_load_communicator(dd,g_all,2,loc);
5300       }
5301     }
5302   }
5303
5304   MPI_Group_free(&g_all);
5305
5306   if (debug)
5307     fprintf(debug,"Finished making load communicators\n");
5308 #endif
5309 }
5310
5311 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5312 {
5313     gmx_bool bZYX;
5314     int  d,dim,i,j,m;
5315     ivec tmp,s;
5316     int  nzone,nzonep;
5317     ivec dd_zp[DD_MAXIZONE];
5318     gmx_domdec_zones_t *zones;
5319     gmx_domdec_ns_ranges_t *izone;
5320     
5321     for(d=0; d<dd->ndim; d++)
5322     {
5323         dim = dd->dim[d];
5324         copy_ivec(dd->ci,tmp);
5325         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5326         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5327         copy_ivec(dd->ci,tmp);
5328         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5329         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5330         if (debug)
5331         {
5332             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5333                     dd->rank,dim,
5334                     dd->neighbor[d][0],
5335                     dd->neighbor[d][1]);
5336         }
5337     }
5338     
5339     if (DDMASTER(dd))
5340     {
5341         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5342             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5343     }
5344     if (fplog)
5345     {
5346         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5347                 dd->ndim,
5348                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5349                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5350     }
5351     switch (dd->ndim)
5352     {
5353     case 3:
5354         nzone  = dd_z3n;
5355         nzonep = dd_zp3n;
5356         for(i=0; i<nzonep; i++)
5357         {
5358             copy_ivec(dd_zp3[i],dd_zp[i]);
5359         }
5360         break;
5361     case 2:
5362         nzone  = dd_z2n;
5363         nzonep = dd_zp2n;
5364         for(i=0; i<nzonep; i++)
5365         {
5366             copy_ivec(dd_zp2[i],dd_zp[i]);
5367         }
5368         break;
5369     case 1:
5370         nzone  = dd_z1n;
5371         nzonep = dd_zp1n;
5372         for(i=0; i<nzonep; i++)
5373         {
5374             copy_ivec(dd_zp1[i],dd_zp[i]);
5375         }
5376         break;
5377     default:
5378         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5379         nzone = 0;
5380         nzonep = 0;
5381     }
5382
5383     zones = &dd->comm->zones;
5384
5385     for(i=0; i<nzone; i++)
5386     {
5387         m = 0;
5388         clear_ivec(zones->shift[i]);
5389         for(d=0; d<dd->ndim; d++)
5390         {
5391             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5392         }
5393     }
5394     
5395     zones->n = nzone;
5396     for(i=0; i<nzone; i++)
5397     {
5398         for(d=0; d<DIM; d++)
5399         {
5400             s[d] = dd->ci[d] - zones->shift[i][d];
5401             if (s[d] < 0)
5402             {
5403                 s[d] += dd->nc[d];
5404             }
5405             else if (s[d] >= dd->nc[d])
5406             {
5407                 s[d] -= dd->nc[d];
5408             }
5409         }
5410     }
5411     zones->nizone = nzonep;
5412     for(i=0; i<zones->nizone; i++)
5413     {
5414         if (dd_zp[i][0] != i)
5415         {
5416             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5417         }
5418         izone = &zones->izone[i];
5419         izone->j0 = dd_zp[i][1];
5420         izone->j1 = dd_zp[i][2];
5421         for(dim=0; dim<DIM; dim++)
5422         {
5423             if (dd->nc[dim] == 1)
5424             {
5425                 /* All shifts should be allowed */
5426                 izone->shift0[dim] = -1;
5427                 izone->shift1[dim] = 1;
5428             }
5429             else
5430             {
5431                 /*
5432                   izone->shift0[d] = 0;
5433                   izone->shift1[d] = 0;
5434                   for(j=izone->j0; j<izone->j1; j++) {
5435                   if (dd->shift[j][d] > dd->shift[i][d])
5436                   izone->shift0[d] = -1;
5437                   if (dd->shift[j][d] < dd->shift[i][d])
5438                   izone->shift1[d] = 1;
5439                   }
5440                 */
5441                 
5442                 int shift_diff;
5443                 
5444                 /* Assume the shift are not more than 1 cell */
5445                 izone->shift0[dim] = 1;
5446                 izone->shift1[dim] = -1;
5447                 for(j=izone->j0; j<izone->j1; j++)
5448                 {
5449                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5450                     if (shift_diff < izone->shift0[dim])
5451                     {
5452                         izone->shift0[dim] = shift_diff;
5453                     }
5454                     if (shift_diff > izone->shift1[dim])
5455                     {
5456                         izone->shift1[dim] = shift_diff;
5457                     }
5458                 }
5459             }
5460         }
5461     }
5462     
5463     if (dd->comm->eDLB != edlbNO)
5464     {
5465         snew(dd->comm->root,dd->ndim);
5466     }
5467     
5468     if (dd->comm->bRecordLoad)
5469     {
5470         make_load_communicators(dd);
5471     }
5472 }
5473
5474 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5475 {
5476     gmx_domdec_t *dd;
5477     gmx_domdec_comm_t *comm;
5478     int  i,rank,*buf;
5479     ivec periods;
5480 #ifdef GMX_MPI
5481     MPI_Comm comm_cart;
5482 #endif
5483     
5484     dd = cr->dd;
5485     comm = dd->comm;
5486     
5487 #ifdef GMX_MPI
5488     if (comm->bCartesianPP)
5489     {
5490         /* Set up cartesian communication for the particle-particle part */
5491         if (fplog)
5492         {
5493             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5494                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5495         }
5496         
5497         for(i=0; i<DIM; i++)
5498         {
5499             periods[i] = TRUE;
5500         }
5501         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5502                         &comm_cart);
5503         /* We overwrite the old communicator with the new cartesian one */
5504         cr->mpi_comm_mygroup = comm_cart;
5505     }
5506     
5507     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5508     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5509     
5510     if (comm->bCartesianPP_PME)
5511     {
5512         /* Since we want to use the original cartesian setup for sim,
5513          * and not the one after split, we need to make an index.
5514          */
5515         snew(comm->ddindex2ddnodeid,dd->nnodes);
5516         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5517         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5518         /* Get the rank of the DD master,
5519          * above we made sure that the master node is a PP node.
5520          */
5521         if (MASTER(cr))
5522         {
5523             rank = dd->rank;
5524         }
5525         else
5526         {
5527             rank = 0;
5528         }
5529         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5530     }
5531     else if (comm->bCartesianPP)
5532     {
5533         if (cr->npmenodes == 0)
5534         {
5535             /* The PP communicator is also
5536              * the communicator for this simulation
5537              */
5538             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5539         }
5540         cr->nodeid = dd->rank;
5541         
5542         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5543         
5544         /* We need to make an index to go from the coordinates
5545          * to the nodeid of this simulation.
5546          */
5547         snew(comm->ddindex2simnodeid,dd->nnodes);
5548         snew(buf,dd->nnodes);
5549         if (cr->duty & DUTY_PP)
5550         {
5551             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5552         }
5553         /* Communicate the ddindex to simulation nodeid index */
5554         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5555                       cr->mpi_comm_mysim);
5556         sfree(buf);
5557         
5558         /* Determine the master coordinates and rank.
5559          * The DD master should be the same node as the master of this sim.
5560          */
5561         for(i=0; i<dd->nnodes; i++)
5562         {
5563             if (comm->ddindex2simnodeid[i] == 0)
5564             {
5565                 ddindex2xyz(dd->nc,i,dd->master_ci);
5566                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5567             }
5568         }
5569         if (debug)
5570         {
5571             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5572         }
5573     }
5574     else
5575     {
5576         /* No Cartesian communicators */
5577         /* We use the rank in dd->comm->all as DD index */
5578         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5579         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5580         dd->masterrank = 0;
5581         clear_ivec(dd->master_ci);
5582     }
5583 #endif
5584   
5585     if (fplog)
5586     {
5587         fprintf(fplog,
5588                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5589                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5590     }
5591     if (debug)
5592     {
5593         fprintf(debug,
5594                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5595                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5596     }
5597 }
5598
5599 static void receive_ddindex2simnodeid(t_commrec *cr)
5600 {
5601     gmx_domdec_t *dd;
5602     
5603     gmx_domdec_comm_t *comm;
5604     int  *buf;
5605     
5606     dd = cr->dd;
5607     comm = dd->comm;
5608     
5609 #ifdef GMX_MPI
5610     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5611     {
5612         snew(comm->ddindex2simnodeid,dd->nnodes);
5613         snew(buf,dd->nnodes);
5614         if (cr->duty & DUTY_PP)
5615         {
5616             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5617         }
5618 #ifdef GMX_MPI
5619         /* Communicate the ddindex to simulation nodeid index */
5620         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5621                       cr->mpi_comm_mysim);
5622 #endif
5623         sfree(buf);
5624     }
5625 #endif
5626 }
5627
5628 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5629                                                      int ncg,int natoms)
5630 {
5631     gmx_domdec_master_t *ma;
5632     int i;
5633
5634     snew(ma,1);
5635     
5636     snew(ma->ncg,dd->nnodes);
5637     snew(ma->index,dd->nnodes+1);
5638     snew(ma->cg,ncg);
5639     snew(ma->nat,dd->nnodes);
5640     snew(ma->ibuf,dd->nnodes*2);
5641     snew(ma->cell_x,DIM);
5642     for(i=0; i<DIM; i++)
5643     {
5644         snew(ma->cell_x[i],dd->nc[i]+1);
5645     }
5646
5647     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5648     {
5649         ma->vbuf = NULL;
5650     }
5651     else
5652     {
5653         snew(ma->vbuf,natoms);
5654     }
5655
5656     return ma;
5657 }
5658
5659 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5660                                int reorder)
5661 {
5662     gmx_domdec_t *dd;
5663     gmx_domdec_comm_t *comm;
5664     int  i,rank;
5665     gmx_bool bDiv[DIM];
5666     ivec periods;
5667 #ifdef GMX_MPI
5668     MPI_Comm comm_cart;
5669 #endif
5670     
5671     dd = cr->dd;
5672     comm = dd->comm;
5673     
5674     if (comm->bCartesianPP)
5675     {
5676         for(i=1; i<DIM; i++)
5677         {
5678             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5679         }
5680         if (bDiv[YY] || bDiv[ZZ])
5681         {
5682             comm->bCartesianPP_PME = TRUE;
5683             /* If we have 2D PME decomposition, which is always in x+y,
5684              * we stack the PME only nodes in z.
5685              * Otherwise we choose the direction that provides the thinnest slab
5686              * of PME only nodes as this will have the least effect
5687              * on the PP communication.
5688              * But for the PME communication the opposite might be better.
5689              */
5690             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5691                              !bDiv[YY] ||
5692                              dd->nc[YY] > dd->nc[ZZ]))
5693             {
5694                 comm->cartpmedim = ZZ;
5695             }
5696             else
5697             {
5698                 comm->cartpmedim = YY;
5699             }
5700             comm->ntot[comm->cartpmedim]
5701                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5702         }
5703         else if (fplog)
5704         {
5705             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5706             fprintf(fplog,
5707                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5708         }
5709     }
5710     
5711 #ifdef GMX_MPI
5712     if (comm->bCartesianPP_PME)
5713     {
5714         if (fplog)
5715         {
5716             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5717         }
5718         
5719         for(i=0; i<DIM; i++)
5720         {
5721             periods[i] = TRUE;
5722         }
5723         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5724                         &comm_cart);
5725         
5726         MPI_Comm_rank(comm_cart,&rank);
5727         if (MASTERNODE(cr) && rank != 0)
5728         {
5729             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5730         }
5731         
5732         /* With this assigment we loose the link to the original communicator
5733          * which will usually be MPI_COMM_WORLD, unless have multisim.
5734          */
5735         cr->mpi_comm_mysim = comm_cart;
5736         cr->sim_nodeid = rank;
5737         
5738         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5739         
5740         if (fplog)
5741         {
5742             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5743                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5744         }
5745         
5746         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5747         {
5748             cr->duty = DUTY_PP;
5749         }
5750         if (cr->npmenodes == 0 ||
5751             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5752         {
5753             cr->duty = DUTY_PME;
5754         }
5755         
5756         /* Split the sim communicator into PP and PME only nodes */
5757         MPI_Comm_split(cr->mpi_comm_mysim,
5758                        cr->duty,
5759                        dd_index(comm->ntot,dd->ci),
5760                        &cr->mpi_comm_mygroup);
5761     }
5762     else
5763     {
5764         switch (dd_node_order)
5765         {
5766         case ddnoPP_PME:
5767             if (fplog)
5768             {
5769                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5770             }
5771             break;
5772         case ddnoINTERLEAVE:
5773             /* Interleave the PP-only and PME-only nodes,
5774              * as on clusters with dual-core machines this will double
5775              * the communication bandwidth of the PME processes
5776              * and thus speed up the PP <-> PME and inter PME communication.
5777              */
5778             if (fplog)
5779             {
5780                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5781             }
5782             comm->pmenodes = dd_pmenodes(cr);
5783             break;
5784         case ddnoCARTESIAN:
5785             break;
5786         default:
5787             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5788         }
5789     
5790         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5791         {
5792             cr->duty = DUTY_PME;
5793         }
5794         else
5795         {
5796             cr->duty = DUTY_PP;
5797         }
5798         
5799         /* Split the sim communicator into PP and PME only nodes */
5800         MPI_Comm_split(cr->mpi_comm_mysim,
5801                        cr->duty,
5802                        cr->nodeid,
5803                        &cr->mpi_comm_mygroup);
5804         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5805     }
5806 #endif
5807
5808     if (fplog)
5809     {
5810         fprintf(fplog,"This is a %s only node\n\n",
5811                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5812     }
5813 }
5814
5815 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5816 {
5817     gmx_domdec_t *dd;
5818     gmx_domdec_comm_t *comm;
5819     int CartReorder;
5820     
5821     dd = cr->dd;
5822     comm = dd->comm;
5823     
5824     copy_ivec(dd->nc,comm->ntot);
5825     
5826     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5827     comm->bCartesianPP_PME = FALSE;
5828     
5829     /* Reorder the nodes by default. This might change the MPI ranks.
5830      * Real reordering is only supported on very few architectures,
5831      * Blue Gene is one of them.
5832      */
5833     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5834     
5835     if (cr->npmenodes > 0)
5836     {
5837         /* Split the communicator into a PP and PME part */
5838         split_communicator(fplog,cr,dd_node_order,CartReorder);
5839         if (comm->bCartesianPP_PME)
5840         {
5841             /* We (possibly) reordered the nodes in split_communicator,
5842              * so it is no longer required in make_pp_communicator.
5843              */
5844             CartReorder = FALSE;
5845         }
5846     }
5847     else
5848     {
5849         /* All nodes do PP and PME */
5850 #ifdef GMX_MPI    
5851         /* We do not require separate communicators */
5852         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5853 #endif
5854     }
5855     
5856     if (cr->duty & DUTY_PP)
5857     {
5858         /* Copy or make a new PP communicator */
5859         make_pp_communicator(fplog,cr,CartReorder);
5860     }
5861     else
5862     {
5863         receive_ddindex2simnodeid(cr);
5864     }
5865     
5866     if (!(cr->duty & DUTY_PME))
5867     {
5868         /* Set up the commnuication to our PME node */
5869         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5870         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5871         if (debug)
5872         {
5873             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5874                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5875         }
5876     }
5877     else
5878     {
5879         dd->pme_nodeid = -1;
5880     }
5881
5882     if (DDMASTER(dd))
5883     {
5884         dd->ma = init_gmx_domdec_master_t(dd,
5885                                           comm->cgs_gl.nr,
5886                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5887     }
5888 }
5889
5890 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5891 {
5892     real *slb_frac,tot;
5893     int  i,n;
5894     double dbl;
5895     
5896     slb_frac = NULL;
5897     if (nc > 1 && size_string != NULL)
5898     {
5899         if (fplog)
5900         {
5901             fprintf(fplog,"Using static load balancing for the %s direction\n",
5902                     dir);
5903         }
5904         snew(slb_frac,nc);
5905         tot = 0;
5906         for (i=0; i<nc; i++)
5907         {
5908             dbl = 0;
5909             sscanf(size_string,"%lf%n",&dbl,&n);
5910             if (dbl == 0)
5911             {
5912                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5913             }
5914             slb_frac[i] = dbl;
5915             size_string += n;
5916             tot += slb_frac[i];
5917         }
5918         /* Normalize */
5919         if (fplog)
5920         {
5921             fprintf(fplog,"Relative cell sizes:");
5922         }
5923         for (i=0; i<nc; i++)
5924         {
5925             slb_frac[i] /= tot;
5926             if (fplog)
5927             {
5928                 fprintf(fplog," %5.3f",slb_frac[i]);
5929             }
5930         }
5931         if (fplog)
5932         {
5933             fprintf(fplog,"\n");
5934         }
5935     }
5936     
5937     return slb_frac;
5938 }
5939
5940 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5941 {
5942     int n,nmol,ftype;
5943     gmx_mtop_ilistloop_t iloop;
5944     t_ilist *il;
5945     
5946     n = 0;
5947     iloop = gmx_mtop_ilistloop_init(mtop);
5948     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5949     {
5950         for(ftype=0; ftype<F_NRE; ftype++)
5951         {
5952             if ((interaction_function[ftype].flags & IF_BOND) &&
5953                 NRAL(ftype) >  2)
5954             {
5955                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5956             }
5957         }
5958   }
5959
5960   return n;
5961 }
5962
5963 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5964 {
5965     char *val;
5966     int  nst;
5967     
5968     nst = def;
5969     val = getenv(env_var);
5970     if (val)
5971     {
5972         if (sscanf(val,"%d",&nst) <= 0)
5973         {
5974             nst = 1;
5975         }
5976         if (fplog)
5977         {
5978             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5979                     env_var,val,nst);
5980         }
5981     }
5982     
5983     return nst;
5984 }
5985
5986 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5987 {
5988     if (MASTER(cr))
5989     {
5990         fprintf(stderr,"\n%s\n",warn_string);
5991     }
5992     if (fplog)
5993     {
5994         fprintf(fplog,"\n%s\n",warn_string);
5995     }
5996 }
5997
5998 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5999                                   t_inputrec *ir,FILE *fplog)
6000 {
6001     if (ir->ePBC == epbcSCREW &&
6002         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6003     {
6004         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6005     }
6006
6007     if (ir->ns_type == ensSIMPLE)
6008     {
6009         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6010     }
6011
6012     if (ir->nstlist == 0)
6013     {
6014         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6015     }
6016
6017     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6018     {
6019         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6020     }
6021 }
6022
6023 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6024 {
6025     int  di,d;
6026     real r;
6027
6028     r = ddbox->box_size[XX];
6029     for(di=0; di<dd->ndim; di++)
6030     {
6031         d = dd->dim[di];
6032         /* Check using the initial average cell size */
6033         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6034     }
6035
6036     return r;
6037 }
6038
6039 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6040                              const char *dlb_opt,gmx_bool bRecordLoad,
6041                              unsigned long Flags,t_inputrec *ir)
6042 {
6043     gmx_domdec_t *dd;
6044     int  eDLB=-1;
6045     char buf[STRLEN];
6046
6047     switch (dlb_opt[0])
6048     {
6049     case 'a': eDLB = edlbAUTO; break;
6050     case 'n': eDLB = edlbNO;   break;
6051     case 'y': eDLB = edlbYES;  break;
6052     default: gmx_incons("Unknown dlb_opt");
6053     }
6054
6055     if (Flags & MD_RERUN)
6056     {
6057         return edlbNO;
6058     }
6059
6060     if (!EI_DYNAMICS(ir->eI))
6061     {
6062         if (eDLB == edlbYES)
6063         {
6064             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6065             dd_warning(cr,fplog,buf);
6066         }
6067             
6068         return edlbNO;
6069     }
6070
6071     if (!bRecordLoad)
6072     {
6073         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6074
6075         return edlbNO;
6076     }
6077
6078     if (Flags & MD_REPRODUCIBLE)
6079     {
6080         switch (eDLB)
6081         {
6082                         case edlbNO: 
6083                                 break;
6084                         case edlbAUTO:
6085                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6086                                 eDLB = edlbNO;
6087                                 break;
6088                         case edlbYES:
6089                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6090                                 break;
6091                         default:
6092                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6093                                 break;
6094         }
6095     }
6096
6097     return eDLB;
6098 }
6099
6100 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6101 {
6102     int dim;
6103
6104     dd->ndim = 0;
6105     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6106     {
6107         /* Decomposition order z,y,x */
6108         if (fplog)
6109         {
6110             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6111         }
6112         for(dim=DIM-1; dim>=0; dim--)
6113         {
6114             if (dd->nc[dim] > 1)
6115             {
6116                 dd->dim[dd->ndim++] = dim;
6117             }
6118         }
6119     }
6120     else
6121     {
6122         /* Decomposition order x,y,z */
6123         for(dim=0; dim<DIM; dim++)
6124         {
6125             if (dd->nc[dim] > 1)
6126             {
6127                 dd->dim[dd->ndim++] = dim;
6128             }
6129         }
6130     }
6131 }
6132
6133 static gmx_domdec_comm_t *init_dd_comm()
6134 {
6135     gmx_domdec_comm_t *comm;
6136     int  i;
6137
6138     snew(comm,1);
6139     snew(comm->cggl_flag,DIM*2);
6140     snew(comm->cgcm_state,DIM*2);
6141     for(i=0; i<DIM*2; i++)
6142     {
6143         comm->cggl_flag_nalloc[i]  = 0;
6144         comm->cgcm_state_nalloc[i] = 0;
6145     }
6146     
6147     comm->nalloc_int = 0;
6148     comm->buf_int    = NULL;
6149
6150     vec_rvec_init(&comm->vbuf);
6151
6152     comm->n_load_have    = 0;
6153     comm->n_load_collect = 0;
6154
6155     for(i=0; i<ddnatNR-ddnatZONE; i++)
6156     {
6157         comm->sum_nat[i] = 0;
6158     }
6159     comm->ndecomp = 0;
6160     comm->nload   = 0;
6161     comm->load_step = 0;
6162     comm->load_sum  = 0;
6163     comm->load_max  = 0;
6164     clear_ivec(comm->load_lim);
6165     comm->load_mdf  = 0;
6166     comm->load_pme  = 0;
6167
6168     return comm;
6169 }
6170
6171 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6172                                         unsigned long Flags,
6173                                         ivec nc,
6174                                         real comm_distance_min,real rconstr,
6175                                         const char *dlb_opt,real dlb_scale,
6176                                         const char *sizex,const char *sizey,const char *sizez,
6177                                         gmx_mtop_t *mtop,t_inputrec *ir,
6178                                         matrix box,rvec *x,
6179                                         gmx_ddbox_t *ddbox,
6180                                         int *npme_x,int *npme_y)
6181 {
6182     gmx_domdec_t *dd;
6183     gmx_domdec_comm_t *comm;
6184     int  recload;
6185     int  d,i,j;
6186     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6187     gmx_bool bC;
6188     char buf[STRLEN];
6189     
6190     if (fplog)
6191     {
6192         fprintf(fplog,
6193                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6194     }
6195     
6196     snew(dd,1);
6197
6198     dd->comm = init_dd_comm();
6199     comm = dd->comm;
6200     snew(comm->cggl_flag,DIM*2);
6201     snew(comm->cgcm_state,DIM*2);
6202
6203     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6204     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6205     
6206     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6207     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6208     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6209     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6210     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6211     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6212     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6213     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6214
6215     dd->pme_recv_f_alloc = 0;
6216     dd->pme_recv_f_buf = NULL;
6217
6218     if (dd->bSendRecv2 && fplog)
6219     {
6220         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6221     }
6222     if (comm->eFlop)
6223     {
6224         if (fplog)
6225         {
6226             fprintf(fplog,"Will load balance based on FLOP count\n");
6227         }
6228         if (comm->eFlop > 1)
6229         {
6230             srand(1+cr->nodeid);
6231         }
6232         comm->bRecordLoad = TRUE;
6233     }
6234     else
6235     {
6236         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6237                              
6238     }
6239     
6240     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6241     
6242     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6243     if (fplog)
6244     {
6245         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6246     }
6247     dd->bGridJump = comm->bDynLoadBal;
6248     
6249     if (comm->nstSortCG)
6250     {
6251         if (fplog)
6252         {
6253             if (comm->nstSortCG == 1)
6254             {
6255                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6256             }
6257             else
6258             {
6259                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6260                         comm->nstSortCG);
6261             }
6262         }
6263         snew(comm->sort,1);
6264     }
6265     else
6266     {
6267         if (fplog)
6268         {
6269             fprintf(fplog,"Will not sort the charge groups\n");
6270         }
6271     }
6272     
6273     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6274     if (comm->bInterCGBondeds)
6275     {
6276         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6277     }
6278     else
6279     {
6280         comm->bInterCGMultiBody = FALSE;
6281     }
6282     
6283     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6284
6285     if (ir->rlistlong == 0)
6286     {
6287         /* Set the cut-off to some very large value,
6288          * so we don't need if statements everywhere in the code.
6289          * We use sqrt, since the cut-off is squared in some places.
6290          */
6291         comm->cutoff   = GMX_CUTOFF_INF;
6292     }
6293     else
6294     {
6295         comm->cutoff   = ir->rlistlong;
6296     }
6297     comm->cutoff_mbody = 0;
6298     
6299     comm->cellsize_limit = 0;
6300     comm->bBondComm = FALSE;
6301
6302     if (comm->bInterCGBondeds)
6303     {
6304         if (comm_distance_min > 0)
6305         {
6306             comm->cutoff_mbody = comm_distance_min;
6307             if (Flags & MD_DDBONDCOMM)
6308             {
6309                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6310             }
6311             else
6312             {
6313                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6314             }
6315             r_bonded_limit = comm->cutoff_mbody;
6316         }
6317         else if (ir->bPeriodicMols)
6318         {
6319             /* Can not easily determine the required cut-off */
6320             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6321             comm->cutoff_mbody = comm->cutoff/2;
6322             r_bonded_limit = comm->cutoff_mbody;
6323         }
6324         else
6325         {
6326             if (MASTER(cr))
6327             {
6328                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6329                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6330             }
6331             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6332             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6333
6334             /* We use an initial margin of 10% for the minimum cell size,
6335              * except when we are just below the non-bonded cut-off.
6336              */
6337             if (Flags & MD_DDBONDCOMM)
6338             {
6339                 if (max(r_2b,r_mb) > comm->cutoff)
6340                 {
6341                     r_bonded       = max(r_2b,r_mb);
6342                     r_bonded_limit = 1.1*r_bonded;
6343                     comm->bBondComm = TRUE;
6344                 }
6345                 else
6346                 {
6347                     r_bonded       = r_mb;
6348                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6349                 }
6350                 /* We determine cutoff_mbody later */
6351             }
6352             else
6353             {
6354                 /* No special bonded communication,
6355                  * simply increase the DD cut-off.
6356                  */
6357                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6358                 comm->cutoff_mbody = r_bonded_limit;
6359                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6360             }
6361         }
6362         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6363         if (fplog)
6364         {
6365             fprintf(fplog,
6366                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6367                     comm->cellsize_limit);
6368         }
6369     }
6370
6371     if (dd->bInterCGcons && rconstr <= 0)
6372     {
6373         /* There is a cell size limit due to the constraints (P-LINCS) */
6374         rconstr = constr_r_max(fplog,mtop,ir);
6375         if (fplog)
6376         {
6377             fprintf(fplog,
6378                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6379                     rconstr);
6380             if (rconstr > comm->cellsize_limit)
6381             {
6382                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6383             }
6384         }
6385     }
6386     else if (rconstr > 0 && fplog)
6387     {
6388         /* Here we do not check for dd->bInterCGcons,
6389          * because one can also set a cell size limit for virtual sites only
6390          * and at this point we don't know yet if there are intercg v-sites.
6391          */
6392         fprintf(fplog,
6393                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6394                 rconstr);
6395     }
6396     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6397
6398     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6399
6400     if (nc[XX] > 0)
6401     {
6402         copy_ivec(nc,dd->nc);
6403         set_dd_dim(fplog,dd);
6404         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6405
6406         if (cr->npmenodes == -1)
6407         {
6408             cr->npmenodes = 0;
6409         }
6410         acs = average_cellsize_min(dd,ddbox);
6411         if (acs < comm->cellsize_limit)
6412         {
6413             if (fplog)
6414             {
6415                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6416             }
6417             gmx_fatal_collective(FARGS,cr,NULL,
6418                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6419                                  acs,comm->cellsize_limit);
6420         }
6421     }
6422     else
6423     {
6424         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6425
6426         /* We need to choose the optimal DD grid and possibly PME nodes */
6427         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6428                                comm->eDLB!=edlbNO,dlb_scale,
6429                                comm->cellsize_limit,comm->cutoff,
6430                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6431         
6432         if (dd->nc[XX] == 0)
6433         {
6434             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6435             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6436                     !bC ? "-rdd" : "-rcon",
6437                     comm->eDLB!=edlbNO ? " or -dds" : "",
6438                     bC ? " or your LINCS settings" : "");
6439
6440             gmx_fatal_collective(FARGS,cr,NULL,
6441                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6442                                  "%s\n"
6443                                  "Look in the log file for details on the domain decomposition",
6444                                  cr->nnodes-cr->npmenodes,limit,buf);
6445         }
6446         set_dd_dim(fplog,dd);
6447     }
6448
6449     if (fplog)
6450     {
6451         fprintf(fplog,
6452                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6453                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6454     }
6455     
6456     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6457     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6458     {
6459         gmx_fatal_collective(FARGS,cr,NULL,
6460                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6461                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6462     }
6463     if (cr->npmenodes > dd->nnodes)
6464     {
6465         gmx_fatal_collective(FARGS,cr,NULL,
6466                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6467     }
6468     if (cr->npmenodes > 0)
6469     {
6470         comm->npmenodes = cr->npmenodes;
6471     }
6472     else
6473     {
6474         comm->npmenodes = dd->nnodes;
6475     }
6476
6477     if (EEL_PME(ir->coulombtype))
6478     {
6479         /* The following choices should match those
6480          * in comm_cost_est in domdec_setup.c.
6481          * Note that here the checks have to take into account
6482          * that the decomposition might occur in a different order than xyz
6483          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6484          * in which case they will not match those in comm_cost_est,
6485          * but since that is mainly for testing purposes that's fine.
6486          */
6487         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6488             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6489             getenv("GMX_PMEONEDD") == NULL)
6490         {
6491             comm->npmedecompdim = 2;
6492             comm->npmenodes_x   = dd->nc[XX];
6493             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6494         }
6495         else
6496         {
6497             /* In case nc is 1 in both x and y we could still choose to
6498              * decompose pme in y instead of x, but we use x for simplicity.
6499              */
6500             comm->npmedecompdim = 1;
6501             if (dd->dim[0] == YY)
6502             {
6503                 comm->npmenodes_x = 1;
6504                 comm->npmenodes_y = comm->npmenodes;
6505             }
6506             else
6507             {
6508                 comm->npmenodes_x = comm->npmenodes;
6509                 comm->npmenodes_y = 1;
6510             }
6511         }    
6512         if (fplog)
6513         {
6514             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6515                     comm->npmenodes_x,comm->npmenodes_y,1);
6516         }
6517     }
6518     else
6519     {
6520         comm->npmedecompdim = 0;
6521         comm->npmenodes_x   = 0;
6522         comm->npmenodes_y   = 0;
6523     }
6524     
6525     /* Technically we don't need both of these,
6526      * but it simplifies code not having to recalculate it.
6527      */
6528     *npme_x = comm->npmenodes_x;
6529     *npme_y = comm->npmenodes_y;
6530         
6531     snew(comm->slb_frac,DIM);
6532     if (comm->eDLB == edlbNO)
6533     {
6534         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6535         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6536         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6537     }
6538
6539     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6540     {
6541         if (comm->bBondComm || comm->eDLB != edlbNO)
6542         {
6543             /* Set the bonded communication distance to halfway
6544              * the minimum and the maximum,
6545              * since the extra communication cost is nearly zero.
6546              */
6547             acs = average_cellsize_min(dd,ddbox);
6548             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6549             if (comm->eDLB != edlbNO)
6550             {
6551                 /* Check if this does not limit the scaling */
6552                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6553             }
6554             if (!comm->bBondComm)
6555             {
6556                 /* Without bBondComm do not go beyond the n.b. cut-off */
6557                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6558                 if (comm->cellsize_limit >= comm->cutoff)
6559                 {
6560                     /* We don't loose a lot of efficieny
6561                      * when increasing it to the n.b. cut-off.
6562                      * It can even be slightly faster, because we need
6563                      * less checks for the communication setup.
6564                      */
6565                     comm->cutoff_mbody = comm->cutoff;
6566                 }
6567             }
6568             /* Check if we did not end up below our original limit */
6569             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6570
6571             if (comm->cutoff_mbody > comm->cellsize_limit)
6572             {
6573                 comm->cellsize_limit = comm->cutoff_mbody;
6574             }
6575         }
6576         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6577     }
6578
6579     if (debug)
6580     {
6581         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6582                 "cellsize limit %f\n",
6583                 comm->bBondComm,comm->cellsize_limit);
6584     }
6585     
6586     if (MASTER(cr))
6587     {
6588         check_dd_restrictions(cr,dd,ir,fplog);
6589     }
6590
6591     comm->globalcomm_step = INT_MIN;
6592     dd->ddp_count = 0;
6593
6594     clear_dd_cycle_counts(dd);
6595
6596     return dd;
6597 }
6598
6599 static void set_dlb_limits(gmx_domdec_t *dd)
6600
6601 {
6602     int d;
6603
6604     for(d=0; d<dd->ndim; d++)
6605     {
6606         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6607         dd->comm->cellsize_min[dd->dim[d]] =
6608             dd->comm->cellsize_min_dlb[dd->dim[d]];
6609     }
6610 }
6611
6612
6613 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6614 {
6615     gmx_domdec_t *dd;
6616     gmx_domdec_comm_t *comm;
6617     real cellsize_min;
6618     int  d,nc,i;
6619     char buf[STRLEN];
6620     
6621     dd = cr->dd;
6622     comm = dd->comm;
6623     
6624     if (fplog)
6625     {
6626         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6627     }
6628
6629     cellsize_min = comm->cellsize_min[dd->dim[0]];
6630     for(d=1; d<dd->ndim; d++)
6631     {
6632         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6633     }
6634
6635     if (cellsize_min < comm->cellsize_limit*1.05)
6636     {
6637         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6638
6639         /* Change DLB from "auto" to "no". */
6640         comm->eDLB = edlbNO;
6641
6642         return;
6643     }
6644
6645     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6646     comm->bDynLoadBal = TRUE;
6647     dd->bGridJump = TRUE;
6648     
6649     set_dlb_limits(dd);
6650
6651     /* We can set the required cell size info here,
6652      * so we do not need to communicate this.
6653      * The grid is completely uniform.
6654      */
6655     for(d=0; d<dd->ndim; d++)
6656     {
6657         if (comm->root[d])
6658         {
6659             comm->load[d].sum_m = comm->load[d].sum;
6660
6661             nc = dd->nc[dd->dim[d]];
6662             for(i=0; i<nc; i++)
6663             {
6664                 comm->root[d]->cell_f[i]    = i/(real)nc;
6665                 if (d > 0)
6666                 {
6667                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6668                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6669                 }
6670             }
6671             comm->root[d]->cell_f[nc] = 1.0;
6672         }
6673     }
6674 }
6675
6676 static char *init_bLocalCG(gmx_mtop_t *mtop)
6677 {
6678     int  ncg,cg;
6679     char *bLocalCG;
6680     
6681     ncg = ncg_mtop(mtop);
6682     snew(bLocalCG,ncg);
6683     for(cg=0; cg<ncg; cg++)
6684     {
6685         bLocalCG[cg] = FALSE;
6686     }
6687
6688     return bLocalCG;
6689 }
6690
6691 void dd_init_bondeds(FILE *fplog,
6692                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6693                      gmx_vsite_t *vsite,gmx_constr_t constr,
6694                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6695 {
6696     gmx_domdec_comm_t *comm;
6697     gmx_bool bBondComm;
6698     int  d;
6699
6700     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6701
6702     comm = dd->comm;
6703
6704     if (comm->bBondComm)
6705     {
6706         /* Communicate atoms beyond the cut-off for bonded interactions */
6707         comm = dd->comm;
6708
6709         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6710
6711         comm->bLocalCG = init_bLocalCG(mtop);
6712     }
6713     else
6714     {
6715         /* Only communicate atoms based on cut-off */
6716         comm->cglink   = NULL;
6717         comm->bLocalCG = NULL;
6718     }
6719 }
6720
6721 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6722                               t_inputrec *ir,
6723                               gmx_bool bDynLoadBal,real dlb_scale,
6724                               gmx_ddbox_t *ddbox)
6725 {
6726     gmx_domdec_comm_t *comm;
6727     int  d;
6728     ivec np;
6729     real limit,shrink;
6730     char buf[64];
6731
6732     if (fplog == NULL)
6733     {
6734         return;
6735     }
6736
6737     comm = dd->comm;
6738
6739     if (bDynLoadBal)
6740     {
6741         fprintf(fplog,"The maximum number of communication pulses is:");
6742         for(d=0; d<dd->ndim; d++)
6743         {
6744             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6745         }
6746         fprintf(fplog,"\n");
6747         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6748         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6749         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6750         for(d=0; d<DIM; d++)
6751         {
6752             if (dd->nc[d] > 1)
6753             {
6754                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6755                 {
6756                     shrink = 0;
6757                 }
6758                 else
6759                 {
6760                     shrink =
6761                         comm->cellsize_min_dlb[d]/
6762                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6763                 }
6764                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6765             }
6766         }
6767         fprintf(fplog,"\n");
6768     }
6769     else
6770     {
6771         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6772         fprintf(fplog,"The initial number of communication pulses is:");
6773         for(d=0; d<dd->ndim; d++)
6774         {
6775             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6776         }
6777         fprintf(fplog,"\n");
6778         fprintf(fplog,"The initial domain decomposition cell size is:");
6779         for(d=0; d<DIM; d++) {
6780             if (dd->nc[d] > 1)
6781             {
6782                 fprintf(fplog," %c %.2f nm",
6783                         dim2char(d),dd->comm->cellsize_min[d]);
6784             }
6785         }
6786         fprintf(fplog,"\n\n");
6787     }
6788     
6789     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6790     {
6791         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6792         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6793                 "non-bonded interactions","",comm->cutoff);
6794
6795         if (bDynLoadBal)
6796         {
6797             limit = dd->comm->cellsize_limit;
6798         }
6799         else
6800         {
6801             if (dynamic_dd_box(ddbox,ir))
6802             {
6803                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6804             }
6805             limit = dd->comm->cellsize_min[XX];
6806             for(d=1; d<DIM; d++)
6807             {
6808                 limit = min(limit,dd->comm->cellsize_min[d]);
6809             }
6810         }
6811
6812         if (comm->bInterCGBondeds)
6813         {
6814             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6815                     "two-body bonded interactions","(-rdd)",
6816                     max(comm->cutoff,comm->cutoff_mbody));
6817             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6818                     "multi-body bonded interactions","(-rdd)",
6819                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6820         }
6821         if (dd->vsite_comm)
6822         {
6823             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6824                     "virtual site constructions","(-rcon)",limit);
6825         }
6826         if (dd->constraint_comm)
6827         {
6828             sprintf(buf,"atoms separated by up to %d constraints",
6829                     1+ir->nProjOrder);
6830             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6831                     buf,"(-rcon)",limit);
6832         }
6833         fprintf(fplog,"\n");
6834     }
6835     
6836     fflush(fplog);
6837 }
6838
6839 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6840                        t_inputrec *ir,t_forcerec *fr,
6841                        gmx_ddbox_t *ddbox)
6842 {
6843     gmx_domdec_comm_t *comm;
6844     int  d,dim,npulse,npulse_d_max,npulse_d;
6845     gmx_bool bNoCutOff;
6846     int  natoms_tot;
6847     real vol_frac;
6848
6849     comm = dd->comm;
6850
6851     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6852
6853     if (EEL_PME(ir->coulombtype))
6854     {
6855         init_ddpme(dd,&comm->ddpme[0],0);
6856         if (comm->npmedecompdim >= 2)
6857         {
6858             init_ddpme(dd,&comm->ddpme[1],1);
6859         }
6860     }
6861     else
6862     {
6863         comm->npmenodes = 0;
6864         if (dd->pme_nodeid >= 0)
6865         {
6866             gmx_fatal_collective(FARGS,NULL,dd,
6867                                  "Can not have separate PME nodes without PME electrostatics");
6868         }
6869     }
6870     
6871     /* If each molecule is a single charge group
6872      * or we use domain decomposition for each periodic dimension,
6873      * we do not need to take pbc into account for the bonded interactions.
6874      */
6875     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6876         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6877     {
6878         fr->bMolPBC = FALSE;
6879     }
6880     else
6881     {
6882         fr->bMolPBC = TRUE;
6883     }
6884         
6885     if (debug)
6886     {
6887         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6888     }
6889     if (comm->eDLB != edlbNO)
6890     {
6891         /* Determine the maximum number of comm. pulses in one dimension */
6892         
6893         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6894         
6895         /* Determine the maximum required number of grid pulses */
6896         if (comm->cellsize_limit >= comm->cutoff)
6897         {
6898             /* Only a single pulse is required */
6899             npulse = 1;
6900         }
6901         else if (!bNoCutOff && comm->cellsize_limit > 0)
6902         {
6903             /* We round down slightly here to avoid overhead due to the latency
6904              * of extra communication calls when the cut-off
6905              * would be only slightly longer than the cell size.
6906              * Later cellsize_limit is redetermined,
6907              * so we can not miss interactions due to this rounding.
6908              */
6909             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6910         }
6911         else
6912         {
6913             /* There is no cell size limit */
6914             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6915         }
6916
6917         if (!bNoCutOff && npulse > 1)
6918         {
6919             /* See if we can do with less pulses, based on dlb_scale */
6920             npulse_d_max = 0;
6921             for(d=0; d<dd->ndim; d++)
6922             {
6923                 dim = dd->dim[d];
6924                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6925                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6926                 npulse_d_max = max(npulse_d_max,npulse_d);
6927             }
6928             npulse = min(npulse,npulse_d_max);
6929         }
6930         
6931         /* This env var can override npulse */
6932         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6933         if (d > 0)
6934         {
6935             npulse = d;
6936         }
6937
6938         comm->maxpulse = 1;
6939         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6940         for(d=0; d<dd->ndim; d++)
6941         {
6942             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6943             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6944             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6945             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6946             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6947             {
6948                 comm->bVacDLBNoLimit = FALSE;
6949             }
6950         }
6951         
6952         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6953         if (!comm->bVacDLBNoLimit)
6954         {
6955             comm->cellsize_limit = max(comm->cellsize_limit,
6956                                        comm->cutoff/comm->maxpulse);
6957         }
6958         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6959         /* Set the minimum cell size for each DD dimension */
6960         for(d=0; d<dd->ndim; d++)
6961         {
6962             if (comm->bVacDLBNoLimit ||
6963                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6964             {
6965                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6966             }
6967             else
6968             {
6969                 comm->cellsize_min_dlb[dd->dim[d]] =
6970                     comm->cutoff/comm->cd[d].np_dlb;
6971             }
6972         }
6973         if (comm->cutoff_mbody <= 0)
6974         {
6975             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6976         }
6977         if (comm->bDynLoadBal)
6978         {
6979             set_dlb_limits(dd);
6980         }
6981     }
6982     
6983     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6984     if (comm->eDLB == edlbAUTO)
6985     {
6986         if (fplog)
6987         {
6988             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6989         }
6990         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6991     }
6992
6993     if (ir->ePBC == epbcNONE)
6994     {
6995         vol_frac = 1 - 1/(double)dd->nnodes;
6996     }
6997     else
6998     {
6999         vol_frac =
7000             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7001     }
7002     if (debug)
7003     {
7004         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7005     }
7006     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7007    
7008     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7009 }
7010
7011 static void merge_cg_buffers(int ncell,
7012                              gmx_domdec_comm_dim_t *cd, int pulse,
7013                              int  *ncg_cell,
7014                              int  *index_gl, int  *recv_i,
7015                              rvec *cg_cm,    rvec *recv_vr,
7016                              int *cgindex,
7017                              cginfo_mb_t *cginfo_mb,int *cginfo)
7018 {
7019     gmx_domdec_ind_t *ind,*ind_p;
7020     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7021     int shift,shift_at;
7022     
7023     ind = &cd->ind[pulse];
7024     
7025     /* First correct the already stored data */
7026     shift = ind->nrecv[ncell];
7027     for(cell=ncell-1; cell>=0; cell--)
7028     {
7029         shift -= ind->nrecv[cell];
7030         if (shift > 0)
7031         {
7032             /* Move the cg's present from previous grid pulses */
7033             cg0 = ncg_cell[ncell+cell];
7034             cg1 = ncg_cell[ncell+cell+1];
7035             cgindex[cg1+shift] = cgindex[cg1];
7036             for(cg=cg1-1; cg>=cg0; cg--)
7037             {
7038                 index_gl[cg+shift] = index_gl[cg];
7039                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7040                 cgindex[cg+shift] = cgindex[cg];
7041                 cginfo[cg+shift] = cginfo[cg];
7042             }
7043             /* Correct the already stored send indices for the shift */
7044             for(p=1; p<=pulse; p++)
7045             {
7046                 ind_p = &cd->ind[p];
7047                 cg0 = 0;
7048                 for(c=0; c<cell; c++)
7049                 {
7050                     cg0 += ind_p->nsend[c];
7051                 }
7052                 cg1 = cg0 + ind_p->nsend[cell];
7053                 for(cg=cg0; cg<cg1; cg++)
7054                 {
7055                     ind_p->index[cg] += shift;
7056                 }
7057             }
7058         }
7059     }
7060
7061     /* Merge in the communicated buffers */
7062     shift = 0;
7063     shift_at = 0;
7064     cg0 = 0;
7065     for(cell=0; cell<ncell; cell++)
7066     {
7067         cg1 = ncg_cell[ncell+cell+1] + shift;
7068         if (shift_at > 0)
7069         {
7070             /* Correct the old cg indices */
7071             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7072             {
7073                 cgindex[cg+1] += shift_at;
7074             }
7075         }
7076         for(cg=0; cg<ind->nrecv[cell]; cg++)
7077         {
7078             /* Copy this charge group from the buffer */
7079             index_gl[cg1] = recv_i[cg0];
7080             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7081             /* Add it to the cgindex */
7082             cg_gl = index_gl[cg1];
7083             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7084             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7085             cgindex[cg1+1] = cgindex[cg1] + nat;
7086             cg0++;
7087             cg1++;
7088             shift_at += nat;
7089         }
7090         shift += ind->nrecv[cell];
7091         ncg_cell[ncell+cell+1] = cg1;
7092     }
7093 }
7094
7095 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7096                                int nzone,int cg0,const int *cgindex)
7097 {
7098     int cg,zone,p;
7099     
7100     /* Store the atom block boundaries for easy copying of communication buffers
7101      */
7102     cg = cg0;
7103     for(zone=0; zone<nzone; zone++)
7104     {
7105         for(p=0; p<cd->np; p++) {
7106             cd->ind[p].cell2at0[zone] = cgindex[cg];
7107             cg += cd->ind[p].nrecv[zone];
7108             cd->ind[p].cell2at1[zone] = cgindex[cg];
7109         }
7110     }
7111 }
7112
7113 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7114 {
7115     int  i;
7116     gmx_bool bMiss;
7117
7118     bMiss = FALSE;
7119     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7120     {
7121         if (!bLocalCG[link->a[i]])
7122         {
7123             bMiss = TRUE;
7124         }
7125     }
7126
7127     return bMiss;
7128 }
7129
7130 static void setup_dd_communication(gmx_domdec_t *dd,
7131                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7132 {
7133     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7134     int nzone,nzone_send,zone,zonei,cg0,cg1;
7135     int c,i,j,cg,cg_gl,nrcg;
7136     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7137     gmx_domdec_comm_t *comm;
7138     gmx_domdec_zones_t *zones;
7139     gmx_domdec_comm_dim_t *cd;
7140     gmx_domdec_ind_t *ind;
7141     cginfo_mb_t *cginfo_mb;
7142     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7143     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7144     rvec rb,rn;
7145     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7146     real bcorner[DIM],bcorner_round_1=0;
7147     ivec tric_dist;
7148     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7149     real skew_fac2_d,skew_fac_01;
7150     rvec sf2_round;
7151     int  nsend,nat;
7152     
7153     if (debug)
7154     {
7155         fprintf(debug,"Setting up DD communication\n");
7156     }
7157     
7158     comm  = dd->comm;
7159     cg_cm = fr->cg_cm;
7160
7161     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7162     {
7163         dim = dd->dim[dim_ind];
7164
7165         /* Check if we need to use triclinic distances */
7166         tric_dist[dim_ind] = 0;
7167         for(i=0; i<=dim_ind; i++)
7168         {
7169             if (ddbox->tric_dir[dd->dim[i]])
7170             {
7171                 tric_dist[dim_ind] = 1;
7172             }
7173         }
7174     }
7175
7176     bBondComm = comm->bBondComm;
7177
7178     /* Do we need to determine extra distances for multi-body bondeds? */
7179     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7180     
7181     /* Do we need to determine extra distances for only two-body bondeds? */
7182     bDist2B = (bBondComm && !bDistMB);
7183
7184     r_comm2  = sqr(comm->cutoff);
7185     r_bcomm2 = sqr(comm->cutoff_mbody);
7186
7187     if (debug)
7188     {
7189         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7190     }
7191
7192     zones = &comm->zones;
7193     
7194     dim0 = dd->dim[0];
7195     /* The first dimension is equal for all cells */
7196     corner[0][0] = comm->cell_x0[dim0];
7197     if (bDistMB)
7198     {
7199         bcorner[0] = corner[0][0];
7200     }
7201     if (dd->ndim >= 2)
7202     {
7203         dim1 = dd->dim[1];
7204         /* This cell row is only seen from the first row */
7205         corner[1][0] = comm->cell_x0[dim1];
7206         /* All rows can see this row */
7207         corner[1][1] = comm->cell_x0[dim1];
7208         if (dd->bGridJump)
7209         {
7210             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7211             if (bDistMB)
7212             {
7213                 /* For the multi-body distance we need the maximum */
7214                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7215             }
7216         }
7217         /* Set the upper-right corner for rounding */
7218         corner_round_0 = comm->cell_x1[dim0];
7219         
7220         if (dd->ndim >= 3)
7221         {
7222             dim2 = dd->dim[2];
7223             for(j=0; j<4; j++)
7224             {
7225                 corner[2][j] = comm->cell_x0[dim2];
7226             }
7227             if (dd->bGridJump)
7228             {
7229                 /* Use the maximum of the i-cells that see a j-cell */
7230                 for(i=0; i<zones->nizone; i++)
7231                 {
7232                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7233                     {
7234                         if (j >= 4)
7235                         {
7236                             corner[2][j-4] =
7237                                 max(corner[2][j-4],
7238                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7239                         }
7240                     }
7241                 }
7242                 if (bDistMB)
7243                 {
7244                     /* For the multi-body distance we need the maximum */
7245                     bcorner[2] = comm->cell_x0[dim2];
7246                     for(i=0; i<2; i++)
7247                     {
7248                         for(j=0; j<2; j++)
7249                         {
7250                             bcorner[2] = max(bcorner[2],
7251                                              comm->zone_d2[i][j].p1_0);
7252                         }
7253                     }
7254                 }
7255             }
7256             
7257             /* Set the upper-right corner for rounding */
7258             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7259              * Only cell (0,0,0) can see cell 7 (1,1,1)
7260              */
7261             corner_round_1[0] = comm->cell_x1[dim1];
7262             corner_round_1[3] = comm->cell_x1[dim1];
7263             if (dd->bGridJump)
7264             {
7265                 corner_round_1[0] = max(comm->cell_x1[dim1],
7266                                         comm->zone_d1[1].mch1);
7267                 if (bDistMB)
7268                 {
7269                     /* For the multi-body distance we need the maximum */
7270                     bcorner_round_1 = max(comm->cell_x1[dim1],
7271                                           comm->zone_d1[1].p1_1);
7272                 }
7273             }
7274         }
7275     }
7276     
7277     /* Triclinic stuff */
7278     normal = ddbox->normal;
7279     skew_fac_01 = 0;
7280     if (dd->ndim >= 2)
7281     {
7282         v_0 = ddbox->v[dim0];
7283         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7284         {
7285             /* Determine the coupling coefficient for the distances
7286              * to the cell planes along dim0 and dim1 through dim2.
7287              * This is required for correct rounding.
7288              */
7289             skew_fac_01 =
7290                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7291             if (debug)
7292             {
7293                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7294             }
7295         }
7296     }
7297     if (dd->ndim >= 3)
7298     {
7299         v_1 = ddbox->v[dim1];
7300     }
7301     
7302     zone_cg_range = zones->cg_range;
7303     index_gl = dd->index_gl;
7304     cgindex  = dd->cgindex;
7305     cginfo_mb = fr->cginfo_mb;
7306     
7307     zone_cg_range[0]   = 0;
7308     zone_cg_range[1]   = dd->ncg_home;
7309     comm->zone_ncg1[0] = dd->ncg_home;
7310     pos_cg             = dd->ncg_home;
7311     
7312     nat_tot = dd->nat_home;
7313     nzone = 1;
7314     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7315     {
7316         dim = dd->dim[dim_ind];
7317         cd = &comm->cd[dim_ind];
7318         
7319         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7320         {
7321             /* No pbc in this dimension, the first node should not comm. */
7322             nzone_send = 0;
7323         }
7324         else
7325         {
7326             nzone_send = nzone;
7327         }
7328
7329         bScrew = (dd->bScrewPBC && dim == XX);
7330         
7331         v_d = ddbox->v[dim];
7332         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7333
7334         cd->bInPlace = TRUE;
7335         for(p=0; p<cd->np; p++)
7336         {
7337             /* Only atoms communicated in the first pulse are used
7338              * for multi-body bonded interactions or for bBondComm.
7339              */
7340             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7341             bDistMB_pulse = (bDistMB && bDistBonded);
7342
7343             ind = &cd->ind[p];
7344             nsend = 0;
7345             nat = 0;
7346             for(zone=0; zone<nzone_send; zone++)
7347             {
7348                 if (tric_dist[dim_ind] && dim_ind > 0)
7349                 {
7350                     /* Determine slightly more optimized skew_fac's
7351                      * for rounding.
7352                      * This reduces the number of communicated atoms
7353                      * by about 10% for 3D DD of rhombic dodecahedra.
7354                      */
7355                     for(dimd=0; dimd<dim; dimd++)
7356                     {
7357                         sf2_round[dimd] = 1;
7358                         if (ddbox->tric_dir[dimd])
7359                         {
7360                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7361                             {
7362                                 /* If we are shifted in dimension i
7363                                  * and the cell plane is tilted forward
7364                                  * in dimension i, skip this coupling.
7365                                  */
7366                                 if (!(zones->shift[nzone+zone][i] &&
7367                                       ddbox->v[dimd][i][dimd] >= 0))
7368                                 {
7369                                     sf2_round[dimd] +=
7370                                         sqr(ddbox->v[dimd][i][dimd]);
7371                                 }
7372                             }
7373                             sf2_round[dimd] = 1/sf2_round[dimd];
7374                         }
7375                     }
7376                 }
7377
7378                 zonei = zone_perm[dim_ind][zone];
7379                 if (p == 0)
7380                 {
7381                     /* Here we permutate the zones to obtain a convenient order
7382                      * for neighbor searching
7383                      */
7384                     cg0 = zone_cg_range[zonei];
7385                     cg1 = zone_cg_range[zonei+1];
7386                 }
7387                 else
7388                 {
7389                     /* Look only at the cg's received in the previous grid pulse
7390                      */
7391                     cg1 = zone_cg_range[nzone+zone+1];
7392                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7393                 }
7394                 ind->nsend[zone] = 0;
7395                 for(cg=cg0; cg<cg1; cg++)
7396                 {
7397                     r2  = 0;
7398                     rb2 = 0;
7399                     if (tric_dist[dim_ind] == 0)
7400                     {
7401                         /* Rectangular direction, easy */
7402                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7403                         if (r > 0)
7404                         {
7405                             r2 += r*r;
7406                         }
7407                         if (bDistMB_pulse)
7408                         {
7409                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7410                             if (r > 0)
7411                             {
7412                                 rb2 += r*r;
7413                             }
7414                         }
7415                         /* Rounding gives at most a 16% reduction
7416                          * in communicated atoms
7417                          */
7418                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7419                         {
7420                             r = cg_cm[cg][dim0] - corner_round_0;
7421                             /* This is the first dimension, so always r >= 0 */
7422                             r2 += r*r;
7423                             if (bDistMB_pulse)
7424                             {
7425                                 rb2 += r*r;
7426                             }
7427                         }
7428                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7429                         {
7430                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7431                             if (r > 0)
7432                             {
7433                                 r2 += r*r;
7434                             }
7435                             if (bDistMB_pulse)
7436                             {
7437                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7438                                 if (r > 0)
7439                                 {
7440                                     rb2 += r*r;
7441                                 }
7442                             }
7443                         }
7444                     }
7445                     else
7446                     {
7447                         /* Triclinic direction, more complicated */
7448                         clear_rvec(rn);
7449                         clear_rvec(rb);
7450                         /* Rounding, conservative as the skew_fac multiplication
7451                          * will slightly underestimate the distance.
7452                          */
7453                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7454                         {
7455                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7456                             for(i=dim0+1; i<DIM; i++)
7457                             {
7458                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7459                             }
7460                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7461                             if (bDistMB_pulse)
7462                             {
7463                                 rb[dim0] = rn[dim0];
7464                                 rb2 = r2;
7465                             }
7466                             /* Take care that the cell planes along dim0 might not
7467                              * be orthogonal to those along dim1 and dim2.
7468                              */
7469                             for(i=1; i<=dim_ind; i++)
7470                             {
7471                                 dimd = dd->dim[i];
7472                                 if (normal[dim0][dimd] > 0)
7473                                 {
7474                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7475                                     if (bDistMB_pulse)
7476                                     {
7477                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7478                                     }
7479                                 }
7480                             }
7481                         }
7482                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7483                         {
7484                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7485                             tric_sh = 0;
7486                             for(i=dim1+1; i<DIM; i++)
7487                             {
7488                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7489                             }
7490                             rn[dim1] += tric_sh;
7491                             if (rn[dim1] > 0)
7492                             {
7493                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7494                                 /* Take care of coupling of the distances
7495                                  * to the planes along dim0 and dim1 through dim2.
7496                                  */
7497                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7498                                 /* Take care that the cell planes along dim1
7499                                  * might not be orthogonal to that along dim2.
7500                                  */
7501                                 if (normal[dim1][dim2] > 0)
7502                                 {
7503                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7504                                 }
7505                             }
7506                             if (bDistMB_pulse)
7507                             {
7508                                 rb[dim1] +=
7509                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7510                                 if (rb[dim1] > 0)
7511                                 {
7512                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7513                                     /* Take care of coupling of the distances
7514                                      * to the planes along dim0 and dim1 through dim2.
7515                                      */
7516                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7517                                     /* Take care that the cell planes along dim1
7518                                      * might not be orthogonal to that along dim2.
7519                                      */
7520                                     if (normal[dim1][dim2] > 0)
7521                                     {
7522                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7523                                     }
7524                                 }
7525                             }
7526                         }
7527                         /* The distance along the communication direction */
7528                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7529                         tric_sh = 0;
7530                         for(i=dim+1; i<DIM; i++)
7531                         {
7532                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7533                         }
7534                         rn[dim] += tric_sh;
7535                         if (rn[dim] > 0)
7536                         {
7537                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7538                             /* Take care of coupling of the distances
7539                              * to the planes along dim0 and dim1 through dim2.
7540                              */
7541                             if (dim_ind == 1 && zonei == 1)
7542                             {
7543                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7544                             }
7545                         }
7546                         if (bDistMB_pulse)
7547                         {
7548                             clear_rvec(rb);
7549                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7550                             if (rb[dim] > 0)
7551                             {
7552                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7553                                 /* Take care of coupling of the distances
7554                                  * to the planes along dim0 and dim1 through dim2.
7555                                  */
7556                                 if (dim_ind == 1 && zonei == 1)
7557                                 {
7558                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7559                                 }
7560                             }
7561                         }
7562                     }
7563                     
7564                     if (r2 < r_comm2 ||
7565                         (bDistBonded &&
7566                          ((bDistMB && rb2 < r_bcomm2) ||
7567                           (bDist2B && r2  < r_bcomm2)) &&
7568                          (!bBondComm ||
7569                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7570                            missing_link(comm->cglink,index_gl[cg],
7571                                         comm->bLocalCG)))))
7572                     {
7573                         /* Make an index to the local charge groups */
7574                         if (nsend+1 > ind->nalloc)
7575                         {
7576                             ind->nalloc = over_alloc_large(nsend+1);
7577                             srenew(ind->index,ind->nalloc);
7578                         }
7579                         if (nsend+1 > comm->nalloc_int)
7580                         {
7581                             comm->nalloc_int = over_alloc_large(nsend+1);
7582                             srenew(comm->buf_int,comm->nalloc_int);
7583                         }
7584                         ind->index[nsend] = cg;
7585                         comm->buf_int[nsend] = index_gl[cg];
7586                         ind->nsend[zone]++;
7587                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7588
7589                         if (dd->ci[dim] == 0)
7590                         {
7591                             /* Correct cg_cm for pbc */
7592                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7593                             if (bScrew)
7594                             {
7595                                 comm->vbuf.v[nsend][YY] =
7596                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7597                                 comm->vbuf.v[nsend][ZZ] =
7598                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7599                             }
7600                         }
7601                         else
7602                         {
7603                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7604                         }
7605                         nsend++;
7606                         nat += cgindex[cg+1] - cgindex[cg];
7607                     }
7608                 }
7609             }
7610             /* Clear the counts in case we do not have pbc */
7611             for(zone=nzone_send; zone<nzone; zone++)
7612             {
7613                 ind->nsend[zone] = 0;
7614             }
7615             ind->nsend[nzone]   = nsend;
7616             ind->nsend[nzone+1] = nat;
7617             /* Communicate the number of cg's and atoms to receive */
7618             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7619                             ind->nsend, nzone+2,
7620                             ind->nrecv, nzone+2);
7621             
7622             /* The rvec buffer is also required for atom buffers of size nsend
7623              * in dd_move_x and dd_move_f.
7624              */
7625             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7626
7627             if (p > 0)
7628             {
7629                 /* We can receive in place if only the last zone is not empty */
7630                 for(zone=0; zone<nzone-1; zone++)
7631                 {
7632                     if (ind->nrecv[zone] > 0)
7633                     {
7634                         cd->bInPlace = FALSE;
7635                     }
7636                 }
7637                 if (!cd->bInPlace)
7638                 {
7639                     /* The int buffer is only required here for the cg indices */
7640                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7641                     {
7642                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7643                         srenew(comm->buf_int2,comm->nalloc_int2);
7644                     }
7645                     /* The rvec buffer is also required for atom buffers
7646                      * of size nrecv in dd_move_x and dd_move_f.
7647                      */
7648                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7649                     vec_rvec_check_alloc(&comm->vbuf2,i);
7650                 }
7651             }
7652             
7653             /* Make space for the global cg indices */
7654             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7655                 || dd->cg_nalloc == 0)
7656             {
7657                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7658                 srenew(index_gl,dd->cg_nalloc);
7659                 srenew(cgindex,dd->cg_nalloc+1);
7660             }
7661             /* Communicate the global cg indices */
7662             if (cd->bInPlace)
7663             {
7664                 recv_i = index_gl + pos_cg;
7665             }
7666             else
7667             {
7668                 recv_i = comm->buf_int2;
7669             }
7670             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7671                             comm->buf_int, nsend,
7672                             recv_i,        ind->nrecv[nzone]);
7673
7674             /* Make space for cg_cm */
7675             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7676             {
7677                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7678                 cg_cm = fr->cg_cm;
7679             }
7680             /* Communicate cg_cm */
7681             if (cd->bInPlace)
7682             {
7683                 recv_vr = cg_cm + pos_cg;
7684             }
7685             else
7686             {
7687                 recv_vr = comm->vbuf2.v;
7688             }
7689             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7690                              comm->vbuf.v, nsend,
7691                              recv_vr,      ind->nrecv[nzone]);
7692             
7693             /* Make the charge group index */
7694             if (cd->bInPlace)
7695             {
7696                 zone = (p == 0 ? 0 : nzone - 1);
7697                 while (zone < nzone)
7698                 {
7699                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7700                     {
7701                         cg_gl = index_gl[pos_cg];
7702                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7703                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7704                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7705                         if (bBondComm)
7706                         {
7707                             /* Update the charge group presence,
7708                              * so we can use it in the next pass of the loop.
7709                              */
7710                             comm->bLocalCG[cg_gl] = TRUE;
7711                         }
7712                         pos_cg++;
7713                     }
7714                     if (p == 0)
7715                     {
7716                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7717                     }
7718                     zone++;
7719                     zone_cg_range[nzone+zone] = pos_cg;
7720                 }
7721             }
7722             else
7723             {
7724                 /* This part of the code is never executed with bBondComm. */
7725                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7726                                  index_gl,recv_i,cg_cm,recv_vr,
7727                                  cgindex,fr->cginfo_mb,fr->cginfo);
7728                 pos_cg += ind->nrecv[nzone];
7729             }
7730             nat_tot += ind->nrecv[nzone+1];
7731         }
7732         if (!cd->bInPlace)
7733         {
7734             /* Store the atom block for easy copying of communication buffers */
7735             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7736         }
7737         nzone += nzone;
7738     }
7739     dd->index_gl = index_gl;
7740     dd->cgindex  = cgindex;
7741     
7742     dd->ncg_tot = zone_cg_range[zones->n];
7743     dd->nat_tot = nat_tot;
7744     comm->nat[ddnatHOME] = dd->nat_home;
7745     for(i=ddnatZONE; i<ddnatNR; i++)
7746     {
7747         comm->nat[i] = dd->nat_tot;
7748     }
7749
7750     if (!bBondComm)
7751     {
7752         /* We don't need to update cginfo, since that was alrady done above.
7753          * So we pass NULL for the forcerec.
7754          */
7755         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7756                       NULL,comm->bLocalCG);
7757     }
7758
7759     if (debug)
7760     {
7761         fprintf(debug,"Finished setting up DD communication, zones:");
7762         for(c=0; c<zones->n; c++)
7763         {
7764             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7765         }
7766         fprintf(debug,"\n");
7767     }
7768 }
7769
7770 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7771 {
7772     int c;
7773     
7774     for(c=0; c<zones->nizone; c++)
7775     {
7776         zones->izone[c].cg1  = zones->cg_range[c+1];
7777         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7778         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7779     }
7780 }
7781
7782 static int comp_cgsort(const void *a,const void *b)
7783 {
7784     int comp;
7785     
7786     gmx_cgsort_t *cga,*cgb;
7787     cga = (gmx_cgsort_t *)a;
7788     cgb = (gmx_cgsort_t *)b;
7789     
7790     comp = cga->nsc - cgb->nsc;
7791     if (comp == 0)
7792     {
7793         comp = cga->ind_gl - cgb->ind_gl;
7794     }
7795     
7796     return comp;
7797 }
7798
7799 static void order_int_cg(int n,gmx_cgsort_t *sort,
7800                          int *a,int *buf)
7801 {
7802     int i;
7803     
7804     /* Order the data */
7805     for(i=0; i<n; i++)
7806     {
7807         buf[i] = a[sort[i].ind];
7808     }
7809     
7810     /* Copy back to the original array */
7811     for(i=0; i<n; i++)
7812     {
7813         a[i] = buf[i];
7814     }
7815 }
7816
7817 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7818                          rvec *v,rvec *buf)
7819 {
7820     int i;
7821     
7822     /* Order the data */
7823     for(i=0; i<n; i++)
7824     {
7825         copy_rvec(v[sort[i].ind],buf[i]);
7826     }
7827     
7828     /* Copy back to the original array */
7829     for(i=0; i<n; i++)
7830     {
7831         copy_rvec(buf[i],v[i]);
7832     }
7833 }
7834
7835 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7836                            rvec *v,rvec *buf)
7837 {
7838     int a,atot,cg,cg0,cg1,i;
7839     
7840     /* Order the data */
7841     a = 0;
7842     for(cg=0; cg<ncg; cg++)
7843     {
7844         cg0 = cgindex[sort[cg].ind];
7845         cg1 = cgindex[sort[cg].ind+1];
7846         for(i=cg0; i<cg1; i++)
7847         {
7848             copy_rvec(v[i],buf[a]);
7849             a++;
7850         }
7851     }
7852     atot = a;
7853     
7854     /* Copy back to the original array */
7855     for(a=0; a<atot; a++)
7856     {
7857         copy_rvec(buf[a],v[a]);
7858     }
7859 }
7860
7861 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7862                          int nsort_new,gmx_cgsort_t *sort_new,
7863                          gmx_cgsort_t *sort1)
7864 {
7865     int i1,i2,i_new;
7866     
7867     /* The new indices are not very ordered, so we qsort them */
7868     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7869     
7870     /* sort2 is already ordered, so now we can merge the two arrays */
7871     i1 = 0;
7872     i2 = 0;
7873     i_new = 0;
7874     while(i2 < nsort2 || i_new < nsort_new)
7875     {
7876         if (i2 == nsort2)
7877         {
7878             sort1[i1++] = sort_new[i_new++];
7879         }
7880         else if (i_new == nsort_new)
7881         {
7882             sort1[i1++] = sort2[i2++];
7883         }
7884         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7885                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7886                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7887         {
7888             sort1[i1++] = sort2[i2++];
7889         }
7890         else
7891         {
7892             sort1[i1++] = sort_new[i_new++];
7893         }
7894     }
7895 }
7896
7897 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7898                           rvec *cgcm,t_forcerec *fr,t_state *state,
7899                           int ncg_home_old)
7900 {
7901     gmx_domdec_sort_t *sort;
7902     gmx_cgsort_t *cgsort,*sort_i;
7903     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7904     rvec *vbuf;
7905     
7906     sort = dd->comm->sort;
7907     
7908     if (dd->ncg_home > sort->sort_nalloc)
7909     {
7910         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7911         srenew(sort->sort1,sort->sort_nalloc);
7912         srenew(sort->sort2,sort->sort_nalloc);
7913     }
7914     
7915     if (ncg_home_old >= 0)
7916     {
7917         /* The charge groups that remained in the same ns grid cell
7918          * are completely ordered. So we can sort efficiently by sorting
7919          * the charge groups that did move into the stationary list.
7920          */
7921         ncg_new = 0;
7922         nsort2 = 0;
7923         nsort_new = 0;
7924         for(i=0; i<dd->ncg_home; i++)
7925         {
7926             /* Check if this cg did not move to another node */
7927             cell_index = fr->ns.grid->cell_index[i];
7928             if (cell_index !=  4*fr->ns.grid->ncells)
7929             {
7930                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7931                 {
7932                     /* This cg is new on this node or moved ns grid cell */
7933                     if (nsort_new >= sort->sort_new_nalloc)
7934                     {
7935                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7936                         srenew(sort->sort_new,sort->sort_new_nalloc);
7937                     }
7938                     sort_i = &(sort->sort_new[nsort_new++]);
7939                 }
7940                 else
7941                 {
7942                     /* This cg did not move */
7943                     sort_i = &(sort->sort2[nsort2++]);
7944                 }
7945                 /* Sort on the ns grid cell indices
7946                  * and the global topology index
7947                  */
7948                 sort_i->nsc    = cell_index;
7949                 sort_i->ind_gl = dd->index_gl[i];
7950                 sort_i->ind    = i;
7951                 ncg_new++;
7952             }
7953         }
7954         if (debug)
7955         {
7956             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7957                     nsort2,nsort_new);
7958         }
7959         /* Sort efficiently */
7960         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7961     }
7962     else
7963     {
7964         cgsort = sort->sort1;
7965         ncg_new = 0;
7966         for(i=0; i<dd->ncg_home; i++)
7967         {
7968             /* Sort on the ns grid cell indices
7969              * and the global topology index
7970              */
7971             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7972             cgsort[i].ind_gl = dd->index_gl[i];
7973             cgsort[i].ind    = i;
7974             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7975             {
7976                 ncg_new++;
7977             }
7978         }
7979         if (debug)
7980         {
7981             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7982         }
7983         /* Determine the order of the charge groups using qsort */
7984         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7985     }
7986     cgsort = sort->sort1;
7987     
7988     /* We alloc with the old size, since cgindex is still old */
7989     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7990     vbuf = dd->comm->vbuf.v;
7991     
7992     /* Remove the charge groups which are no longer at home here */
7993     dd->ncg_home = ncg_new;
7994     
7995     /* Reorder the state */
7996     for(i=0; i<estNR; i++)
7997     {
7998         if (EST_DISTR(i) && (state->flags & (1<<i)))
7999         {
8000             switch (i)
8001             {
8002             case estX:
8003                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8004                 break;
8005             case estV:
8006                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8007                 break;
8008             case estSDX:
8009                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8010                 break;
8011             case estCGP:
8012                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8013                 break;
8014             case estLD_RNG:
8015             case estLD_RNGI:
8016             case estDISRE_INITF:
8017             case estDISRE_RM3TAV:
8018             case estORIRE_INITF:
8019             case estORIRE_DTAV:
8020                 /* No ordering required */
8021                 break;
8022             default:
8023                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8024                 break;
8025             }
8026         }
8027     }
8028     /* Reorder cgcm */
8029     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8030     
8031     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8032     {
8033         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8034         srenew(sort->ibuf,sort->ibuf_nalloc);
8035     }
8036     ibuf = sort->ibuf;
8037     /* Reorder the global cg index */
8038     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8039     /* Reorder the cginfo */
8040     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8041     /* Rebuild the local cg index */
8042     ibuf[0] = 0;
8043     for(i=0; i<dd->ncg_home; i++)
8044     {
8045         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8046         ibuf[i+1] = ibuf[i] + cgsize;
8047     }
8048     for(i=0; i<dd->ncg_home+1; i++)
8049     {
8050         dd->cgindex[i] = ibuf[i];
8051     }
8052     /* Set the home atom number */
8053     dd->nat_home = dd->cgindex[dd->ncg_home];
8054     
8055     /* Copy the sorted ns cell indices back to the ns grid struct */
8056     for(i=0; i<dd->ncg_home; i++)
8057     {
8058         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8059     }
8060     fr->ns.grid->nr = dd->ncg_home;
8061 }
8062
8063 static void add_dd_statistics(gmx_domdec_t *dd)
8064 {
8065     gmx_domdec_comm_t *comm;
8066     int ddnat;
8067     
8068     comm = dd->comm;
8069     
8070     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8071     {
8072         comm->sum_nat[ddnat-ddnatZONE] +=
8073             comm->nat[ddnat] - comm->nat[ddnat-1];
8074     }
8075     comm->ndecomp++;
8076 }
8077
8078 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8079 {
8080     gmx_domdec_comm_t *comm;
8081     int ddnat;
8082     
8083     comm = dd->comm;
8084
8085     /* Reset all the statistics and counters for total run counting */
8086     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8087     {
8088         comm->sum_nat[ddnat-ddnatZONE] = 0;
8089     }
8090     comm->ndecomp = 0;
8091     comm->nload = 0;
8092     comm->load_step = 0;
8093     comm->load_sum = 0;
8094     comm->load_max = 0;
8095     clear_ivec(comm->load_lim);
8096     comm->load_mdf = 0;
8097     comm->load_pme = 0;
8098 }
8099
8100 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8101 {
8102     gmx_domdec_comm_t *comm;
8103     int ddnat;
8104     double av;
8105    
8106     comm = cr->dd->comm;
8107     
8108     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8109     
8110     if (fplog == NULL)
8111     {
8112         return;
8113     }
8114     
8115     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8116             
8117     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8118     {
8119         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8120         switch(ddnat)
8121         {
8122         case ddnatZONE:
8123             fprintf(fplog,
8124                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8125                     2,av);
8126             break;
8127         case ddnatVSITE:
8128             if (cr->dd->vsite_comm)
8129             {
8130                 fprintf(fplog,
8131                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8132                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8133                         av);
8134             }
8135             break;
8136         case ddnatCON:
8137             if (cr->dd->constraint_comm)
8138             {
8139                 fprintf(fplog,
8140                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8141                         1 + ir->nLincsIter,av);
8142             }
8143             break;
8144         default:
8145             gmx_incons(" Unknown type for DD statistics");
8146         }
8147     }
8148     fprintf(fplog,"\n");
8149     
8150     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8151     {
8152         print_dd_load_av(fplog,cr->dd);
8153     }
8154 }
8155
8156 void dd_partition_system(FILE            *fplog,
8157                          gmx_large_int_t      step,
8158                          t_commrec       *cr,
8159                          gmx_bool            bMasterState,
8160                          int             nstglobalcomm,
8161                          t_state         *state_global,
8162                          gmx_mtop_t      *top_global,
8163                          t_inputrec      *ir,
8164                          t_state         *state_local,
8165                          rvec            **f,
8166                          t_mdatoms       *mdatoms,
8167                          gmx_localtop_t  *top_local,
8168                          t_forcerec      *fr,
8169                          gmx_vsite_t     *vsite,
8170                          gmx_shellfc_t   shellfc,
8171                          gmx_constr_t    constr,
8172                          t_nrnb          *nrnb,
8173                          gmx_wallcycle_t wcycle,
8174                          gmx_bool            bVerbose)
8175 {
8176     gmx_domdec_t *dd;
8177     gmx_domdec_comm_t *comm;
8178     gmx_ddbox_t ddbox={0};
8179     t_block *cgs_gl;
8180     gmx_large_int_t step_pcoupl;
8181     rvec cell_ns_x0,cell_ns_x1;
8182     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8183     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8184     gmx_bool bRedist,bSortCG,bResortAll;
8185     ivec ncells_old,np;
8186     real grid_density;
8187     char sbuf[22];
8188         
8189     dd = cr->dd;
8190     comm = dd->comm;
8191
8192     bBoxChanged = (bMasterState || DEFORM(*ir));
8193     if (ir->epc != epcNO)
8194     {
8195         /* With nstpcouple > 1 pressure coupling happens.
8196          * one step after calculating the pressure.
8197          * Box scaling happens at the end of the MD step,
8198          * after the DD partitioning.
8199          * We therefore have to do DLB in the first partitioning
8200          * after an MD step where P-coupling occured.
8201          * We need to determine the last step in which p-coupling occurred.
8202          * MRS -- need to validate this for vv?
8203          */
8204         n = ir->nstpcouple;
8205         if (n == 1)
8206         {
8207             step_pcoupl = step - 1;
8208         }
8209         else
8210         {
8211             step_pcoupl = ((step - 1)/n)*n + 1;
8212         }
8213         if (step_pcoupl >= comm->globalcomm_step)
8214         {
8215             bBoxChanged = TRUE;
8216         }
8217     }
8218
8219     bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8220
8221     if (!comm->bDynLoadBal)
8222     {
8223         bDoDLB = FALSE;
8224     }
8225     else
8226     {
8227         /* Should we do dynamic load balacing this step?
8228          * Since it requires (possibly expensive) global communication,
8229          * we might want to do DLB less frequently.
8230          */
8231         if (bBoxChanged || ir->epc != epcNO)
8232         {
8233             bDoDLB = bBoxChanged;
8234         }
8235         else
8236         {
8237             bDoDLB = bNStGlobalComm;
8238         }
8239     }
8240
8241     /* Check if we have recorded loads on the nodes */
8242     if (comm->bRecordLoad && dd_load_count(comm))
8243     {
8244         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8245         {
8246             /* Check if we should use DLB at the second partitioning
8247              * and every 100 partitionings,
8248              * so the extra communication cost is negligible.
8249              */
8250             n = max(100,nstglobalcomm);
8251             bCheckDLB = (comm->n_load_collect == 0 ||
8252                          comm->n_load_have % n == n-1);
8253         }
8254         else
8255         {
8256             bCheckDLB = FALSE;
8257         }
8258         
8259         /* Print load every nstlog, first and last step to the log file */
8260         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8261                     comm->n_load_collect == 0 ||
8262                     (ir->nsteps >= 0 &&
8263                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
8264
8265         /* Avoid extra communication due to verbose screen output
8266          * when nstglobalcomm is set.
8267          */
8268         if (bDoDLB || bLogLoad || bCheckDLB ||
8269             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8270         {
8271             get_load_distribution(dd,wcycle);
8272             if (DDMASTER(dd))
8273             {
8274                 if (bLogLoad)
8275                 {
8276                     dd_print_load(fplog,dd,step-1);
8277                 }
8278                 if (bVerbose)
8279                 {
8280                     dd_print_load_verbose(dd);
8281                 }
8282             }
8283             comm->n_load_collect++;
8284
8285             if (bCheckDLB) {
8286                 /* Since the timings are node dependent, the master decides */
8287                 if (DDMASTER(dd))
8288                 {
8289                     bTurnOnDLB =
8290                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8291                     if (debug)
8292                     {
8293                         fprintf(debug,"step %s, imb loss %f\n",
8294                                 gmx_step_str(step,sbuf),
8295                                 dd_force_imb_perf_loss(dd));
8296                     }
8297                 }
8298                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8299                 if (bTurnOnDLB)
8300                 {
8301                     turn_on_dlb(fplog,cr,step);
8302                     bDoDLB = TRUE;
8303                 }
8304             }
8305         }
8306         comm->n_load_have++;
8307     }
8308
8309     cgs_gl = &comm->cgs_gl;
8310
8311     bRedist = FALSE;
8312     if (bMasterState)
8313     {
8314         /* Clear the old state */
8315         clear_dd_indices(dd,0,0);
8316
8317         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8318                   TRUE,cgs_gl,state_global->x,&ddbox);
8319     
8320         get_cg_distribution(fplog,step,dd,cgs_gl,
8321                             state_global->box,&ddbox,state_global->x);
8322         
8323         dd_distribute_state(dd,cgs_gl,
8324                             state_global,state_local,f);
8325         
8326         dd_make_local_cgs(dd,&top_local->cgs);
8327         
8328         if (dd->ncg_home > fr->cg_nalloc)
8329         {
8330             dd_realloc_fr_cg(fr,dd->ncg_home);
8331         }
8332         calc_cgcm(fplog,0,dd->ncg_home,
8333                   &top_local->cgs,state_local->x,fr->cg_cm);
8334         
8335         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8336         
8337         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8338
8339         cg0 = 0;
8340     }
8341     else if (state_local->ddp_count != dd->ddp_count)
8342     {
8343         if (state_local->ddp_count > dd->ddp_count)
8344         {
8345             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8346         }
8347         
8348         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8349         {
8350             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8351         }
8352         
8353         /* Clear the old state */
8354         clear_dd_indices(dd,0,0);
8355         
8356         /* Build the new indices */
8357         rebuild_cgindex(dd,cgs_gl->index,state_local);
8358         make_dd_indices(dd,cgs_gl->index,0);
8359         
8360         /* Redetermine the cg COMs */
8361         calc_cgcm(fplog,0,dd->ncg_home,
8362                   &top_local->cgs,state_local->x,fr->cg_cm);
8363         
8364         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8365
8366         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8367
8368         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8369                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8370
8371         bRedist = comm->bDynLoadBal;
8372     }
8373     else
8374     {
8375         /* We have the full state, only redistribute the cgs */
8376
8377         /* Clear the non-home indices */
8378         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8379
8380         /* Avoid global communication for dim's without pbc and -gcom */
8381         if (!bNStGlobalComm)
8382         {
8383             copy_rvec(comm->box0    ,ddbox.box0    );
8384             copy_rvec(comm->box_size,ddbox.box_size);
8385         }
8386         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8387                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8388
8389         bBoxChanged = TRUE;
8390         bRedist = TRUE;
8391     }
8392     /* For dim's without pbc and -gcom */
8393     copy_rvec(ddbox.box0    ,comm->box0    );
8394     copy_rvec(ddbox.box_size,comm->box_size);
8395     
8396     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8397                       step,wcycle);
8398     
8399     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8400     {
8401         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8402     }
8403     
8404     /* Check if we should sort the charge groups */
8405     if (comm->nstSortCG > 0)
8406     {
8407         bSortCG = (bMasterState ||
8408                    (bRedist && (step % comm->nstSortCG == 0)));
8409     }
8410     else
8411     {
8412         bSortCG = FALSE;
8413     }
8414
8415     ncg_home_old = dd->ncg_home;
8416
8417     if (bRedist)
8418     {
8419         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8420                                  state_local,f,fr,mdatoms,
8421                                  !bSortCG,nrnb);
8422     }
8423     
8424     get_nsgrid_boundaries(fr->ns.grid,dd,
8425                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8426                           dd->ncg_home,fr->cg_cm,
8427                           cell_ns_x0,cell_ns_x1,&grid_density);
8428
8429     if (bBoxChanged)
8430     {
8431         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8432     }
8433
8434     copy_ivec(fr->ns.grid->n,ncells_old);
8435     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8436                state_local->box,cell_ns_x0,cell_ns_x1,
8437                fr->rlistlong,grid_density);
8438     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8439     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8440
8441     if (bSortCG)
8442     {
8443         /* Sort the state on charge group position.
8444          * This enables exact restarts from this step.
8445          * It also improves performance by about 15% with larger numbers
8446          * of atoms per node.
8447          */
8448         
8449         /* Fill the ns grid with the home cell,
8450          * so we can sort with the indices.
8451          */
8452         set_zones_ncg_home(dd);
8453         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8454                   0,dd->ncg_home,fr->cg_cm);
8455         
8456         /* Check if we can user the old order and ns grid cell indices
8457          * of the charge groups to sort the charge groups efficiently.
8458          */
8459         bResortAll = (bMasterState ||
8460                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8461                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8462                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8463
8464         if (debug)
8465         {
8466             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8467                     gmx_step_str(step,sbuf),dd->ncg_home);
8468         }
8469         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8470                       bResortAll ? -1 : ncg_home_old);
8471         /* Rebuild all the indices */
8472         cg0 = 0;
8473         ga2la_clear(dd->ga2la);
8474     }
8475     
8476     /* Setup up the communication and communicate the coordinates */
8477     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8478     
8479     /* Set the indices */
8480     make_dd_indices(dd,cgs_gl->index,cg0);
8481
8482     /* Set the charge group boundaries for neighbor searching */
8483     set_cg_boundaries(&comm->zones);
8484     
8485     /*
8486     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8487                  -1,state_local->x,state_local->box);
8488     */
8489     
8490     /* Extract a local topology from the global topology */
8491     for(i=0; i<dd->ndim; i++)
8492     {
8493         np[dd->dim[i]] = comm->cd[i].np;
8494     }
8495     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8496                       comm->cellsize_min,np,
8497                       fr,vsite,top_global,top_local);
8498     
8499     /* Set up the special atom communication */
8500     n = comm->nat[ddnatZONE];
8501     for(i=ddnatZONE+1; i<ddnatNR; i++)
8502     {
8503         switch(i)
8504         {
8505         case ddnatVSITE:
8506             if (vsite && vsite->n_intercg_vsite)
8507             {
8508                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8509             }
8510             break;
8511         case ddnatCON:
8512             if (dd->bInterCGcons)
8513             {
8514                 /* Only for inter-cg constraints we need special code */
8515                 n = dd_make_local_constraints(dd,n,top_global,
8516                                               constr,ir->nProjOrder,
8517                                               &top_local->idef.il[F_CONSTR]);
8518             }
8519             break;
8520         default:
8521             gmx_incons("Unknown special atom type setup");
8522         }
8523         comm->nat[i] = n;
8524     }
8525     
8526     /* Make space for the extra coordinates for virtual site
8527      * or constraint communication.
8528      */
8529     state_local->natoms = comm->nat[ddnatNR-1];
8530     if (state_local->natoms > state_local->nalloc)
8531     {
8532         dd_realloc_state(state_local,f,state_local->natoms);
8533     }
8534
8535     if (fr->bF_NoVirSum)
8536     {
8537         if (vsite && vsite->n_intercg_vsite)
8538         {
8539             nat_f_novirsum = comm->nat[ddnatVSITE];
8540         }
8541         else
8542         {
8543             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8544             {
8545                 nat_f_novirsum = dd->nat_tot;
8546             }
8547             else
8548             {
8549                 nat_f_novirsum = dd->nat_home;
8550             }
8551         }
8552     }
8553     else
8554     {
8555         nat_f_novirsum = 0;
8556     }
8557
8558     /* Set the number of atoms required for the force calculation.
8559      * Forces need to be constrained when using a twin-range setup
8560      * or with energy minimization. For simple simulations we could
8561      * avoid some allocation, zeroing and copying, but this is
8562      * probably not worth the complications ande checking.
8563      */
8564     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8565                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8566
8567     /* We make the all mdatoms up to nat_tot_con.
8568      * We could save some work by only setting invmass
8569      * between nat_tot and nat_tot_con.
8570      */
8571     /* This call also sets the new number of home particles to dd->nat_home */
8572     atoms2md(top_global,ir,
8573              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8574
8575     /* Now we have the charges we can sort the FE interactions */
8576     dd_sort_local_top(dd,mdatoms,top_local);
8577
8578     if (shellfc)
8579     {
8580         /* Make the local shell stuff, currently no communication is done */
8581         make_local_shells(cr,mdatoms,shellfc);
8582     }
8583     
8584         if (ir->implicit_solvent)
8585     {
8586         make_local_gb(cr,fr->born,ir->gb_algorithm);
8587     }
8588         
8589     if (!(cr->duty & DUTY_PME))
8590     {
8591         /* Send the charges to our PME only node */
8592         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8593                        mdatoms->chargeA,mdatoms->chargeB,
8594                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8595     }
8596     
8597     if (constr)
8598     {
8599         set_constraints(constr,top_local,ir,mdatoms,cr);
8600     }
8601     
8602     if (ir->ePull != epullNO)
8603     {
8604         /* Update the local pull groups */
8605         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8606     }
8607     
8608     if (ir->bRot)
8609     {
8610         /* Update the local rotation groups */
8611         dd_make_local_rotation_groups(dd,ir->rot);
8612     }
8613
8614
8615     add_dd_statistics(dd);
8616     
8617     /* Make sure we only count the cycles for this DD partitioning */
8618     clear_dd_cycle_counts(dd);
8619     
8620     /* Because the order of the atoms might have changed since
8621      * the last vsite construction, we need to communicate the constructing
8622      * atom coordinates again (for spreading the forces this MD step).
8623      */
8624     dd_move_x_vsites(dd,state_local->box,state_local->x);
8625     
8626     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8627     {
8628         dd_move_x(dd,state_local->box,state_local->x);
8629         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8630                      -1,state_local->x,state_local->box);
8631     }
8632
8633     if (bNStGlobalComm)
8634     {
8635         /* Store the global communication step */
8636         comm->globalcomm_step = step;
8637     }
8638     
8639     /* Increase the DD partitioning counter */
8640     dd->ddp_count++;
8641     /* The state currently matches this DD partitioning count, store it */
8642     state_local->ddp_count = dd->ddp_count;
8643     if (bMasterState)
8644     {
8645         /* The DD master node knows the complete cg distribution,
8646          * store the count so we can possibly skip the cg info communication.
8647          */
8648         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8649     }
8650
8651     if (comm->DD_debug > 0)
8652     {
8653         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8654         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8655                                 "after partitioning");
8656     }
8657 }