472778af04349f2ac47f6e16b70e96bbbc5df8b5
[alexxy/gromacs.git] / src / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "vec.h"
31 #include "domdec.h"
32 #include "domdec_network.h"
33 #include "nrnb.h"
34 #include "pbc.h"
35 #include "chargegroup.h"
36 #include "constr.h"
37 #include "mdatoms.h"
38 #include "names.h"
39 #include "pdbio.h"
40 #include "futil.h"
41 #include "force.h"
42 #include "pme.h"
43 #include "pull.h"
44 #include "pull_rotation.h"
45 #include "gmx_wallcycle.h"
46 #include "mdrun.h"
47 #include "nsgrid.h"
48 #include "shellfc.h"
49 #include "mtop_util.h"
50 #include "gmxfio.h"
51 #include "gmx_ga2la.h"
52 #include "gmx_sort.h"
53
54 #ifdef GMX_LIB_MPI
55 #include <mpi.h>
56 #endif
57 #ifdef GMX_THREAD_MPI
58 #include "tmpi.h"
59 #endif
60
61 #define DDRANK(dd,rank)    (rank)
62 #define DDMASTERRANK(dd)   (dd->masterrank)
63
64 typedef struct gmx_domdec_master
65 {
66     /* The cell boundaries */
67     real **cell_x;
68     /* The global charge group division */
69     int  *ncg;     /* Number of home charge groups for each node */
70     int  *index;   /* Index of nnodes+1 into cg */
71     int  *cg;      /* Global charge group index */
72     int  *nat;     /* Number of home atoms for each node. */
73     int  *ibuf;    /* Buffer for communication */
74     rvec *vbuf;    /* Buffer for state scattering and gathering */
75 } gmx_domdec_master_t;
76
77 typedef struct
78 {
79     /* The numbers of charge groups to send and receive for each cell
80      * that requires communication, the last entry contains the total
81      * number of atoms that needs to be communicated.
82      */
83     int nsend[DD_MAXIZONE+2];
84     int nrecv[DD_MAXIZONE+2];
85     /* The charge groups to send */
86     int *index;
87     int nalloc;
88     /* The atom range for non-in-place communication */
89     int cell2at0[DD_MAXIZONE];
90     int cell2at1[DD_MAXIZONE];
91 } gmx_domdec_ind_t;
92
93 typedef struct
94 {
95     int  np;                   /* Number of grid pulses in this dimension */
96     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
97     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
98     int  np_nalloc;
99     gmx_bool bInPlace;             /* Can we communicate in place?            */
100 } gmx_domdec_comm_dim_t;
101
102 typedef struct
103 {
104     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
105     real *cell_f;      /* State var.: cell boundaries, box relative      */
106     real *old_cell_f;  /* Temp. var.: old cell size                      */
107     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
108     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
109     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
110     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
111     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
112     real *buf_ncd;     /* Temp. var.                                     */
113 } gmx_domdec_root_t;
114
115 #define DD_NLOAD_MAX 9
116
117 /* Here floats are accurate enough, since these variables
118  * only influence the load balancing, not the actual MD results.
119  */
120 typedef struct
121 {
122     int  nload;
123     float *load;
124     float sum;
125     float max;
126     float sum_m;
127     float cvol_min;
128     float mdf;
129     float pme;
130     int   flags;
131 } gmx_domdec_load_t;
132
133 typedef struct
134 {
135     int  nsc;
136     int  ind_gl;
137     int  ind;
138 } gmx_cgsort_t;
139
140 typedef struct
141 {
142     gmx_cgsort_t *sort1,*sort2;
143     int  sort_nalloc;
144     gmx_cgsort_t *sort_new;
145     int  sort_new_nalloc;
146     int  *ibuf;
147     int  ibuf_nalloc;
148 } gmx_domdec_sort_t;
149
150 typedef struct
151 {
152     rvec *v;
153     int  nalloc;
154 } vec_rvec_t;
155
156 /* This enum determines the order of the coordinates.
157  * ddnatHOME and ddnatZONE should be first and second,
158  * the others can be ordered as wanted.
159  */
160 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
161
162 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
163 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
164
165 typedef struct
166 {
167     int  dim;      /* The dimension                                          */
168     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
169     int  nslab;    /* The number of PME slabs in this dimension              */
170     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
171     int  *pp_min;  /* The minimum pp node location, size nslab               */
172     int  *pp_max;  /* The maximum pp node location,size nslab                */
173     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
174 } gmx_ddpme_t;
175
176 typedef struct
177 {
178     real min0;    /* The minimum bottom of this zone                        */
179     real max1;    /* The maximum top of this zone                           */
180     real mch0;    /* The maximum bottom communicaton height for this zone   */
181     real mch1;    /* The maximum top communicaton height for this zone      */
182     real p1_0;    /* The bottom value of the first cell in this zone        */
183     real p1_1;    /* The top value of the first cell in this zone           */
184 } gmx_ddzone_t;
185
186 typedef struct gmx_domdec_comm
187 {
188     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
189      * unless stated otherwise.
190      */
191
192     /* The number of decomposition dimensions for PME, 0: no PME */
193     int  npmedecompdim;
194     /* The number of nodes doing PME (PP/PME or only PME) */
195     int  npmenodes;
196     int  npmenodes_x;
197     int  npmenodes_y;
198     /* The communication setup including the PME only nodes */
199     gmx_bool bCartesianPP_PME;
200     ivec ntot;
201     int  cartpmedim;
202     int  *pmenodes;          /* size npmenodes                         */
203     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
204                               * but with bCartesianPP_PME              */
205     gmx_ddpme_t ddpme[2];
206     
207     /* The DD particle-particle nodes only */
208     gmx_bool bCartesianPP;
209     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
210     
211     /* The global charge groups */
212     t_block cgs_gl;
213
214     /* Should we sort the cgs */
215     int  nstSortCG;
216     gmx_domdec_sort_t *sort;
217     
218     /* Are there bonded and multi-body interactions between charge groups? */
219     gmx_bool bInterCGBondeds;
220     gmx_bool bInterCGMultiBody;
221
222     /* Data for the optional bonded interaction atom communication range */
223     gmx_bool bBondComm;
224     t_blocka *cglink;
225     char *bLocalCG;
226
227     /* The DLB option */
228     int  eDLB;
229     /* Are we actually using DLB? */
230     gmx_bool bDynLoadBal;
231
232     /* Cell sizes for static load balancing, first index cartesian */
233     real **slb_frac;
234     
235     /* The width of the communicated boundaries */
236     real cutoff_mbody;
237     real cutoff;
238     /* The minimum cell size (including triclinic correction) */
239     rvec cellsize_min;
240     /* For dlb, for use with edlbAUTO */
241     rvec cellsize_min_dlb;
242     /* The lower limit for the DD cell size with DLB */
243     real cellsize_limit;
244     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
245     gmx_bool bVacDLBNoLimit;
246
247     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
248     ivec tric_dir;
249     /* box0 and box_size are required with dim's without pbc and -gcom */
250     rvec box0;
251     rvec box_size;
252     
253     /* The cell boundaries */
254     rvec cell_x0;
255     rvec cell_x1;
256
257     /* The old location of the cell boundaries, to check cg displacements */
258     rvec old_cell_x0;
259     rvec old_cell_x1;
260
261     /* The communication setup and charge group boundaries for the zones */
262     gmx_domdec_zones_t zones;
263     
264     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
265      * cell boundaries of neighboring cells for dynamic load balancing.
266      */
267     gmx_ddzone_t zone_d1[2];
268     gmx_ddzone_t zone_d2[2][2];
269     
270     /* The coordinate/force communication setup and indices */
271     gmx_domdec_comm_dim_t cd[DIM];
272     /* The maximum number of cells to communicate with in one dimension */
273     int  maxpulse;
274     
275     /* Which cg distribution is stored on the master node */
276     int master_cg_ddp_count;
277     
278     /* The number of cg's received from the direct neighbors */
279     int  zone_ncg1[DD_MAXZONE];
280     
281     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
282     int  nat[ddnatNR];
283     
284     /* Communication buffer for general use */
285     int  *buf_int;
286     int  nalloc_int;
287
288      /* Communication buffer for general use */
289     vec_rvec_t vbuf;
290     
291     /* Communication buffers only used with multiple grid pulses */
292     int  *buf_int2;
293     int  nalloc_int2;
294     vec_rvec_t vbuf2;
295     
296     /* Communication buffers for local redistribution */
297     int  **cggl_flag;
298     int  cggl_flag_nalloc[DIM*2];
299     rvec **cgcm_state;
300     int  cgcm_state_nalloc[DIM*2];
301     
302     /* Cell sizes for dynamic load balancing */
303     gmx_domdec_root_t **root;
304     real *cell_f_row;
305     real cell_f0[DIM];
306     real cell_f1[DIM];
307     real cell_f_max0[DIM];
308     real cell_f_min1[DIM];
309     
310     /* Stuff for load communication */
311     gmx_bool bRecordLoad;
312     gmx_domdec_load_t *load;
313 #ifdef GMX_MPI
314     MPI_Comm *mpi_comm_load;
315 #endif
316
317     /* Maximum DLB scaling per load balancing step in percent */
318     int dlb_scale_lim;
319
320     /* Cycle counters */
321     float cycl[ddCyclNr];
322     int   cycl_n[ddCyclNr];
323     float cycl_max[ddCyclNr];
324     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
325     int eFlop;
326     double flop;
327     int    flop_n;
328     /* Have often have did we have load measurements */
329     int    n_load_have;
330     /* Have often have we collected the load measurements */
331     int    n_load_collect;
332     
333     /* Statistics */
334     double sum_nat[ddnatNR-ddnatZONE];
335     int    ndecomp;
336     int    nload;
337     double load_step;
338     double load_sum;
339     double load_max;
340     ivec   load_lim;
341     double load_mdf;
342     double load_pme;
343
344     /* The last partition step */
345     gmx_large_int_t globalcomm_step;
346
347     /* Debugging */
348     int  nstDDDump;
349     int  nstDDDumpGrid;
350     int  DD_debug;
351 } gmx_domdec_comm_t;
352
353 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
354 #define DD_CGIBS 2
355
356 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
357 #define DD_FLAG_NRCG  65535
358 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
359 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
360
361 /* Zone permutation required to obtain consecutive charge groups
362  * for neighbor searching.
363  */
364 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
365
366 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
367  * components see only j zones with that component 0.
368  */
369
370 /* The DD zone order */
371 static const ivec dd_zo[DD_MAXZONE] =
372   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
373
374 /* The 3D setup */
375 #define dd_z3n  8
376 #define dd_zp3n 4
377 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
378
379 /* The 2D setup */
380 #define dd_z2n  4
381 #define dd_zp2n 2
382 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
383
384 /* The 1D setup */
385 #define dd_z1n  2
386 #define dd_zp1n 1
387 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
388
389 /* Factors used to avoid problems due to rounding issues */
390 #define DD_CELL_MARGIN       1.0001
391 #define DD_CELL_MARGIN2      1.00005
392 /* Factor to account for pressure scaling during nstlist steps */
393 #define DD_PRES_SCALE_MARGIN 1.02
394
395 /* Allowed performance loss before we DLB or warn */
396 #define DD_PERF_LOSS 0.05
397
398 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
399
400 /* Use separate MPI send and receive commands
401  * when nnodes <= GMX_DD_NNODES_SENDRECV.
402  * This saves memory (and some copying for small nnodes).
403  * For high parallelization scatter and gather calls are used.
404  */
405 #define GMX_DD_NNODES_SENDRECV 4
406
407
408 /*
409 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
410
411 static void index2xyz(ivec nc,int ind,ivec xyz)
412 {
413   xyz[XX] = ind % nc[XX];
414   xyz[YY] = (ind / nc[XX]) % nc[YY];
415   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
416 }
417 */
418
419 /* This order is required to minimize the coordinate communication in PME
420  * which uses decomposition in the x direction.
421  */
422 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
423
424 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
425 {
426     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
427     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
428     xyz[ZZ] = ind % nc[ZZ];
429 }
430
431 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
432 {
433     int ddindex;
434     int ddnodeid=-1;
435     
436     ddindex = dd_index(dd->nc,c);
437     if (dd->comm->bCartesianPP_PME)
438     {
439         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
440     }
441     else if (dd->comm->bCartesianPP)
442     {
443 #ifdef GMX_MPI
444         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
445 #endif
446     }
447     else
448     {
449         ddnodeid = ddindex;
450     }
451     
452     return ddnodeid;
453 }
454
455 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
456 {
457     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
458 }
459
460 int ddglatnr(gmx_domdec_t *dd,int i)
461 {
462     int atnr;
463     
464     if (dd == NULL)
465     {
466         atnr = i + 1;
467     }
468     else
469     {
470         if (i >= dd->comm->nat[ddnatNR-1])
471         {
472             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
473         }
474         atnr = dd->gatindex[i] + 1;
475     }
476     
477     return atnr;
478 }
479
480 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
481 {
482     return &dd->comm->cgs_gl;
483 }
484
485 static void vec_rvec_init(vec_rvec_t *v)
486 {
487     v->nalloc = 0;
488     v->v      = NULL;
489 }
490
491 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
492 {
493     if (n > v->nalloc)
494     {
495         v->nalloc = over_alloc_dd(n);
496         srenew(v->v,v->nalloc);
497     }
498 }
499
500 void dd_store_state(gmx_domdec_t *dd,t_state *state)
501 {
502     int i;
503     
504     if (state->ddp_count != dd->ddp_count)
505     {
506         gmx_incons("The state does not the domain decomposition state");
507     }
508     
509     state->ncg_gl = dd->ncg_home;
510     if (state->ncg_gl > state->cg_gl_nalloc)
511     {
512         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
513         srenew(state->cg_gl,state->cg_gl_nalloc);
514     }
515     for(i=0; i<state->ncg_gl; i++)
516     {
517         state->cg_gl[i] = dd->index_gl[i];
518     }
519     
520     state->ddp_count_cg_gl = dd->ddp_count;
521 }
522
523 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
524 {
525     return &dd->comm->zones;
526 }
527
528 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
529                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
530 {
531     gmx_domdec_zones_t *zones;
532     int izone,d,dim;
533
534     zones = &dd->comm->zones;
535
536     izone = 0;
537     while (icg >= zones->izone[izone].cg1)
538     {
539         izone++;
540     }
541     
542     if (izone == 0)
543     {
544         *jcg0 = icg;
545     }
546     else if (izone < zones->nizone)
547     {
548         *jcg0 = zones->izone[izone].jcg0;
549     }
550     else
551     {
552         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
553                   icg,izone,zones->nizone);
554     }
555         
556     *jcg1 = zones->izone[izone].jcg1;
557     
558     for(d=0; d<dd->ndim; d++)
559     {
560         dim = dd->dim[d];
561         shift0[dim] = zones->izone[izone].shift0[dim];
562         shift1[dim] = zones->izone[izone].shift1[dim];
563         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
564         {
565             /* A conservative approach, this can be optimized */
566             shift0[dim] -= 1;
567             shift1[dim] += 1;
568         }
569     }
570 }
571
572 int dd_natoms_vsite(gmx_domdec_t *dd)
573 {
574     return dd->comm->nat[ddnatVSITE];
575 }
576
577 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
578 {
579     *at_start = dd->comm->nat[ddnatCON-1];
580     *at_end   = dd->comm->nat[ddnatCON];
581 }
582
583 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
584 {
585     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
586     int  *index,*cgindex;
587     gmx_domdec_comm_t *comm;
588     gmx_domdec_comm_dim_t *cd;
589     gmx_domdec_ind_t *ind;
590     rvec shift={0,0,0},*buf,*rbuf;
591     gmx_bool bPBC,bScrew;
592     
593     comm = dd->comm;
594     
595     cgindex = dd->cgindex;
596     
597     buf = comm->vbuf.v;
598
599     nzone = 1;
600     nat_tot = dd->nat_home;
601     for(d=0; d<dd->ndim; d++)
602     {
603         bPBC   = (dd->ci[dd->dim[d]] == 0);
604         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
605         if (bPBC)
606         {
607             copy_rvec(box[dd->dim[d]],shift);
608         }
609         cd = &comm->cd[d];
610         for(p=0; p<cd->np; p++)
611         {
612             ind = &cd->ind[p];
613             index = ind->index;
614             n = 0;
615             if (!bPBC)
616             {
617                 for(i=0; i<ind->nsend[nzone]; i++)
618                 {
619                     at0 = cgindex[index[i]];
620                     at1 = cgindex[index[i]+1];
621                     for(j=at0; j<at1; j++)
622                     {
623                         copy_rvec(x[j],buf[n]);
624                         n++;
625                     }
626                 }
627             }
628             else if (!bScrew)
629             {
630                 for(i=0; i<ind->nsend[nzone]; i++)
631                 {
632                     at0 = cgindex[index[i]];
633                     at1 = cgindex[index[i]+1];
634                     for(j=at0; j<at1; j++)
635                     {
636                         /* We need to shift the coordinates */
637                         rvec_add(x[j],shift,buf[n]);
638                         n++;
639                     }
640                 }
641             }
642             else
643             {
644                 for(i=0; i<ind->nsend[nzone]; i++)
645                 {
646                     at0 = cgindex[index[i]];
647                     at1 = cgindex[index[i]+1];
648                     for(j=at0; j<at1; j++)
649                     {
650                         /* Shift x */
651                         buf[n][XX] = x[j][XX] + shift[XX];
652                         /* Rotate y and z.
653                          * This operation requires a special shift force
654                          * treatment, which is performed in calc_vir.
655                          */
656                         buf[n][YY] = box[YY][YY] - x[j][YY];
657                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
658                         n++;
659                     }
660                 }
661             }
662             
663             if (cd->bInPlace)
664             {
665                 rbuf = x + nat_tot;
666             }
667             else
668             {
669                 rbuf = comm->vbuf2.v;
670             }
671             /* Send and receive the coordinates */
672             dd_sendrecv_rvec(dd, d, dddirBackward,
673                              buf,  ind->nsend[nzone+1],
674                              rbuf, ind->nrecv[nzone+1]);
675             if (!cd->bInPlace)
676             {
677                 j = 0;
678                 for(zone=0; zone<nzone; zone++)
679                 {
680                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
681                     {
682                         copy_rvec(rbuf[j],x[i]);
683                         j++;
684                     }
685                 }
686             }
687             nat_tot += ind->nrecv[nzone+1];
688         }
689         nzone += nzone;
690     }
691 }
692
693 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
694 {
695     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
696     int  *index,*cgindex;
697     gmx_domdec_comm_t *comm;
698     gmx_domdec_comm_dim_t *cd;
699     gmx_domdec_ind_t *ind;
700     rvec *buf,*sbuf;
701     ivec vis;
702     int  is;
703     gmx_bool bPBC,bScrew;
704     
705     comm = dd->comm;
706     
707     cgindex = dd->cgindex;
708
709     buf = comm->vbuf.v;
710
711     n = 0;
712     nzone = comm->zones.n/2;
713     nat_tot = dd->nat_tot;
714     for(d=dd->ndim-1; d>=0; d--)
715     {
716         bPBC   = (dd->ci[dd->dim[d]] == 0);
717         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
718         if (fshift == NULL && !bScrew)
719         {
720             bPBC = FALSE;
721         }
722         /* Determine which shift vector we need */
723         clear_ivec(vis);
724         vis[dd->dim[d]] = 1;
725         is = IVEC2IS(vis);
726         
727         cd = &comm->cd[d];
728         for(p=cd->np-1; p>=0; p--) {
729             ind = &cd->ind[p];
730             nat_tot -= ind->nrecv[nzone+1];
731             if (cd->bInPlace)
732             {
733                 sbuf = f + nat_tot;
734             }
735             else
736             {
737                 sbuf = comm->vbuf2.v;
738                 j = 0;
739                 for(zone=0; zone<nzone; zone++)
740                 {
741                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
742                     {
743                         copy_rvec(f[i],sbuf[j]);
744                         j++;
745                     }
746                 }
747             }
748             /* Communicate the forces */
749             dd_sendrecv_rvec(dd, d, dddirForward,
750                              sbuf, ind->nrecv[nzone+1],
751                              buf,  ind->nsend[nzone+1]);
752             index = ind->index;
753             /* Add the received forces */
754             n = 0;
755             if (!bPBC)
756             {
757                 for(i=0; i<ind->nsend[nzone]; i++)
758                 {
759                     at0 = cgindex[index[i]];
760                     at1 = cgindex[index[i]+1];
761                     for(j=at0; j<at1; j++)
762                     {
763                         rvec_inc(f[j],buf[n]);
764                         n++;
765                     }
766                 } 
767             }
768             else if (!bScrew)
769             {
770                 for(i=0; i<ind->nsend[nzone]; i++)
771                 {
772                     at0 = cgindex[index[i]];
773                     at1 = cgindex[index[i]+1];
774                     for(j=at0; j<at1; j++)
775                     {
776                         rvec_inc(f[j],buf[n]);
777                         /* Add this force to the shift force */
778                         rvec_inc(fshift[is],buf[n]);
779                         n++;
780                     }
781                 }
782             }
783             else
784             {
785                 for(i=0; i<ind->nsend[nzone]; i++)
786                 {
787                     at0 = cgindex[index[i]];
788                     at1 = cgindex[index[i]+1];
789                     for(j=at0; j<at1; j++)
790                     {
791                         /* Rotate the force */
792                         f[j][XX] += buf[n][XX];
793                         f[j][YY] -= buf[n][YY];
794                         f[j][ZZ] -= buf[n][ZZ];
795                         if (fshift)
796                         {
797                             /* Add this force to the shift force */
798                             rvec_inc(fshift[is],buf[n]);
799                         }
800                         n++;
801                     }
802                 }
803             }
804         }
805         nzone /= 2;
806     }
807 }
808
809 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
810 {
811     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
812     int  *index,*cgindex;
813     gmx_domdec_comm_t *comm;
814     gmx_domdec_comm_dim_t *cd;
815     gmx_domdec_ind_t *ind;
816     real *buf,*rbuf;
817     
818     comm = dd->comm;
819     
820     cgindex = dd->cgindex;
821     
822     buf = &comm->vbuf.v[0][0];
823
824     nzone = 1;
825     nat_tot = dd->nat_home;
826     for(d=0; d<dd->ndim; d++)
827     {
828         cd = &comm->cd[d];
829         for(p=0; p<cd->np; p++)
830         {
831             ind = &cd->ind[p];
832             index = ind->index;
833             n = 0;
834             for(i=0; i<ind->nsend[nzone]; i++)
835             {
836                 at0 = cgindex[index[i]];
837                 at1 = cgindex[index[i]+1];
838                 for(j=at0; j<at1; j++)
839                 {
840                     buf[n] = v[j];
841                     n++;
842                 }
843             }
844             
845             if (cd->bInPlace)
846             {
847                 rbuf = v + nat_tot;
848             }
849             else
850             {
851                 rbuf = &comm->vbuf2.v[0][0];
852             }
853             /* Send and receive the coordinates */
854             dd_sendrecv_real(dd, d, dddirBackward,
855                              buf,  ind->nsend[nzone+1],
856                              rbuf, ind->nrecv[nzone+1]);
857             if (!cd->bInPlace)
858             {
859                 j = 0;
860                 for(zone=0; zone<nzone; zone++)
861                 {
862                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
863                     {
864                         v[i] = rbuf[j];
865                         j++;
866                     }
867                 }
868             }
869             nat_tot += ind->nrecv[nzone+1];
870         }
871         nzone += nzone;
872     }
873 }
874
875 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
876 {
877     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
878     int  *index,*cgindex;
879     gmx_domdec_comm_t *comm;
880     gmx_domdec_comm_dim_t *cd;
881     gmx_domdec_ind_t *ind;
882     real *buf,*sbuf;
883     
884     comm = dd->comm;
885     
886     cgindex = dd->cgindex;
887
888     buf = &comm->vbuf.v[0][0];
889
890     n = 0;
891     nzone = comm->zones.n/2;
892     nat_tot = dd->nat_tot;
893     for(d=dd->ndim-1; d>=0; d--)
894     {
895         cd = &comm->cd[d];
896         for(p=cd->np-1; p>=0; p--) {
897             ind = &cd->ind[p];
898             nat_tot -= ind->nrecv[nzone+1];
899             if (cd->bInPlace)
900             {
901                 sbuf = v + nat_tot;
902             }
903             else
904             {
905                 sbuf = &comm->vbuf2.v[0][0];
906                 j = 0;
907                 for(zone=0; zone<nzone; zone++)
908                 {
909                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
910                     {
911                         sbuf[j] = v[i];
912                         j++;
913                     }
914                 }
915             }
916             /* Communicate the forces */
917             dd_sendrecv_real(dd, d, dddirForward,
918                              sbuf, ind->nrecv[nzone+1],
919                              buf,  ind->nsend[nzone+1]);
920             index = ind->index;
921             /* Add the received forces */
922             n = 0;
923             for(i=0; i<ind->nsend[nzone]; i++)
924             {
925                 at0 = cgindex[index[i]];
926                 at1 = cgindex[index[i]+1];
927                 for(j=at0; j<at1; j++)
928                 {
929                     v[j] += buf[n];
930                     n++;
931                 }
932             } 
933         }
934         nzone /= 2;
935     }
936 }
937
938 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
939 {
940     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
941             d,i,j,
942             zone->min0,zone->max1,
943             zone->mch0,zone->mch0,
944             zone->p1_0,zone->p1_1);
945 }
946
947 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
948                                int ddimind,int direction,
949                                gmx_ddzone_t *buf_s,int n_s,
950                                gmx_ddzone_t *buf_r,int n_r)
951 {
952     rvec vbuf_s[5*2],vbuf_r[5*2];
953     int i;
954
955     for(i=0; i<n_s; i++)
956     {
957         vbuf_s[i*2  ][0] = buf_s[i].min0;
958         vbuf_s[i*2  ][1] = buf_s[i].max1;
959         vbuf_s[i*2  ][2] = buf_s[i].mch0;
960         vbuf_s[i*2+1][0] = buf_s[i].mch1;
961         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
962         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
963     }
964
965     dd_sendrecv_rvec(dd, ddimind, direction,
966                      vbuf_s, n_s*2,
967                      vbuf_r, n_r*2);
968
969     for(i=0; i<n_r; i++)
970     {
971         buf_r[i].min0 = vbuf_r[i*2  ][0];
972         buf_r[i].max1 = vbuf_r[i*2  ][1];
973         buf_r[i].mch0 = vbuf_r[i*2  ][2];
974         buf_r[i].mch1 = vbuf_r[i*2+1][0];
975         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
976         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
977     }
978 }
979
980 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
981                           rvec cell_ns_x0,rvec cell_ns_x1)
982 {
983     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
984     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
985     rvec extr_s[2],extr_r[2];
986     rvec dh;
987     real dist_d,c=0,det;
988     gmx_domdec_comm_t *comm;
989     gmx_bool bPBC,bUse;
990
991     comm = dd->comm;
992
993     for(d=1; d<dd->ndim; d++)
994     {
995         dim = dd->dim[d];
996         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
997         zp->min0 = cell_ns_x0[dim];
998         zp->max1 = cell_ns_x1[dim];
999         zp->mch0 = cell_ns_x0[dim];
1000         zp->mch1 = cell_ns_x1[dim];
1001         zp->p1_0 = cell_ns_x0[dim];
1002         zp->p1_1 = cell_ns_x1[dim];
1003     }
1004     
1005     for(d=dd->ndim-2; d>=0; d--)
1006     {
1007         dim  = dd->dim[d];
1008         bPBC = (dim < ddbox->npbcdim);
1009
1010         /* Use an rvec to store two reals */
1011         extr_s[d][0] = comm->cell_f0[d+1];
1012         extr_s[d][1] = comm->cell_f1[d+1];
1013         extr_s[d][2] = 0;
1014
1015         pos = 0;
1016         /* Store the extremes in the backward sending buffer,
1017          * so the get updated separately from the forward communication.
1018          */
1019         for(d1=d; d1<dd->ndim-1; d1++)
1020         {
1021             /* We invert the order to be able to use the same loop for buf_e */
1022             buf_s[pos].min0 = extr_s[d1][1];
1023             buf_s[pos].max1 = extr_s[d1][0];
1024             buf_s[pos].mch0 = 0;
1025             buf_s[pos].mch1 = 0;
1026             /* Store the cell corner of the dimension we communicate along */
1027             buf_s[pos].p1_0 = comm->cell_x0[dim];
1028             buf_s[pos].p1_1 = 0;
1029             pos++;
1030         }
1031
1032         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1033         pos++;
1034
1035         if (dd->ndim == 3 && d == 0)
1036         {
1037             buf_s[pos] = comm->zone_d2[0][1];
1038             pos++;
1039             buf_s[pos] = comm->zone_d1[0];
1040             pos++;
1041         }
1042
1043         /* We only need to communicate the extremes
1044          * in the forward direction
1045          */
1046         npulse = comm->cd[d].np;
1047         if (bPBC)
1048         {
1049             /* Take the minimum to avoid double communication */
1050             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1051         }
1052         else
1053         {
1054             /* Without PBC we should really not communicate over
1055              * the boundaries, but implementing that complicates
1056              * the communication setup and therefore we simply
1057              * do all communication, but ignore some data.
1058              */
1059             npulse_min = npulse;
1060         }
1061         for(p=0; p<npulse_min; p++)
1062         {
1063             /* Communicate the extremes forward */
1064             bUse = (bPBC || dd->ci[dim] > 0);
1065
1066             dd_sendrecv_rvec(dd, d, dddirForward,
1067                              extr_s+d, dd->ndim-d-1,
1068                              extr_r+d, dd->ndim-d-1);
1069
1070             if (bUse)
1071             {
1072                 for(d1=d; d1<dd->ndim-1; d1++)
1073                 {
1074                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1075                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1076                 }
1077             }
1078         }
1079
1080         buf_size = pos;
1081         for(p=0; p<npulse; p++)
1082         {
1083             /* Communicate all the zone information backward */
1084             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1085
1086             dd_sendrecv_ddzone(dd, d, dddirBackward,
1087                                buf_s, buf_size,
1088                                buf_r, buf_size);
1089
1090             clear_rvec(dh);
1091             if (p > 0)
1092             {
1093                 for(d1=d+1; d1<dd->ndim; d1++)
1094                 {
1095                     /* Determine the decrease of maximum required
1096                      * communication height along d1 due to the distance along d,
1097                      * this avoids a lot of useless atom communication.
1098                      */
1099                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1100
1101                     if (ddbox->tric_dir[dim])
1102                     {
1103                         /* c is the off-diagonal coupling between the cell planes
1104                          * along directions d and d1.
1105                          */
1106                         c = ddbox->v[dim][dd->dim[d1]][dim];
1107                     }
1108                     else
1109                     {
1110                         c = 0;
1111                     }
1112                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1113                     if (det > 0)
1114                     {
1115                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1116                     }
1117                     else
1118                     {
1119                         /* A negative value signals out of range */
1120                         dh[d1] = -1;
1121                     }
1122                 }
1123             }
1124
1125             /* Accumulate the extremes over all pulses */
1126             for(i=0; i<buf_size; i++)
1127             {
1128                 if (p == 0)
1129                 {
1130                     buf_e[i] = buf_r[i];
1131                 }
1132                 else
1133                 {
1134                     if (bUse)
1135                     {
1136                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1137                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1138                     }
1139
1140                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1141                     {
1142                         d1 = 1;
1143                     }
1144                     else
1145                     {
1146                         d1 = d + 1;
1147                     }
1148                     if (bUse && dh[d1] >= 0)
1149                     {
1150                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1151                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1152                     }
1153                 }
1154                 /* Copy the received buffer to the send buffer,
1155                  * to pass the data through with the next pulse.
1156                  */
1157                 buf_s[i] = buf_r[i];
1158             }
1159             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1160                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1161             {
1162                 /* Store the extremes */ 
1163                 pos = 0;
1164
1165                 for(d1=d; d1<dd->ndim-1; d1++)
1166                 {
1167                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1168                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1169                     pos++;
1170                 }
1171
1172                 if (d == 1 || (d == 0 && dd->ndim == 3))
1173                 {
1174                     for(i=d; i<2; i++)
1175                     {
1176                         comm->zone_d2[1-d][i] = buf_e[pos];
1177                         pos++;
1178                     }
1179                 }
1180                 if (d == 0)
1181                 {
1182                     comm->zone_d1[1] = buf_e[pos];
1183                     pos++;
1184                 }
1185             }
1186         }
1187     }
1188     
1189     if (dd->ndim >= 2)
1190     {
1191         dim = dd->dim[1];
1192         for(i=0; i<2; i++)
1193         {
1194             if (debug)
1195             {
1196                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1197             }
1198             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1199             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1200         }
1201     }
1202     if (dd->ndim >= 3)
1203     {
1204         dim = dd->dim[2];
1205         for(i=0; i<2; i++)
1206         {
1207             for(j=0; j<2; j++)
1208             {
1209                 if (debug)
1210                 {
1211                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1212                 }
1213                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1214                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1215             }
1216         }
1217     }
1218     for(d=1; d<dd->ndim; d++)
1219     {
1220         comm->cell_f_max0[d] = extr_s[d-1][0];
1221         comm->cell_f_min1[d] = extr_s[d-1][1];
1222         if (debug)
1223         {
1224             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1225                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1226         }
1227     }
1228 }
1229
1230 static void dd_collect_cg(gmx_domdec_t *dd,
1231                           t_state *state_local)
1232 {
1233     gmx_domdec_master_t *ma=NULL;
1234     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1235     t_block *cgs_gl;
1236
1237     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1238     {
1239         /* The master has the correct distribution */
1240         return;
1241     }
1242     
1243     if (state_local->ddp_count == dd->ddp_count)
1244     {
1245         ncg_home = dd->ncg_home;
1246         cg       = dd->index_gl;
1247         nat_home = dd->nat_home;
1248     } 
1249     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1250     {
1251         cgs_gl = &dd->comm->cgs_gl;
1252
1253         ncg_home = state_local->ncg_gl;
1254         cg       = state_local->cg_gl;
1255         nat_home = 0;
1256         for(i=0; i<ncg_home; i++)
1257         {
1258             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1259         }
1260     }
1261     else
1262     {
1263         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1264     }
1265     
1266     buf2[0] = dd->ncg_home;
1267     buf2[1] = dd->nat_home;
1268     if (DDMASTER(dd))
1269     {
1270         ma = dd->ma;
1271         ibuf = ma->ibuf;
1272     }
1273     else
1274     {
1275         ibuf = NULL;
1276     }
1277     /* Collect the charge group and atom counts on the master */
1278     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1279     
1280     if (DDMASTER(dd))
1281     {
1282         ma->index[0] = 0;
1283         for(i=0; i<dd->nnodes; i++)
1284         {
1285             ma->ncg[i] = ma->ibuf[2*i];
1286             ma->nat[i] = ma->ibuf[2*i+1];
1287             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1288             
1289         }
1290         /* Make byte counts and indices */
1291         for(i=0; i<dd->nnodes; i++)
1292         {
1293             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1294             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1295         }
1296         if (debug)
1297         {
1298             fprintf(debug,"Initial charge group distribution: ");
1299             for(i=0; i<dd->nnodes; i++)
1300                 fprintf(debug," %d",ma->ncg[i]);
1301             fprintf(debug,"\n");
1302         }
1303     }
1304     
1305     /* Collect the charge group indices on the master */
1306     dd_gatherv(dd,
1307                dd->ncg_home*sizeof(int),dd->index_gl,
1308                DDMASTER(dd) ? ma->ibuf : NULL,
1309                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1310                DDMASTER(dd) ? ma->cg : NULL);
1311     
1312     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1313 }
1314
1315 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1316                                     rvec *lv,rvec *v)
1317 {
1318     gmx_domdec_master_t *ma;
1319     int  n,i,c,a,nalloc=0;
1320     rvec *buf=NULL;
1321     t_block *cgs_gl;
1322
1323     ma = dd->ma;
1324     
1325     if (!DDMASTER(dd))
1326     {
1327 #ifdef GMX_MPI
1328         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1329                  dd->rank,dd->mpi_comm_all);
1330 #endif
1331     } else {
1332         /* Copy the master coordinates to the global array */
1333         cgs_gl = &dd->comm->cgs_gl;
1334
1335         n = DDMASTERRANK(dd);
1336         a = 0;
1337         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1338         {
1339             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1340             {
1341                 copy_rvec(lv[a++],v[c]);
1342             }
1343         }
1344         
1345         for(n=0; n<dd->nnodes; n++)
1346         {
1347             if (n != dd->rank)
1348             {
1349                 if (ma->nat[n] > nalloc)
1350                 {
1351                     nalloc = over_alloc_dd(ma->nat[n]);
1352                     srenew(buf,nalloc);
1353                 }
1354 #ifdef GMX_MPI
1355                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1356                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1357 #endif
1358                 a = 0;
1359                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1360                 {
1361                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1362                     {
1363                         copy_rvec(buf[a++],v[c]);
1364                     }
1365                 }
1366             }
1367         }
1368         sfree(buf);
1369     }
1370 }
1371
1372 static void get_commbuffer_counts(gmx_domdec_t *dd,
1373                                   int **counts,int **disps)
1374 {
1375     gmx_domdec_master_t *ma;
1376     int n;
1377
1378     ma = dd->ma;
1379     
1380     /* Make the rvec count and displacment arrays */
1381     *counts  = ma->ibuf;
1382     *disps   = ma->ibuf + dd->nnodes;
1383     for(n=0; n<dd->nnodes; n++)
1384     {
1385         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1386         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1387     }
1388 }
1389
1390 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1391                                    rvec *lv,rvec *v)
1392 {
1393     gmx_domdec_master_t *ma;
1394     int  *rcounts=NULL,*disps=NULL;
1395     int  n,i,c,a;
1396     rvec *buf=NULL;
1397     t_block *cgs_gl;
1398     
1399     ma = dd->ma;
1400     
1401     if (DDMASTER(dd))
1402     {
1403         get_commbuffer_counts(dd,&rcounts,&disps);
1404
1405         buf = ma->vbuf;
1406     }
1407     
1408     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1409
1410     if (DDMASTER(dd))
1411     {
1412         cgs_gl = &dd->comm->cgs_gl;
1413
1414         a = 0;
1415         for(n=0; n<dd->nnodes; n++)
1416         {
1417             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1418             {
1419                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1420                 {
1421                     copy_rvec(buf[a++],v[c]);
1422                 }
1423             }
1424         }
1425     }
1426 }
1427
1428 void dd_collect_vec(gmx_domdec_t *dd,
1429                     t_state *state_local,rvec *lv,rvec *v)
1430 {
1431     gmx_domdec_master_t *ma;
1432     int  n,i,c,a,nalloc=0;
1433     rvec *buf=NULL;
1434     
1435     dd_collect_cg(dd,state_local);
1436
1437     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1438     {
1439         dd_collect_vec_sendrecv(dd,lv,v);
1440     }
1441     else
1442     {
1443         dd_collect_vec_gatherv(dd,lv,v);
1444     }
1445 }
1446
1447
1448 void dd_collect_state(gmx_domdec_t *dd,
1449                       t_state *state_local,t_state *state)
1450 {
1451     int est,i,j,nh;
1452
1453     nh = state->nhchainlength;
1454
1455     if (DDMASTER(dd))
1456     {
1457         for (i=0;i<efptNR;i++) {
1458             state->lambda[i] = state_local->lambda[i];
1459         }
1460         state->fep_state = state_local->fep_state;
1461         state->veta = state_local->veta;
1462         state->vol0 = state_local->vol0;
1463         copy_mat(state_local->box,state->box);
1464         copy_mat(state_local->boxv,state->boxv);
1465         copy_mat(state_local->svir_prev,state->svir_prev);
1466         copy_mat(state_local->fvir_prev,state->fvir_prev);
1467         copy_mat(state_local->pres_prev,state->pres_prev);
1468
1469
1470         for(i=0; i<state_local->ngtc; i++)
1471         {
1472             for(j=0; j<nh; j++) {
1473                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1474                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1475             }
1476             state->therm_integral[i] = state_local->therm_integral[i];            
1477         }
1478         for(i=0; i<state_local->nnhpres; i++) 
1479         {
1480             for(j=0; j<nh; j++) {
1481                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1482                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1483             }
1484         }
1485     }
1486     for(est=0; est<estNR; est++)
1487     {
1488         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1489         {
1490             switch (est) {
1491             case estX:
1492                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1493                 break;
1494             case estV:
1495                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1496                 break;
1497             case estSDX:
1498                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1499                 break;
1500             case estCGP:
1501                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1502                 break;
1503             case estLD_RNG:
1504                 if (state->nrngi == 1)
1505                 {
1506                     if (DDMASTER(dd))
1507                     {
1508                         for(i=0; i<state_local->nrng; i++)
1509                         {
1510                             state->ld_rng[i] = state_local->ld_rng[i];
1511                         }
1512                     }
1513                 }
1514                 else
1515                 {
1516                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1517                               state_local->ld_rng,state->ld_rng);
1518                 }
1519                 break;
1520             case estLD_RNGI:
1521                 if (state->nrngi == 1)
1522                 {
1523                    if (DDMASTER(dd))
1524                     {
1525                         state->ld_rngi[0] = state_local->ld_rngi[0];
1526                     } 
1527                 }
1528                 else
1529                 {
1530                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1531                               state_local->ld_rngi,state->ld_rngi);
1532                 }
1533                 break;
1534             case estDISRE_INITF:
1535             case estDISRE_RM3TAV:
1536             case estORIRE_INITF:
1537             case estORIRE_DTAV:
1538                 break;
1539             default:
1540                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1541             }
1542         }
1543     }
1544 }
1545
1546 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1547 {
1548     if (debug)
1549     {
1550         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1551     }
1552     fr->cg_nalloc = over_alloc_dd(nalloc);
1553     srenew(fr->cg_cm,fr->cg_nalloc);
1554     srenew(fr->cginfo,fr->cg_nalloc);
1555 }
1556
1557 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1558 {
1559     int est;
1560
1561     if (debug)
1562     {
1563         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1564     }
1565
1566     state->nalloc = over_alloc_dd(nalloc);
1567     
1568     for(est=0; est<estNR; est++)
1569     {
1570         if (EST_DISTR(est) && (state->flags & (1<<est)))
1571         {
1572             switch(est) {
1573             case estX:
1574                 srenew(state->x,state->nalloc);
1575                 break;
1576             case estV:
1577                 srenew(state->v,state->nalloc);
1578                 break;
1579             case estSDX:
1580                 srenew(state->sd_X,state->nalloc);
1581                 break;
1582             case estCGP:
1583                 srenew(state->cg_p,state->nalloc);
1584                 break;
1585             case estLD_RNG:
1586             case estLD_RNGI:
1587             case estDISRE_INITF:
1588             case estDISRE_RM3TAV:
1589             case estORIRE_INITF:
1590             case estORIRE_DTAV:
1591                 /* No reallocation required */
1592                 break;
1593             default:
1594                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1595             }
1596         }
1597     }
1598     
1599     if (f != NULL)
1600     {
1601         srenew(*f,state->nalloc);
1602     }
1603 }
1604
1605 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1606                                        rvec *v,rvec *lv)
1607 {
1608     gmx_domdec_master_t *ma;
1609     int  n,i,c,a,nalloc=0;
1610     rvec *buf=NULL;
1611     
1612     if (DDMASTER(dd))
1613     {
1614         ma  = dd->ma;
1615         
1616         for(n=0; n<dd->nnodes; n++)
1617         {
1618             if (n != dd->rank)
1619             {
1620                 if (ma->nat[n] > nalloc)
1621                 {
1622                     nalloc = over_alloc_dd(ma->nat[n]);
1623                     srenew(buf,nalloc);
1624                 }
1625                 /* Use lv as a temporary buffer */
1626                 a = 0;
1627                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1628                 {
1629                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1630                     {
1631                         copy_rvec(v[c],buf[a++]);
1632                     }
1633                 }
1634                 if (a != ma->nat[n])
1635                 {
1636                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1637                               a,ma->nat[n]);
1638                 }
1639                 
1640 #ifdef GMX_MPI
1641                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1642                          DDRANK(dd,n),n,dd->mpi_comm_all);
1643 #endif
1644             }
1645         }
1646         sfree(buf);
1647         n = DDMASTERRANK(dd);
1648         a = 0;
1649         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1650         {
1651             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1652             {
1653                 copy_rvec(v[c],lv[a++]);
1654             }
1655         }
1656     }
1657     else
1658     {
1659 #ifdef GMX_MPI
1660         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1661                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1662 #endif
1663     }
1664 }
1665
1666 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1667                                        rvec *v,rvec *lv)
1668 {
1669     gmx_domdec_master_t *ma;
1670     int  *scounts=NULL,*disps=NULL;
1671     int  n,i,c,a,nalloc=0;
1672     rvec *buf=NULL;
1673     
1674     if (DDMASTER(dd))
1675     {
1676         ma  = dd->ma;
1677      
1678         get_commbuffer_counts(dd,&scounts,&disps);
1679
1680         buf = ma->vbuf;
1681         a = 0;
1682         for(n=0; n<dd->nnodes; n++)
1683         {
1684             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1685             {
1686                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1687                 {
1688                     copy_rvec(v[c],buf[a++]);
1689                 }
1690             }
1691         }
1692     }
1693
1694     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1695 }
1696
1697 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1698 {
1699     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1700     {
1701         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1702     }
1703     else
1704     {
1705         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1706     }
1707 }
1708
1709 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1710                                 t_state *state,t_state *state_local,
1711                                 rvec **f)
1712 {
1713     int  i,j,nh;
1714
1715     nh = state->nhchainlength;
1716
1717     if (DDMASTER(dd))
1718     {
1719         for(i=0;i<efptNR;i++)
1720         {
1721             state_local->lambda[i] = state->lambda[i];
1722         }
1723         state_local->fep_state = state->fep_state;
1724         state_local->veta   = state->veta;
1725         state_local->vol0   = state->vol0;
1726         copy_mat(state->box,state_local->box);
1727         copy_mat(state->box_rel,state_local->box_rel);
1728         copy_mat(state->boxv,state_local->boxv);
1729         copy_mat(state->svir_prev,state_local->svir_prev);
1730         copy_mat(state->fvir_prev,state_local->fvir_prev);
1731         for(i=0; i<state_local->ngtc; i++)
1732         {
1733             for(j=0; j<nh; j++) {
1734                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1735                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1736             }
1737             state_local->therm_integral[i] = state->therm_integral[i];
1738         }
1739         for(i=0; i<state_local->nnhpres; i++)
1740         {
1741             for(j=0; j<nh; j++) {
1742                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1743                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1744             }
1745         }
1746     }
1747     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1748     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1749     dd_bcast(dd,sizeof(real),&state_local->veta);
1750     dd_bcast(dd,sizeof(real),&state_local->vol0);
1751     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1752     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1753     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1754     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1755     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1756     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1757     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1758     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1759     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1760     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1761
1762     if (dd->nat_home > state_local->nalloc)
1763     {
1764         dd_realloc_state(state_local,f,dd->nat_home);
1765     }
1766     for(i=0; i<estNR; i++)
1767     {
1768         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1769         {
1770             switch (i) {
1771             case estX:
1772                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1773                 break;
1774             case estV:
1775                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1776                 break;
1777             case estSDX:
1778                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1779                 break;
1780             case estCGP:
1781                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1782                 break;
1783             case estLD_RNG:
1784                 if (state->nrngi == 1)
1785                 {
1786                     dd_bcastc(dd,
1787                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1788                               state->ld_rng,state_local->ld_rng);
1789                 }
1790                 else
1791                 {
1792                     dd_scatter(dd,
1793                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1794                                state->ld_rng,state_local->ld_rng);
1795                 }
1796                 break;
1797             case estLD_RNGI:
1798                 if (state->nrngi == 1)
1799                 {
1800                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1801                               state->ld_rngi,state_local->ld_rngi);
1802                 }
1803                 else
1804                 {
1805                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1806                                state->ld_rngi,state_local->ld_rngi);
1807                 }   
1808                 break;
1809             case estDISRE_INITF:
1810             case estDISRE_RM3TAV:
1811             case estORIRE_INITF:
1812             case estORIRE_DTAV:
1813                 /* Not implemented yet */
1814                 break;
1815             default:
1816                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1817             }
1818         }
1819     }
1820 }
1821
1822 static char dim2char(int dim)
1823 {
1824     char c='?';
1825     
1826     switch (dim)
1827     {
1828     case XX: c = 'X'; break;
1829     case YY: c = 'Y'; break;
1830     case ZZ: c = 'Z'; break;
1831     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1832     }
1833     
1834     return c;
1835 }
1836
1837 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1838                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1839 {
1840     rvec grid_s[2],*grid_r=NULL,cx,r;
1841     char fname[STRLEN],format[STRLEN],buf[22];
1842     FILE *out;
1843     int  a,i,d,z,y,x;
1844     matrix tric;
1845     real vol;
1846
1847     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1848     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1849     
1850     if (DDMASTER(dd))
1851     {
1852         snew(grid_r,2*dd->nnodes);
1853     }
1854     
1855     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1856     
1857     if (DDMASTER(dd))
1858     {
1859         for(d=0; d<DIM; d++)
1860         {
1861             for(i=0; i<DIM; i++)
1862             {
1863                 if (d == i)
1864                 {
1865                     tric[d][i] = 1;
1866                 }
1867                 else
1868                 {
1869                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1870                     {
1871                         tric[d][i] = box[i][d]/box[i][i];
1872                     }
1873                     else
1874                     {
1875                         tric[d][i] = 0;
1876                     }
1877                 }
1878             }
1879         }
1880         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1881         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1882         out = gmx_fio_fopen(fname,"w");
1883         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1884         a = 1;
1885         for(i=0; i<dd->nnodes; i++)
1886         {
1887             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1888             for(d=0; d<DIM; d++)
1889             {
1890                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1891             }
1892             for(z=0; z<2; z++)
1893             {
1894                 for(y=0; y<2; y++)
1895                 {
1896                     for(x=0; x<2; x++)
1897                     {
1898                         cx[XX] = grid_r[i*2+x][XX];
1899                         cx[YY] = grid_r[i*2+y][YY];
1900                         cx[ZZ] = grid_r[i*2+z][ZZ];
1901                         mvmul(tric,cx,r);
1902                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1903                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1904                     }
1905                 }
1906             }
1907             for(d=0; d<DIM; d++)
1908             {
1909                 for(x=0; x<4; x++)
1910                 {
1911                     switch(d)
1912                     {
1913                     case 0: y = 1 + i*8 + 2*x; break;
1914                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1915                     case 2: y = 1 + i*8 + x; break;
1916                     }
1917                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1918                 }
1919             }
1920         }
1921         gmx_fio_fclose(out);
1922         sfree(grid_r);
1923     }
1924 }
1925
1926 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1927                   gmx_mtop_t *mtop,t_commrec *cr,
1928                   int natoms,rvec x[],matrix box)
1929 {
1930     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1931     FILE *out;
1932     int  i,ii,resnr,c;
1933     char *atomname,*resname;
1934     real b;
1935     gmx_domdec_t *dd;
1936     
1937     dd = cr->dd;
1938     if (natoms == -1)
1939     {
1940         natoms = dd->comm->nat[ddnatVSITE];
1941     }
1942     
1943     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1944     
1945     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1946     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1947     
1948     out = gmx_fio_fopen(fname,"w");
1949     
1950     fprintf(out,"TITLE     %s\n",title);
1951     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1952     for(i=0; i<natoms; i++)
1953     {
1954         ii = dd->gatindex[i];
1955         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1956         if (i < dd->comm->nat[ddnatZONE])
1957         {
1958             c = 0;
1959             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1960             {
1961                 c++;
1962             }
1963             b = c;
1964         }
1965         else if (i < dd->comm->nat[ddnatVSITE])
1966         {
1967             b = dd->comm->zones.n;
1968         }
1969         else
1970         {
1971             b = dd->comm->zones.n + 1;
1972         }
1973         fprintf(out,strlen(atomname)<4 ? format : format4,
1974                 "ATOM",(ii+1)%100000,
1975                 atomname,resname,' ',resnr%10000,' ',
1976                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1977     }
1978     fprintf(out,"TER\n");
1979     
1980     gmx_fio_fclose(out);
1981 }
1982
1983 real dd_cutoff_mbody(gmx_domdec_t *dd)
1984 {
1985     gmx_domdec_comm_t *comm;
1986     int  di;
1987     real r;
1988
1989     comm = dd->comm;
1990
1991     r = -1;
1992     if (comm->bInterCGBondeds)
1993     {
1994         if (comm->cutoff_mbody > 0)
1995         {
1996             r = comm->cutoff_mbody;
1997         }
1998         else
1999         {
2000             /* cutoff_mbody=0 means we do not have DLB */
2001             r = comm->cellsize_min[dd->dim[0]];
2002             for(di=1; di<dd->ndim; di++)
2003             {
2004                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2005             }
2006             if (comm->bBondComm)
2007             {
2008                 r = max(r,comm->cutoff_mbody);
2009             }
2010             else
2011             {
2012                 r = min(r,comm->cutoff);
2013             }
2014         }
2015     }
2016
2017     return r;
2018 }
2019
2020 real dd_cutoff_twobody(gmx_domdec_t *dd)
2021 {
2022     real r_mb;
2023
2024     r_mb = dd_cutoff_mbody(dd);
2025
2026     return max(dd->comm->cutoff,r_mb);
2027 }
2028
2029
2030 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2031 {
2032     int nc,ntot;
2033     
2034     nc   = dd->nc[dd->comm->cartpmedim];
2035     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2036     copy_ivec(coord,coord_pme);
2037     coord_pme[dd->comm->cartpmedim] =
2038         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2039 }
2040
2041 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2042 {
2043     /* Here we assign a PME node to communicate with this DD node
2044      * by assuming that the major index of both is x.
2045      * We add cr->npmenodes/2 to obtain an even distribution.
2046      */
2047     return (ddindex*npme + npme/2)/ndd;
2048 }
2049
2050 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2051 {
2052     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2053 }
2054
2055 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2056 {
2057     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2058 }
2059
2060 static int *dd_pmenodes(t_commrec *cr)
2061 {
2062     int *pmenodes;
2063     int n,i,p0,p1;
2064     
2065     snew(pmenodes,cr->npmenodes);
2066     n = 0;
2067     for(i=0; i<cr->dd->nnodes; i++) {
2068         p0 = cr_ddindex2pmeindex(cr,i);
2069         p1 = cr_ddindex2pmeindex(cr,i+1);
2070         if (i+1 == cr->dd->nnodes || p1 > p0) {
2071             if (debug)
2072                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2073             pmenodes[n] = i + 1 + n;
2074             n++;
2075         }
2076     }
2077
2078     return pmenodes;
2079 }
2080
2081 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2082 {
2083     gmx_domdec_t *dd;
2084     ivec coords,coords_pme,nc;
2085     int  slab;
2086     
2087     dd = cr->dd;
2088     /*
2089       if (dd->comm->bCartesian) {
2090       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2091       dd_coords2pmecoords(dd,coords,coords_pme);
2092       copy_ivec(dd->ntot,nc);
2093       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2094       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2095       
2096       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2097       } else {
2098       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2099       }
2100     */
2101     coords[XX] = x;
2102     coords[YY] = y;
2103     coords[ZZ] = z;
2104     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2105     
2106     return slab;
2107 }
2108
2109 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2110 {
2111     gmx_domdec_comm_t *comm;
2112     ivec coords;
2113     int  ddindex,nodeid=-1;
2114     
2115     comm = cr->dd->comm;
2116     
2117     coords[XX] = x;
2118     coords[YY] = y;
2119     coords[ZZ] = z;
2120     if (comm->bCartesianPP_PME)
2121     {
2122 #ifdef GMX_MPI
2123         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2124 #endif
2125     }
2126     else
2127     {
2128         ddindex = dd_index(cr->dd->nc,coords);
2129         if (comm->bCartesianPP)
2130         {
2131             nodeid = comm->ddindex2simnodeid[ddindex];
2132         }
2133         else
2134         {
2135             if (comm->pmenodes)
2136             {
2137                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2138             }
2139             else
2140             {
2141                 nodeid = ddindex;
2142             }
2143         }
2144     }
2145   
2146     return nodeid;
2147 }
2148
2149 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2150 {
2151     gmx_domdec_t *dd;
2152     gmx_domdec_comm_t *comm;
2153     ivec coord,coord_pme;
2154     int  i;
2155     int  pmenode=-1;
2156     
2157     dd = cr->dd;
2158     comm = dd->comm;
2159     
2160     /* This assumes a uniform x domain decomposition grid cell size */
2161     if (comm->bCartesianPP_PME)
2162     {
2163 #ifdef GMX_MPI
2164         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2165         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2166         {
2167             /* This is a PP node */
2168             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2169             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2170         }
2171 #endif
2172     }
2173     else if (comm->bCartesianPP)
2174     {
2175         if (sim_nodeid < dd->nnodes)
2176         {
2177             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2178         }
2179     }
2180     else
2181     {
2182         /* This assumes DD cells with identical x coordinates
2183          * are numbered sequentially.
2184          */
2185         if (dd->comm->pmenodes == NULL)
2186         {
2187             if (sim_nodeid < dd->nnodes)
2188             {
2189                 /* The DD index equals the nodeid */
2190                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2191             }
2192         }
2193         else
2194         {
2195             i = 0;
2196             while (sim_nodeid > dd->comm->pmenodes[i])
2197             {
2198                 i++;
2199             }
2200             if (sim_nodeid < dd->comm->pmenodes[i])
2201             {
2202                 pmenode = dd->comm->pmenodes[i];
2203             }
2204         }
2205     }
2206     
2207     return pmenode;
2208 }
2209
2210 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2211 {
2212     gmx_bool bPMEOnlyNode;
2213     
2214     if (DOMAINDECOMP(cr))
2215     {
2216         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2217     }
2218     else
2219     {
2220         bPMEOnlyNode = FALSE;
2221     }
2222     
2223     return bPMEOnlyNode;
2224 }
2225
2226 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2227                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2228 {
2229     gmx_domdec_t *dd;
2230     int x,y,z;
2231     ivec coord,coord_pme;
2232     
2233     dd = cr->dd;
2234     
2235     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2236     
2237     *nmy_ddnodes = 0;
2238     for(x=0; x<dd->nc[XX]; x++)
2239     {
2240         for(y=0; y<dd->nc[YY]; y++)
2241         {
2242             for(z=0; z<dd->nc[ZZ]; z++)
2243             {
2244                 if (dd->comm->bCartesianPP_PME)
2245                 {
2246                     coord[XX] = x;
2247                     coord[YY] = y;
2248                     coord[ZZ] = z;
2249                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2250                     if (dd->ci[XX] == coord_pme[XX] &&
2251                         dd->ci[YY] == coord_pme[YY] &&
2252                         dd->ci[ZZ] == coord_pme[ZZ])
2253                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2254                 }
2255                 else
2256                 {
2257                     /* The slab corresponds to the nodeid in the PME group */
2258                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2259                     {
2260                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2261                     }
2262                 }
2263             }
2264         }
2265     }
2266     
2267     /* The last PP-only node is the peer node */
2268     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2269     
2270     if (debug)
2271     {
2272         fprintf(debug,"Receive coordinates from PP nodes:");
2273         for(x=0; x<*nmy_ddnodes; x++)
2274         {
2275             fprintf(debug," %d",(*my_ddnodes)[x]);
2276         }
2277         fprintf(debug,"\n");
2278     }
2279 }
2280
2281 static gmx_bool receive_vir_ener(t_commrec *cr)
2282 {
2283     gmx_domdec_comm_t *comm;
2284     int  pmenode,coords[DIM],rank;
2285     gmx_bool bReceive;
2286     
2287     bReceive = TRUE;
2288     if (cr->npmenodes < cr->dd->nnodes)
2289     {
2290         comm = cr->dd->comm;
2291         if (comm->bCartesianPP_PME)
2292         {
2293             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2294 #ifdef GMX_MPI
2295             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2296             coords[comm->cartpmedim]++;
2297             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2298             {
2299                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2300                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2301                 {
2302                     /* This is not the last PP node for pmenode */
2303                     bReceive = FALSE;
2304                 }
2305             }
2306 #endif  
2307         }
2308         else
2309         {
2310             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2311             if (cr->sim_nodeid+1 < cr->nnodes &&
2312                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2313             {
2314                 /* This is not the last PP node for pmenode */
2315                 bReceive = FALSE;
2316             }
2317         }
2318     }
2319     
2320     return bReceive;
2321 }
2322
2323 static void set_zones_ncg_home(gmx_domdec_t *dd)
2324 {
2325     gmx_domdec_zones_t *zones;
2326     int i;
2327
2328     zones = &dd->comm->zones;
2329
2330     zones->cg_range[0] = 0;
2331     for(i=1; i<zones->n+1; i++)
2332     {
2333         zones->cg_range[i] = dd->ncg_home;
2334     }
2335 }
2336
2337 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2338 {
2339     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2340     
2341     ind = state->cg_gl;
2342     dd_cg_gl = dd->index_gl;
2343     cgindex  = dd->cgindex;
2344     nat = 0;
2345     cgindex[0] = nat;
2346     for(i=0; i<state->ncg_gl; i++)
2347     {
2348         cgindex[i] = nat;
2349         cg_gl = ind[i];
2350         dd_cg_gl[i] = cg_gl;
2351         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2352     }
2353     cgindex[i] = nat;
2354     
2355     dd->ncg_home = state->ncg_gl;
2356     dd->nat_home = nat;
2357
2358     set_zones_ncg_home(dd);
2359 }
2360
2361 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2362 {
2363     while (cg >= cginfo_mb->cg_end)
2364     {
2365         cginfo_mb++;
2366     }
2367
2368     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2369 }
2370
2371 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2372                           t_forcerec *fr,char *bLocalCG)
2373 {
2374     cginfo_mb_t *cginfo_mb;
2375     int *cginfo;
2376     int cg;
2377
2378     if (fr != NULL)
2379     {
2380         cginfo_mb = fr->cginfo_mb;
2381         cginfo    = fr->cginfo;
2382
2383         for(cg=cg0; cg<cg1; cg++)
2384         {
2385             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2386         }
2387     }
2388
2389     if (bLocalCG != NULL)
2390     {
2391         for(cg=cg0; cg<cg1; cg++)
2392         {
2393             bLocalCG[index_gl[cg]] = TRUE;
2394         }
2395     }
2396 }
2397
2398 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2399 {
2400     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2401     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2402     gmx_ga2la_t *ga2la;
2403     char *bLocalCG;
2404
2405     bLocalCG = dd->comm->bLocalCG;
2406
2407     if (dd->nat_tot > dd->gatindex_nalloc)
2408     {
2409         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2410         srenew(dd->gatindex,dd->gatindex_nalloc);
2411     }
2412
2413     nzone      = dd->comm->zones.n;
2414     zone2cg    = dd->comm->zones.cg_range;
2415     zone_ncg1  = dd->comm->zone_ncg1;
2416     index_gl   = dd->index_gl;
2417     gatindex   = dd->gatindex;
2418
2419     if (zone2cg[1] != dd->ncg_home)
2420     {
2421         gmx_incons("dd->ncg_zone is not up to date");
2422     }
2423     
2424     /* Make the local to global and global to local atom index */
2425     a = dd->cgindex[cg_start];
2426     for(zone=0; zone<nzone; zone++)
2427     {
2428         if (zone == 0)
2429         {
2430             cg0 = cg_start;
2431         }
2432         else
2433         {
2434             cg0 = zone2cg[zone];
2435         }
2436         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2437         {
2438             zone1 = zone;
2439             if (cg - cg0 >= zone_ncg1[zone])
2440             {
2441                 /* Signal that this cg is from more than one zone away */
2442                 zone1 += nzone;
2443             }
2444             cg_gl = index_gl[cg];
2445             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2446             {
2447                 gatindex[a] = a_gl;
2448                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2449                 a++;
2450             }
2451         }
2452     }
2453 }
2454
2455 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2456                           const char *where)
2457 {
2458     int ncg,i,ngl,nerr;
2459
2460     nerr = 0;
2461     if (bLocalCG == NULL)
2462     {
2463         return nerr;
2464     }
2465     for(i=0; i<dd->ncg_tot; i++)
2466     {
2467         if (!bLocalCG[dd->index_gl[i]])
2468         {
2469             fprintf(stderr,
2470                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2471             nerr++;
2472         }
2473     }
2474     ngl = 0;
2475     for(i=0; i<ncg_sys; i++)
2476     {
2477         if (bLocalCG[i])
2478         {
2479             ngl++;
2480         }
2481     }
2482     if (ngl != dd->ncg_tot)
2483     {
2484         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2485         nerr++;
2486     }
2487
2488     return nerr;
2489 }
2490
2491 static void check_index_consistency(gmx_domdec_t *dd,
2492                                     int natoms_sys,int ncg_sys,
2493                                     const char *where)
2494 {
2495     int  nerr,ngl,i,a,cell;
2496     int  *have;
2497
2498     nerr = 0;
2499
2500     if (dd->comm->DD_debug > 1)
2501     {
2502         snew(have,natoms_sys);
2503         for(a=0; a<dd->nat_tot; a++)
2504         {
2505             if (have[dd->gatindex[a]] > 0)
2506             {
2507                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2508             }
2509             else
2510             {
2511                 have[dd->gatindex[a]] = a + 1;
2512             }
2513         }
2514         sfree(have);
2515     }
2516
2517     snew(have,dd->nat_tot);
2518
2519     ngl  = 0;
2520     for(i=0; i<natoms_sys; i++)
2521     {
2522         if (ga2la_get(dd->ga2la,i,&a,&cell))
2523         {
2524             if (a >= dd->nat_tot)
2525             {
2526                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2527                 nerr++;
2528             }
2529             else
2530             {
2531                 have[a] = 1;
2532                 if (dd->gatindex[a] != i)
2533                 {
2534                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2535                     nerr++;
2536                 }
2537             }
2538             ngl++;
2539         }
2540     }
2541     if (ngl != dd->nat_tot)
2542     {
2543         fprintf(stderr,
2544                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2545                 dd->rank,where,ngl,dd->nat_tot);
2546     }
2547     for(a=0; a<dd->nat_tot; a++)
2548     {
2549         if (have[a] == 0)
2550         {
2551             fprintf(stderr,
2552                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2553                     dd->rank,where,a+1,dd->gatindex[a]+1);
2554         }
2555     }
2556     sfree(have);
2557
2558     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2559
2560     if (nerr > 0) {
2561         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2562                   dd->rank,where,nerr);
2563     }
2564 }
2565
2566 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2567 {
2568     int  i;
2569     char *bLocalCG;
2570
2571     if (a_start == 0)
2572     {
2573         /* Clear the whole list without searching */
2574         ga2la_clear(dd->ga2la);
2575     }
2576     else
2577     {
2578         for(i=a_start; i<dd->nat_tot; i++)
2579         {
2580             ga2la_del(dd->ga2la,dd->gatindex[i]);
2581         }
2582     }
2583
2584     bLocalCG = dd->comm->bLocalCG;
2585     if (bLocalCG)
2586     {
2587         for(i=cg_start; i<dd->ncg_tot; i++)
2588         {
2589             bLocalCG[dd->index_gl[i]] = FALSE;
2590         }
2591     }
2592
2593     dd_clear_local_vsite_indices(dd);
2594     
2595     if (dd->constraints)
2596     {
2597         dd_clear_local_constraint_indices(dd);
2598     }
2599 }
2600
2601 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2602 {
2603     real grid_jump_limit;
2604
2605     /* The distance between the boundaries of cells at distance
2606      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2607      * and by the fact that cells should not be shifted by more than
2608      * half their size, such that cg's only shift by one cell
2609      * at redecomposition.
2610      */
2611     grid_jump_limit = comm->cellsize_limit;
2612     if (!comm->bVacDLBNoLimit)
2613     {
2614         grid_jump_limit = max(grid_jump_limit,
2615                               comm->cutoff/comm->cd[dim_ind].np);
2616     }
2617
2618     return grid_jump_limit;
2619 }
2620
2621 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2622 {
2623     gmx_domdec_comm_t *comm;
2624     int  d,dim;
2625     real limit,bfac;
2626     
2627     comm = dd->comm;
2628     
2629     for(d=1; d<dd->ndim; d++)
2630     {
2631         dim = dd->dim[d];
2632         limit = grid_jump_limit(comm,d);
2633         bfac = ddbox->box_size[dim];
2634         if (ddbox->tric_dir[dim])
2635         {
2636             bfac *= ddbox->skew_fac[dim];
2637         }
2638         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2639             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2640         {
2641             char buf[22];
2642             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2643                       gmx_step_str(step,buf),
2644                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2645         }
2646     }
2647 }
2648
2649 static int dd_load_count(gmx_domdec_comm_t *comm)
2650 {
2651     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2652 }
2653
2654 static float dd_force_load(gmx_domdec_comm_t *comm)
2655 {
2656     float load;
2657     
2658     if (comm->eFlop)
2659     {
2660         load = comm->flop;
2661         if (comm->eFlop > 1)
2662         {
2663             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2664         }
2665     } 
2666     else
2667     {
2668         load = comm->cycl[ddCyclF];
2669         if (comm->cycl_n[ddCyclF] > 1)
2670         {
2671             /* Subtract the maximum of the last n cycle counts
2672              * to get rid of possible high counts due to other soures,
2673              * for instance system activity, that would otherwise
2674              * affect the dynamic load balancing.
2675              */
2676             load -= comm->cycl_max[ddCyclF];
2677         }
2678     }
2679     
2680     return load;
2681 }
2682
2683 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2684 {
2685     gmx_domdec_comm_t *comm;
2686     int i;
2687     
2688     comm = dd->comm;
2689     
2690     snew(*dim_f,dd->nc[dim]+1);
2691     (*dim_f)[0] = 0;
2692     for(i=1; i<dd->nc[dim]; i++)
2693     {
2694         if (comm->slb_frac[dim])
2695         {
2696             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2697         }
2698         else
2699         {
2700             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2701         }
2702     }
2703     (*dim_f)[dd->nc[dim]] = 1;
2704 }
2705
2706 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2707 {
2708     int  pmeindex,slab,nso,i;
2709     ivec xyz;
2710     
2711     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2712     {
2713         ddpme->dim = YY;
2714     }
2715     else
2716     {
2717         ddpme->dim = dimind;
2718     }
2719     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2720     
2721     ddpme->nslab = (ddpme->dim == 0 ?
2722                     dd->comm->npmenodes_x :
2723                     dd->comm->npmenodes_y);
2724
2725     if (ddpme->nslab <= 1)
2726     {
2727         return;
2728     }
2729
2730     nso = dd->comm->npmenodes/ddpme->nslab;
2731     /* Determine for each PME slab the PP location range for dimension dim */
2732     snew(ddpme->pp_min,ddpme->nslab);
2733     snew(ddpme->pp_max,ddpme->nslab);
2734     for(slab=0; slab<ddpme->nslab; slab++) {
2735         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2736         ddpme->pp_max[slab] = 0;
2737     }
2738     for(i=0; i<dd->nnodes; i++) {
2739         ddindex2xyz(dd->nc,i,xyz);
2740         /* For y only use our y/z slab.
2741          * This assumes that the PME x grid size matches the DD grid size.
2742          */
2743         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2744             pmeindex = ddindex2pmeindex(dd,i);
2745             if (dimind == 0) {
2746                 slab = pmeindex/nso;
2747             } else {
2748                 slab = pmeindex % ddpme->nslab;
2749             }
2750             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2751             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2752         }
2753     }
2754
2755     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2756 }
2757
2758 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2759 {
2760     if (dd->comm->ddpme[0].dim == XX)
2761     {
2762         return dd->comm->ddpme[0].maxshift;
2763     }
2764     else
2765     {
2766         return 0;
2767     }
2768 }
2769
2770 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2771 {
2772     if (dd->comm->ddpme[0].dim == YY)
2773     {
2774         return dd->comm->ddpme[0].maxshift;
2775     }
2776     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2777     {
2778         return dd->comm->ddpme[1].maxshift;
2779     }
2780     else
2781     {
2782         return 0;
2783     }
2784 }
2785
2786 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2787                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2788 {
2789     gmx_domdec_comm_t *comm;
2790     int  nc,ns,s;
2791     int  *xmin,*xmax;
2792     real range,pme_boundary;
2793     int  sh;
2794     
2795     comm = dd->comm;
2796     nc  = dd->nc[ddpme->dim];
2797     ns  = ddpme->nslab;
2798     
2799     if (!ddpme->dim_match)
2800     {
2801         /* PP decomposition is not along dim: the worst situation */
2802         sh = ns/2;
2803     }
2804     else if (ns <= 3 || (bUniform && ns == nc))
2805     {
2806         /* The optimal situation */
2807         sh = 1;
2808     }
2809     else
2810     {
2811         /* We need to check for all pme nodes which nodes they
2812          * could possibly need to communicate with.
2813          */
2814         xmin = ddpme->pp_min;
2815         xmax = ddpme->pp_max;
2816         /* Allow for atoms to be maximally 2/3 times the cut-off
2817          * out of their DD cell. This is a reasonable balance between
2818          * between performance and support for most charge-group/cut-off
2819          * combinations.
2820          */
2821         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2822         /* Avoid extra communication when we are exactly at a boundary */
2823         range *= 0.999;
2824         
2825         sh = 1;
2826         for(s=0; s<ns; s++)
2827         {
2828             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2829             pme_boundary = (real)s/ns;
2830             while (sh+1 < ns &&
2831                    ((s-(sh+1) >= 0 &&
2832                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2833                     (s-(sh+1) <  0 &&
2834                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2835             {
2836                 sh++;
2837             }
2838             pme_boundary = (real)(s+1)/ns;
2839             while (sh+1 < ns &&
2840                    ((s+(sh+1) <  ns &&
2841                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2842                     (s+(sh+1) >= ns &&
2843                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2844             {
2845                 sh++;
2846             }
2847         }
2848     }
2849     
2850     ddpme->maxshift = sh;
2851     
2852     if (debug)
2853     {
2854         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2855                 ddpme->dim,ddpme->maxshift);
2856     }
2857 }
2858
2859 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2860 {
2861     int d,dim;
2862     
2863     for(d=0; d<dd->ndim; d++)
2864     {
2865         dim = dd->dim[d];
2866         if (dim < ddbox->nboundeddim &&
2867             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2868             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2869         {
2870             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2871                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2872                       dd->nc[dim],dd->comm->cellsize_limit);
2873         }
2874     }
2875 }
2876
2877 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2878                                   gmx_bool bMaster,ivec npulse)
2879 {
2880     gmx_domdec_comm_t *comm;
2881     int  d,j;
2882     rvec cellsize_min;
2883     real *cell_x,cell_dx,cellsize;
2884     
2885     comm = dd->comm;
2886     
2887     for(d=0; d<DIM; d++)
2888     {
2889         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2890         npulse[d] = 1;
2891         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2892         {
2893             /* Uniform grid */
2894             cell_dx = ddbox->box_size[d]/dd->nc[d];
2895             if (bMaster)
2896             {
2897                 for(j=0; j<dd->nc[d]+1; j++)
2898                 {
2899                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2900                 }
2901             }
2902             else
2903             {
2904                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2905                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2906             }
2907             cellsize = cell_dx*ddbox->skew_fac[d];
2908             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2909             {
2910                 npulse[d]++;
2911             }
2912             cellsize_min[d] = cellsize;
2913         }
2914         else
2915         {
2916             /* Statically load balanced grid */
2917             /* Also when we are not doing a master distribution we determine
2918              * all cell borders in a loop to obtain identical values
2919              * to the master distribution case and to determine npulse.
2920              */
2921             if (bMaster)
2922             {
2923                 cell_x = dd->ma->cell_x[d];
2924             }
2925             else
2926             {
2927                 snew(cell_x,dd->nc[d]+1);
2928             }
2929             cell_x[0] = ddbox->box0[d];
2930             for(j=0; j<dd->nc[d]; j++)
2931             {
2932                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2933                 cell_x[j+1] = cell_x[j] + cell_dx;
2934                 cellsize = cell_dx*ddbox->skew_fac[d];
2935                 while (cellsize*npulse[d] < comm->cutoff &&
2936                        npulse[d] < dd->nc[d]-1)
2937                 {
2938                     npulse[d]++;
2939                 }
2940                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2941             }
2942             if (!bMaster)
2943             {
2944                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2945                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2946                 sfree(cell_x);
2947             }
2948         }
2949         /* The following limitation is to avoid that a cell would receive
2950          * some of its own home charge groups back over the periodic boundary.
2951          * Double charge groups cause trouble with the global indices.
2952          */
2953         if (d < ddbox->npbcdim &&
2954             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2955         {
2956             gmx_fatal_collective(FARGS,NULL,dd,
2957                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2958                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2959                                  comm->cutoff,
2960                                  dd->nc[d],dd->nc[d],
2961                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2962         }
2963     }
2964     
2965     if (!comm->bDynLoadBal)
2966     {
2967         copy_rvec(cellsize_min,comm->cellsize_min);
2968     }
2969    
2970     for(d=0; d<comm->npmedecompdim; d++)
2971     {
2972         set_pme_maxshift(dd,&comm->ddpme[d],
2973                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2974                          comm->ddpme[d].slb_dim_f);
2975     }
2976 }
2977
2978
2979 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2980                                        int d,int dim,gmx_domdec_root_t *root,
2981                                        gmx_ddbox_t *ddbox,
2982                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2983 {
2984     gmx_domdec_comm_t *comm;
2985     int  ncd,i,j,nmin,nmin_old;
2986     gmx_bool bLimLo,bLimHi;
2987     real *cell_size;
2988     real fac,halfway,cellsize_limit_f_i,region_size;
2989     gmx_bool bPBC,bLastHi=FALSE;
2990     int nrange[]={range[0],range[1]};
2991
2992     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
2993
2994     comm = dd->comm;
2995
2996     ncd = dd->nc[dim];
2997
2998     bPBC = (dim < ddbox->npbcdim);
2999
3000     cell_size = root->buf_ncd;
3001
3002     if (debug) 
3003     {
3004         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3005     }
3006
3007     /* First we need to check if the scaling does not make cells
3008      * smaller than the smallest allowed size.
3009      * We need to do this iteratively, since if a cell is too small,
3010      * it needs to be enlarged, which makes all the other cells smaller,
3011      * which could in turn make another cell smaller than allowed.
3012      */
3013     for(i=range[0]; i<range[1]; i++)
3014     {
3015         root->bCellMin[i] = FALSE;
3016     }
3017     nmin = 0;
3018     do
3019     {
3020         nmin_old = nmin;
3021         /* We need the total for normalization */
3022         fac = 0;
3023         for(i=range[0]; i<range[1]; i++)
3024         {
3025             if (root->bCellMin[i] == FALSE)
3026             {
3027                 fac += cell_size[i];
3028             }
3029         }
3030         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3031         /* Determine the cell boundaries */
3032         for(i=range[0]; i<range[1]; i++)
3033         {
3034             if (root->bCellMin[i] == FALSE)
3035             {
3036                 cell_size[i] *= fac;
3037                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3038                 {
3039                     cellsize_limit_f_i = 0;
3040                 }
3041                 else
3042                 {
3043                     cellsize_limit_f_i = cellsize_limit_f;
3044                 }
3045                 if (cell_size[i] < cellsize_limit_f_i)
3046                 {
3047                     root->bCellMin[i] = TRUE;
3048                     cell_size[i] = cellsize_limit_f_i;
3049                     nmin++;
3050                 }
3051             }
3052             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3053         }
3054     }
3055     while (nmin > nmin_old);
3056     
3057     i=range[1]-1;
3058     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3059     /* For this check we should not use DD_CELL_MARGIN,
3060      * but a slightly smaller factor,
3061      * since rounding could get use below the limit.
3062      */
3063     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3064     {
3065         char buf[22];
3066         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3067                   gmx_step_str(step,buf),
3068                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3069                   ncd,comm->cellsize_min[dim]);
3070     }
3071     
3072     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3073     
3074     if (!bUniform)
3075     {
3076         /* Check if the boundary did not displace more than halfway
3077          * each of the cells it bounds, as this could cause problems,
3078          * especially when the differences between cell sizes are large.
3079          * If changes are applied, they will not make cells smaller
3080          * than the cut-off, as we check all the boundaries which
3081          * might be affected by a change and if the old state was ok,
3082          * the cells will at most be shrunk back to their old size.
3083          */
3084         for(i=range[0]+1; i<range[1]; i++)
3085         {
3086             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3087             if (root->cell_f[i] < halfway)
3088             {
3089                 root->cell_f[i] = halfway;
3090                 /* Check if the change also causes shifts of the next boundaries */
3091                 for(j=i+1; j<range[1]; j++)
3092                 {
3093                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3094                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3095                 }
3096             }
3097             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3098             if (root->cell_f[i] > halfway)
3099             {
3100                 root->cell_f[i] = halfway;
3101                 /* Check if the change also causes shifts of the next boundaries */
3102                 for(j=i-1; j>=range[0]+1; j--)
3103                 {
3104                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3105                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3106                 }
3107             }
3108         }
3109     }
3110     
3111     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3112     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3113      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3114      * for a and b nrange is used */
3115     if (d > 0)
3116     {
3117         /* Take care of the staggering of the cell boundaries */
3118         if (bUniform)
3119         {
3120             for(i=range[0]; i<range[1]; i++)
3121             {
3122                 root->cell_f_max0[i] = root->cell_f[i];
3123                 root->cell_f_min1[i] = root->cell_f[i+1];
3124             }
3125         }
3126         else
3127         {
3128             for(i=range[0]+1; i<range[1]; i++)
3129             {
3130                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3131                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3132                 if (bLimLo && bLimHi)
3133                 {
3134                     /* Both limits violated, try the best we can */
3135                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3136                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3137                     nrange[0]=range[0];
3138                     nrange[1]=i;
3139                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3140
3141                     nrange[0]=i;
3142                     nrange[1]=range[1];
3143                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3144
3145                     return;
3146                 }
3147                 else if (bLimLo)
3148                 {
3149                     /* root->cell_f[i] = root->bound_min[i]; */
3150                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3151                     bLastHi=FALSE;
3152                 }
3153                 else if (bLimHi && !bLastHi)
3154                 {
3155                     bLastHi=TRUE;
3156                     if (nrange[1] < range[1])   /* found a LimLo before */
3157                     {
3158                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3159                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3160                         nrange[0]=nrange[1];
3161                     }
3162                     root->cell_f[i] = root->bound_max[i];
3163                     nrange[1]=i; 
3164                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3165                     nrange[0]=i;
3166                     nrange[1]=range[1];
3167                 }
3168             }
3169             if (nrange[1] < range[1])   /* found last a LimLo */
3170             {
3171                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3172                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3173                 nrange[0]=nrange[1];
3174                 nrange[1]=range[1];
3175                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3176             } 
3177             else if (nrange[0] > range[0]) /* found at least one LimHi */
3178             {
3179                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3180             }
3181         }
3182     }
3183 }
3184
3185
3186 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3187                                        int d,int dim,gmx_domdec_root_t *root,
3188                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3189                                        gmx_bool bUniform,gmx_large_int_t step)
3190 {
3191     gmx_domdec_comm_t *comm;
3192     int  ncd,d1,i,j,pos;
3193     real *cell_size;
3194     real load_aver,load_i,imbalance,change,change_max,sc;
3195     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3196     real change_limit;
3197     real relax = 0.5;
3198     gmx_bool bPBC;
3199     int range[] = { 0, 0 };
3200
3201     comm = dd->comm;
3202
3203     /* Convert the maximum change from the input percentage to a fraction */
3204     change_limit = comm->dlb_scale_lim*0.01;
3205
3206     ncd = dd->nc[dim];
3207
3208     bPBC = (dim < ddbox->npbcdim);
3209
3210     cell_size = root->buf_ncd;
3211
3212     /* Store the original boundaries */
3213     for(i=0; i<ncd+1; i++)
3214     {
3215         root->old_cell_f[i] = root->cell_f[i];
3216     }
3217     if (bUniform) {
3218         for(i=0; i<ncd; i++)
3219         {
3220             cell_size[i] = 1.0/ncd;
3221         }
3222     }
3223     else if (dd_load_count(comm))
3224     {
3225         load_aver = comm->load[d].sum_m/ncd;
3226         change_max = 0;
3227         for(i=0; i<ncd; i++)
3228         {
3229             /* Determine the relative imbalance of cell i */
3230             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3231             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3232             /* Determine the change of the cell size using underrelaxation */
3233             change = -relax*imbalance;
3234             change_max = max(change_max,max(change,-change));
3235         }
3236         /* Limit the amount of scaling.
3237          * We need to use the same rescaling for all cells in one row,
3238          * otherwise the load balancing might not converge.
3239          */
3240         sc = relax;
3241         if (change_max > change_limit)
3242         {
3243             sc *= change_limit/change_max;
3244         }
3245         for(i=0; i<ncd; i++)
3246         {
3247             /* Determine the relative imbalance of cell i */
3248             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3249             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3250             /* Determine the change of the cell size using underrelaxation */
3251             change = -sc*imbalance;
3252             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3253         }
3254     }
3255     
3256     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3257     cellsize_limit_f *= DD_CELL_MARGIN;
3258     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3259     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3260     if (ddbox->tric_dir[dim])
3261     {
3262         cellsize_limit_f /= ddbox->skew_fac[dim];
3263         dist_min_f       /= ddbox->skew_fac[dim];
3264     }
3265     if (bDynamicBox && d > 0)
3266     {
3267         dist_min_f *= DD_PRES_SCALE_MARGIN;
3268     }
3269     if (d > 0 && !bUniform)
3270     {
3271         /* Make sure that the grid is not shifted too much */
3272         for(i=1; i<ncd; i++) {
3273             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3274             {
3275                 gmx_incons("Inconsistent DD boundary staggering limits!");
3276             }
3277             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3278             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3279             if (space > 0) {
3280                 root->bound_min[i] += 0.5*space;
3281             }
3282             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3283             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3284             if (space < 0) {
3285                 root->bound_max[i] += 0.5*space;
3286             }
3287             if (debug)
3288             {
3289                 fprintf(debug,
3290                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3291                         d,i,
3292                         root->cell_f_max0[i-1] + dist_min_f,
3293                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3294                         root->cell_f_min1[i] - dist_min_f);
3295             }
3296         }
3297     }
3298     range[1]=ncd;
3299     root->cell_f[0] = 0;
3300     root->cell_f[ncd] = 1;
3301     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3302
3303
3304     /* After the checks above, the cells should obey the cut-off
3305      * restrictions, but it does not hurt to check.
3306      */
3307     for(i=0; i<ncd; i++)
3308     {
3309         if (debug)
3310         {
3311             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3312                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3313         }
3314
3315         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3316             root->cell_f[i+1] - root->cell_f[i] <
3317             cellsize_limit_f/DD_CELL_MARGIN)
3318         {
3319             char buf[22];
3320             fprintf(stderr,
3321                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3322                     gmx_step_str(step,buf),dim2char(dim),i,
3323                     (root->cell_f[i+1] - root->cell_f[i])
3324                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3325         }
3326     }
3327     
3328     pos = ncd + 1;
3329     /* Store the cell boundaries of the lower dimensions at the end */
3330     for(d1=0; d1<d; d1++)
3331     {
3332         root->cell_f[pos++] = comm->cell_f0[d1];
3333         root->cell_f[pos++] = comm->cell_f1[d1];
3334     }
3335     
3336     if (d < comm->npmedecompdim)
3337     {
3338         /* The master determines the maximum shift for
3339          * the coordinate communication between separate PME nodes.
3340          */
3341         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3342     }
3343     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3344     if (d >= 1)
3345     {
3346         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3347     }
3348 }    
3349
3350 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3351                                              gmx_ddbox_t *ddbox,int dimind)
3352 {
3353     gmx_domdec_comm_t *comm;
3354     int dim;
3355
3356     comm = dd->comm;
3357
3358     /* Set the cell dimensions */
3359     dim = dd->dim[dimind];
3360     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3361     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3362     if (dim >= ddbox->nboundeddim)
3363     {
3364         comm->cell_x0[dim] += ddbox->box0[dim];
3365         comm->cell_x1[dim] += ddbox->box0[dim];
3366     }
3367 }
3368
3369 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3370                                          int d,int dim,real *cell_f_row,
3371                                          gmx_ddbox_t *ddbox)
3372 {
3373     gmx_domdec_comm_t *comm;
3374     int d1,dim1,pos;
3375
3376     comm = dd->comm;
3377
3378 #ifdef GMX_MPI
3379     /* Each node would only need to know two fractions,
3380      * but it is probably cheaper to broadcast the whole array.
3381      */
3382     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3383               0,comm->mpi_comm_load[d]);
3384 #endif
3385     /* Copy the fractions for this dimension from the buffer */
3386     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3387     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3388     /* The whole array was communicated, so set the buffer position */
3389     pos = dd->nc[dim] + 1;
3390     for(d1=0; d1<=d; d1++)
3391     {
3392         if (d1 < d)
3393         {
3394             /* Copy the cell fractions of the lower dimensions */
3395             comm->cell_f0[d1] = cell_f_row[pos++];
3396             comm->cell_f1[d1] = cell_f_row[pos++];
3397         }
3398         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3399     }
3400     /* Convert the communicated shift from float to int */
3401     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3402     if (d >= 1)
3403     {
3404         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3405     }
3406 }
3407
3408 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3409                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3410                                          gmx_bool bUniform,gmx_large_int_t step)
3411 {
3412     gmx_domdec_comm_t *comm;
3413     int d,dim,d1;
3414     gmx_bool bRowMember,bRowRoot;
3415     real *cell_f_row;
3416     
3417     comm = dd->comm;
3418
3419     for(d=0; d<dd->ndim; d++)
3420     {
3421         dim = dd->dim[d];
3422         bRowMember = TRUE;
3423         bRowRoot = TRUE;
3424         for(d1=d; d1<dd->ndim; d1++)
3425         {
3426             if (dd->ci[dd->dim[d1]] > 0)
3427             {
3428                 if (d1 > d)
3429                 {
3430                     bRowMember = FALSE;
3431                 }
3432                 bRowRoot = FALSE;
3433             }
3434         }
3435         if (bRowMember)
3436         {
3437             if (bRowRoot)
3438             {
3439                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3440                                            ddbox,bDynamicBox,bUniform,step);
3441                 cell_f_row = comm->root[d]->cell_f;
3442             }
3443             else
3444             {
3445                 cell_f_row = comm->cell_f_row;
3446             }
3447             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3448         }
3449     }
3450 }    
3451
3452 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3453 {
3454     int d;
3455
3456     /* This function assumes the box is static and should therefore
3457      * not be called when the box has changed since the last
3458      * call to dd_partition_system.
3459      */
3460     for(d=0; d<dd->ndim; d++)
3461     {
3462         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3463     }
3464 }
3465
3466
3467
3468 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3469                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3470                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3471                                   gmx_wallcycle_t wcycle)
3472 {
3473     gmx_domdec_comm_t *comm;
3474     int dim;
3475
3476     comm = dd->comm;
3477     
3478     if (bDoDLB)
3479     {
3480         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3481         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3482         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3483     }
3484     else if (bDynamicBox)
3485     {
3486         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3487     }
3488     
3489     /* Set the dimensions for which no DD is used */
3490     for(dim=0; dim<DIM; dim++) {
3491         if (dd->nc[dim] == 1) {
3492             comm->cell_x0[dim] = 0;
3493             comm->cell_x1[dim] = ddbox->box_size[dim];
3494             if (dim >= ddbox->nboundeddim)
3495             {
3496                 comm->cell_x0[dim] += ddbox->box0[dim];
3497                 comm->cell_x1[dim] += ddbox->box0[dim];
3498             }
3499         }
3500     }
3501 }
3502
3503 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3504 {
3505     int d,np,i;
3506     gmx_domdec_comm_dim_t *cd;
3507     
3508     for(d=0; d<dd->ndim; d++)
3509     {
3510         cd = &dd->comm->cd[d];
3511         np = npulse[dd->dim[d]];
3512         if (np > cd->np_nalloc)
3513         {
3514             if (debug)
3515             {
3516                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3517                         dim2char(dd->dim[d]),np);
3518             }
3519             if (DDMASTER(dd) && cd->np_nalloc > 0)
3520             {
3521                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3522             }
3523             srenew(cd->ind,np);
3524             for(i=cd->np_nalloc; i<np; i++)
3525             {
3526                 cd->ind[i].index  = NULL;
3527                 cd->ind[i].nalloc = 0;
3528             }
3529             cd->np_nalloc = np;
3530         }
3531         cd->np = np;
3532     }
3533 }
3534
3535
3536 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3537                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3538                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3539                               gmx_wallcycle_t wcycle)
3540 {
3541     gmx_domdec_comm_t *comm;
3542     int  d;
3543     ivec npulse;
3544     
3545     comm = dd->comm;
3546
3547     /* Copy the old cell boundaries for the cg displacement check */
3548     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3549     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3550     
3551     if (comm->bDynLoadBal)
3552     {
3553         if (DDMASTER(dd))
3554         {
3555             check_box_size(dd,ddbox);
3556         }
3557         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3558     }
3559     else
3560     {
3561         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3562         realloc_comm_ind(dd,npulse);
3563     }
3564     
3565     if (debug)
3566     {
3567         for(d=0; d<DIM; d++)
3568         {
3569             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3570                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3571         }
3572     }
3573 }
3574
3575 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3576                                   gmx_ddbox_t *ddbox,
3577                                   rvec cell_ns_x0,rvec cell_ns_x1,
3578                                   gmx_large_int_t step)
3579 {
3580     gmx_domdec_comm_t *comm;
3581     int dim_ind,dim;
3582     
3583     comm = dd->comm;
3584
3585     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3586     {
3587         dim = dd->dim[dim_ind];
3588         
3589         /* Without PBC we don't have restrictions on the outer cells */
3590         if (!(dim >= ddbox->npbcdim && 
3591               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3592             comm->bDynLoadBal &&
3593             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3594             comm->cellsize_min[dim])
3595         {
3596             char buf[22];
3597             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3598                       gmx_step_str(step,buf),dim2char(dim),
3599                       comm->cell_x1[dim] - comm->cell_x0[dim],
3600                       ddbox->skew_fac[dim],
3601                       dd->comm->cellsize_min[dim],
3602                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3603         }
3604     }
3605     
3606     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3607     {
3608         /* Communicate the boundaries and update cell_ns_x0/1 */
3609         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3610         if (dd->bGridJump && dd->ndim > 1)
3611         {
3612             check_grid_jump(step,dd,ddbox);
3613         }
3614     }
3615 }
3616
3617 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3618 {
3619     if (YY < npbcdim)
3620     {
3621         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3622     }
3623     else
3624     {
3625         tcm[YY][XX] = 0;
3626     }
3627     if (ZZ < npbcdim)
3628     {
3629         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3630         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3631     }
3632     else
3633     {
3634         tcm[ZZ][XX] = 0;
3635         tcm[ZZ][YY] = 0;
3636     }
3637 }
3638
3639 static void check_screw_box(matrix box)
3640 {
3641     /* Mathematical limitation */
3642     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3643     {
3644         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3645     }
3646     
3647     /* Limitation due to the asymmetry of the eighth shell method */
3648     if (box[ZZ][YY] != 0)
3649     {
3650         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3651     }
3652 }
3653
3654 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3655                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3656                           gmx_domdec_t *dd)
3657 {
3658     gmx_domdec_master_t *ma;
3659     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3660     int  i,icg,j,k,k0,k1,d,npbcdim;
3661     matrix tcm;
3662     rvec box_size,cg_cm;
3663     ivec ind;
3664     real nrcg,inv_ncg,pos_d;
3665     atom_id *cgindex;
3666     gmx_bool bUnbounded,bScrew;
3667
3668     ma = dd->ma;
3669     
3670     if (tmp_ind == NULL)
3671     {
3672         snew(tmp_nalloc,dd->nnodes);
3673         snew(tmp_ind,dd->nnodes);
3674         for(i=0; i<dd->nnodes; i++)
3675         {
3676             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3677             snew(tmp_ind[i],tmp_nalloc[i]);
3678         }
3679     }
3680     
3681     /* Clear the count */
3682     for(i=0; i<dd->nnodes; i++)
3683     {
3684         ma->ncg[i] = 0;
3685         ma->nat[i] = 0;
3686     }
3687     
3688     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3689     
3690     cgindex = cgs->index;
3691     
3692     /* Compute the center of geometry for all charge groups */
3693     for(icg=0; icg<cgs->nr; icg++)
3694     {
3695         k0      = cgindex[icg];
3696         k1      = cgindex[icg+1];
3697         nrcg    = k1 - k0;
3698         if (nrcg == 1)
3699         {
3700             copy_rvec(pos[k0],cg_cm);
3701         }
3702         else
3703         {
3704             inv_ncg = 1.0/nrcg;
3705             
3706             clear_rvec(cg_cm);
3707             for(k=k0; (k<k1); k++)
3708             {
3709                 rvec_inc(cg_cm,pos[k]);
3710             }
3711             for(d=0; (d<DIM); d++)
3712             {
3713                 cg_cm[d] *= inv_ncg;
3714             }
3715         }
3716         /* Put the charge group in the box and determine the cell index */
3717         for(d=DIM-1; d>=0; d--) {
3718             pos_d = cg_cm[d];
3719             if (d < dd->npbcdim)
3720             {
3721                 bScrew = (dd->bScrewPBC && d == XX);
3722                 if (tric_dir[d] && dd->nc[d] > 1)
3723                 {
3724                     /* Use triclinic coordintates for this dimension */
3725                     for(j=d+1; j<DIM; j++)
3726                     {
3727                         pos_d += cg_cm[j]*tcm[j][d];
3728                     }
3729                 }
3730                 while(pos_d >= box[d][d])
3731                 {
3732                     pos_d -= box[d][d];
3733                     rvec_dec(cg_cm,box[d]);
3734                     if (bScrew)
3735                     {
3736                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3737                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3738                     }
3739                     for(k=k0; (k<k1); k++)
3740                     {
3741                         rvec_dec(pos[k],box[d]);
3742                         if (bScrew)
3743                         {
3744                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3745                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3746                         }
3747                     }
3748                 }
3749                 while(pos_d < 0)
3750                 {
3751                     pos_d += box[d][d];
3752                     rvec_inc(cg_cm,box[d]);
3753                     if (bScrew)
3754                     {
3755                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3756                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3757                     }
3758                     for(k=k0; (k<k1); k++)
3759                     {
3760                         rvec_inc(pos[k],box[d]);
3761                         if (bScrew) {
3762                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3763                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3764                         }
3765                     }
3766                 }
3767             }
3768             /* This could be done more efficiently */
3769             ind[d] = 0;
3770             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3771             {
3772                 ind[d]++;
3773             }
3774         }
3775         i = dd_index(dd->nc,ind);
3776         if (ma->ncg[i] == tmp_nalloc[i])
3777         {
3778             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3779             srenew(tmp_ind[i],tmp_nalloc[i]);
3780         }
3781         tmp_ind[i][ma->ncg[i]] = icg;
3782         ma->ncg[i]++;
3783         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3784     }
3785     
3786     k1 = 0;
3787     for(i=0; i<dd->nnodes; i++)
3788     {
3789         ma->index[i] = k1;
3790         for(k=0; k<ma->ncg[i]; k++)
3791         {
3792             ma->cg[k1++] = tmp_ind[i][k];
3793         }
3794     }
3795     ma->index[dd->nnodes] = k1;
3796     
3797     for(i=0; i<dd->nnodes; i++)
3798     {
3799         sfree(tmp_ind[i]);
3800     }
3801     sfree(tmp_ind);
3802     sfree(tmp_nalloc);
3803     
3804     if (fplog)
3805     {
3806         char buf[22];
3807         fprintf(fplog,"Charge group distribution at step %s:",
3808                 gmx_step_str(step,buf));
3809         for(i=0; i<dd->nnodes; i++)
3810         {
3811             fprintf(fplog," %d",ma->ncg[i]);
3812         }
3813         fprintf(fplog,"\n");
3814     }
3815 }
3816
3817 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3818                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3819                                 rvec pos[])
3820 {
3821     gmx_domdec_master_t *ma=NULL;
3822     ivec npulse;
3823     int  i,cg_gl;
3824     int  *ibuf,buf2[2] = { 0, 0 };
3825     gmx_bool bMaster = DDMASTER(dd);
3826     if (bMaster)
3827     {
3828         ma = dd->ma;
3829         
3830         if (dd->bScrewPBC)
3831         {
3832             check_screw_box(box);
3833         }
3834     
3835         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3836     
3837         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3838         for(i=0; i<dd->nnodes; i++)
3839         {
3840             ma->ibuf[2*i]   = ma->ncg[i];
3841             ma->ibuf[2*i+1] = ma->nat[i];
3842         }
3843         ibuf = ma->ibuf;
3844     }
3845     else
3846     {
3847         ibuf = NULL;
3848     }
3849     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3850     
3851     dd->ncg_home = buf2[0];
3852     dd->nat_home = buf2[1];
3853     dd->ncg_tot  = dd->ncg_home;
3854     dd->nat_tot  = dd->nat_home;
3855     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3856     {
3857         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3858         srenew(dd->index_gl,dd->cg_nalloc);
3859         srenew(dd->cgindex,dd->cg_nalloc+1);
3860     }
3861     if (bMaster)
3862     {
3863         for(i=0; i<dd->nnodes; i++)
3864         {
3865             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3866             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3867         }
3868     }
3869     
3870     dd_scatterv(dd,
3871                 DDMASTER(dd) ? ma->ibuf : NULL,
3872                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3873                 DDMASTER(dd) ? ma->cg : NULL,
3874                 dd->ncg_home*sizeof(int),dd->index_gl);
3875     
3876     /* Determine the home charge group sizes */
3877     dd->cgindex[0] = 0;
3878     for(i=0; i<dd->ncg_home; i++)
3879     {
3880         cg_gl = dd->index_gl[i];
3881         dd->cgindex[i+1] =
3882             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3883     }
3884     
3885     if (debug)
3886     {
3887         fprintf(debug,"Home charge groups:\n");
3888         for(i=0; i<dd->ncg_home; i++)
3889         {
3890             fprintf(debug," %d",dd->index_gl[i]);
3891             if (i % 10 == 9) 
3892                 fprintf(debug,"\n");
3893         }
3894         fprintf(debug,"\n");
3895     }
3896 }
3897
3898 static int compact_and_copy_vec_at(int ncg,int *move,
3899                                    int *cgindex,
3900                                    int nvec,int vec,
3901                                    rvec *src,gmx_domdec_comm_t *comm,
3902                                    gmx_bool bCompact)
3903 {
3904     int m,icg,i,i0,i1,nrcg;
3905     int home_pos;
3906     int pos_vec[DIM*2];
3907     
3908     home_pos = 0;
3909
3910     for(m=0; m<DIM*2; m++)
3911     {
3912         pos_vec[m] = 0;
3913     }
3914     
3915     i0 = 0;
3916     for(icg=0; icg<ncg; icg++)
3917     {
3918         i1 = cgindex[icg+1];
3919         m = move[icg];
3920         if (m == -1)
3921         {
3922             if (bCompact)
3923             {
3924                 /* Compact the home array in place */
3925                 for(i=i0; i<i1; i++)
3926                 {
3927                     copy_rvec(src[i],src[home_pos++]);
3928                 }
3929             }
3930         }
3931         else
3932         {
3933             /* Copy to the communication buffer */
3934             nrcg = i1 - i0;
3935             pos_vec[m] += 1 + vec*nrcg;
3936             for(i=i0; i<i1; i++)
3937             {
3938                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3939             }
3940             pos_vec[m] += (nvec - vec - 1)*nrcg;
3941         }
3942         if (!bCompact)
3943         {
3944             home_pos += i1 - i0;
3945         }
3946         i0 = i1;
3947     }
3948     
3949     return home_pos;
3950 }
3951
3952 static int compact_and_copy_vec_cg(int ncg,int *move,
3953                                    int *cgindex,
3954                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3955                                    gmx_bool bCompact)
3956 {
3957     int m,icg,i0,i1,nrcg;
3958     int home_pos;
3959     int pos_vec[DIM*2];
3960     
3961     home_pos = 0;
3962     
3963     for(m=0; m<DIM*2; m++)
3964     {
3965         pos_vec[m] = 0;
3966     }
3967     
3968     i0 = 0;
3969     for(icg=0; icg<ncg; icg++)
3970     {
3971         i1 = cgindex[icg+1];
3972         m = move[icg];
3973         if (m == -1)
3974         {
3975             if (bCompact)
3976             {
3977                 /* Compact the home array in place */
3978                 copy_rvec(src[icg],src[home_pos++]);
3979             }
3980         }
3981         else
3982         {
3983             nrcg = i1 - i0;
3984             /* Copy to the communication buffer */
3985             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3986             pos_vec[m] += 1 + nrcg*nvec;
3987         }
3988         i0 = i1;
3989     }
3990     if (!bCompact)
3991     {
3992         home_pos = ncg;
3993     }
3994     
3995     return home_pos;
3996 }
3997
3998 static int compact_ind(int ncg,int *move,
3999                        int *index_gl,int *cgindex,
4000                        int *gatindex,
4001                        gmx_ga2la_t ga2la,char *bLocalCG,
4002                        int *cginfo)
4003 {
4004     int cg,nat,a0,a1,a,a_gl;
4005     int home_pos;
4006
4007     home_pos = 0;
4008     nat = 0;
4009     for(cg=0; cg<ncg; cg++)
4010     {
4011         a0 = cgindex[cg];
4012         a1 = cgindex[cg+1];
4013         if (move[cg] == -1)
4014         {
4015             /* Compact the home arrays in place.
4016              * Anything that can be done here avoids access to global arrays.
4017              */
4018             cgindex[home_pos] = nat;
4019             for(a=a0; a<a1; a++)
4020             {
4021                 a_gl = gatindex[a];
4022                 gatindex[nat] = a_gl;
4023                 /* The cell number stays 0, so we don't need to set it */
4024                 ga2la_change_la(ga2la,a_gl,nat);
4025                 nat++;
4026             }
4027             index_gl[home_pos] = index_gl[cg];
4028             cginfo[home_pos]   = cginfo[cg];
4029             /* The charge group remains local, so bLocalCG does not change */
4030             home_pos++;
4031         }
4032         else
4033         {
4034             /* Clear the global indices */
4035             for(a=a0; a<a1; a++)
4036             {
4037                 ga2la_del(ga2la,gatindex[a]);
4038             }
4039             if (bLocalCG)
4040             {
4041                 bLocalCG[index_gl[cg]] = FALSE;
4042             }
4043         }
4044     }
4045     cgindex[home_pos] = nat;
4046     
4047     return home_pos;
4048 }
4049
4050 static void clear_and_mark_ind(int ncg,int *move,
4051                                int *index_gl,int *cgindex,int *gatindex,
4052                                gmx_ga2la_t ga2la,char *bLocalCG,
4053                                int *cell_index)
4054 {
4055     int cg,a0,a1,a;
4056     
4057     for(cg=0; cg<ncg; cg++)
4058     {
4059         if (move[cg] >= 0)
4060         {
4061             a0 = cgindex[cg];
4062             a1 = cgindex[cg+1];
4063             /* Clear the global indices */
4064             for(a=a0; a<a1; a++)
4065             {
4066                 ga2la_del(ga2la,gatindex[a]);
4067             }
4068             if (bLocalCG)
4069             {
4070                 bLocalCG[index_gl[cg]] = FALSE;
4071             }
4072             /* Signal that this cg has moved using the ns cell index.
4073              * Here we set it to -1.
4074              * fill_grid will change it from -1 to 4*grid->ncells.
4075              */
4076             cell_index[cg] = -1;
4077         }
4078     }
4079 }
4080
4081 static void print_cg_move(FILE *fplog,
4082                           gmx_domdec_t *dd,
4083                           gmx_large_int_t step,int cg,int dim,int dir,
4084                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4085                           rvec cm_old,rvec cm_new,real pos_d)
4086 {
4087     gmx_domdec_comm_t *comm;
4088     char buf[22];
4089
4090     comm = dd->comm;
4091
4092     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4093     if (bHaveLimitdAndCMOld)
4094     {
4095         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4096                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4097     }
4098     else
4099     {
4100         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4101                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4102     }
4103     fprintf(fplog,"distance out of cell %f\n",
4104             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4105     if (bHaveLimitdAndCMOld)
4106     {
4107         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4108                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4109     }
4110     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4111             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4112     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4113             dim2char(dim),
4114             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4115     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4116             dim2char(dim),
4117             comm->cell_x0[dim],comm->cell_x1[dim]);
4118 }
4119
4120 static void cg_move_error(FILE *fplog,
4121                           gmx_domdec_t *dd,
4122                           gmx_large_int_t step,int cg,int dim,int dir,
4123                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4124                           rvec cm_old,rvec cm_new,real pos_d)
4125 {
4126     if (fplog)
4127     {
4128         print_cg_move(fplog, dd,step,cg,dim,dir,
4129                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4130     }
4131     print_cg_move(stderr,dd,step,cg,dim,dir,
4132                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4133     gmx_fatal(FARGS,
4134               "A charge group moved too far between two domain decomposition steps\n"
4135               "This usually means that your system is not well equilibrated");
4136 }
4137
4138 static void rotate_state_atom(t_state *state,int a)
4139 {
4140     int est;
4141
4142     for(est=0; est<estNR; est++)
4143     {
4144         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4145             switch (est) {
4146             case estX:
4147                 /* Rotate the complete state; for a rectangular box only */
4148                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4149                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4150                 break;
4151             case estV:
4152                 state->v[a][YY] = -state->v[a][YY];
4153                 state->v[a][ZZ] = -state->v[a][ZZ];
4154                 break;
4155             case estSDX:
4156                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4157                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4158                 break;
4159             case estCGP:
4160                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4161                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4162                 break;
4163             case estDISRE_INITF:
4164             case estDISRE_RM3TAV:
4165             case estORIRE_INITF:
4166             case estORIRE_DTAV:
4167                 /* These are distances, so not affected by rotation */
4168                 break;
4169             default:
4170                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4171             }
4172         }
4173     }
4174 }
4175
4176 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4177                               gmx_domdec_t *dd,ivec tric_dir,
4178                               t_state *state,rvec **f,
4179                               t_forcerec *fr,t_mdatoms *md,
4180                               gmx_bool bCompact,
4181                               t_nrnb *nrnb)
4182 {
4183     int  *move;
4184     int  npbcdim;
4185     int  ncg[DIM*2],nat[DIM*2];
4186     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4187     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4188     int  sbuf[2],rbuf[2];
4189     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4190     int  flag;
4191     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4192     gmx_bool bScrew;
4193     ivec dev;
4194     real inv_ncg,pos_d;
4195     matrix tcm;
4196     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4197     atom_id *cgindex;
4198     cginfo_mb_t *cginfo_mb;
4199     gmx_domdec_comm_t *comm;
4200     
4201     if (dd->bScrewPBC)
4202     {
4203         check_screw_box(state->box);
4204     }
4205     
4206     comm  = dd->comm;
4207     cg_cm = fr->cg_cm;
4208     
4209     for(i=0; i<estNR; i++)
4210     {
4211         if (EST_DISTR(i))
4212         {
4213             switch (i)
4214             {
4215             case estX:   /* Always present */            break;
4216             case estV:   bV   = (state->flags & (1<<i)); break;
4217             case estSDX: bSDX = (state->flags & (1<<i)); break;
4218             case estCGP: bCGP = (state->flags & (1<<i)); break;
4219             case estLD_RNG:
4220             case estLD_RNGI:
4221             case estDISRE_INITF:
4222             case estDISRE_RM3TAV:
4223             case estORIRE_INITF:
4224             case estORIRE_DTAV:
4225                 /* No processing required */
4226                 break;
4227             default:
4228             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4229             }
4230         }
4231     }
4232     
4233     if (dd->ncg_tot > comm->nalloc_int)
4234     {
4235         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4236         srenew(comm->buf_int,comm->nalloc_int);
4237     }
4238     move = comm->buf_int;
4239     
4240     /* Clear the count */
4241     for(c=0; c<dd->ndim*2; c++)
4242     {
4243         ncg[c] = 0;
4244         nat[c] = 0;
4245     }
4246
4247     npbcdim = dd->npbcdim;
4248
4249     for(d=0; (d<DIM); d++)
4250     {
4251         limitd[d] = dd->comm->cellsize_min[d];
4252         if (d >= npbcdim && dd->ci[d] == 0)
4253         {
4254             cell_x0[d] = -GMX_FLOAT_MAX;
4255         }
4256         else
4257         {
4258             cell_x0[d] = comm->cell_x0[d];
4259         }
4260         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4261         {
4262             cell_x1[d] = GMX_FLOAT_MAX;
4263         }
4264         else
4265         {
4266             cell_x1[d] = comm->cell_x1[d];
4267         }
4268         if (d < npbcdim)
4269         {
4270             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4271             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4272         }
4273         else
4274         {
4275             /* We check after communication if a charge group moved
4276              * more than one cell. Set the pre-comm check limit to float_max.
4277              */
4278             limit0[d] = -GMX_FLOAT_MAX;
4279             limit1[d] =  GMX_FLOAT_MAX;
4280         }
4281     }
4282     
4283     make_tric_corr_matrix(npbcdim,state->box,tcm);
4284     
4285     cgindex = dd->cgindex;
4286     
4287     /* Compute the center of geometry for all home charge groups
4288      * and put them in the box and determine where they should go.
4289      */
4290     for(cg=0; cg<dd->ncg_home; cg++)
4291     {
4292         k0   = cgindex[cg];
4293         k1   = cgindex[cg+1];
4294         nrcg = k1 - k0;
4295         if (nrcg == 1)
4296         {
4297             copy_rvec(state->x[k0],cm_new);
4298         }
4299         else
4300         {
4301             inv_ncg = 1.0/nrcg;
4302             
4303             clear_rvec(cm_new);
4304             for(k=k0; (k<k1); k++)
4305             {
4306                 rvec_inc(cm_new,state->x[k]);
4307             }
4308             for(d=0; (d<DIM); d++)
4309             {
4310                 cm_new[d] = inv_ncg*cm_new[d];
4311             }
4312         }
4313         
4314         clear_ivec(dev);
4315         /* Do pbc and check DD cell boundary crossings */
4316         for(d=DIM-1; d>=0; d--)
4317         {
4318             if (dd->nc[d] > 1)
4319             {
4320                 bScrew = (dd->bScrewPBC && d == XX);
4321                 /* Determine the location of this cg in lattice coordinates */
4322                 pos_d = cm_new[d];
4323                 if (tric_dir[d])
4324                 {
4325                     for(d2=d+1; d2<DIM; d2++)
4326                     {
4327                         pos_d += cm_new[d2]*tcm[d2][d];
4328                     }
4329                 }
4330                 /* Put the charge group in the triclinic unit-cell */
4331                 if (pos_d >= cell_x1[d])
4332                 {
4333                     if (pos_d >= limit1[d])
4334                     {
4335                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4336                                       cg_cm[cg],cm_new,pos_d);
4337                     }
4338                     dev[d] = 1;
4339                     if (dd->ci[d] == dd->nc[d] - 1)
4340                     {
4341                         rvec_dec(cm_new,state->box[d]);
4342                         if (bScrew)
4343                         {
4344                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4345                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4346                         }
4347                         for(k=k0; (k<k1); k++)
4348                         {
4349                             rvec_dec(state->x[k],state->box[d]);
4350                             if (bScrew)
4351                             {
4352                                 rotate_state_atom(state,k);
4353                             }
4354                         }
4355                     }
4356                 }
4357                 else if (pos_d < cell_x0[d])
4358                 {
4359                     if (pos_d < limit0[d])
4360                     {
4361                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4362                                       cg_cm[cg],cm_new,pos_d);
4363                     }
4364                     dev[d] = -1;
4365                     if (dd->ci[d] == 0)
4366                     {
4367                         rvec_inc(cm_new,state->box[d]);
4368                         if (bScrew)
4369                         {
4370                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4371                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4372                         }
4373                         for(k=k0; (k<k1); k++)
4374                         {
4375                             rvec_inc(state->x[k],state->box[d]);
4376                             if (bScrew)
4377                             {
4378                                 rotate_state_atom(state,k);
4379                             }
4380                         }
4381                     }
4382                 }
4383             }
4384             else if (d < npbcdim)
4385             {
4386                 /* Put the charge group in the rectangular unit-cell */
4387                 while (cm_new[d] >= state->box[d][d])
4388                 {
4389                     rvec_dec(cm_new,state->box[d]);
4390                     for(k=k0; (k<k1); k++)
4391                     {
4392                         rvec_dec(state->x[k],state->box[d]);
4393                     }
4394                 }
4395                 while (cm_new[d] < 0)
4396                 {
4397                     rvec_inc(cm_new,state->box[d]);
4398                     for(k=k0; (k<k1); k++)
4399                     {
4400                         rvec_inc(state->x[k],state->box[d]);
4401                     }
4402                 }
4403             }
4404         }
4405     
4406         copy_rvec(cm_new,cg_cm[cg]);
4407         
4408         /* Determine where this cg should go */
4409         flag = 0;
4410         mc = -1;
4411         for(d=0; d<dd->ndim; d++)
4412         {
4413             dim = dd->dim[d];
4414             if (dev[dim] == 1)
4415             {
4416                 flag |= DD_FLAG_FW(d);
4417                 if (mc == -1)
4418                 {
4419                     mc = d*2;
4420                 }
4421             }
4422             else if (dev[dim] == -1)
4423             {
4424                 flag |= DD_FLAG_BW(d);
4425                 if (mc == -1) {
4426                     if (dd->nc[dim] > 2)
4427                     {
4428                         mc = d*2 + 1;
4429                     }
4430                     else
4431                     {
4432                         mc = d*2;
4433                     }
4434                 }
4435             }
4436         }
4437         move[cg] = mc;
4438         if (mc >= 0)
4439         {
4440             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4441             {
4442                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4443                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4444             }
4445             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4446             /* We store the cg size in the lower 16 bits
4447              * and the place where the charge group should go
4448              * in the next 6 bits. This saves some communication volume.
4449              */
4450             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4451             ncg[mc] += 1;
4452             nat[mc] += nrcg;
4453         }
4454     }
4455     
4456     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4457     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4458     
4459     nvec = 1;
4460     if (bV)
4461     {
4462         nvec++;
4463     }
4464     if (bSDX)
4465     {
4466         nvec++;
4467     }
4468     if (bCGP)
4469     {
4470         nvec++;
4471     }
4472     
4473     /* Make sure the communication buffers are large enough */
4474     for(mc=0; mc<dd->ndim*2; mc++)
4475     {
4476         nvr = ncg[mc] + nat[mc]*nvec;
4477         if (nvr > comm->cgcm_state_nalloc[mc])
4478         {
4479             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4480             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4481         }
4482     }
4483     
4484     /* Recalculating cg_cm might be cheaper than communicating,
4485      * but that could give rise to rounding issues.
4486      */
4487     home_pos_cg =
4488         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4489                                 nvec,cg_cm,comm,bCompact);
4490     
4491     vec = 0;
4492     home_pos_at =
4493         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4494                                 nvec,vec++,state->x,comm,bCompact);
4495     if (bV)
4496     {
4497         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4498                                 nvec,vec++,state->v,comm,bCompact);
4499     }
4500     if (bSDX)
4501     {
4502         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4503                                 nvec,vec++,state->sd_X,comm,bCompact);
4504     }
4505     if (bCGP)
4506     {
4507         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4508                                 nvec,vec++,state->cg_p,comm,bCompact);
4509     }
4510     
4511     if (bCompact)
4512     {
4513         compact_ind(dd->ncg_home,move,
4514                     dd->index_gl,dd->cgindex,dd->gatindex,
4515                     dd->ga2la,comm->bLocalCG,
4516                     fr->cginfo);
4517     }
4518     else
4519     {
4520         clear_and_mark_ind(dd->ncg_home,move,
4521                            dd->index_gl,dd->cgindex,dd->gatindex,
4522                            dd->ga2la,comm->bLocalCG,
4523                            fr->ns.grid->cell_index);
4524     }
4525     
4526     cginfo_mb = fr->cginfo_mb;
4527
4528     ncg_stay_home = home_pos_cg;
4529     for(d=0; d<dd->ndim; d++)
4530     {
4531         dim = dd->dim[d];
4532         ncg_recv = 0;
4533         nat_recv = 0;
4534         nvr      = 0;
4535         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4536         {
4537             cdd = d*2 + dir;
4538             /* Communicate the cg and atom counts */
4539             sbuf[0] = ncg[cdd];
4540             sbuf[1] = nat[cdd];
4541             if (debug)
4542             {
4543                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4544                         d,dir,sbuf[0],sbuf[1]);
4545             }
4546             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4547             
4548             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4549             {
4550                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4551                 srenew(comm->buf_int,comm->nalloc_int);
4552             }
4553             
4554             /* Communicate the charge group indices, sizes and flags */
4555             dd_sendrecv_int(dd, d, dir,
4556                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4557                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4558             
4559             nvs = ncg[cdd] + nat[cdd]*nvec;
4560             i   = rbuf[0]  + rbuf[1] *nvec;
4561             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4562             
4563             /* Communicate cgcm and state */
4564             dd_sendrecv_rvec(dd, d, dir,
4565                              comm->cgcm_state[cdd], nvs,
4566                              comm->vbuf.v+nvr, i);
4567             ncg_recv += rbuf[0];
4568             nat_recv += rbuf[1];
4569             nvr      += i;
4570         }
4571         
4572         /* Process the received charge groups */
4573         buf_pos = 0;
4574         for(cg=0; cg<ncg_recv; cg++)
4575         {
4576             flag = comm->buf_int[cg*DD_CGIBS+1];
4577
4578             if (dim >= npbcdim && dd->nc[dim] > 2)
4579             {
4580                 /* No pbc in this dim and more than one domain boundary.
4581                  * We do a separate check if a charge group didn't move too far.
4582                  */
4583                 if (((flag & DD_FLAG_FW(d)) &&
4584                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4585                     ((flag & DD_FLAG_BW(d)) &&
4586                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4587                 {
4588                     cg_move_error(fplog,dd,step,cg,dim,
4589                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4590                                    FALSE,0,
4591                                    comm->vbuf.v[buf_pos],
4592                                    comm->vbuf.v[buf_pos],
4593                                    comm->vbuf.v[buf_pos][dim]);
4594                 }
4595             }
4596
4597             mc = -1;
4598             if (d < dd->ndim-1)
4599             {
4600                 /* Check which direction this cg should go */
4601                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4602                 {
4603                     if (dd->bGridJump)
4604                     {
4605                         /* The cell boundaries for dimension d2 are not equal
4606                          * for each cell row of the lower dimension(s),
4607                          * therefore we might need to redetermine where
4608                          * this cg should go.
4609                          */
4610                         dim2 = dd->dim[d2];
4611                         /* If this cg crosses the box boundary in dimension d2
4612                          * we can use the communicated flag, so we do not
4613                          * have to worry about pbc.
4614                          */
4615                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4616                                (flag & DD_FLAG_FW(d2))) ||
4617                               (dd->ci[dim2] == 0 &&
4618                                (flag & DD_FLAG_BW(d2)))))
4619                         {
4620                             /* Clear the two flags for this dimension */
4621                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4622                             /* Determine the location of this cg
4623                              * in lattice coordinates
4624                              */
4625                             pos_d = comm->vbuf.v[buf_pos][dim2];
4626                             if (tric_dir[dim2])
4627                             {
4628                                 for(d3=dim2+1; d3<DIM; d3++)
4629                                 {
4630                                     pos_d +=
4631                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4632                                 }
4633                             }
4634                             /* Check of we are not at the box edge.
4635                              * pbc is only handled in the first step above,
4636                              * but this check could move over pbc while
4637                              * the first step did not due to different rounding.
4638                              */
4639                             if (pos_d >= cell_x1[dim2] &&
4640                                 dd->ci[dim2] != dd->nc[dim2]-1)
4641                             {
4642                                 flag |= DD_FLAG_FW(d2);
4643                             }
4644                             else if (pos_d < cell_x0[dim2] &&
4645                                      dd->ci[dim2] != 0)
4646                             {
4647                                 flag |= DD_FLAG_BW(d2);
4648                             }
4649                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4650                         }
4651                     }
4652                     /* Set to which neighboring cell this cg should go */
4653                     if (flag & DD_FLAG_FW(d2))
4654                     {
4655                         mc = d2*2;
4656                     }
4657                     else if (flag & DD_FLAG_BW(d2))
4658                     {
4659                         if (dd->nc[dd->dim[d2]] > 2)
4660                         {
4661                             mc = d2*2+1;
4662                         }
4663                         else
4664                         {
4665                             mc = d2*2;
4666                         }
4667                     }
4668                 }
4669             }
4670             
4671             nrcg = flag & DD_FLAG_NRCG;
4672             if (mc == -1)
4673             {
4674                 if (home_pos_cg+1 > dd->cg_nalloc)
4675                 {
4676                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4677                     srenew(dd->index_gl,dd->cg_nalloc);
4678                     srenew(dd->cgindex,dd->cg_nalloc+1);
4679                 }
4680                 /* Set the global charge group index and size */
4681                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4682                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4683                 /* Copy the state from the buffer */
4684                 if (home_pos_cg >= fr->cg_nalloc)
4685                 {
4686                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4687                     cg_cm = fr->cg_cm;
4688                 }
4689                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4690                 /* Set the cginfo */
4691                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4692                                                    dd->index_gl[home_pos_cg]);
4693                 if (comm->bLocalCG)
4694                 {
4695                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4696                 }
4697
4698                 if (home_pos_at+nrcg > state->nalloc)
4699                 {
4700                     dd_realloc_state(state,f,home_pos_at+nrcg);
4701                 }
4702                 for(i=0; i<nrcg; i++)
4703                 {
4704                     copy_rvec(comm->vbuf.v[buf_pos++],
4705                               state->x[home_pos_at+i]);
4706                 }
4707                 if (bV)
4708                 {
4709                     for(i=0; i<nrcg; i++)
4710                     {
4711                         copy_rvec(comm->vbuf.v[buf_pos++],
4712                                   state->v[home_pos_at+i]);
4713                     }
4714                 }
4715                 if (bSDX)
4716                 {
4717                     for(i=0; i<nrcg; i++)
4718                     {
4719                         copy_rvec(comm->vbuf.v[buf_pos++],
4720                                   state->sd_X[home_pos_at+i]);
4721                     }
4722                 }
4723                 if (bCGP)
4724                 {
4725                     for(i=0; i<nrcg; i++)
4726                     {
4727                         copy_rvec(comm->vbuf.v[buf_pos++],
4728                                   state->cg_p[home_pos_at+i]);
4729                     }
4730                 }
4731                 home_pos_cg += 1;
4732                 home_pos_at += nrcg;
4733             }
4734             else
4735             {
4736                 /* Reallocate the buffers if necessary  */
4737                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4738                 {
4739                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4740                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4741                 }
4742                 nvr = ncg[mc] + nat[mc]*nvec;
4743                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4744                 {
4745                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4746                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4747                 }
4748                 /* Copy from the receive to the send buffers */
4749                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4750                        comm->buf_int + cg*DD_CGIBS,
4751                        DD_CGIBS*sizeof(int));
4752                 memcpy(comm->cgcm_state[mc][nvr],
4753                        comm->vbuf.v[buf_pos],
4754                        (1+nrcg*nvec)*sizeof(rvec));
4755                 buf_pos += 1 + nrcg*nvec;
4756                 ncg[mc] += 1;
4757                 nat[mc] += nrcg;
4758             }
4759         }
4760     }
4761     
4762     /* With sorting (!bCompact) the indices are now only partially up to date
4763      * and ncg_home and nat_home are not the real count, since there are
4764      * "holes" in the arrays for the charge groups that moved to neighbors.
4765      */
4766     dd->ncg_home = home_pos_cg;
4767     dd->nat_home = home_pos_at;
4768
4769     if (debug)
4770     {
4771         fprintf(debug,"Finished repartitioning\n");
4772     }
4773
4774     return ncg_stay_home;
4775 }
4776
4777 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4778 {
4779     dd->comm->cycl[ddCycl] += cycles;
4780     dd->comm->cycl_n[ddCycl]++;
4781     if (cycles > dd->comm->cycl_max[ddCycl])
4782     {
4783         dd->comm->cycl_max[ddCycl] = cycles;
4784     }
4785 }
4786
4787 static double force_flop_count(t_nrnb *nrnb)
4788 {
4789     int i;
4790     double sum;
4791     const char *name;
4792
4793     sum = 0;
4794     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4795     {
4796         /* To get closer to the real timings, we half the count
4797          * for the normal loops and again half it for water loops.
4798          */
4799         name = nrnb_str(i);
4800         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4801         {
4802             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4803         }
4804         else
4805         {
4806             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4807         }
4808     }
4809     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4810     {
4811         name = nrnb_str(i);
4812         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4813         sum += nrnb->n[i]*cost_nrnb(i);
4814     }
4815     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4816     {
4817         sum += nrnb->n[i]*cost_nrnb(i);
4818     }
4819
4820     return sum;
4821 }
4822
4823 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4824 {
4825     if (dd->comm->eFlop)
4826     {
4827         dd->comm->flop -= force_flop_count(nrnb);
4828     }
4829 }
4830 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4831 {
4832     if (dd->comm->eFlop)
4833     {
4834         dd->comm->flop += force_flop_count(nrnb);
4835         dd->comm->flop_n++;
4836     }
4837 }  
4838
4839 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4840 {
4841     int i;
4842     
4843     for(i=0; i<ddCyclNr; i++)
4844     {
4845         dd->comm->cycl[i] = 0;
4846         dd->comm->cycl_n[i] = 0;
4847         dd->comm->cycl_max[i] = 0;
4848     }
4849     dd->comm->flop = 0;
4850     dd->comm->flop_n = 0;
4851 }
4852
4853 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4854 {
4855     gmx_domdec_comm_t *comm;
4856     gmx_domdec_load_t *load;
4857     gmx_domdec_root_t *root=NULL;
4858     int  d,dim,cid,i,pos;
4859     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4860     gmx_bool bSepPME;
4861     
4862     if (debug)
4863     {
4864         fprintf(debug,"get_load_distribution start\n");
4865     }
4866
4867     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4868     
4869     comm = dd->comm;
4870     
4871     bSepPME = (dd->pme_nodeid >= 0);
4872     
4873     for(d=dd->ndim-1; d>=0; d--)
4874     {
4875         dim = dd->dim[d];
4876         /* Check if we participate in the communication in this dimension */
4877         if (d == dd->ndim-1 || 
4878             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4879         {
4880             load = &comm->load[d];
4881             if (dd->bGridJump)
4882             {
4883                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4884             }
4885             pos = 0;
4886             if (d == dd->ndim-1)
4887             {
4888                 sbuf[pos++] = dd_force_load(comm);
4889                 sbuf[pos++] = sbuf[0];
4890                 if (dd->bGridJump)
4891                 {
4892                     sbuf[pos++] = sbuf[0];
4893                     sbuf[pos++] = cell_frac;
4894                     if (d > 0)
4895                     {
4896                         sbuf[pos++] = comm->cell_f_max0[d];
4897                         sbuf[pos++] = comm->cell_f_min1[d];
4898                     }
4899                 }
4900                 if (bSepPME)
4901                 {
4902                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4903                     sbuf[pos++] = comm->cycl[ddCyclPME];
4904                 }
4905             }
4906             else
4907             {
4908                 sbuf[pos++] = comm->load[d+1].sum;
4909                 sbuf[pos++] = comm->load[d+1].max;
4910                 if (dd->bGridJump)
4911                 {
4912                     sbuf[pos++] = comm->load[d+1].sum_m;
4913                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4914                     sbuf[pos++] = comm->load[d+1].flags;
4915                     if (d > 0)
4916                     {
4917                         sbuf[pos++] = comm->cell_f_max0[d];
4918                         sbuf[pos++] = comm->cell_f_min1[d];
4919                     }
4920                 }
4921                 if (bSepPME)
4922                 {
4923                     sbuf[pos++] = comm->load[d+1].mdf;
4924                     sbuf[pos++] = comm->load[d+1].pme;
4925                 }
4926             }
4927             load->nload = pos;
4928             /* Communicate a row in DD direction d.
4929              * The communicators are setup such that the root always has rank 0.
4930              */
4931 #ifdef GMX_MPI
4932             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4933                        load->load,load->nload*sizeof(float),MPI_BYTE,
4934                        0,comm->mpi_comm_load[d]);
4935 #endif
4936             if (dd->ci[dim] == dd->master_ci[dim])
4937             {
4938                 /* We are the root, process this row */
4939                 if (comm->bDynLoadBal)
4940                 {
4941                     root = comm->root[d];
4942                 }
4943                 load->sum = 0;
4944                 load->max = 0;
4945                 load->sum_m = 0;
4946                 load->cvol_min = 1;
4947                 load->flags = 0;
4948                 load->mdf = 0;
4949                 load->pme = 0;
4950                 pos = 0;
4951                 for(i=0; i<dd->nc[dim]; i++)
4952                 {
4953                     load->sum += load->load[pos++];
4954                     load->max = max(load->max,load->load[pos]);
4955                     pos++;
4956                     if (dd->bGridJump)
4957                     {
4958                         if (root->bLimited)
4959                         {
4960                             /* This direction could not be load balanced properly,
4961                              * therefore we need to use the maximum iso the average load.
4962                              */
4963                             load->sum_m = max(load->sum_m,load->load[pos]);
4964                         }
4965                         else
4966                         {
4967                             load->sum_m += load->load[pos];
4968                         }
4969                         pos++;
4970                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4971                         pos++;
4972                         if (d < dd->ndim-1)
4973                         {
4974                             load->flags = (int)(load->load[pos++] + 0.5);
4975                         }
4976                         if (d > 0)
4977                         {
4978                             root->cell_f_max0[i] = load->load[pos++];
4979                             root->cell_f_min1[i] = load->load[pos++];
4980                         }
4981                     }
4982                     if (bSepPME)
4983                     {
4984                         load->mdf = max(load->mdf,load->load[pos]);
4985                         pos++;
4986                         load->pme = max(load->pme,load->load[pos]);
4987                         pos++;
4988                     }
4989                 }
4990                 if (comm->bDynLoadBal && root->bLimited)
4991                 {
4992                     load->sum_m *= dd->nc[dim];
4993                     load->flags |= (1<<d);
4994                 }
4995             }
4996         }
4997     }
4998
4999     if (DDMASTER(dd))
5000     {
5001         comm->nload      += dd_load_count(comm);
5002         comm->load_step  += comm->cycl[ddCyclStep];
5003         comm->load_sum   += comm->load[0].sum;
5004         comm->load_max   += comm->load[0].max;
5005         if (comm->bDynLoadBal)
5006         {
5007             for(d=0; d<dd->ndim; d++)
5008             {
5009                 if (comm->load[0].flags & (1<<d))
5010                 {
5011                     comm->load_lim[d]++;
5012                 }
5013             }
5014         }
5015         if (bSepPME)
5016         {
5017             comm->load_mdf += comm->load[0].mdf;
5018             comm->load_pme += comm->load[0].pme;
5019         }
5020     }
5021
5022     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5023     
5024     if (debug)
5025     {
5026         fprintf(debug,"get_load_distribution finished\n");
5027     }
5028 }
5029
5030 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5031 {
5032     /* Return the relative performance loss on the total run time
5033      * due to the force calculation load imbalance.
5034      */
5035     if (dd->comm->nload > 0)
5036     {
5037         return
5038             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5039             (dd->comm->load_step*dd->nnodes);
5040     }
5041     else
5042     {
5043         return 0;
5044     }
5045 }
5046
5047 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5048 {
5049     char  buf[STRLEN];
5050     int   npp,npme,nnodes,d,limp;
5051     float imbal,pme_f_ratio,lossf,lossp=0;
5052     gmx_bool  bLim;
5053     gmx_domdec_comm_t *comm;
5054
5055     comm = dd->comm;
5056     if (DDMASTER(dd) && comm->nload > 0)
5057     {
5058         npp    = dd->nnodes;
5059         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5060         nnodes = npp + npme;
5061         imbal = comm->load_max*npp/comm->load_sum - 1;
5062         lossf = dd_force_imb_perf_loss(dd);
5063         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5064         fprintf(fplog,"%s",buf);
5065         fprintf(stderr,"\n");
5066         fprintf(stderr,"%s",buf);
5067         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5068         fprintf(fplog,"%s",buf);
5069         fprintf(stderr,"%s",buf);
5070         bLim = FALSE;
5071         if (comm->bDynLoadBal)
5072         {
5073             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5074             for(d=0; d<dd->ndim; d++)
5075             {
5076                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5077                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5078                 if (limp >= 50)
5079                 {
5080                     bLim = TRUE;
5081                 }
5082             }
5083             sprintf(buf+strlen(buf),"\n");
5084             fprintf(fplog,"%s",buf);
5085             fprintf(stderr,"%s",buf);
5086         }
5087         if (npme > 0)
5088         {
5089             pme_f_ratio = comm->load_pme/comm->load_mdf;
5090             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5091             if (lossp <= 0)
5092             {
5093                 lossp *= (float)npme/(float)nnodes;
5094             }
5095             else
5096             {
5097                 lossp *= (float)npp/(float)nnodes;
5098             }
5099             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5100             fprintf(fplog,"%s",buf);
5101             fprintf(stderr,"%s",buf);
5102             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5103             fprintf(fplog,"%s",buf);
5104             fprintf(stderr,"%s",buf);
5105         }
5106         fprintf(fplog,"\n");
5107         fprintf(stderr,"\n");
5108         
5109         if (lossf >= DD_PERF_LOSS)
5110         {
5111             sprintf(buf,
5112                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5113                     "      in the domain decomposition.\n",lossf*100);
5114             if (!comm->bDynLoadBal)
5115             {
5116                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5117             }
5118             else if (bLim)
5119             {
5120                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5121             }
5122             fprintf(fplog,"%s\n",buf);
5123             fprintf(stderr,"%s\n",buf);
5124         }
5125         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5126         {
5127             sprintf(buf,
5128                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5129                     "      had %s work to do than the PP nodes.\n"
5130                     "      You might want to %s the number of PME nodes\n"
5131                     "      or %s the cut-off and the grid spacing.\n",
5132                     fabs(lossp*100),
5133                     (lossp < 0) ? "less"     : "more",
5134                     (lossp < 0) ? "decrease" : "increase",
5135                     (lossp < 0) ? "decrease" : "increase");
5136             fprintf(fplog,"%s\n",buf);
5137             fprintf(stderr,"%s\n",buf);
5138         }
5139     }
5140 }
5141
5142 static float dd_vol_min(gmx_domdec_t *dd)
5143 {
5144     return dd->comm->load[0].cvol_min*dd->nnodes;
5145 }
5146
5147 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5148 {
5149     return dd->comm->load[0].flags;
5150 }
5151
5152 static float dd_f_imbal(gmx_domdec_t *dd)
5153 {
5154     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5155 }
5156
5157 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5158 {
5159     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5160 }
5161
5162 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5163 {
5164     int flags,d;
5165     char buf[22];
5166     
5167     flags = dd_load_flags(dd);
5168     if (flags)
5169     {
5170         fprintf(fplog,
5171                 "DD  load balancing is limited by minimum cell size in dimension");
5172         for(d=0; d<dd->ndim; d++)
5173         {
5174             if (flags & (1<<d))
5175             {
5176                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5177             }
5178         }
5179         fprintf(fplog,"\n");
5180     }
5181     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5182     if (dd->comm->bDynLoadBal)
5183     {
5184         fprintf(fplog,"  vol min/aver %5.3f%c",
5185                 dd_vol_min(dd),flags ? '!' : ' ');
5186     }
5187     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5188     if (dd->comm->cycl_n[ddCyclPME])
5189     {
5190         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5191     }
5192     fprintf(fplog,"\n\n");
5193 }
5194
5195 static void dd_print_load_verbose(gmx_domdec_t *dd)
5196 {
5197     if (dd->comm->bDynLoadBal)
5198     {
5199         fprintf(stderr,"vol %4.2f%c ",
5200                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5201     }
5202     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5203     if (dd->comm->cycl_n[ddCyclPME])
5204     {
5205         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5206     }
5207 }
5208
5209 #ifdef GMX_MPI
5210 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5211 {
5212     MPI_Comm  c_row;
5213     int  dim, i, rank;
5214     ivec loc_c;
5215     gmx_domdec_root_t *root;
5216     gmx_bool bPartOfGroup = FALSE;
5217     
5218     dim = dd->dim[dim_ind];
5219     copy_ivec(loc,loc_c);
5220     for(i=0; i<dd->nc[dim]; i++)
5221     {
5222         loc_c[dim] = i;
5223         rank = dd_index(dd->nc,loc_c);
5224         if (rank == dd->rank)
5225         {
5226             /* This process is part of the group */
5227             bPartOfGroup = TRUE;
5228         }
5229     }
5230     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5231                    &c_row);
5232     if (bPartOfGroup)
5233     {
5234         dd->comm->mpi_comm_load[dim_ind] = c_row;
5235         if (dd->comm->eDLB != edlbNO)
5236         {
5237             if (dd->ci[dim] == dd->master_ci[dim])
5238             {
5239                 /* This is the root process of this row */
5240                 snew(dd->comm->root[dim_ind],1);
5241                 root = dd->comm->root[dim_ind];
5242                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5243                 snew(root->old_cell_f,dd->nc[dim]+1);
5244                 snew(root->bCellMin,dd->nc[dim]);
5245                 if (dim_ind > 0)
5246                 {
5247                     snew(root->cell_f_max0,dd->nc[dim]);
5248                     snew(root->cell_f_min1,dd->nc[dim]);
5249                     snew(root->bound_min,dd->nc[dim]);
5250                     snew(root->bound_max,dd->nc[dim]);
5251                 }
5252                 snew(root->buf_ncd,dd->nc[dim]);
5253             }
5254             else
5255             {
5256                 /* This is not a root process, we only need to receive cell_f */
5257                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5258             }
5259         }
5260         if (dd->ci[dim] == dd->master_ci[dim])
5261         {
5262             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5263         }
5264     }
5265 }
5266 #endif
5267
5268 static void make_load_communicators(gmx_domdec_t *dd)
5269 {
5270 #ifdef GMX_MPI
5271   int  dim0,dim1,i,j;
5272   ivec loc;
5273
5274   if (debug)
5275     fprintf(debug,"Making load communicators\n");
5276
5277   snew(dd->comm->load,dd->ndim);
5278   snew(dd->comm->mpi_comm_load,dd->ndim);
5279   
5280   clear_ivec(loc);
5281   make_load_communicator(dd,0,loc);
5282   if (dd->ndim > 1) {
5283     dim0 = dd->dim[0];
5284     for(i=0; i<dd->nc[dim0]; i++) {
5285       loc[dim0] = i;
5286       make_load_communicator(dd,1,loc);
5287     }
5288   }
5289   if (dd->ndim > 2) {
5290     dim0 = dd->dim[0];
5291     for(i=0; i<dd->nc[dim0]; i++) {
5292       loc[dim0] = i;
5293       dim1 = dd->dim[1];
5294       for(j=0; j<dd->nc[dim1]; j++) {
5295           loc[dim1] = j;
5296           make_load_communicator(dd,2,loc);
5297       }
5298     }
5299   }
5300
5301   if (debug)
5302     fprintf(debug,"Finished making load communicators\n");
5303 #endif
5304 }
5305
5306 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5307 {
5308     gmx_bool bZYX;
5309     int  d,dim,i,j,m;
5310     ivec tmp,s;
5311     int  nzone,nzonep;
5312     ivec dd_zp[DD_MAXIZONE];
5313     gmx_domdec_zones_t *zones;
5314     gmx_domdec_ns_ranges_t *izone;
5315     
5316     for(d=0; d<dd->ndim; d++)
5317     {
5318         dim = dd->dim[d];
5319         copy_ivec(dd->ci,tmp);
5320         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5321         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5322         copy_ivec(dd->ci,tmp);
5323         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5324         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5325         if (debug)
5326         {
5327             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5328                     dd->rank,dim,
5329                     dd->neighbor[d][0],
5330                     dd->neighbor[d][1]);
5331         }
5332     }
5333     
5334     if (DDMASTER(dd))
5335     {
5336         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5337             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5338     }
5339     if (fplog)
5340     {
5341         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5342                 dd->ndim,
5343                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5344                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5345     }
5346     switch (dd->ndim)
5347     {
5348     case 3:
5349         nzone  = dd_z3n;
5350         nzonep = dd_zp3n;
5351         for(i=0; i<nzonep; i++)
5352         {
5353             copy_ivec(dd_zp3[i],dd_zp[i]);
5354         }
5355         break;
5356     case 2:
5357         nzone  = dd_z2n;
5358         nzonep = dd_zp2n;
5359         for(i=0; i<nzonep; i++)
5360         {
5361             copy_ivec(dd_zp2[i],dd_zp[i]);
5362         }
5363         break;
5364     case 1:
5365         nzone  = dd_z1n;
5366         nzonep = dd_zp1n;
5367         for(i=0; i<nzonep; i++)
5368         {
5369             copy_ivec(dd_zp1[i],dd_zp[i]);
5370         }
5371         break;
5372     default:
5373         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5374         nzone = 0;
5375         nzonep = 0;
5376     }
5377
5378     zones = &dd->comm->zones;
5379
5380     for(i=0; i<nzone; i++)
5381     {
5382         m = 0;
5383         clear_ivec(zones->shift[i]);
5384         for(d=0; d<dd->ndim; d++)
5385         {
5386             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5387         }
5388     }
5389     
5390     zones->n = nzone;
5391     for(i=0; i<nzone; i++)
5392     {
5393         for(d=0; d<DIM; d++)
5394         {
5395             s[d] = dd->ci[d] - zones->shift[i][d];
5396             if (s[d] < 0)
5397             {
5398                 s[d] += dd->nc[d];
5399             }
5400             else if (s[d] >= dd->nc[d])
5401             {
5402                 s[d] -= dd->nc[d];
5403             }
5404         }
5405     }
5406     zones->nizone = nzonep;
5407     for(i=0; i<zones->nizone; i++)
5408     {
5409         if (dd_zp[i][0] != i)
5410         {
5411             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5412         }
5413         izone = &zones->izone[i];
5414         izone->j0 = dd_zp[i][1];
5415         izone->j1 = dd_zp[i][2];
5416         for(dim=0; dim<DIM; dim++)
5417         {
5418             if (dd->nc[dim] == 1)
5419             {
5420                 /* All shifts should be allowed */
5421                 izone->shift0[dim] = -1;
5422                 izone->shift1[dim] = 1;
5423             }
5424             else
5425             {
5426                 /*
5427                   izone->shift0[d] = 0;
5428                   izone->shift1[d] = 0;
5429                   for(j=izone->j0; j<izone->j1; j++) {
5430                   if (dd->shift[j][d] > dd->shift[i][d])
5431                   izone->shift0[d] = -1;
5432                   if (dd->shift[j][d] < dd->shift[i][d])
5433                   izone->shift1[d] = 1;
5434                   }
5435                 */
5436                 
5437                 int shift_diff;
5438                 
5439                 /* Assume the shift are not more than 1 cell */
5440                 izone->shift0[dim] = 1;
5441                 izone->shift1[dim] = -1;
5442                 for(j=izone->j0; j<izone->j1; j++)
5443                 {
5444                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5445                     if (shift_diff < izone->shift0[dim])
5446                     {
5447                         izone->shift0[dim] = shift_diff;
5448                     }
5449                     if (shift_diff > izone->shift1[dim])
5450                     {
5451                         izone->shift1[dim] = shift_diff;
5452                     }
5453                 }
5454             }
5455         }
5456     }
5457     
5458     if (dd->comm->eDLB != edlbNO)
5459     {
5460         snew(dd->comm->root,dd->ndim);
5461     }
5462     
5463     if (dd->comm->bRecordLoad)
5464     {
5465         make_load_communicators(dd);
5466     }
5467 }
5468
5469 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5470 {
5471     gmx_domdec_t *dd;
5472     gmx_domdec_comm_t *comm;
5473     int  i,rank,*buf;
5474     ivec periods;
5475 #ifdef GMX_MPI
5476     MPI_Comm comm_cart;
5477 #endif
5478     
5479     dd = cr->dd;
5480     comm = dd->comm;
5481     
5482 #ifdef GMX_MPI
5483     if (comm->bCartesianPP)
5484     {
5485         /* Set up cartesian communication for the particle-particle part */
5486         if (fplog)
5487         {
5488             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5489                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5490         }
5491         
5492         for(i=0; i<DIM; i++)
5493         {
5494             periods[i] = TRUE;
5495         }
5496         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5497                         &comm_cart);
5498         /* We overwrite the old communicator with the new cartesian one */
5499         cr->mpi_comm_mygroup = comm_cart;
5500     }
5501     
5502     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5503     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5504     
5505     if (comm->bCartesianPP_PME)
5506     {
5507         /* Since we want to use the original cartesian setup for sim,
5508          * and not the one after split, we need to make an index.
5509          */
5510         snew(comm->ddindex2ddnodeid,dd->nnodes);
5511         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5512         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5513         /* Get the rank of the DD master,
5514          * above we made sure that the master node is a PP node.
5515          */
5516         if (MASTER(cr))
5517         {
5518             rank = dd->rank;
5519         }
5520         else
5521         {
5522             rank = 0;
5523         }
5524         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5525     }
5526     else if (comm->bCartesianPP)
5527     {
5528         if (cr->npmenodes == 0)
5529         {
5530             /* The PP communicator is also
5531              * the communicator for this simulation
5532              */
5533             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5534         }
5535         cr->nodeid = dd->rank;
5536         
5537         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5538         
5539         /* We need to make an index to go from the coordinates
5540          * to the nodeid of this simulation.
5541          */
5542         snew(comm->ddindex2simnodeid,dd->nnodes);
5543         snew(buf,dd->nnodes);
5544         if (cr->duty & DUTY_PP)
5545         {
5546             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5547         }
5548         /* Communicate the ddindex to simulation nodeid index */
5549         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5550                       cr->mpi_comm_mysim);
5551         sfree(buf);
5552         
5553         /* Determine the master coordinates and rank.
5554          * The DD master should be the same node as the master of this sim.
5555          */
5556         for(i=0; i<dd->nnodes; i++)
5557         {
5558             if (comm->ddindex2simnodeid[i] == 0)
5559             {
5560                 ddindex2xyz(dd->nc,i,dd->master_ci);
5561                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5562             }
5563         }
5564         if (debug)
5565         {
5566             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5567         }
5568     }
5569     else
5570     {
5571         /* No Cartesian communicators */
5572         /* We use the rank in dd->comm->all as DD index */
5573         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5574         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5575         dd->masterrank = 0;
5576         clear_ivec(dd->master_ci);
5577     }
5578 #endif
5579   
5580     if (fplog)
5581     {
5582         fprintf(fplog,
5583                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5584                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5585     }
5586     if (debug)
5587     {
5588         fprintf(debug,
5589                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5590                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5591     }
5592 }
5593
5594 static void receive_ddindex2simnodeid(t_commrec *cr)
5595 {
5596     gmx_domdec_t *dd;
5597     
5598     gmx_domdec_comm_t *comm;
5599     int  *buf;
5600     
5601     dd = cr->dd;
5602     comm = dd->comm;
5603     
5604 #ifdef GMX_MPI
5605     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5606     {
5607         snew(comm->ddindex2simnodeid,dd->nnodes);
5608         snew(buf,dd->nnodes);
5609         if (cr->duty & DUTY_PP)
5610         {
5611             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5612         }
5613 #ifdef GMX_MPI
5614         /* Communicate the ddindex to simulation nodeid index */
5615         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5616                       cr->mpi_comm_mysim);
5617 #endif
5618         sfree(buf);
5619     }
5620 #endif
5621 }
5622
5623 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5624                                                      int ncg,int natoms)
5625 {
5626     gmx_domdec_master_t *ma;
5627     int i;
5628
5629     snew(ma,1);
5630     
5631     snew(ma->ncg,dd->nnodes);
5632     snew(ma->index,dd->nnodes+1);
5633     snew(ma->cg,ncg);
5634     snew(ma->nat,dd->nnodes);
5635     snew(ma->ibuf,dd->nnodes*2);
5636     snew(ma->cell_x,DIM);
5637     for(i=0; i<DIM; i++)
5638     {
5639         snew(ma->cell_x[i],dd->nc[i]+1);
5640     }
5641
5642     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5643     {
5644         ma->vbuf = NULL;
5645     }
5646     else
5647     {
5648         snew(ma->vbuf,natoms);
5649     }
5650
5651     return ma;
5652 }
5653
5654 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5655                                int reorder)
5656 {
5657     gmx_domdec_t *dd;
5658     gmx_domdec_comm_t *comm;
5659     int  i,rank;
5660     gmx_bool bDiv[DIM];
5661     ivec periods;
5662 #ifdef GMX_MPI
5663     MPI_Comm comm_cart;
5664 #endif
5665     
5666     dd = cr->dd;
5667     comm = dd->comm;
5668     
5669     if (comm->bCartesianPP)
5670     {
5671         for(i=1; i<DIM; i++)
5672         {
5673             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5674         }
5675         if (bDiv[YY] || bDiv[ZZ])
5676         {
5677             comm->bCartesianPP_PME = TRUE;
5678             /* If we have 2D PME decomposition, which is always in x+y,
5679              * we stack the PME only nodes in z.
5680              * Otherwise we choose the direction that provides the thinnest slab
5681              * of PME only nodes as this will have the least effect
5682              * on the PP communication.
5683              * But for the PME communication the opposite might be better.
5684              */
5685             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5686                              !bDiv[YY] ||
5687                              dd->nc[YY] > dd->nc[ZZ]))
5688             {
5689                 comm->cartpmedim = ZZ;
5690             }
5691             else
5692             {
5693                 comm->cartpmedim = YY;
5694             }
5695             comm->ntot[comm->cartpmedim]
5696                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5697         }
5698         else if (fplog)
5699         {
5700             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5701             fprintf(fplog,
5702                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5703         }
5704     }
5705     
5706 #ifdef GMX_MPI
5707     if (comm->bCartesianPP_PME)
5708     {
5709         if (fplog)
5710         {
5711             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5712         }
5713         
5714         for(i=0; i<DIM; i++)
5715         {
5716             periods[i] = TRUE;
5717         }
5718         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5719                         &comm_cart);
5720         
5721         MPI_Comm_rank(comm_cart,&rank);
5722         if (MASTERNODE(cr) && rank != 0)
5723         {
5724             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5725         }
5726         
5727         /* With this assigment we loose the link to the original communicator
5728          * which will usually be MPI_COMM_WORLD, unless have multisim.
5729          */
5730         cr->mpi_comm_mysim = comm_cart;
5731         cr->sim_nodeid = rank;
5732         
5733         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5734         
5735         if (fplog)
5736         {
5737             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5738                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5739         }
5740         
5741         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5742         {
5743             cr->duty = DUTY_PP;
5744         }
5745         if (cr->npmenodes == 0 ||
5746             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5747         {
5748             cr->duty = DUTY_PME;
5749         }
5750         
5751         /* Split the sim communicator into PP and PME only nodes */
5752         MPI_Comm_split(cr->mpi_comm_mysim,
5753                        cr->duty,
5754                        dd_index(comm->ntot,dd->ci),
5755                        &cr->mpi_comm_mygroup);
5756     }
5757     else
5758     {
5759         switch (dd_node_order)
5760         {
5761         case ddnoPP_PME:
5762             if (fplog)
5763             {
5764                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5765             }
5766             break;
5767         case ddnoINTERLEAVE:
5768             /* Interleave the PP-only and PME-only nodes,
5769              * as on clusters with dual-core machines this will double
5770              * the communication bandwidth of the PME processes
5771              * and thus speed up the PP <-> PME and inter PME communication.
5772              */
5773             if (fplog)
5774             {
5775                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5776             }
5777             comm->pmenodes = dd_pmenodes(cr);
5778             break;
5779         case ddnoCARTESIAN:
5780             break;
5781         default:
5782             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5783         }
5784     
5785         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5786         {
5787             cr->duty = DUTY_PME;
5788         }
5789         else
5790         {
5791             cr->duty = DUTY_PP;
5792         }
5793         
5794         /* Split the sim communicator into PP and PME only nodes */
5795         MPI_Comm_split(cr->mpi_comm_mysim,
5796                        cr->duty,
5797                        cr->nodeid,
5798                        &cr->mpi_comm_mygroup);
5799         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5800     }
5801 #endif
5802
5803     if (fplog)
5804     {
5805         fprintf(fplog,"This is a %s only node\n\n",
5806                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5807     }
5808 }
5809
5810 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5811 {
5812     gmx_domdec_t *dd;
5813     gmx_domdec_comm_t *comm;
5814     int CartReorder;
5815     
5816     dd = cr->dd;
5817     comm = dd->comm;
5818     
5819     copy_ivec(dd->nc,comm->ntot);
5820     
5821     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5822     comm->bCartesianPP_PME = FALSE;
5823     
5824     /* Reorder the nodes by default. This might change the MPI ranks.
5825      * Real reordering is only supported on very few architectures,
5826      * Blue Gene is one of them.
5827      */
5828     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5829     
5830     if (cr->npmenodes > 0)
5831     {
5832         /* Split the communicator into a PP and PME part */
5833         split_communicator(fplog,cr,dd_node_order,CartReorder);
5834         if (comm->bCartesianPP_PME)
5835         {
5836             /* We (possibly) reordered the nodes in split_communicator,
5837              * so it is no longer required in make_pp_communicator.
5838              */
5839             CartReorder = FALSE;
5840         }
5841     }
5842     else
5843     {
5844         /* All nodes do PP and PME */
5845 #ifdef GMX_MPI    
5846         /* We do not require separate communicators */
5847         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5848 #endif
5849     }
5850     
5851     if (cr->duty & DUTY_PP)
5852     {
5853         /* Copy or make a new PP communicator */
5854         make_pp_communicator(fplog,cr,CartReorder);
5855     }
5856     else
5857     {
5858         receive_ddindex2simnodeid(cr);
5859     }
5860     
5861     if (!(cr->duty & DUTY_PME))
5862     {
5863         /* Set up the commnuication to our PME node */
5864         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5865         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5866         if (debug)
5867         {
5868             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5869                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5870         }
5871     }
5872     else
5873     {
5874         dd->pme_nodeid = -1;
5875     }
5876
5877     if (DDMASTER(dd))
5878     {
5879         dd->ma = init_gmx_domdec_master_t(dd,
5880                                           comm->cgs_gl.nr,
5881                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5882     }
5883 }
5884
5885 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5886 {
5887     real *slb_frac,tot;
5888     int  i,n;
5889     double dbl;
5890     
5891     slb_frac = NULL;
5892     if (nc > 1 && size_string != NULL)
5893     {
5894         if (fplog)
5895         {
5896             fprintf(fplog,"Using static load balancing for the %s direction\n",
5897                     dir);
5898         }
5899         snew(slb_frac,nc);
5900         tot = 0;
5901         for (i=0; i<nc; i++)
5902         {
5903             dbl = 0;
5904             sscanf(size_string,"%lf%n",&dbl,&n);
5905             if (dbl == 0)
5906             {
5907                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5908             }
5909             slb_frac[i] = dbl;
5910             size_string += n;
5911             tot += slb_frac[i];
5912         }
5913         /* Normalize */
5914         if (fplog)
5915         {
5916             fprintf(fplog,"Relative cell sizes:");
5917         }
5918         for (i=0; i<nc; i++)
5919         {
5920             slb_frac[i] /= tot;
5921             if (fplog)
5922             {
5923                 fprintf(fplog," %5.3f",slb_frac[i]);
5924             }
5925         }
5926         if (fplog)
5927         {
5928             fprintf(fplog,"\n");
5929         }
5930     }
5931     
5932     return slb_frac;
5933 }
5934
5935 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5936 {
5937     int n,nmol,ftype;
5938     gmx_mtop_ilistloop_t iloop;
5939     t_ilist *il;
5940     
5941     n = 0;
5942     iloop = gmx_mtop_ilistloop_init(mtop);
5943     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5944     {
5945         for(ftype=0; ftype<F_NRE; ftype++)
5946         {
5947             if ((interaction_function[ftype].flags & IF_BOND) &&
5948                 NRAL(ftype) >  2)
5949             {
5950                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5951             }
5952         }
5953   }
5954
5955   return n;
5956 }
5957
5958 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5959 {
5960     char *val;
5961     int  nst;
5962     
5963     nst = def;
5964     val = getenv(env_var);
5965     if (val)
5966     {
5967         if (sscanf(val,"%d",&nst) <= 0)
5968         {
5969             nst = 1;
5970         }
5971         if (fplog)
5972         {
5973             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5974                     env_var,val,nst);
5975         }
5976     }
5977     
5978     return nst;
5979 }
5980
5981 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5982 {
5983     if (MASTER(cr))
5984     {
5985         fprintf(stderr,"\n%s\n",warn_string);
5986     }
5987     if (fplog)
5988     {
5989         fprintf(fplog,"\n%s\n",warn_string);
5990     }
5991 }
5992
5993 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5994                                   t_inputrec *ir,FILE *fplog)
5995 {
5996     if (ir->ePBC == epbcSCREW &&
5997         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
5998     {
5999         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6000     }
6001
6002     if (ir->ns_type == ensSIMPLE)
6003     {
6004         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6005     }
6006
6007     if (ir->nstlist == 0)
6008     {
6009         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6010     }
6011
6012     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6013     {
6014         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6015     }
6016 }
6017
6018 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6019 {
6020     int  di,d;
6021     real r;
6022
6023     r = ddbox->box_size[XX];
6024     for(di=0; di<dd->ndim; di++)
6025     {
6026         d = dd->dim[di];
6027         /* Check using the initial average cell size */
6028         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6029     }
6030
6031     return r;
6032 }
6033
6034 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6035                              const char *dlb_opt,gmx_bool bRecordLoad,
6036                              unsigned long Flags,t_inputrec *ir)
6037 {
6038     gmx_domdec_t *dd;
6039     int  eDLB=-1;
6040     char buf[STRLEN];
6041
6042     switch (dlb_opt[0])
6043     {
6044     case 'a': eDLB = edlbAUTO; break;
6045     case 'n': eDLB = edlbNO;   break;
6046     case 'y': eDLB = edlbYES;  break;
6047     default: gmx_incons("Unknown dlb_opt");
6048     }
6049
6050     if (Flags & MD_RERUN)
6051     {
6052         return edlbNO;
6053     }
6054
6055     if (!EI_DYNAMICS(ir->eI))
6056     {
6057         if (eDLB == edlbYES)
6058         {
6059             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6060             dd_warning(cr,fplog,buf);
6061         }
6062             
6063         return edlbNO;
6064     }
6065
6066     if (!bRecordLoad)
6067     {
6068         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6069
6070         return edlbNO;
6071     }
6072
6073     if (Flags & MD_REPRODUCIBLE)
6074     {
6075         switch (eDLB)
6076         {
6077                         case edlbNO: 
6078                                 break;
6079                         case edlbAUTO:
6080                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6081                                 eDLB = edlbNO;
6082                                 break;
6083                         case edlbYES:
6084                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6085                                 break;
6086                         default:
6087                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6088                                 break;
6089         }
6090     }
6091
6092     return eDLB;
6093 }
6094
6095 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6096 {
6097     int dim;
6098
6099     dd->ndim = 0;
6100     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6101     {
6102         /* Decomposition order z,y,x */
6103         if (fplog)
6104         {
6105             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6106         }
6107         for(dim=DIM-1; dim>=0; dim--)
6108         {
6109             if (dd->nc[dim] > 1)
6110             {
6111                 dd->dim[dd->ndim++] = dim;
6112             }
6113         }
6114     }
6115     else
6116     {
6117         /* Decomposition order x,y,z */
6118         for(dim=0; dim<DIM; dim++)
6119         {
6120             if (dd->nc[dim] > 1)
6121             {
6122                 dd->dim[dd->ndim++] = dim;
6123             }
6124         }
6125     }
6126 }
6127
6128 static gmx_domdec_comm_t *init_dd_comm()
6129 {
6130     gmx_domdec_comm_t *comm;
6131     int  i;
6132
6133     snew(comm,1);
6134     snew(comm->cggl_flag,DIM*2);
6135     snew(comm->cgcm_state,DIM*2);
6136     for(i=0; i<DIM*2; i++)
6137     {
6138         comm->cggl_flag_nalloc[i]  = 0;
6139         comm->cgcm_state_nalloc[i] = 0;
6140     }
6141     
6142     comm->nalloc_int = 0;
6143     comm->buf_int    = NULL;
6144
6145     vec_rvec_init(&comm->vbuf);
6146
6147     comm->n_load_have    = 0;
6148     comm->n_load_collect = 0;
6149
6150     for(i=0; i<ddnatNR-ddnatZONE; i++)
6151     {
6152         comm->sum_nat[i] = 0;
6153     }
6154     comm->ndecomp = 0;
6155     comm->nload   = 0;
6156     comm->load_step = 0;
6157     comm->load_sum  = 0;
6158     comm->load_max  = 0;
6159     clear_ivec(comm->load_lim);
6160     comm->load_mdf  = 0;
6161     comm->load_pme  = 0;
6162
6163     return comm;
6164 }
6165
6166 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6167                                         unsigned long Flags,
6168                                         ivec nc,
6169                                         real comm_distance_min,real rconstr,
6170                                         const char *dlb_opt,real dlb_scale,
6171                                         const char *sizex,const char *sizey,const char *sizez,
6172                                         gmx_mtop_t *mtop,t_inputrec *ir,
6173                                         matrix box,rvec *x,
6174                                         gmx_ddbox_t *ddbox,
6175                                         int *npme_x,int *npme_y)
6176 {
6177     gmx_domdec_t *dd;
6178     gmx_domdec_comm_t *comm;
6179     int  recload;
6180     int  d,i,j;
6181     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6182     gmx_bool bC;
6183     char buf[STRLEN];
6184     
6185     if (fplog)
6186     {
6187         fprintf(fplog,
6188                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6189     }
6190     
6191     snew(dd,1);
6192
6193     dd->comm = init_dd_comm();
6194     comm = dd->comm;
6195     snew(comm->cggl_flag,DIM*2);
6196     snew(comm->cgcm_state,DIM*2);
6197
6198     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6199     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6200     
6201     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6202     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6203     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6204     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6205     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6206     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6207     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6208     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6209
6210     dd->pme_recv_f_alloc = 0;
6211     dd->pme_recv_f_buf = NULL;
6212
6213     if (dd->bSendRecv2 && fplog)
6214     {
6215         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6216     }
6217     if (comm->eFlop)
6218     {
6219         if (fplog)
6220         {
6221             fprintf(fplog,"Will load balance based on FLOP count\n");
6222         }
6223         if (comm->eFlop > 1)
6224         {
6225             srand(1+cr->nodeid);
6226         }
6227         comm->bRecordLoad = TRUE;
6228     }
6229     else
6230     {
6231         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6232                              
6233     }
6234     
6235     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6236     
6237     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6238     if (fplog)
6239     {
6240         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6241     }
6242     dd->bGridJump = comm->bDynLoadBal;
6243     
6244     if (comm->nstSortCG)
6245     {
6246         if (fplog)
6247         {
6248             if (comm->nstSortCG == 1)
6249             {
6250                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6251             }
6252             else
6253             {
6254                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6255                         comm->nstSortCG);
6256             }
6257         }
6258         snew(comm->sort,1);
6259     }
6260     else
6261     {
6262         if (fplog)
6263         {
6264             fprintf(fplog,"Will not sort the charge groups\n");
6265         }
6266     }
6267     
6268     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6269     if (comm->bInterCGBondeds)
6270     {
6271         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6272     }
6273     else
6274     {
6275         comm->bInterCGMultiBody = FALSE;
6276     }
6277     
6278     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6279
6280     if (ir->rlistlong == 0)
6281     {
6282         /* Set the cut-off to some very large value,
6283          * so we don't need if statements everywhere in the code.
6284          * We use sqrt, since the cut-off is squared in some places.
6285          */
6286         comm->cutoff   = GMX_CUTOFF_INF;
6287     }
6288     else
6289     {
6290         comm->cutoff   = ir->rlistlong;
6291     }
6292     comm->cutoff_mbody = 0;
6293     
6294     comm->cellsize_limit = 0;
6295     comm->bBondComm = FALSE;
6296
6297     if (comm->bInterCGBondeds)
6298     {
6299         if (comm_distance_min > 0)
6300         {
6301             comm->cutoff_mbody = comm_distance_min;
6302             if (Flags & MD_DDBONDCOMM)
6303             {
6304                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6305             }
6306             else
6307             {
6308                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6309             }
6310             r_bonded_limit = comm->cutoff_mbody;
6311         }
6312         else if (ir->bPeriodicMols)
6313         {
6314             /* Can not easily determine the required cut-off */
6315             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6316             comm->cutoff_mbody = comm->cutoff/2;
6317             r_bonded_limit = comm->cutoff_mbody;
6318         }
6319         else
6320         {
6321             if (MASTER(cr))
6322             {
6323                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6324                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6325             }
6326             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6327             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6328
6329             /* We use an initial margin of 10% for the minimum cell size,
6330              * except when we are just below the non-bonded cut-off.
6331              */
6332             if (Flags & MD_DDBONDCOMM)
6333             {
6334                 if (max(r_2b,r_mb) > comm->cutoff)
6335                 {
6336                     r_bonded       = max(r_2b,r_mb);
6337                     r_bonded_limit = 1.1*r_bonded;
6338                     comm->bBondComm = TRUE;
6339                 }
6340                 else
6341                 {
6342                     r_bonded       = r_mb;
6343                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6344                 }
6345                 /* We determine cutoff_mbody later */
6346             }
6347             else
6348             {
6349                 /* No special bonded communication,
6350                  * simply increase the DD cut-off.
6351                  */
6352                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6353                 comm->cutoff_mbody = r_bonded_limit;
6354                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6355             }
6356         }
6357         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6358         if (fplog)
6359         {
6360             fprintf(fplog,
6361                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6362                     comm->cellsize_limit);
6363         }
6364     }
6365
6366     if (dd->bInterCGcons && rconstr <= 0)
6367     {
6368         /* There is a cell size limit due to the constraints (P-LINCS) */
6369         rconstr = constr_r_max(fplog,mtop,ir);
6370         if (fplog)
6371         {
6372             fprintf(fplog,
6373                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6374                     rconstr);
6375             if (rconstr > comm->cellsize_limit)
6376             {
6377                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6378             }
6379         }
6380     }
6381     else if (rconstr > 0 && fplog)
6382     {
6383         /* Here we do not check for dd->bInterCGcons,
6384          * because one can also set a cell size limit for virtual sites only
6385          * and at this point we don't know yet if there are intercg v-sites.
6386          */
6387         fprintf(fplog,
6388                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6389                 rconstr);
6390     }
6391     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6392
6393     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6394
6395     if (nc[XX] > 0)
6396     {
6397         copy_ivec(nc,dd->nc);
6398         set_dd_dim(fplog,dd);
6399         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6400
6401         if (cr->npmenodes == -1)
6402         {
6403             cr->npmenodes = 0;
6404         }
6405         acs = average_cellsize_min(dd,ddbox);
6406         if (acs < comm->cellsize_limit)
6407         {
6408             if (fplog)
6409             {
6410                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6411             }
6412             gmx_fatal_collective(FARGS,cr,NULL,
6413                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6414                                  acs,comm->cellsize_limit);
6415         }
6416     }
6417     else
6418     {
6419         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6420
6421         /* We need to choose the optimal DD grid and possibly PME nodes */
6422         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6423                                comm->eDLB!=edlbNO,dlb_scale,
6424                                comm->cellsize_limit,comm->cutoff,
6425                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6426         
6427         if (dd->nc[XX] == 0)
6428         {
6429             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6430             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6431                     !bC ? "-rdd" : "-rcon",
6432                     comm->eDLB!=edlbNO ? " or -dds" : "",
6433                     bC ? " or your LINCS settings" : "");
6434
6435             gmx_fatal_collective(FARGS,cr,NULL,
6436                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6437                                  "%s\n"
6438                                  "Look in the log file for details on the domain decomposition",
6439                                  cr->nnodes-cr->npmenodes,limit,buf);
6440         }
6441         set_dd_dim(fplog,dd);
6442     }
6443
6444     if (fplog)
6445     {
6446         fprintf(fplog,
6447                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6448                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6449     }
6450     
6451     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6452     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6453     {
6454         gmx_fatal_collective(FARGS,cr,NULL,
6455                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6456                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6457     }
6458     if (cr->npmenodes > dd->nnodes)
6459     {
6460         gmx_fatal_collective(FARGS,cr,NULL,
6461                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6462     }
6463     if (cr->npmenodes > 0)
6464     {
6465         comm->npmenodes = cr->npmenodes;
6466     }
6467     else
6468     {
6469         comm->npmenodes = dd->nnodes;
6470     }
6471
6472     if (EEL_PME(ir->coulombtype))
6473     {
6474         /* The following choices should match those
6475          * in comm_cost_est in domdec_setup.c.
6476          * Note that here the checks have to take into account
6477          * that the decomposition might occur in a different order than xyz
6478          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6479          * in which case they will not match those in comm_cost_est,
6480          * but since that is mainly for testing purposes that's fine.
6481          */
6482         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6483             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6484             getenv("GMX_PMEONEDD") == NULL)
6485         {
6486             comm->npmedecompdim = 2;
6487             comm->npmenodes_x   = dd->nc[XX];
6488             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6489         }
6490         else
6491         {
6492             /* In case nc is 1 in both x and y we could still choose to
6493              * decompose pme in y instead of x, but we use x for simplicity.
6494              */
6495             comm->npmedecompdim = 1;
6496             if (dd->dim[0] == YY)
6497             {
6498                 comm->npmenodes_x = 1;
6499                 comm->npmenodes_y = comm->npmenodes;
6500             }
6501             else
6502             {
6503                 comm->npmenodes_x = comm->npmenodes;
6504                 comm->npmenodes_y = 1;
6505             }
6506         }    
6507         if (fplog)
6508         {
6509             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6510                     comm->npmenodes_x,comm->npmenodes_y,1);
6511         }
6512     }
6513     else
6514     {
6515         comm->npmedecompdim = 0;
6516         comm->npmenodes_x   = 0;
6517         comm->npmenodes_y   = 0;
6518     }
6519     
6520     /* Technically we don't need both of these,
6521      * but it simplifies code not having to recalculate it.
6522      */
6523     *npme_x = comm->npmenodes_x;
6524     *npme_y = comm->npmenodes_y;
6525         
6526     snew(comm->slb_frac,DIM);
6527     if (comm->eDLB == edlbNO)
6528     {
6529         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6530         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6531         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6532     }
6533
6534     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6535     {
6536         if (comm->bBondComm || comm->eDLB != edlbNO)
6537         {
6538             /* Set the bonded communication distance to halfway
6539              * the minimum and the maximum,
6540              * since the extra communication cost is nearly zero.
6541              */
6542             acs = average_cellsize_min(dd,ddbox);
6543             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6544             if (comm->eDLB != edlbNO)
6545             {
6546                 /* Check if this does not limit the scaling */
6547                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6548             }
6549             if (!comm->bBondComm)
6550             {
6551                 /* Without bBondComm do not go beyond the n.b. cut-off */
6552                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6553                 if (comm->cellsize_limit >= comm->cutoff)
6554                 {
6555                     /* We don't loose a lot of efficieny
6556                      * when increasing it to the n.b. cut-off.
6557                      * It can even be slightly faster, because we need
6558                      * less checks for the communication setup.
6559                      */
6560                     comm->cutoff_mbody = comm->cutoff;
6561                 }
6562             }
6563             /* Check if we did not end up below our original limit */
6564             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6565
6566             if (comm->cutoff_mbody > comm->cellsize_limit)
6567             {
6568                 comm->cellsize_limit = comm->cutoff_mbody;
6569             }
6570         }
6571         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6572     }
6573
6574     if (debug)
6575     {
6576         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6577                 "cellsize limit %f\n",
6578                 comm->bBondComm,comm->cellsize_limit);
6579     }
6580     
6581     if (MASTER(cr))
6582     {
6583         check_dd_restrictions(cr,dd,ir,fplog);
6584     }
6585
6586     comm->globalcomm_step = INT_MIN;
6587     dd->ddp_count = 0;
6588
6589     clear_dd_cycle_counts(dd);
6590
6591     return dd;
6592 }
6593
6594 static void set_dlb_limits(gmx_domdec_t *dd)
6595
6596 {
6597     int d;
6598
6599     for(d=0; d<dd->ndim; d++)
6600     {
6601         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6602         dd->comm->cellsize_min[dd->dim[d]] =
6603             dd->comm->cellsize_min_dlb[dd->dim[d]];
6604     }
6605 }
6606
6607
6608 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6609 {
6610     gmx_domdec_t *dd;
6611     gmx_domdec_comm_t *comm;
6612     real cellsize_min;
6613     int  d,nc,i;
6614     char buf[STRLEN];
6615     
6616     dd = cr->dd;
6617     comm = dd->comm;
6618     
6619     if (fplog)
6620     {
6621         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6622     }
6623
6624     cellsize_min = comm->cellsize_min[dd->dim[0]];
6625     for(d=1; d<dd->ndim; d++)
6626     {
6627         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6628     }
6629
6630     if (cellsize_min < comm->cellsize_limit*1.05)
6631     {
6632         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6633
6634         /* Change DLB from "auto" to "no". */
6635         comm->eDLB = edlbNO;
6636
6637         return;
6638     }
6639
6640     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6641     comm->bDynLoadBal = TRUE;
6642     dd->bGridJump = TRUE;
6643     
6644     set_dlb_limits(dd);
6645
6646     /* We can set the required cell size info here,
6647      * so we do not need to communicate this.
6648      * The grid is completely uniform.
6649      */
6650     for(d=0; d<dd->ndim; d++)
6651     {
6652         if (comm->root[d])
6653         {
6654             comm->load[d].sum_m = comm->load[d].sum;
6655
6656             nc = dd->nc[dd->dim[d]];
6657             for(i=0; i<nc; i++)
6658             {
6659                 comm->root[d]->cell_f[i]    = i/(real)nc;
6660                 if (d > 0)
6661                 {
6662                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6663                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6664                 }
6665             }
6666             comm->root[d]->cell_f[nc] = 1.0;
6667         }
6668     }
6669 }
6670
6671 static char *init_bLocalCG(gmx_mtop_t *mtop)
6672 {
6673     int  ncg,cg;
6674     char *bLocalCG;
6675     
6676     ncg = ncg_mtop(mtop);
6677     snew(bLocalCG,ncg);
6678     for(cg=0; cg<ncg; cg++)
6679     {
6680         bLocalCG[cg] = FALSE;
6681     }
6682
6683     return bLocalCG;
6684 }
6685
6686 void dd_init_bondeds(FILE *fplog,
6687                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6688                      gmx_vsite_t *vsite,gmx_constr_t constr,
6689                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6690 {
6691     gmx_domdec_comm_t *comm;
6692     gmx_bool bBondComm;
6693     int  d;
6694
6695     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6696
6697     comm = dd->comm;
6698
6699     if (comm->bBondComm)
6700     {
6701         /* Communicate atoms beyond the cut-off for bonded interactions */
6702         comm = dd->comm;
6703
6704         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6705
6706         comm->bLocalCG = init_bLocalCG(mtop);
6707     }
6708     else
6709     {
6710         /* Only communicate atoms based on cut-off */
6711         comm->cglink   = NULL;
6712         comm->bLocalCG = NULL;
6713     }
6714 }
6715
6716 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6717                               t_inputrec *ir,
6718                               gmx_bool bDynLoadBal,real dlb_scale,
6719                               gmx_ddbox_t *ddbox)
6720 {
6721     gmx_domdec_comm_t *comm;
6722     int  d;
6723     ivec np;
6724     real limit,shrink;
6725     char buf[64];
6726
6727     if (fplog == NULL)
6728     {
6729         return;
6730     }
6731
6732     comm = dd->comm;
6733
6734     if (bDynLoadBal)
6735     {
6736         fprintf(fplog,"The maximum number of communication pulses is:");
6737         for(d=0; d<dd->ndim; d++)
6738         {
6739             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6740         }
6741         fprintf(fplog,"\n");
6742         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6743         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6744         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6745         for(d=0; d<DIM; d++)
6746         {
6747             if (dd->nc[d] > 1)
6748             {
6749                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6750                 {
6751                     shrink = 0;
6752                 }
6753                 else
6754                 {
6755                     shrink =
6756                         comm->cellsize_min_dlb[d]/
6757                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6758                 }
6759                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6760             }
6761         }
6762         fprintf(fplog,"\n");
6763     }
6764     else
6765     {
6766         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6767         fprintf(fplog,"The initial number of communication pulses is:");
6768         for(d=0; d<dd->ndim; d++)
6769         {
6770             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6771         }
6772         fprintf(fplog,"\n");
6773         fprintf(fplog,"The initial domain decomposition cell size is:");
6774         for(d=0; d<DIM; d++) {
6775             if (dd->nc[d] > 1)
6776             {
6777                 fprintf(fplog," %c %.2f nm",
6778                         dim2char(d),dd->comm->cellsize_min[d]);
6779             }
6780         }
6781         fprintf(fplog,"\n\n");
6782     }
6783     
6784     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6785     {
6786         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6787         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6788                 "non-bonded interactions","",comm->cutoff);
6789
6790         if (bDynLoadBal)
6791         {
6792             limit = dd->comm->cellsize_limit;
6793         }
6794         else
6795         {
6796             if (dynamic_dd_box(ddbox,ir))
6797             {
6798                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6799             }
6800             limit = dd->comm->cellsize_min[XX];
6801             for(d=1; d<DIM; d++)
6802             {
6803                 limit = min(limit,dd->comm->cellsize_min[d]);
6804             }
6805         }
6806
6807         if (comm->bInterCGBondeds)
6808         {
6809             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6810                     "two-body bonded interactions","(-rdd)",
6811                     max(comm->cutoff,comm->cutoff_mbody));
6812             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6813                     "multi-body bonded interactions","(-rdd)",
6814                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6815         }
6816         if (dd->vsite_comm)
6817         {
6818             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6819                     "virtual site constructions","(-rcon)",limit);
6820         }
6821         if (dd->constraint_comm)
6822         {
6823             sprintf(buf,"atoms separated by up to %d constraints",
6824                     1+ir->nProjOrder);
6825             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6826                     buf,"(-rcon)",limit);
6827         }
6828         fprintf(fplog,"\n");
6829     }
6830     
6831     fflush(fplog);
6832 }
6833
6834 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6835                        t_inputrec *ir,t_forcerec *fr,
6836                        gmx_ddbox_t *ddbox)
6837 {
6838     gmx_domdec_comm_t *comm;
6839     int  d,dim,npulse,npulse_d_max,npulse_d;
6840     gmx_bool bNoCutOff;
6841     int  natoms_tot;
6842     real vol_frac;
6843
6844     comm = dd->comm;
6845
6846     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6847
6848     if (EEL_PME(ir->coulombtype))
6849     {
6850         init_ddpme(dd,&comm->ddpme[0],0);
6851         if (comm->npmedecompdim >= 2)
6852         {
6853             init_ddpme(dd,&comm->ddpme[1],1);
6854         }
6855     }
6856     else
6857     {
6858         comm->npmenodes = 0;
6859         if (dd->pme_nodeid >= 0)
6860         {
6861             gmx_fatal_collective(FARGS,NULL,dd,
6862                                  "Can not have separate PME nodes without PME electrostatics");
6863         }
6864     }
6865     
6866     /* If each molecule is a single charge group
6867      * or we use domain decomposition for each periodic dimension,
6868      * we do not need to take pbc into account for the bonded interactions.
6869      */
6870     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6871         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6872     {
6873         fr->bMolPBC = FALSE;
6874     }
6875     else
6876     {
6877         fr->bMolPBC = TRUE;
6878     }
6879         
6880     if (debug)
6881     {
6882         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6883     }
6884     if (comm->eDLB != edlbNO)
6885     {
6886         /* Determine the maximum number of comm. pulses in one dimension */
6887         
6888         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6889         
6890         /* Determine the maximum required number of grid pulses */
6891         if (comm->cellsize_limit >= comm->cutoff)
6892         {
6893             /* Only a single pulse is required */
6894             npulse = 1;
6895         }
6896         else if (!bNoCutOff && comm->cellsize_limit > 0)
6897         {
6898             /* We round down slightly here to avoid overhead due to the latency
6899              * of extra communication calls when the cut-off
6900              * would be only slightly longer than the cell size.
6901              * Later cellsize_limit is redetermined,
6902              * so we can not miss interactions due to this rounding.
6903              */
6904             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6905         }
6906         else
6907         {
6908             /* There is no cell size limit */
6909             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6910         }
6911
6912         if (!bNoCutOff && npulse > 1)
6913         {
6914             /* See if we can do with less pulses, based on dlb_scale */
6915             npulse_d_max = 0;
6916             for(d=0; d<dd->ndim; d++)
6917             {
6918                 dim = dd->dim[d];
6919                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6920                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6921                 npulse_d_max = max(npulse_d_max,npulse_d);
6922             }
6923             npulse = min(npulse,npulse_d_max);
6924         }
6925         
6926         /* This env var can override npulse */
6927         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6928         if (d > 0)
6929         {
6930             npulse = d;
6931         }
6932
6933         comm->maxpulse = 1;
6934         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6935         for(d=0; d<dd->ndim; d++)
6936         {
6937             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6938             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6939             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6940             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6941             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6942             {
6943                 comm->bVacDLBNoLimit = FALSE;
6944             }
6945         }
6946         
6947         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6948         if (!comm->bVacDLBNoLimit)
6949         {
6950             comm->cellsize_limit = max(comm->cellsize_limit,
6951                                        comm->cutoff/comm->maxpulse);
6952         }
6953         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6954         /* Set the minimum cell size for each DD dimension */
6955         for(d=0; d<dd->ndim; d++)
6956         {
6957             if (comm->bVacDLBNoLimit ||
6958                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6959             {
6960                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6961             }
6962             else
6963             {
6964                 comm->cellsize_min_dlb[dd->dim[d]] =
6965                     comm->cutoff/comm->cd[d].np_dlb;
6966             }
6967         }
6968         if (comm->cutoff_mbody <= 0)
6969         {
6970             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6971         }
6972         if (comm->bDynLoadBal)
6973         {
6974             set_dlb_limits(dd);
6975         }
6976     }
6977     
6978     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6979     if (comm->eDLB == edlbAUTO)
6980     {
6981         if (fplog)
6982         {
6983             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6984         }
6985         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6986     }
6987
6988     if (ir->ePBC == epbcNONE)
6989     {
6990         vol_frac = 1 - 1/(double)dd->nnodes;
6991     }
6992     else
6993     {
6994         vol_frac =
6995             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
6996     }
6997     if (debug)
6998     {
6999         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7000     }
7001     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7002    
7003     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7004 }
7005
7006 static void merge_cg_buffers(int ncell,
7007                              gmx_domdec_comm_dim_t *cd, int pulse,
7008                              int  *ncg_cell,
7009                              int  *index_gl, int  *recv_i,
7010                              rvec *cg_cm,    rvec *recv_vr,
7011                              int *cgindex,
7012                              cginfo_mb_t *cginfo_mb,int *cginfo)
7013 {
7014     gmx_domdec_ind_t *ind,*ind_p;
7015     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7016     int shift,shift_at;
7017     
7018     ind = &cd->ind[pulse];
7019     
7020     /* First correct the already stored data */
7021     shift = ind->nrecv[ncell];
7022     for(cell=ncell-1; cell>=0; cell--)
7023     {
7024         shift -= ind->nrecv[cell];
7025         if (shift > 0)
7026         {
7027             /* Move the cg's present from previous grid pulses */
7028             cg0 = ncg_cell[ncell+cell];
7029             cg1 = ncg_cell[ncell+cell+1];
7030             cgindex[cg1+shift] = cgindex[cg1];
7031             for(cg=cg1-1; cg>=cg0; cg--)
7032             {
7033                 index_gl[cg+shift] = index_gl[cg];
7034                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7035                 cgindex[cg+shift] = cgindex[cg];
7036                 cginfo[cg+shift] = cginfo[cg];
7037             }
7038             /* Correct the already stored send indices for the shift */
7039             for(p=1; p<=pulse; p++)
7040             {
7041                 ind_p = &cd->ind[p];
7042                 cg0 = 0;
7043                 for(c=0; c<cell; c++)
7044                 {
7045                     cg0 += ind_p->nsend[c];
7046                 }
7047                 cg1 = cg0 + ind_p->nsend[cell];
7048                 for(cg=cg0; cg<cg1; cg++)
7049                 {
7050                     ind_p->index[cg] += shift;
7051                 }
7052             }
7053         }
7054     }
7055
7056     /* Merge in the communicated buffers */
7057     shift = 0;
7058     shift_at = 0;
7059     cg0 = 0;
7060     for(cell=0; cell<ncell; cell++)
7061     {
7062         cg1 = ncg_cell[ncell+cell+1] + shift;
7063         if (shift_at > 0)
7064         {
7065             /* Correct the old cg indices */
7066             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7067             {
7068                 cgindex[cg+1] += shift_at;
7069             }
7070         }
7071         for(cg=0; cg<ind->nrecv[cell]; cg++)
7072         {
7073             /* Copy this charge group from the buffer */
7074             index_gl[cg1] = recv_i[cg0];
7075             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7076             /* Add it to the cgindex */
7077             cg_gl = index_gl[cg1];
7078             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7079             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7080             cgindex[cg1+1] = cgindex[cg1] + nat;
7081             cg0++;
7082             cg1++;
7083             shift_at += nat;
7084         }
7085         shift += ind->nrecv[cell];
7086         ncg_cell[ncell+cell+1] = cg1;
7087     }
7088 }
7089
7090 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7091                                int nzone,int cg0,const int *cgindex)
7092 {
7093     int cg,zone,p;
7094     
7095     /* Store the atom block boundaries for easy copying of communication buffers
7096      */
7097     cg = cg0;
7098     for(zone=0; zone<nzone; zone++)
7099     {
7100         for(p=0; p<cd->np; p++) {
7101             cd->ind[p].cell2at0[zone] = cgindex[cg];
7102             cg += cd->ind[p].nrecv[zone];
7103             cd->ind[p].cell2at1[zone] = cgindex[cg];
7104         }
7105     }
7106 }
7107
7108 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7109 {
7110     int  i;
7111     gmx_bool bMiss;
7112
7113     bMiss = FALSE;
7114     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7115     {
7116         if (!bLocalCG[link->a[i]])
7117         {
7118             bMiss = TRUE;
7119         }
7120     }
7121
7122     return bMiss;
7123 }
7124
7125 static void setup_dd_communication(gmx_domdec_t *dd,
7126                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7127 {
7128     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7129     int nzone,nzone_send,zone,zonei,cg0,cg1;
7130     int c,i,j,cg,cg_gl,nrcg;
7131     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7132     gmx_domdec_comm_t *comm;
7133     gmx_domdec_zones_t *zones;
7134     gmx_domdec_comm_dim_t *cd;
7135     gmx_domdec_ind_t *ind;
7136     cginfo_mb_t *cginfo_mb;
7137     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7138     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7139     rvec rb,rn;
7140     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7141     real bcorner[DIM],bcorner_round_1=0;
7142     ivec tric_dist;
7143     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7144     real skew_fac2_d,skew_fac_01;
7145     rvec sf2_round;
7146     int  nsend,nat;
7147     
7148     if (debug)
7149     {
7150         fprintf(debug,"Setting up DD communication\n");
7151     }
7152     
7153     comm  = dd->comm;
7154     cg_cm = fr->cg_cm;
7155
7156     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7157     {
7158         dim = dd->dim[dim_ind];
7159
7160         /* Check if we need to use triclinic distances */
7161         tric_dist[dim_ind] = 0;
7162         for(i=0; i<=dim_ind; i++)
7163         {
7164             if (ddbox->tric_dir[dd->dim[i]])
7165             {
7166                 tric_dist[dim_ind] = 1;
7167             }
7168         }
7169     }
7170
7171     bBondComm = comm->bBondComm;
7172
7173     /* Do we need to determine extra distances for multi-body bondeds? */
7174     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7175     
7176     /* Do we need to determine extra distances for only two-body bondeds? */
7177     bDist2B = (bBondComm && !bDistMB);
7178
7179     r_comm2  = sqr(comm->cutoff);
7180     r_bcomm2 = sqr(comm->cutoff_mbody);
7181
7182     if (debug)
7183     {
7184         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7185     }
7186
7187     zones = &comm->zones;
7188     
7189     dim0 = dd->dim[0];
7190     /* The first dimension is equal for all cells */
7191     corner[0][0] = comm->cell_x0[dim0];
7192     if (bDistMB)
7193     {
7194         bcorner[0] = corner[0][0];
7195     }
7196     if (dd->ndim >= 2)
7197     {
7198         dim1 = dd->dim[1];
7199         /* This cell row is only seen from the first row */
7200         corner[1][0] = comm->cell_x0[dim1];
7201         /* All rows can see this row */
7202         corner[1][1] = comm->cell_x0[dim1];
7203         if (dd->bGridJump)
7204         {
7205             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7206             if (bDistMB)
7207             {
7208                 /* For the multi-body distance we need the maximum */
7209                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7210             }
7211         }
7212         /* Set the upper-right corner for rounding */
7213         corner_round_0 = comm->cell_x1[dim0];
7214         
7215         if (dd->ndim >= 3)
7216         {
7217             dim2 = dd->dim[2];
7218             for(j=0; j<4; j++)
7219             {
7220                 corner[2][j] = comm->cell_x0[dim2];
7221             }
7222             if (dd->bGridJump)
7223             {
7224                 /* Use the maximum of the i-cells that see a j-cell */
7225                 for(i=0; i<zones->nizone; i++)
7226                 {
7227                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7228                     {
7229                         if (j >= 4)
7230                         {
7231                             corner[2][j-4] =
7232                                 max(corner[2][j-4],
7233                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7234                         }
7235                     }
7236                 }
7237                 if (bDistMB)
7238                 {
7239                     /* For the multi-body distance we need the maximum */
7240                     bcorner[2] = comm->cell_x0[dim2];
7241                     for(i=0; i<2; i++)
7242                     {
7243                         for(j=0; j<2; j++)
7244                         {
7245                             bcorner[2] = max(bcorner[2],
7246                                              comm->zone_d2[i][j].p1_0);
7247                         }
7248                     }
7249                 }
7250             }
7251             
7252             /* Set the upper-right corner for rounding */
7253             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7254              * Only cell (0,0,0) can see cell 7 (1,1,1)
7255              */
7256             corner_round_1[0] = comm->cell_x1[dim1];
7257             corner_round_1[3] = comm->cell_x1[dim1];
7258             if (dd->bGridJump)
7259             {
7260                 corner_round_1[0] = max(comm->cell_x1[dim1],
7261                                         comm->zone_d1[1].mch1);
7262                 if (bDistMB)
7263                 {
7264                     /* For the multi-body distance we need the maximum */
7265                     bcorner_round_1 = max(comm->cell_x1[dim1],
7266                                           comm->zone_d1[1].p1_1);
7267                 }
7268             }
7269         }
7270     }
7271     
7272     /* Triclinic stuff */
7273     normal = ddbox->normal;
7274     skew_fac_01 = 0;
7275     if (dd->ndim >= 2)
7276     {
7277         v_0 = ddbox->v[dim0];
7278         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7279         {
7280             /* Determine the coupling coefficient for the distances
7281              * to the cell planes along dim0 and dim1 through dim2.
7282              * This is required for correct rounding.
7283              */
7284             skew_fac_01 =
7285                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7286             if (debug)
7287             {
7288                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7289             }
7290         }
7291     }
7292     if (dd->ndim >= 3)
7293     {
7294         v_1 = ddbox->v[dim1];
7295     }
7296     
7297     zone_cg_range = zones->cg_range;
7298     index_gl = dd->index_gl;
7299     cgindex  = dd->cgindex;
7300     cginfo_mb = fr->cginfo_mb;
7301     
7302     zone_cg_range[0]   = 0;
7303     zone_cg_range[1]   = dd->ncg_home;
7304     comm->zone_ncg1[0] = dd->ncg_home;
7305     pos_cg             = dd->ncg_home;
7306     
7307     nat_tot = dd->nat_home;
7308     nzone = 1;
7309     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7310     {
7311         dim = dd->dim[dim_ind];
7312         cd = &comm->cd[dim_ind];
7313         
7314         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7315         {
7316             /* No pbc in this dimension, the first node should not comm. */
7317             nzone_send = 0;
7318         }
7319         else
7320         {
7321             nzone_send = nzone;
7322         }
7323
7324         bScrew = (dd->bScrewPBC && dim == XX);
7325         
7326         v_d = ddbox->v[dim];
7327         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7328
7329         cd->bInPlace = TRUE;
7330         for(p=0; p<cd->np; p++)
7331         {
7332             /* Only atoms communicated in the first pulse are used
7333              * for multi-body bonded interactions or for bBondComm.
7334              */
7335             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7336             bDistMB_pulse = (bDistMB && bDistBonded);
7337
7338             ind = &cd->ind[p];
7339             nsend = 0;
7340             nat = 0;
7341             for(zone=0; zone<nzone_send; zone++)
7342             {
7343                 if (tric_dist[dim_ind] && dim_ind > 0)
7344                 {
7345                     /* Determine slightly more optimized skew_fac's
7346                      * for rounding.
7347                      * This reduces the number of communicated atoms
7348                      * by about 10% for 3D DD of rhombic dodecahedra.
7349                      */
7350                     for(dimd=0; dimd<dim; dimd++)
7351                     {
7352                         sf2_round[dimd] = 1;
7353                         if (ddbox->tric_dir[dimd])
7354                         {
7355                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7356                             {
7357                                 /* If we are shifted in dimension i
7358                                  * and the cell plane is tilted forward
7359                                  * in dimension i, skip this coupling.
7360                                  */
7361                                 if (!(zones->shift[nzone+zone][i] &&
7362                                       ddbox->v[dimd][i][dimd] >= 0))
7363                                 {
7364                                     sf2_round[dimd] +=
7365                                         sqr(ddbox->v[dimd][i][dimd]);
7366                                 }
7367                             }
7368                             sf2_round[dimd] = 1/sf2_round[dimd];
7369                         }
7370                     }
7371                 }
7372
7373                 zonei = zone_perm[dim_ind][zone];
7374                 if (p == 0)
7375                 {
7376                     /* Here we permutate the zones to obtain a convenient order
7377                      * for neighbor searching
7378                      */
7379                     cg0 = zone_cg_range[zonei];
7380                     cg1 = zone_cg_range[zonei+1];
7381                 }
7382                 else
7383                 {
7384                     /* Look only at the cg's received in the previous grid pulse
7385                      */
7386                     cg1 = zone_cg_range[nzone+zone+1];
7387                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7388                 }
7389                 ind->nsend[zone] = 0;
7390                 for(cg=cg0; cg<cg1; cg++)
7391                 {
7392                     r2  = 0;
7393                     rb2 = 0;
7394                     if (tric_dist[dim_ind] == 0)
7395                     {
7396                         /* Rectangular direction, easy */
7397                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7398                         if (r > 0)
7399                         {
7400                             r2 += r*r;
7401                         }
7402                         if (bDistMB_pulse)
7403                         {
7404                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7405                             if (r > 0)
7406                             {
7407                                 rb2 += r*r;
7408                             }
7409                         }
7410                         /* Rounding gives at most a 16% reduction
7411                          * in communicated atoms
7412                          */
7413                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7414                         {
7415                             r = cg_cm[cg][dim0] - corner_round_0;
7416                             /* This is the first dimension, so always r >= 0 */
7417                             r2 += r*r;
7418                             if (bDistMB_pulse)
7419                             {
7420                                 rb2 += r*r;
7421                             }
7422                         }
7423                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7424                         {
7425                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7426                             if (r > 0)
7427                             {
7428                                 r2 += r*r;
7429                             }
7430                             if (bDistMB_pulse)
7431                             {
7432                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7433                                 if (r > 0)
7434                                 {
7435                                     rb2 += r*r;
7436                                 }
7437                             }
7438                         }
7439                     }
7440                     else
7441                     {
7442                         /* Triclinic direction, more complicated */
7443                         clear_rvec(rn);
7444                         clear_rvec(rb);
7445                         /* Rounding, conservative as the skew_fac multiplication
7446                          * will slightly underestimate the distance.
7447                          */
7448                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7449                         {
7450                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7451                             for(i=dim0+1; i<DIM; i++)
7452                             {
7453                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7454                             }
7455                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7456                             if (bDistMB_pulse)
7457                             {
7458                                 rb[dim0] = rn[dim0];
7459                                 rb2 = r2;
7460                             }
7461                             /* Take care that the cell planes along dim0 might not
7462                              * be orthogonal to those along dim1 and dim2.
7463                              */
7464                             for(i=1; i<=dim_ind; i++)
7465                             {
7466                                 dimd = dd->dim[i];
7467                                 if (normal[dim0][dimd] > 0)
7468                                 {
7469                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7470                                     if (bDistMB_pulse)
7471                                     {
7472                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7473                                     }
7474                                 }
7475                             }
7476                         }
7477                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7478                         {
7479                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7480                             tric_sh = 0;
7481                             for(i=dim1+1; i<DIM; i++)
7482                             {
7483                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7484                             }
7485                             rn[dim1] += tric_sh;
7486                             if (rn[dim1] > 0)
7487                             {
7488                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7489                                 /* Take care of coupling of the distances
7490                                  * to the planes along dim0 and dim1 through dim2.
7491                                  */
7492                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7493                                 /* Take care that the cell planes along dim1
7494                                  * might not be orthogonal to that along dim2.
7495                                  */
7496                                 if (normal[dim1][dim2] > 0)
7497                                 {
7498                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7499                                 }
7500                             }
7501                             if (bDistMB_pulse)
7502                             {
7503                                 rb[dim1] +=
7504                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7505                                 if (rb[dim1] > 0)
7506                                 {
7507                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7508                                     /* Take care of coupling of the distances
7509                                      * to the planes along dim0 and dim1 through dim2.
7510                                      */
7511                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7512                                     /* Take care that the cell planes along dim1
7513                                      * might not be orthogonal to that along dim2.
7514                                      */
7515                                     if (normal[dim1][dim2] > 0)
7516                                     {
7517                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7518                                     }
7519                                 }
7520                             }
7521                         }
7522                         /* The distance along the communication direction */
7523                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7524                         tric_sh = 0;
7525                         for(i=dim+1; i<DIM; i++)
7526                         {
7527                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7528                         }
7529                         rn[dim] += tric_sh;
7530                         if (rn[dim] > 0)
7531                         {
7532                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7533                             /* Take care of coupling of the distances
7534                              * to the planes along dim0 and dim1 through dim2.
7535                              */
7536                             if (dim_ind == 1 && zonei == 1)
7537                             {
7538                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7539                             }
7540                         }
7541                         if (bDistMB_pulse)
7542                         {
7543                             clear_rvec(rb);
7544                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7545                             if (rb[dim] > 0)
7546                             {
7547                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7548                                 /* Take care of coupling of the distances
7549                                  * to the planes along dim0 and dim1 through dim2.
7550                                  */
7551                                 if (dim_ind == 1 && zonei == 1)
7552                                 {
7553                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7554                                 }
7555                             }
7556                         }
7557                     }
7558                     
7559                     if (r2 < r_comm2 ||
7560                         (bDistBonded &&
7561                          ((bDistMB && rb2 < r_bcomm2) ||
7562                           (bDist2B && r2  < r_bcomm2)) &&
7563                          (!bBondComm ||
7564                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7565                            missing_link(comm->cglink,index_gl[cg],
7566                                         comm->bLocalCG)))))
7567                     {
7568                         /* Make an index to the local charge groups */
7569                         if (nsend+1 > ind->nalloc)
7570                         {
7571                             ind->nalloc = over_alloc_large(nsend+1);
7572                             srenew(ind->index,ind->nalloc);
7573                         }
7574                         if (nsend+1 > comm->nalloc_int)
7575                         {
7576                             comm->nalloc_int = over_alloc_large(nsend+1);
7577                             srenew(comm->buf_int,comm->nalloc_int);
7578                         }
7579                         ind->index[nsend] = cg;
7580                         comm->buf_int[nsend] = index_gl[cg];
7581                         ind->nsend[zone]++;
7582                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7583
7584                         if (dd->ci[dim] == 0)
7585                         {
7586                             /* Correct cg_cm for pbc */
7587                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7588                             if (bScrew)
7589                             {
7590                                 comm->vbuf.v[nsend][YY] =
7591                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7592                                 comm->vbuf.v[nsend][ZZ] =
7593                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7594                             }
7595                         }
7596                         else
7597                         {
7598                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7599                         }
7600                         nsend++;
7601                         nat += cgindex[cg+1] - cgindex[cg];
7602                     }
7603                 }
7604             }
7605             /* Clear the counts in case we do not have pbc */
7606             for(zone=nzone_send; zone<nzone; zone++)
7607             {
7608                 ind->nsend[zone] = 0;
7609             }
7610             ind->nsend[nzone]   = nsend;
7611             ind->nsend[nzone+1] = nat;
7612             /* Communicate the number of cg's and atoms to receive */
7613             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7614                             ind->nsend, nzone+2,
7615                             ind->nrecv, nzone+2);
7616             
7617             /* The rvec buffer is also required for atom buffers of size nsend
7618              * in dd_move_x and dd_move_f.
7619              */
7620             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7621
7622             if (p > 0)
7623             {
7624                 /* We can receive in place if only the last zone is not empty */
7625                 for(zone=0; zone<nzone-1; zone++)
7626                 {
7627                     if (ind->nrecv[zone] > 0)
7628                     {
7629                         cd->bInPlace = FALSE;
7630                     }
7631                 }
7632                 if (!cd->bInPlace)
7633                 {
7634                     /* The int buffer is only required here for the cg indices */
7635                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7636                     {
7637                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7638                         srenew(comm->buf_int2,comm->nalloc_int2);
7639                     }
7640                     /* The rvec buffer is also required for atom buffers
7641                      * of size nrecv in dd_move_x and dd_move_f.
7642                      */
7643                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7644                     vec_rvec_check_alloc(&comm->vbuf2,i);
7645                 }
7646             }
7647             
7648             /* Make space for the global cg indices */
7649             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7650                 || dd->cg_nalloc == 0)
7651             {
7652                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7653                 srenew(index_gl,dd->cg_nalloc);
7654                 srenew(cgindex,dd->cg_nalloc+1);
7655             }
7656             /* Communicate the global cg indices */
7657             if (cd->bInPlace)
7658             {
7659                 recv_i = index_gl + pos_cg;
7660             }
7661             else
7662             {
7663                 recv_i = comm->buf_int2;
7664             }
7665             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7666                             comm->buf_int, nsend,
7667                             recv_i,        ind->nrecv[nzone]);
7668
7669             /* Make space for cg_cm */
7670             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7671             {
7672                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7673                 cg_cm = fr->cg_cm;
7674             }
7675             /* Communicate cg_cm */
7676             if (cd->bInPlace)
7677             {
7678                 recv_vr = cg_cm + pos_cg;
7679             }
7680             else
7681             {
7682                 recv_vr = comm->vbuf2.v;
7683             }
7684             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7685                              comm->vbuf.v, nsend,
7686                              recv_vr,      ind->nrecv[nzone]);
7687             
7688             /* Make the charge group index */
7689             if (cd->bInPlace)
7690             {
7691                 zone = (p == 0 ? 0 : nzone - 1);
7692                 while (zone < nzone)
7693                 {
7694                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7695                     {
7696                         cg_gl = index_gl[pos_cg];
7697                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7698                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7699                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7700                         if (bBondComm)
7701                         {
7702                             /* Update the charge group presence,
7703                              * so we can use it in the next pass of the loop.
7704                              */
7705                             comm->bLocalCG[cg_gl] = TRUE;
7706                         }
7707                         pos_cg++;
7708                     }
7709                     if (p == 0)
7710                     {
7711                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7712                     }
7713                     zone++;
7714                     zone_cg_range[nzone+zone] = pos_cg;
7715                 }
7716             }
7717             else
7718             {
7719                 /* This part of the code is never executed with bBondComm. */
7720                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7721                                  index_gl,recv_i,cg_cm,recv_vr,
7722                                  cgindex,fr->cginfo_mb,fr->cginfo);
7723                 pos_cg += ind->nrecv[nzone];
7724             }
7725             nat_tot += ind->nrecv[nzone+1];
7726         }
7727         if (!cd->bInPlace)
7728         {
7729             /* Store the atom block for easy copying of communication buffers */
7730             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7731         }
7732         nzone += nzone;
7733     }
7734     dd->index_gl = index_gl;
7735     dd->cgindex  = cgindex;
7736     
7737     dd->ncg_tot = zone_cg_range[zones->n];
7738     dd->nat_tot = nat_tot;
7739     comm->nat[ddnatHOME] = dd->nat_home;
7740     for(i=ddnatZONE; i<ddnatNR; i++)
7741     {
7742         comm->nat[i] = dd->nat_tot;
7743     }
7744
7745     if (!bBondComm)
7746     {
7747         /* We don't need to update cginfo, since that was alrady done above.
7748          * So we pass NULL for the forcerec.
7749          */
7750         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7751                       NULL,comm->bLocalCG);
7752     }
7753
7754     if (debug)
7755     {
7756         fprintf(debug,"Finished setting up DD communication, zones:");
7757         for(c=0; c<zones->n; c++)
7758         {
7759             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7760         }
7761         fprintf(debug,"\n");
7762     }
7763 }
7764
7765 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7766 {
7767     int c;
7768     
7769     for(c=0; c<zones->nizone; c++)
7770     {
7771         zones->izone[c].cg1  = zones->cg_range[c+1];
7772         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7773         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7774     }
7775 }
7776
7777 static int comp_cgsort(const void *a,const void *b)
7778 {
7779     int comp;
7780     
7781     gmx_cgsort_t *cga,*cgb;
7782     cga = (gmx_cgsort_t *)a;
7783     cgb = (gmx_cgsort_t *)b;
7784     
7785     comp = cga->nsc - cgb->nsc;
7786     if (comp == 0)
7787     {
7788         comp = cga->ind_gl - cgb->ind_gl;
7789     }
7790     
7791     return comp;
7792 }
7793
7794 static void order_int_cg(int n,gmx_cgsort_t *sort,
7795                          int *a,int *buf)
7796 {
7797     int i;
7798     
7799     /* Order the data */
7800     for(i=0; i<n; i++)
7801     {
7802         buf[i] = a[sort[i].ind];
7803     }
7804     
7805     /* Copy back to the original array */
7806     for(i=0; i<n; i++)
7807     {
7808         a[i] = buf[i];
7809     }
7810 }
7811
7812 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7813                          rvec *v,rvec *buf)
7814 {
7815     int i;
7816     
7817     /* Order the data */
7818     for(i=0; i<n; i++)
7819     {
7820         copy_rvec(v[sort[i].ind],buf[i]);
7821     }
7822     
7823     /* Copy back to the original array */
7824     for(i=0; i<n; i++)
7825     {
7826         copy_rvec(buf[i],v[i]);
7827     }
7828 }
7829
7830 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7831                            rvec *v,rvec *buf)
7832 {
7833     int a,atot,cg,cg0,cg1,i;
7834     
7835     /* Order the data */
7836     a = 0;
7837     for(cg=0; cg<ncg; cg++)
7838     {
7839         cg0 = cgindex[sort[cg].ind];
7840         cg1 = cgindex[sort[cg].ind+1];
7841         for(i=cg0; i<cg1; i++)
7842         {
7843             copy_rvec(v[i],buf[a]);
7844             a++;
7845         }
7846     }
7847     atot = a;
7848     
7849     /* Copy back to the original array */
7850     for(a=0; a<atot; a++)
7851     {
7852         copy_rvec(buf[a],v[a]);
7853     }
7854 }
7855
7856 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7857                          int nsort_new,gmx_cgsort_t *sort_new,
7858                          gmx_cgsort_t *sort1)
7859 {
7860     int i1,i2,i_new;
7861     
7862     /* The new indices are not very ordered, so we qsort them */
7863     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7864     
7865     /* sort2 is already ordered, so now we can merge the two arrays */
7866     i1 = 0;
7867     i2 = 0;
7868     i_new = 0;
7869     while(i2 < nsort2 || i_new < nsort_new)
7870     {
7871         if (i2 == nsort2)
7872         {
7873             sort1[i1++] = sort_new[i_new++];
7874         }
7875         else if (i_new == nsort_new)
7876         {
7877             sort1[i1++] = sort2[i2++];
7878         }
7879         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7880                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7881                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7882         {
7883             sort1[i1++] = sort2[i2++];
7884         }
7885         else
7886         {
7887             sort1[i1++] = sort_new[i_new++];
7888         }
7889     }
7890 }
7891
7892 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7893                           rvec *cgcm,t_forcerec *fr,t_state *state,
7894                           int ncg_home_old)
7895 {
7896     gmx_domdec_sort_t *sort;
7897     gmx_cgsort_t *cgsort,*sort_i;
7898     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7899     rvec *vbuf;
7900     
7901     sort = dd->comm->sort;
7902     
7903     if (dd->ncg_home > sort->sort_nalloc)
7904     {
7905         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7906         srenew(sort->sort1,sort->sort_nalloc);
7907         srenew(sort->sort2,sort->sort_nalloc);
7908     }
7909     
7910     if (ncg_home_old >= 0)
7911     {
7912         /* The charge groups that remained in the same ns grid cell
7913          * are completely ordered. So we can sort efficiently by sorting
7914          * the charge groups that did move into the stationary list.
7915          */
7916         ncg_new = 0;
7917         nsort2 = 0;
7918         nsort_new = 0;
7919         for(i=0; i<dd->ncg_home; i++)
7920         {
7921             /* Check if this cg did not move to another node */
7922             cell_index = fr->ns.grid->cell_index[i];
7923             if (cell_index !=  4*fr->ns.grid->ncells)
7924             {
7925                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7926                 {
7927                     /* This cg is new on this node or moved ns grid cell */
7928                     if (nsort_new >= sort->sort_new_nalloc)
7929                     {
7930                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7931                         srenew(sort->sort_new,sort->sort_new_nalloc);
7932                     }
7933                     sort_i = &(sort->sort_new[nsort_new++]);
7934                 }
7935                 else
7936                 {
7937                     /* This cg did not move */
7938                     sort_i = &(sort->sort2[nsort2++]);
7939                 }
7940                 /* Sort on the ns grid cell indices
7941                  * and the global topology index
7942                  */
7943                 sort_i->nsc    = cell_index;
7944                 sort_i->ind_gl = dd->index_gl[i];
7945                 sort_i->ind    = i;
7946                 ncg_new++;
7947             }
7948         }
7949         if (debug)
7950         {
7951             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7952                     nsort2,nsort_new);
7953         }
7954         /* Sort efficiently */
7955         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7956     }
7957     else
7958     {
7959         cgsort = sort->sort1;
7960         ncg_new = 0;
7961         for(i=0; i<dd->ncg_home; i++)
7962         {
7963             /* Sort on the ns grid cell indices
7964              * and the global topology index
7965              */
7966             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7967             cgsort[i].ind_gl = dd->index_gl[i];
7968             cgsort[i].ind    = i;
7969             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7970             {
7971                 ncg_new++;
7972             }
7973         }
7974         if (debug)
7975         {
7976             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7977         }
7978         /* Determine the order of the charge groups using qsort */
7979         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7980     }
7981     cgsort = sort->sort1;
7982     
7983     /* We alloc with the old size, since cgindex is still old */
7984     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7985     vbuf = dd->comm->vbuf.v;
7986     
7987     /* Remove the charge groups which are no longer at home here */
7988     dd->ncg_home = ncg_new;
7989     
7990     /* Reorder the state */
7991     for(i=0; i<estNR; i++)
7992     {
7993         if (EST_DISTR(i) && (state->flags & (1<<i)))
7994         {
7995             switch (i)
7996             {
7997             case estX:
7998                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
7999                 break;
8000             case estV:
8001                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8002                 break;
8003             case estSDX:
8004                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8005                 break;
8006             case estCGP:
8007                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8008                 break;
8009             case estLD_RNG:
8010             case estLD_RNGI:
8011             case estDISRE_INITF:
8012             case estDISRE_RM3TAV:
8013             case estORIRE_INITF:
8014             case estORIRE_DTAV:
8015                 /* No ordering required */
8016                 break;
8017             default:
8018                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8019                 break;
8020             }
8021         }
8022     }
8023     /* Reorder cgcm */
8024     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8025     
8026     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8027     {
8028         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8029         srenew(sort->ibuf,sort->ibuf_nalloc);
8030     }
8031     ibuf = sort->ibuf;
8032     /* Reorder the global cg index */
8033     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8034     /* Reorder the cginfo */
8035     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8036     /* Rebuild the local cg index */
8037     ibuf[0] = 0;
8038     for(i=0; i<dd->ncg_home; i++)
8039     {
8040         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8041         ibuf[i+1] = ibuf[i] + cgsize;
8042     }
8043     for(i=0; i<dd->ncg_home+1; i++)
8044     {
8045         dd->cgindex[i] = ibuf[i];
8046     }
8047     /* Set the home atom number */
8048     dd->nat_home = dd->cgindex[dd->ncg_home];
8049     
8050     /* Copy the sorted ns cell indices back to the ns grid struct */
8051     for(i=0; i<dd->ncg_home; i++)
8052     {
8053         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8054     }
8055     fr->ns.grid->nr = dd->ncg_home;
8056 }
8057
8058 static void add_dd_statistics(gmx_domdec_t *dd)
8059 {
8060     gmx_domdec_comm_t *comm;
8061     int ddnat;
8062     
8063     comm = dd->comm;
8064     
8065     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8066     {
8067         comm->sum_nat[ddnat-ddnatZONE] +=
8068             comm->nat[ddnat] - comm->nat[ddnat-1];
8069     }
8070     comm->ndecomp++;
8071 }
8072
8073 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8074 {
8075     gmx_domdec_comm_t *comm;
8076     int ddnat;
8077     
8078     comm = dd->comm;
8079
8080     /* Reset all the statistics and counters for total run counting */
8081     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8082     {
8083         comm->sum_nat[ddnat-ddnatZONE] = 0;
8084     }
8085     comm->ndecomp = 0;
8086     comm->nload = 0;
8087     comm->load_step = 0;
8088     comm->load_sum = 0;
8089     comm->load_max = 0;
8090     clear_ivec(comm->load_lim);
8091     comm->load_mdf = 0;
8092     comm->load_pme = 0;
8093 }
8094
8095 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8096 {
8097     gmx_domdec_comm_t *comm;
8098     int ddnat;
8099     double av;
8100    
8101     comm = cr->dd->comm;
8102     
8103     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8104     
8105     if (fplog == NULL)
8106     {
8107         return;
8108     }
8109     
8110     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8111             
8112     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8113     {
8114         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8115         switch(ddnat)
8116         {
8117         case ddnatZONE:
8118             fprintf(fplog,
8119                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8120                     2,av);
8121             break;
8122         case ddnatVSITE:
8123             if (cr->dd->vsite_comm)
8124             {
8125                 fprintf(fplog,
8126                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8127                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8128                         av);
8129             }
8130             break;
8131         case ddnatCON:
8132             if (cr->dd->constraint_comm)
8133             {
8134                 fprintf(fplog,
8135                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8136                         1 + ir->nLincsIter,av);
8137             }
8138             break;
8139         default:
8140             gmx_incons(" Unknown type for DD statistics");
8141         }
8142     }
8143     fprintf(fplog,"\n");
8144     
8145     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8146     {
8147         print_dd_load_av(fplog,cr->dd);
8148     }
8149 }
8150
8151 void dd_partition_system(FILE            *fplog,
8152                          gmx_large_int_t      step,
8153                          t_commrec       *cr,
8154                          gmx_bool            bMasterState,
8155                          int             nstglobalcomm,
8156                          t_state         *state_global,
8157                          gmx_mtop_t      *top_global,
8158                          t_inputrec      *ir,
8159                          t_state         *state_local,
8160                          rvec            **f,
8161                          t_mdatoms       *mdatoms,
8162                          gmx_localtop_t  *top_local,
8163                          t_forcerec      *fr,
8164                          gmx_vsite_t     *vsite,
8165                          gmx_shellfc_t   shellfc,
8166                          gmx_constr_t    constr,
8167                          t_nrnb          *nrnb,
8168                          gmx_wallcycle_t wcycle,
8169                          gmx_bool            bVerbose)
8170 {
8171     gmx_domdec_t *dd;
8172     gmx_domdec_comm_t *comm;
8173     gmx_ddbox_t ddbox={0};
8174     t_block *cgs_gl;
8175     gmx_large_int_t step_pcoupl;
8176     rvec cell_ns_x0,cell_ns_x1;
8177     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8178     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8179     gmx_bool bRedist,bSortCG,bResortAll;
8180     ivec ncells_old,np;
8181     real grid_density;
8182     char sbuf[22];
8183         
8184     dd = cr->dd;
8185     comm = dd->comm;
8186
8187     bBoxChanged = (bMasterState || DEFORM(*ir));
8188     if (ir->epc != epcNO)
8189     {
8190         /* With nstpcouple > 1 pressure coupling happens.
8191          * one step after calculating the pressure.
8192          * Box scaling happens at the end of the MD step,
8193          * after the DD partitioning.
8194          * We therefore have to do DLB in the first partitioning
8195          * after an MD step where P-coupling occured.
8196          * We need to determine the last step in which p-coupling occurred.
8197          * MRS -- need to validate this for vv?
8198          */
8199         n = ir->nstpcouple;
8200         if (n == 1)
8201         {
8202             step_pcoupl = step - 1;
8203         }
8204         else
8205         {
8206             step_pcoupl = ((step - 1)/n)*n + 1;
8207         }
8208         if (step_pcoupl >= comm->globalcomm_step)
8209         {
8210             bBoxChanged = TRUE;
8211         }
8212     }
8213
8214     bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8215
8216     if (!comm->bDynLoadBal)
8217     {
8218         bDoDLB = FALSE;
8219     }
8220     else
8221     {
8222         /* Should we do dynamic load balacing this step?
8223          * Since it requires (possibly expensive) global communication,
8224          * we might want to do DLB less frequently.
8225          */
8226         if (bBoxChanged || ir->epc != epcNO)
8227         {
8228             bDoDLB = bBoxChanged;
8229         }
8230         else
8231         {
8232             bDoDLB = bNStGlobalComm;
8233         }
8234     }
8235
8236     /* Check if we have recorded loads on the nodes */
8237     if (comm->bRecordLoad && dd_load_count(comm))
8238     {
8239         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8240         {
8241             /* Check if we should use DLB at the second partitioning
8242              * and every 100 partitionings,
8243              * so the extra communication cost is negligible.
8244              */
8245             n = max(100,nstglobalcomm);
8246             bCheckDLB = (comm->n_load_collect == 0 ||
8247                          comm->n_load_have % n == n-1);
8248         }
8249         else
8250         {
8251             bCheckDLB = FALSE;
8252         }
8253         
8254         /* Print load every nstlog, first and last step to the log file */
8255         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8256                     comm->n_load_collect == 0 ||
8257                     (ir->nsteps >= 0 &&
8258                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
8259
8260         /* Avoid extra communication due to verbose screen output
8261          * when nstglobalcomm is set.
8262          */
8263         if (bDoDLB || bLogLoad || bCheckDLB ||
8264             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8265         {
8266             get_load_distribution(dd,wcycle);
8267             if (DDMASTER(dd))
8268             {
8269                 if (bLogLoad)
8270                 {
8271                     dd_print_load(fplog,dd,step-1);
8272                 }
8273                 if (bVerbose)
8274                 {
8275                     dd_print_load_verbose(dd);
8276                 }
8277             }
8278             comm->n_load_collect++;
8279
8280             if (bCheckDLB) {
8281                 /* Since the timings are node dependent, the master decides */
8282                 if (DDMASTER(dd))
8283                 {
8284                     bTurnOnDLB =
8285                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8286                     if (debug)
8287                     {
8288                         fprintf(debug,"step %s, imb loss %f\n",
8289                                 gmx_step_str(step,sbuf),
8290                                 dd_force_imb_perf_loss(dd));
8291                     }
8292                 }
8293                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8294                 if (bTurnOnDLB)
8295                 {
8296                     turn_on_dlb(fplog,cr,step);
8297                     bDoDLB = TRUE;
8298                 }
8299             }
8300         }
8301         comm->n_load_have++;
8302     }
8303
8304     cgs_gl = &comm->cgs_gl;
8305
8306     bRedist = FALSE;
8307     if (bMasterState)
8308     {
8309         /* Clear the old state */
8310         clear_dd_indices(dd,0,0);
8311
8312         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8313                   TRUE,cgs_gl,state_global->x,&ddbox);
8314     
8315         get_cg_distribution(fplog,step,dd,cgs_gl,
8316                             state_global->box,&ddbox,state_global->x);
8317         
8318         dd_distribute_state(dd,cgs_gl,
8319                             state_global,state_local,f);
8320         
8321         dd_make_local_cgs(dd,&top_local->cgs);
8322         
8323         if (dd->ncg_home > fr->cg_nalloc)
8324         {
8325             dd_realloc_fr_cg(fr,dd->ncg_home);
8326         }
8327         calc_cgcm(fplog,0,dd->ncg_home,
8328                   &top_local->cgs,state_local->x,fr->cg_cm);
8329         
8330         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8331         
8332         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8333
8334         cg0 = 0;
8335     }
8336     else if (state_local->ddp_count != dd->ddp_count)
8337     {
8338         if (state_local->ddp_count > dd->ddp_count)
8339         {
8340             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8341         }
8342         
8343         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8344         {
8345             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8346         }
8347         
8348         /* Clear the old state */
8349         clear_dd_indices(dd,0,0);
8350         
8351         /* Build the new indices */
8352         rebuild_cgindex(dd,cgs_gl->index,state_local);
8353         make_dd_indices(dd,cgs_gl->index,0);
8354         
8355         /* Redetermine the cg COMs */
8356         calc_cgcm(fplog,0,dd->ncg_home,
8357                   &top_local->cgs,state_local->x,fr->cg_cm);
8358         
8359         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8360
8361         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8362
8363         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8364                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8365
8366         bRedist = comm->bDynLoadBal;
8367     }
8368     else
8369     {
8370         /* We have the full state, only redistribute the cgs */
8371
8372         /* Clear the non-home indices */
8373         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8374
8375         /* Avoid global communication for dim's without pbc and -gcom */
8376         if (!bNStGlobalComm)
8377         {
8378             copy_rvec(comm->box0    ,ddbox.box0    );
8379             copy_rvec(comm->box_size,ddbox.box_size);
8380         }
8381         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8382                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8383
8384         bBoxChanged = TRUE;
8385         bRedist = TRUE;
8386     }
8387     /* For dim's without pbc and -gcom */
8388     copy_rvec(ddbox.box0    ,comm->box0    );
8389     copy_rvec(ddbox.box_size,comm->box_size);
8390     
8391     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8392                       step,wcycle);
8393     
8394     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8395     {
8396         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8397     }
8398     
8399     /* Check if we should sort the charge groups */
8400     if (comm->nstSortCG > 0)
8401     {
8402         bSortCG = (bMasterState ||
8403                    (bRedist && (step % comm->nstSortCG == 0)));
8404     }
8405     else
8406     {
8407         bSortCG = FALSE;
8408     }
8409
8410     ncg_home_old = dd->ncg_home;
8411
8412     if (bRedist)
8413     {
8414         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8415                                  state_local,f,fr,mdatoms,
8416                                  !bSortCG,nrnb);
8417     }
8418     
8419     get_nsgrid_boundaries(fr->ns.grid,dd,
8420                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8421                           dd->ncg_home,fr->cg_cm,
8422                           cell_ns_x0,cell_ns_x1,&grid_density);
8423
8424     if (bBoxChanged)
8425     {
8426         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8427     }
8428
8429     copy_ivec(fr->ns.grid->n,ncells_old);
8430     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8431                state_local->box,cell_ns_x0,cell_ns_x1,
8432                fr->rlistlong,grid_density);
8433     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8434     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8435
8436     if (bSortCG)
8437     {
8438         /* Sort the state on charge group position.
8439          * This enables exact restarts from this step.
8440          * It also improves performance by about 15% with larger numbers
8441          * of atoms per node.
8442          */
8443         
8444         /* Fill the ns grid with the home cell,
8445          * so we can sort with the indices.
8446          */
8447         set_zones_ncg_home(dd);
8448         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8449                   0,dd->ncg_home,fr->cg_cm);
8450         
8451         /* Check if we can user the old order and ns grid cell indices
8452          * of the charge groups to sort the charge groups efficiently.
8453          */
8454         bResortAll = (bMasterState ||
8455                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8456                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8457                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8458
8459         if (debug)
8460         {
8461             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8462                     gmx_step_str(step,sbuf),dd->ncg_home);
8463         }
8464         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8465                       bResortAll ? -1 : ncg_home_old);
8466         /* Rebuild all the indices */
8467         cg0 = 0;
8468         ga2la_clear(dd->ga2la);
8469     }
8470     
8471     /* Setup up the communication and communicate the coordinates */
8472     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8473     
8474     /* Set the indices */
8475     make_dd_indices(dd,cgs_gl->index,cg0);
8476
8477     /* Set the charge group boundaries for neighbor searching */
8478     set_cg_boundaries(&comm->zones);
8479     
8480     /*
8481     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8482                  -1,state_local->x,state_local->box);
8483     */
8484     
8485     /* Extract a local topology from the global topology */
8486     for(i=0; i<dd->ndim; i++)
8487     {
8488         np[dd->dim[i]] = comm->cd[i].np;
8489     }
8490     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8491                       comm->cellsize_min,np,
8492                       fr,vsite,top_global,top_local);
8493     
8494     /* Set up the special atom communication */
8495     n = comm->nat[ddnatZONE];
8496     for(i=ddnatZONE+1; i<ddnatNR; i++)
8497     {
8498         switch(i)
8499         {
8500         case ddnatVSITE:
8501             if (vsite && vsite->n_intercg_vsite)
8502             {
8503                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8504             }
8505             break;
8506         case ddnatCON:
8507             if (dd->bInterCGcons)
8508             {
8509                 /* Only for inter-cg constraints we need special code */
8510                 n = dd_make_local_constraints(dd,n,top_global,
8511                                               constr,ir->nProjOrder,
8512                                               &top_local->idef.il[F_CONSTR]);
8513             }
8514             break;
8515         default:
8516             gmx_incons("Unknown special atom type setup");
8517         }
8518         comm->nat[i] = n;
8519     }
8520     
8521     /* Make space for the extra coordinates for virtual site
8522      * or constraint communication.
8523      */
8524     state_local->natoms = comm->nat[ddnatNR-1];
8525     if (state_local->natoms > state_local->nalloc)
8526     {
8527         dd_realloc_state(state_local,f,state_local->natoms);
8528     }
8529
8530     if (fr->bF_NoVirSum)
8531     {
8532         if (vsite && vsite->n_intercg_vsite)
8533         {
8534             nat_f_novirsum = comm->nat[ddnatVSITE];
8535         }
8536         else
8537         {
8538             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8539             {
8540                 nat_f_novirsum = dd->nat_tot;
8541             }
8542             else
8543             {
8544                 nat_f_novirsum = dd->nat_home;
8545             }
8546         }
8547     }
8548     else
8549     {
8550         nat_f_novirsum = 0;
8551     }
8552
8553     /* Set the number of atoms required for the force calculation.
8554      * Forces need to be constrained when using a twin-range setup
8555      * or with energy minimization. For simple simulations we could
8556      * avoid some allocation, zeroing and copying, but this is
8557      * probably not worth the complications ande checking.
8558      */
8559     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8560                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8561
8562     /* We make the all mdatoms up to nat_tot_con.
8563      * We could save some work by only setting invmass
8564      * between nat_tot and nat_tot_con.
8565      */
8566     /* This call also sets the new number of home particles to dd->nat_home */
8567     atoms2md(top_global,ir,
8568              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8569
8570     /* Now we have the charges we can sort the FE interactions */
8571     dd_sort_local_top(dd,mdatoms,top_local);
8572
8573     if (shellfc)
8574     {
8575         /* Make the local shell stuff, currently no communication is done */
8576         make_local_shells(cr,mdatoms,shellfc);
8577     }
8578     
8579         if (ir->implicit_solvent)
8580     {
8581         make_local_gb(cr,fr->born,ir->gb_algorithm);
8582     }
8583         
8584     if (!(cr->duty & DUTY_PME))
8585     {
8586         /* Send the charges to our PME only node */
8587         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8588                        mdatoms->chargeA,mdatoms->chargeB,
8589                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8590     }
8591     
8592     if (constr)
8593     {
8594         set_constraints(constr,top_local,ir,mdatoms,cr);
8595     }
8596     
8597     if (ir->ePull != epullNO)
8598     {
8599         /* Update the local pull groups */
8600         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8601     }
8602     
8603     if (ir->bRot)
8604     {
8605         /* Update the local rotation groups */
8606         dd_make_local_rotation_groups(dd,ir->rot);
8607     }
8608
8609
8610     add_dd_statistics(dd);
8611     
8612     /* Make sure we only count the cycles for this DD partitioning */
8613     clear_dd_cycle_counts(dd);
8614     
8615     /* Because the order of the atoms might have changed since
8616      * the last vsite construction, we need to communicate the constructing
8617      * atom coordinates again (for spreading the forces this MD step).
8618      */
8619     dd_move_x_vsites(dd,state_local->box,state_local->x);
8620     
8621     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8622     {
8623         dd_move_x(dd,state_local->box,state_local->x);
8624         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8625                      -1,state_local->x,state_local->box);
8626     }
8627
8628     if (bNStGlobalComm)
8629     {
8630         /* Store the global communication step */
8631         comm->globalcomm_step = step;
8632     }
8633     
8634     /* Increase the DD partitioning counter */
8635     dd->ddp_count++;
8636     /* The state currently matches this DD partitioning count, store it */
8637     state_local->ddp_count = dd->ddp_count;
8638     if (bMasterState)
8639     {
8640         /* The DD master node knows the complete cg distribution,
8641          * store the count so we can possibly skip the cg info communication.
8642          */
8643         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8644     }
8645
8646     if (comm->DD_debug > 0)
8647     {
8648         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8649         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8650                                 "after partitioning");
8651     }
8652 }