Merge release-4-6 into master
[alexxy/gromacs.git] / src / gromacs / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "gmx_fatal.h"
31 #include "gmx_fatal_collective.h"
32 #include "vec.h"
33 #include "domdec.h"
34 #include "domdec_network.h"
35 #include "nrnb.h"
36 #include "pbc.h"
37 #include "chargegroup.h"
38 #include "constr.h"
39 #include "mdatoms.h"
40 #include "names.h"
41 #include "pdbio.h"
42 #include "futil.h"
43 #include "force.h"
44 #include "pme.h"
45 #include "pull.h"
46 #include "pull_rotation.h"
47 #include "gmx_wallcycle.h"
48 #include "mdrun.h"
49 #include "nsgrid.h"
50 #include "shellfc.h"
51 #include "mtop_util.h"
52 #include "gmxfio.h"
53 #include "gmx_ga2la.h"
54 #include "gmx_sort.h"
55 #include "macros.h"
56 #include "nbnxn_search.h"
57 #include "bondf.h"
58 #include "gmx_omp_nthreads.h"
59
60 #ifdef GMX_LIB_MPI
61 #include <mpi.h>
62 #endif
63 #ifdef GMX_THREAD_MPI
64 #include "tmpi.h"
65 #endif
66
67 #define DDRANK(dd,rank)    (rank)
68 #define DDMASTERRANK(dd)   (dd->masterrank)
69
70 typedef struct gmx_domdec_master
71 {
72     /* The cell boundaries */
73     real **cell_x;
74     /* The global charge group division */
75     int  *ncg;     /* Number of home charge groups for each node */
76     int  *index;   /* Index of nnodes+1 into cg */
77     int  *cg;      /* Global charge group index */
78     int  *nat;     /* Number of home atoms for each node. */
79     int  *ibuf;    /* Buffer for communication */
80     rvec *vbuf;    /* Buffer for state scattering and gathering */
81 } gmx_domdec_master_t;
82
83 typedef struct
84 {
85     /* The numbers of charge groups to send and receive for each cell
86      * that requires communication, the last entry contains the total
87      * number of atoms that needs to be communicated.
88      */
89     int nsend[DD_MAXIZONE+2];
90     int nrecv[DD_MAXIZONE+2];
91     /* The charge groups to send */
92     int *index;
93     int nalloc;
94     /* The atom range for non-in-place communication */
95     int cell2at0[DD_MAXIZONE];
96     int cell2at1[DD_MAXIZONE];
97 } gmx_domdec_ind_t;
98
99 typedef struct
100 {
101     int  np;                   /* Number of grid pulses in this dimension */
102     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
103     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
104     int  np_nalloc;
105     gmx_bool bInPlace;             /* Can we communicate in place?            */
106 } gmx_domdec_comm_dim_t;
107
108 typedef struct
109 {
110     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
111     real *cell_f;      /* State var.: cell boundaries, box relative      */
112     real *old_cell_f;  /* Temp. var.: old cell size                      */
113     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
114     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
115     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
116     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
117     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
118     real *buf_ncd;     /* Temp. var.                                     */
119 } gmx_domdec_root_t;
120
121 #define DD_NLOAD_MAX 9
122
123 /* Here floats are accurate enough, since these variables
124  * only influence the load balancing, not the actual MD results.
125  */
126 typedef struct
127 {
128     int  nload;
129     float *load;
130     float sum;
131     float max;
132     float sum_m;
133     float cvol_min;
134     float mdf;
135     float pme;
136     int   flags;
137 } gmx_domdec_load_t;
138
139 typedef struct
140 {
141     int  nsc;
142     int  ind_gl;
143     int  ind;
144 } gmx_cgsort_t;
145
146 typedef struct
147 {
148     gmx_cgsort_t *sort;
149     gmx_cgsort_t *sort2;
150     int  sort_nalloc;
151     gmx_cgsort_t *sort_new;
152     int  sort_new_nalloc;
153     int  *ibuf;
154     int  ibuf_nalloc;
155 } gmx_domdec_sort_t;
156
157 typedef struct
158 {
159     rvec *v;
160     int  nalloc;
161 } vec_rvec_t;
162
163 /* This enum determines the order of the coordinates.
164  * ddnatHOME and ddnatZONE should be first and second,
165  * the others can be ordered as wanted.
166  */
167 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
168
169 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
170 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
171
172 typedef struct
173 {
174     int  dim;      /* The dimension                                          */
175     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
176     int  nslab;    /* The number of PME slabs in this dimension              */
177     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
178     int  *pp_min;  /* The minimum pp node location, size nslab               */
179     int  *pp_max;  /* The maximum pp node location,size nslab                */
180     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
181 } gmx_ddpme_t;
182
183 typedef struct
184 {
185     real min0;    /* The minimum bottom of this zone                        */
186     real max1;    /* The maximum top of this zone                           */
187     real min1;    /* The minimum top of this zone                           */
188     real mch0;    /* The maximum bottom communicaton height for this zone   */
189     real mch1;    /* The maximum top communicaton height for this zone      */
190     real p1_0;    /* The bottom value of the first cell in this zone        */
191     real p1_1;    /* The top value of the first cell in this zone           */
192 } gmx_ddzone_t;
193
194 typedef struct
195 {
196     gmx_domdec_ind_t ind;
197     int *ibuf;
198     int ibuf_nalloc;
199     vec_rvec_t vbuf;
200     int nsend;
201     int nat;
202     int nsend_zone;
203 } dd_comm_setup_work_t;
204
205 typedef struct gmx_domdec_comm
206 {
207     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
208      * unless stated otherwise.
209      */
210
211     /* The number of decomposition dimensions for PME, 0: no PME */
212     int  npmedecompdim;
213     /* The number of nodes doing PME (PP/PME or only PME) */
214     int  npmenodes;
215     int  npmenodes_x;
216     int  npmenodes_y;
217     /* The communication setup including the PME only nodes */
218     gmx_bool bCartesianPP_PME;
219     ivec ntot;
220     int  cartpmedim;
221     int  *pmenodes;          /* size npmenodes                         */
222     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
223                               * but with bCartesianPP_PME              */
224     gmx_ddpme_t ddpme[2];
225     
226     /* The DD particle-particle nodes only */
227     gmx_bool bCartesianPP;
228     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
229     
230     /* The global charge groups */
231     t_block cgs_gl;
232
233     /* Should we sort the cgs */
234     int  nstSortCG;
235     gmx_domdec_sort_t *sort;
236     
237     /* Are there charge groups? */
238     gmx_bool bCGs;
239
240     /* Are there bonded and multi-body interactions between charge groups? */
241     gmx_bool bInterCGBondeds;
242     gmx_bool bInterCGMultiBody;
243
244     /* Data for the optional bonded interaction atom communication range */
245     gmx_bool bBondComm;
246     t_blocka *cglink;
247     char *bLocalCG;
248
249     /* The DLB option */
250     int  eDLB;
251     /* Are we actually using DLB? */
252     gmx_bool bDynLoadBal;
253
254     /* Cell sizes for static load balancing, first index cartesian */
255     real **slb_frac;
256
257     /* The width of the communicated boundaries */
258     real cutoff_mbody;
259     real cutoff;
260     /* The minimum cell size (including triclinic correction) */
261     rvec cellsize_min;
262     /* For dlb, for use with edlbAUTO */
263     rvec cellsize_min_dlb;
264     /* The lower limit for the DD cell size with DLB */
265     real cellsize_limit;
266     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
267     gmx_bool bVacDLBNoLimit;
268
269     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
270     ivec tric_dir;
271     /* box0 and box_size are required with dim's without pbc and -gcom */
272     rvec box0;
273     rvec box_size;
274     
275     /* The cell boundaries */
276     rvec cell_x0;
277     rvec cell_x1;
278
279     /* The old location of the cell boundaries, to check cg displacements */
280     rvec old_cell_x0;
281     rvec old_cell_x1;
282
283     /* The communication setup and charge group boundaries for the zones */
284     gmx_domdec_zones_t zones;
285     
286     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
287      * cell boundaries of neighboring cells for dynamic load balancing.
288      */
289     gmx_ddzone_t zone_d1[2];
290     gmx_ddzone_t zone_d2[2][2];
291     
292     /* The coordinate/force communication setup and indices */
293     gmx_domdec_comm_dim_t cd[DIM];
294     /* The maximum number of cells to communicate with in one dimension */
295     int  maxpulse;
296     
297     /* Which cg distribution is stored on the master node */
298     int master_cg_ddp_count;
299     
300     /* The number of cg's received from the direct neighbors */
301     int  zone_ncg1[DD_MAXZONE];
302     
303     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
304     int  nat[ddnatNR];
305
306     /* Array for signalling if atoms have moved to another domain */
307     int  *moved;
308     int  moved_nalloc;
309     
310     /* Communication buffer for general use */
311     int  *buf_int;
312     int  nalloc_int;
313
314     /* Communication buffer for general use */
315     vec_rvec_t vbuf;
316
317     /* Temporary storage for thread parallel communication setup */
318     int nth;
319     dd_comm_setup_work_t *dth;
320
321     /* Communication buffers only used with multiple grid pulses */
322     int  *buf_int2;
323     int  nalloc_int2;
324     vec_rvec_t vbuf2;
325     
326     /* Communication buffers for local redistribution */
327     int  **cggl_flag;
328     int  cggl_flag_nalloc[DIM*2];
329     rvec **cgcm_state;
330     int  cgcm_state_nalloc[DIM*2];
331     
332     /* Cell sizes for dynamic load balancing */
333     gmx_domdec_root_t **root;
334     real *cell_f_row;
335     real cell_f0[DIM];
336     real cell_f1[DIM];
337     real cell_f_max0[DIM];
338     real cell_f_min1[DIM];
339     
340     /* Stuff for load communication */
341     gmx_bool bRecordLoad;
342     gmx_domdec_load_t *load;
343 #ifdef GMX_MPI
344     MPI_Comm *mpi_comm_load;
345 #endif
346
347     /* Maximum DLB scaling per load balancing step in percent */
348     int dlb_scale_lim;
349
350     /* Cycle counters */
351     float cycl[ddCyclNr];
352     int   cycl_n[ddCyclNr];
353     float cycl_max[ddCyclNr];
354     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
355     int eFlop;
356     double flop;
357     int    flop_n;
358     /* Have often have did we have load measurements */
359     int    n_load_have;
360     /* Have often have we collected the load measurements */
361     int    n_load_collect;
362     
363     /* Statistics */
364     double sum_nat[ddnatNR-ddnatZONE];
365     int    ndecomp;
366     int    nload;
367     double load_step;
368     double load_sum;
369     double load_max;
370     ivec   load_lim;
371     double load_mdf;
372     double load_pme;
373
374     /* The last partition step */
375     gmx_large_int_t partition_step;
376
377     /* Debugging */
378     int  nstDDDump;
379     int  nstDDDumpGrid;
380     int  DD_debug;
381 } gmx_domdec_comm_t;
382
383 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
384 #define DD_CGIBS 2
385
386 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
387 #define DD_FLAG_NRCG  65535
388 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
389 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
390
391 /* Zone permutation required to obtain consecutive charge groups
392  * for neighbor searching.
393  */
394 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
395
396 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
397  * components see only j zones with that component 0.
398  */
399
400 /* The DD zone order */
401 static const ivec dd_zo[DD_MAXZONE] =
402   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
403
404 /* The 3D setup */
405 #define dd_z3n  8
406 #define dd_zp3n 4
407 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
408
409 /* The 2D setup */
410 #define dd_z2n  4
411 #define dd_zp2n 2
412 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
413
414 /* The 1D setup */
415 #define dd_z1n  2
416 #define dd_zp1n 1
417 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
418
419 /* Factors used to avoid problems due to rounding issues */
420 #define DD_CELL_MARGIN       1.0001
421 #define DD_CELL_MARGIN2      1.00005
422 /* Factor to account for pressure scaling during nstlist steps */
423 #define DD_PRES_SCALE_MARGIN 1.02
424
425 /* Allowed performance loss before we DLB or warn */
426 #define DD_PERF_LOSS 0.05
427
428 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
429
430 /* Use separate MPI send and receive commands
431  * when nnodes <= GMX_DD_NNODES_SENDRECV.
432  * This saves memory (and some copying for small nnodes).
433  * For high parallelization scatter and gather calls are used.
434  */
435 #define GMX_DD_NNODES_SENDRECV 4
436
437
438 /*
439 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
440
441 static void index2xyz(ivec nc,int ind,ivec xyz)
442 {
443   xyz[XX] = ind % nc[XX];
444   xyz[YY] = (ind / nc[XX]) % nc[YY];
445   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
446 }
447 */
448
449 /* This order is required to minimize the coordinate communication in PME
450  * which uses decomposition in the x direction.
451  */
452 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
453
454 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
455 {
456     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
457     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
458     xyz[ZZ] = ind % nc[ZZ];
459 }
460
461 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
462 {
463     int ddindex;
464     int ddnodeid=-1;
465     
466     ddindex = dd_index(dd->nc,c);
467     if (dd->comm->bCartesianPP_PME)
468     {
469         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
470     }
471     else if (dd->comm->bCartesianPP)
472     {
473 #ifdef GMX_MPI
474         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
475 #endif
476     }
477     else
478     {
479         ddnodeid = ddindex;
480     }
481     
482     return ddnodeid;
483 }
484
485 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
486 {
487     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
488 }
489
490 int ddglatnr(gmx_domdec_t *dd,int i)
491 {
492     int atnr;
493     
494     if (dd == NULL)
495     {
496         atnr = i + 1;
497     }
498     else
499     {
500         if (i >= dd->comm->nat[ddnatNR-1])
501         {
502             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
503         }
504         atnr = dd->gatindex[i] + 1;
505     }
506     
507     return atnr;
508 }
509
510 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
511 {
512     return &dd->comm->cgs_gl;
513 }
514
515 static void vec_rvec_init(vec_rvec_t *v)
516 {
517     v->nalloc = 0;
518     v->v      = NULL;
519 }
520
521 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
522 {
523     if (n > v->nalloc)
524     {
525         v->nalloc = over_alloc_dd(n);
526         srenew(v->v,v->nalloc);
527     }
528 }
529
530 void dd_store_state(gmx_domdec_t *dd,t_state *state)
531 {
532     int i;
533     
534     if (state->ddp_count != dd->ddp_count)
535     {
536         gmx_incons("The state does not the domain decomposition state");
537     }
538     
539     state->ncg_gl = dd->ncg_home;
540     if (state->ncg_gl > state->cg_gl_nalloc)
541     {
542         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
543         srenew(state->cg_gl,state->cg_gl_nalloc);
544     }
545     for(i=0; i<state->ncg_gl; i++)
546     {
547         state->cg_gl[i] = dd->index_gl[i];
548     }
549     
550     state->ddp_count_cg_gl = dd->ddp_count;
551 }
552
553 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
554 {
555     return &dd->comm->zones;
556 }
557
558 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
559                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
560 {
561     gmx_domdec_zones_t *zones;
562     int izone,d,dim;
563
564     zones = &dd->comm->zones;
565
566     izone = 0;
567     while (icg >= zones->izone[izone].cg1)
568     {
569         izone++;
570     }
571     
572     if (izone == 0)
573     {
574         *jcg0 = icg;
575     }
576     else if (izone < zones->nizone)
577     {
578         *jcg0 = zones->izone[izone].jcg0;
579     }
580     else
581     {
582         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
583                   icg,izone,zones->nizone);
584     }
585         
586     *jcg1 = zones->izone[izone].jcg1;
587     
588     for(d=0; d<dd->ndim; d++)
589     {
590         dim = dd->dim[d];
591         shift0[dim] = zones->izone[izone].shift0[dim];
592         shift1[dim] = zones->izone[izone].shift1[dim];
593         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
594         {
595             /* A conservative approach, this can be optimized */
596             shift0[dim] -= 1;
597             shift1[dim] += 1;
598         }
599     }
600 }
601
602 int dd_natoms_vsite(gmx_domdec_t *dd)
603 {
604     return dd->comm->nat[ddnatVSITE];
605 }
606
607 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
608 {
609     *at_start = dd->comm->nat[ddnatCON-1];
610     *at_end   = dd->comm->nat[ddnatCON];
611 }
612
613 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
614 {
615     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
616     int  *index,*cgindex;
617     gmx_domdec_comm_t *comm;
618     gmx_domdec_comm_dim_t *cd;
619     gmx_domdec_ind_t *ind;
620     rvec shift={0,0,0},*buf,*rbuf;
621     gmx_bool bPBC,bScrew;
622     
623     comm = dd->comm;
624     
625     cgindex = dd->cgindex;
626     
627     buf = comm->vbuf.v;
628
629     nzone = 1;
630     nat_tot = dd->nat_home;
631     for(d=0; d<dd->ndim; d++)
632     {
633         bPBC   = (dd->ci[dd->dim[d]] == 0);
634         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
635         if (bPBC)
636         {
637             copy_rvec(box[dd->dim[d]],shift);
638         }
639         cd = &comm->cd[d];
640         for(p=0; p<cd->np; p++)
641         {
642             ind = &cd->ind[p];
643             index = ind->index;
644             n = 0;
645             if (!bPBC)
646             {
647                 for(i=0; i<ind->nsend[nzone]; i++)
648                 {
649                     at0 = cgindex[index[i]];
650                     at1 = cgindex[index[i]+1];
651                     for(j=at0; j<at1; j++)
652                     {
653                         copy_rvec(x[j],buf[n]);
654                         n++;
655                     }
656                 }
657             }
658             else if (!bScrew)
659             {
660                 for(i=0; i<ind->nsend[nzone]; i++)
661                 {
662                     at0 = cgindex[index[i]];
663                     at1 = cgindex[index[i]+1];
664                     for(j=at0; j<at1; j++)
665                     {
666                         /* We need to shift the coordinates */
667                         rvec_add(x[j],shift,buf[n]);
668                         n++;
669                     }
670                 }
671             }
672             else
673             {
674                 for(i=0; i<ind->nsend[nzone]; i++)
675                 {
676                     at0 = cgindex[index[i]];
677                     at1 = cgindex[index[i]+1];
678                     for(j=at0; j<at1; j++)
679                     {
680                         /* Shift x */
681                         buf[n][XX] = x[j][XX] + shift[XX];
682                         /* Rotate y and z.
683                          * This operation requires a special shift force
684                          * treatment, which is performed in calc_vir.
685                          */
686                         buf[n][YY] = box[YY][YY] - x[j][YY];
687                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
688                         n++;
689                     }
690                 }
691             }
692             
693             if (cd->bInPlace)
694             {
695                 rbuf = x + nat_tot;
696             }
697             else
698             {
699                 rbuf = comm->vbuf2.v;
700             }
701             /* Send and receive the coordinates */
702             dd_sendrecv_rvec(dd, d, dddirBackward,
703                              buf,  ind->nsend[nzone+1],
704                              rbuf, ind->nrecv[nzone+1]);
705             if (!cd->bInPlace)
706             {
707                 j = 0;
708                 for(zone=0; zone<nzone; zone++)
709                 {
710                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
711                     {
712                         copy_rvec(rbuf[j],x[i]);
713                         j++;
714                     }
715                 }
716             }
717             nat_tot += ind->nrecv[nzone+1];
718         }
719         nzone += nzone;
720     }
721 }
722
723 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
724 {
725     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
726     int  *index,*cgindex;
727     gmx_domdec_comm_t *comm;
728     gmx_domdec_comm_dim_t *cd;
729     gmx_domdec_ind_t *ind;
730     rvec *buf,*sbuf;
731     ivec vis;
732     int  is;
733     gmx_bool bPBC,bScrew;
734     
735     comm = dd->comm;
736     
737     cgindex = dd->cgindex;
738
739     buf = comm->vbuf.v;
740
741     n = 0;
742     nzone = comm->zones.n/2;
743     nat_tot = dd->nat_tot;
744     for(d=dd->ndim-1; d>=0; d--)
745     {
746         bPBC   = (dd->ci[dd->dim[d]] == 0);
747         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
748         if (fshift == NULL && !bScrew)
749         {
750             bPBC = FALSE;
751         }
752         /* Determine which shift vector we need */
753         clear_ivec(vis);
754         vis[dd->dim[d]] = 1;
755         is = IVEC2IS(vis);
756         
757         cd = &comm->cd[d];
758         for(p=cd->np-1; p>=0; p--) {
759             ind = &cd->ind[p];
760             nat_tot -= ind->nrecv[nzone+1];
761             if (cd->bInPlace)
762             {
763                 sbuf = f + nat_tot;
764             }
765             else
766             {
767                 sbuf = comm->vbuf2.v;
768                 j = 0;
769                 for(zone=0; zone<nzone; zone++)
770                 {
771                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
772                     {
773                         copy_rvec(f[i],sbuf[j]);
774                         j++;
775                     }
776                 }
777             }
778             /* Communicate the forces */
779             dd_sendrecv_rvec(dd, d, dddirForward,
780                              sbuf, ind->nrecv[nzone+1],
781                              buf,  ind->nsend[nzone+1]);
782             index = ind->index;
783             /* Add the received forces */
784             n = 0;
785             if (!bPBC)
786             {
787                 for(i=0; i<ind->nsend[nzone]; i++)
788                 {
789                     at0 = cgindex[index[i]];
790                     at1 = cgindex[index[i]+1];
791                     for(j=at0; j<at1; j++)
792                     {
793                         rvec_inc(f[j],buf[n]);
794                         n++;
795                     }
796                 } 
797             }
798             else if (!bScrew)
799             {
800                 for(i=0; i<ind->nsend[nzone]; i++)
801                 {
802                     at0 = cgindex[index[i]];
803                     at1 = cgindex[index[i]+1];
804                     for(j=at0; j<at1; j++)
805                     {
806                         rvec_inc(f[j],buf[n]);
807                         /* Add this force to the shift force */
808                         rvec_inc(fshift[is],buf[n]);
809                         n++;
810                     }
811                 }
812             }
813             else
814             {
815                 for(i=0; i<ind->nsend[nzone]; i++)
816                 {
817                     at0 = cgindex[index[i]];
818                     at1 = cgindex[index[i]+1];
819                     for(j=at0; j<at1; j++)
820                     {
821                         /* Rotate the force */
822                         f[j][XX] += buf[n][XX];
823                         f[j][YY] -= buf[n][YY];
824                         f[j][ZZ] -= buf[n][ZZ];
825                         if (fshift)
826                         {
827                             /* Add this force to the shift force */
828                             rvec_inc(fshift[is],buf[n]);
829                         }
830                         n++;
831                     }
832                 }
833             }
834         }
835         nzone /= 2;
836     }
837 }
838
839 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
840 {
841     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
842     int  *index,*cgindex;
843     gmx_domdec_comm_t *comm;
844     gmx_domdec_comm_dim_t *cd;
845     gmx_domdec_ind_t *ind;
846     real *buf,*rbuf;
847     
848     comm = dd->comm;
849     
850     cgindex = dd->cgindex;
851     
852     buf = &comm->vbuf.v[0][0];
853
854     nzone = 1;
855     nat_tot = dd->nat_home;
856     for(d=0; d<dd->ndim; d++)
857     {
858         cd = &comm->cd[d];
859         for(p=0; p<cd->np; p++)
860         {
861             ind = &cd->ind[p];
862             index = ind->index;
863             n = 0;
864             for(i=0; i<ind->nsend[nzone]; i++)
865             {
866                 at0 = cgindex[index[i]];
867                 at1 = cgindex[index[i]+1];
868                 for(j=at0; j<at1; j++)
869                 {
870                     buf[n] = v[j];
871                     n++;
872                 }
873             }
874             
875             if (cd->bInPlace)
876             {
877                 rbuf = v + nat_tot;
878             }
879             else
880             {
881                 rbuf = &comm->vbuf2.v[0][0];
882             }
883             /* Send and receive the coordinates */
884             dd_sendrecv_real(dd, d, dddirBackward,
885                              buf,  ind->nsend[nzone+1],
886                              rbuf, ind->nrecv[nzone+1]);
887             if (!cd->bInPlace)
888             {
889                 j = 0;
890                 for(zone=0; zone<nzone; zone++)
891                 {
892                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
893                     {
894                         v[i] = rbuf[j];
895                         j++;
896                     }
897                 }
898             }
899             nat_tot += ind->nrecv[nzone+1];
900         }
901         nzone += nzone;
902     }
903 }
904
905 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
906 {
907     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
908     int  *index,*cgindex;
909     gmx_domdec_comm_t *comm;
910     gmx_domdec_comm_dim_t *cd;
911     gmx_domdec_ind_t *ind;
912     real *buf,*sbuf;
913     
914     comm = dd->comm;
915     
916     cgindex = dd->cgindex;
917
918     buf = &comm->vbuf.v[0][0];
919
920     n = 0;
921     nzone = comm->zones.n/2;
922     nat_tot = dd->nat_tot;
923     for(d=dd->ndim-1; d>=0; d--)
924     {
925         cd = &comm->cd[d];
926         for(p=cd->np-1; p>=0; p--) {
927             ind = &cd->ind[p];
928             nat_tot -= ind->nrecv[nzone+1];
929             if (cd->bInPlace)
930             {
931                 sbuf = v + nat_tot;
932             }
933             else
934             {
935                 sbuf = &comm->vbuf2.v[0][0];
936                 j = 0;
937                 for(zone=0; zone<nzone; zone++)
938                 {
939                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
940                     {
941                         sbuf[j] = v[i];
942                         j++;
943                     }
944                 }
945             }
946             /* Communicate the forces */
947             dd_sendrecv_real(dd, d, dddirForward,
948                              sbuf, ind->nrecv[nzone+1],
949                              buf,  ind->nsend[nzone+1]);
950             index = ind->index;
951             /* Add the received forces */
952             n = 0;
953             for(i=0; i<ind->nsend[nzone]; i++)
954             {
955                 at0 = cgindex[index[i]];
956                 at1 = cgindex[index[i]+1];
957                 for(j=at0; j<at1; j++)
958                 {
959                     v[j] += buf[n];
960                     n++;
961                 }
962             } 
963         }
964         nzone /= 2;
965     }
966 }
967
968 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
969 {
970     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
971             d,i,j,
972             zone->min0,zone->max1,
973             zone->mch0,zone->mch0,
974             zone->p1_0,zone->p1_1);
975 }
976
977
978 #define DDZONECOMM_MAXZONE  5
979 #define DDZONECOMM_BUFSIZE  3
980
981 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
982                                int ddimind,int direction,
983                                gmx_ddzone_t *buf_s,int n_s,
984                                gmx_ddzone_t *buf_r,int n_r)
985 {
986 #define ZBS  DDZONECOMM_BUFSIZE
987     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
988     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
989     int i;
990
991     for(i=0; i<n_s; i++)
992     {
993         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
994         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
995         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
996         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
997         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
998         vbuf_s[i*ZBS+1][2] = 0;
999         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1000         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1001         vbuf_s[i*ZBS+2][2] = 0;
1002     }
1003
1004     dd_sendrecv_rvec(dd, ddimind, direction,
1005                      vbuf_s, n_s*ZBS,
1006                      vbuf_r, n_r*ZBS);
1007
1008     for(i=0; i<n_r; i++)
1009     {
1010         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1011         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1012         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1013         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1014         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1015         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1016         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1017     }
1018
1019 #undef ZBS
1020 }
1021
1022 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1023                           rvec cell_ns_x0,rvec cell_ns_x1)
1024 {
1025     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1026     gmx_ddzone_t *zp;
1027     gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1028     gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1029     gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1030     rvec extr_s[2],extr_r[2];
1031     rvec dh;
1032     real dist_d,c=0,det;
1033     gmx_domdec_comm_t *comm;
1034     gmx_bool bPBC,bUse;
1035
1036     comm = dd->comm;
1037
1038     for(d=1; d<dd->ndim; d++)
1039     {
1040         dim = dd->dim[d];
1041         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1042         zp->min0 = cell_ns_x0[dim];
1043         zp->max1 = cell_ns_x1[dim];
1044         zp->min1 = cell_ns_x1[dim];
1045         zp->mch0 = cell_ns_x0[dim];
1046         zp->mch1 = cell_ns_x1[dim];
1047         zp->p1_0 = cell_ns_x0[dim];
1048         zp->p1_1 = cell_ns_x1[dim];
1049     }
1050     
1051     for(d=dd->ndim-2; d>=0; d--)
1052     {
1053         dim  = dd->dim[d];
1054         bPBC = (dim < ddbox->npbcdim);
1055
1056         /* Use an rvec to store two reals */
1057         extr_s[d][0] = comm->cell_f0[d+1];
1058         extr_s[d][1] = comm->cell_f1[d+1];
1059         extr_s[d][2] = comm->cell_f1[d+1];
1060
1061         pos = 0;
1062         /* Store the extremes in the backward sending buffer,
1063          * so the get updated separately from the forward communication.
1064          */
1065         for(d1=d; d1<dd->ndim-1; d1++)
1066         {
1067             /* We invert the order to be able to use the same loop for buf_e */
1068             buf_s[pos].min0 = extr_s[d1][1];
1069             buf_s[pos].max1 = extr_s[d1][0];
1070             buf_s[pos].min1 = extr_s[d1][2];
1071             buf_s[pos].mch0 = 0;
1072             buf_s[pos].mch1 = 0;
1073             /* Store the cell corner of the dimension we communicate along */
1074             buf_s[pos].p1_0 = comm->cell_x0[dim];
1075             buf_s[pos].p1_1 = 0;
1076             pos++;
1077         }
1078
1079         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1080         pos++;
1081
1082         if (dd->ndim == 3 && d == 0)
1083         {
1084             buf_s[pos] = comm->zone_d2[0][1];
1085             pos++;
1086             buf_s[pos] = comm->zone_d1[0];
1087             pos++;
1088         }
1089
1090         /* We only need to communicate the extremes
1091          * in the forward direction
1092          */
1093         npulse = comm->cd[d].np;
1094         if (bPBC)
1095         {
1096             /* Take the minimum to avoid double communication */
1097             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1098         }
1099         else
1100         {
1101             /* Without PBC we should really not communicate over
1102              * the boundaries, but implementing that complicates
1103              * the communication setup and therefore we simply
1104              * do all communication, but ignore some data.
1105              */
1106             npulse_min = npulse;
1107         }
1108         for(p=0; p<npulse_min; p++)
1109         {
1110             /* Communicate the extremes forward */
1111             bUse = (bPBC || dd->ci[dim] > 0);
1112
1113             dd_sendrecv_rvec(dd, d, dddirForward,
1114                              extr_s+d, dd->ndim-d-1,
1115                              extr_r+d, dd->ndim-d-1);
1116
1117             if (bUse)
1118             {
1119                 for(d1=d; d1<dd->ndim-1; d1++)
1120                 {
1121                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1122                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1123                     extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1124                 }
1125             }
1126         }
1127
1128         buf_size = pos;
1129         for(p=0; p<npulse; p++)
1130         {
1131             /* Communicate all the zone information backward */
1132             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1133
1134             dd_sendrecv_ddzone(dd, d, dddirBackward,
1135                                buf_s, buf_size,
1136                                buf_r, buf_size);
1137
1138             clear_rvec(dh);
1139             if (p > 0)
1140             {
1141                 for(d1=d+1; d1<dd->ndim; d1++)
1142                 {
1143                     /* Determine the decrease of maximum required
1144                      * communication height along d1 due to the distance along d,
1145                      * this avoids a lot of useless atom communication.
1146                      */
1147                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1148
1149                     if (ddbox->tric_dir[dim])
1150                     {
1151                         /* c is the off-diagonal coupling between the cell planes
1152                          * along directions d and d1.
1153                          */
1154                         c = ddbox->v[dim][dd->dim[d1]][dim];
1155                     }
1156                     else
1157                     {
1158                         c = 0;
1159                     }
1160                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1161                     if (det > 0)
1162                     {
1163                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1164                     }
1165                     else
1166                     {
1167                         /* A negative value signals out of range */
1168                         dh[d1] = -1;
1169                     }
1170                 }
1171             }
1172
1173             /* Accumulate the extremes over all pulses */
1174             for(i=0; i<buf_size; i++)
1175             {
1176                 if (p == 0)
1177                 {
1178                     buf_e[i] = buf_r[i];
1179                 }
1180                 else
1181                 {
1182                     if (bUse)
1183                     {
1184                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1185                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1186                         buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1187                     }
1188
1189                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1190                     {
1191                         d1 = 1;
1192                     }
1193                     else
1194                     {
1195                         d1 = d + 1;
1196                     }
1197                     if (bUse && dh[d1] >= 0)
1198                     {
1199                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1200                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1201                     }
1202                 }
1203                 /* Copy the received buffer to the send buffer,
1204                  * to pass the data through with the next pulse.
1205                  */
1206                 buf_s[i] = buf_r[i];
1207             }
1208             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1209                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1210             {
1211                 /* Store the extremes */ 
1212                 pos = 0;
1213
1214                 for(d1=d; d1<dd->ndim-1; d1++)
1215                 {
1216                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1217                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1218                     extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1219                     pos++;
1220                 }
1221
1222                 if (d == 1 || (d == 0 && dd->ndim == 3))
1223                 {
1224                     for(i=d; i<2; i++)
1225                     {
1226                         comm->zone_d2[1-d][i] = buf_e[pos];
1227                         pos++;
1228                     }
1229                 }
1230                 if (d == 0)
1231                 {
1232                     comm->zone_d1[1] = buf_e[pos];
1233                     pos++;
1234                 }
1235             }
1236         }
1237     }
1238     
1239     if (dd->ndim >= 2)
1240     {
1241         dim = dd->dim[1];
1242         for(i=0; i<2; i++)
1243         {
1244             if (debug)
1245             {
1246                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1247             }
1248             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1249             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1250         }
1251     }
1252     if (dd->ndim >= 3)
1253     {
1254         dim = dd->dim[2];
1255         for(i=0; i<2; i++)
1256         {
1257             for(j=0; j<2; j++)
1258             {
1259                 if (debug)
1260                 {
1261                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1262                 }
1263                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1264                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1265             }
1266         }
1267     }
1268     for(d=1; d<dd->ndim; d++)
1269     {
1270         comm->cell_f_max0[d] = extr_s[d-1][0];
1271         comm->cell_f_min1[d] = extr_s[d-1][1];
1272         if (debug)
1273         {
1274             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1275                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1276         }
1277     }
1278 }
1279
1280 static void dd_collect_cg(gmx_domdec_t *dd,
1281                           t_state *state_local)
1282 {
1283     gmx_domdec_master_t *ma=NULL;
1284     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1285     t_block *cgs_gl;
1286
1287     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1288     {
1289         /* The master has the correct distribution */
1290         return;
1291     }
1292     
1293     if (state_local->ddp_count == dd->ddp_count)
1294     {
1295         ncg_home = dd->ncg_home;
1296         cg       = dd->index_gl;
1297         nat_home = dd->nat_home;
1298     } 
1299     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1300     {
1301         cgs_gl = &dd->comm->cgs_gl;
1302
1303         ncg_home = state_local->ncg_gl;
1304         cg       = state_local->cg_gl;
1305         nat_home = 0;
1306         for(i=0; i<ncg_home; i++)
1307         {
1308             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1309         }
1310     }
1311     else
1312     {
1313         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1314     }
1315     
1316     buf2[0] = dd->ncg_home;
1317     buf2[1] = dd->nat_home;
1318     if (DDMASTER(dd))
1319     {
1320         ma = dd->ma;
1321         ibuf = ma->ibuf;
1322     }
1323     else
1324     {
1325         ibuf = NULL;
1326     }
1327     /* Collect the charge group and atom counts on the master */
1328     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1329     
1330     if (DDMASTER(dd))
1331     {
1332         ma->index[0] = 0;
1333         for(i=0; i<dd->nnodes; i++)
1334         {
1335             ma->ncg[i] = ma->ibuf[2*i];
1336             ma->nat[i] = ma->ibuf[2*i+1];
1337             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1338             
1339         }
1340         /* Make byte counts and indices */
1341         for(i=0; i<dd->nnodes; i++)
1342         {
1343             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1344             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1345         }
1346         if (debug)
1347         {
1348             fprintf(debug,"Initial charge group distribution: ");
1349             for(i=0; i<dd->nnodes; i++)
1350                 fprintf(debug," %d",ma->ncg[i]);
1351             fprintf(debug,"\n");
1352         }
1353     }
1354     
1355     /* Collect the charge group indices on the master */
1356     dd_gatherv(dd,
1357                dd->ncg_home*sizeof(int),dd->index_gl,
1358                DDMASTER(dd) ? ma->ibuf : NULL,
1359                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1360                DDMASTER(dd) ? ma->cg : NULL);
1361     
1362     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1363 }
1364
1365 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1366                                     rvec *lv,rvec *v)
1367 {
1368     gmx_domdec_master_t *ma;
1369     int  n,i,c,a,nalloc=0;
1370     rvec *buf=NULL;
1371     t_block *cgs_gl;
1372
1373     ma = dd->ma;
1374     
1375     if (!DDMASTER(dd))
1376     {
1377 #ifdef GMX_MPI
1378         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1379                  dd->rank,dd->mpi_comm_all);
1380 #endif
1381     } else {
1382         /* Copy the master coordinates to the global array */
1383         cgs_gl = &dd->comm->cgs_gl;
1384
1385         n = DDMASTERRANK(dd);
1386         a = 0;
1387         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1388         {
1389             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1390             {
1391                 copy_rvec(lv[a++],v[c]);
1392             }
1393         }
1394         
1395         for(n=0; n<dd->nnodes; n++)
1396         {
1397             if (n != dd->rank)
1398             {
1399                 if (ma->nat[n] > nalloc)
1400                 {
1401                     nalloc = over_alloc_dd(ma->nat[n]);
1402                     srenew(buf,nalloc);
1403                 }
1404 #ifdef GMX_MPI
1405                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1406                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1407 #endif
1408                 a = 0;
1409                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1410                 {
1411                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1412                     {
1413                         copy_rvec(buf[a++],v[c]);
1414                     }
1415                 }
1416             }
1417         }
1418         sfree(buf);
1419     }
1420 }
1421
1422 static void get_commbuffer_counts(gmx_domdec_t *dd,
1423                                   int **counts,int **disps)
1424 {
1425     gmx_domdec_master_t *ma;
1426     int n;
1427
1428     ma = dd->ma;
1429     
1430     /* Make the rvec count and displacment arrays */
1431     *counts  = ma->ibuf;
1432     *disps   = ma->ibuf + dd->nnodes;
1433     for(n=0; n<dd->nnodes; n++)
1434     {
1435         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1436         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1437     }
1438 }
1439
1440 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1441                                    rvec *lv,rvec *v)
1442 {
1443     gmx_domdec_master_t *ma;
1444     int  *rcounts=NULL,*disps=NULL;
1445     int  n,i,c,a;
1446     rvec *buf=NULL;
1447     t_block *cgs_gl;
1448     
1449     ma = dd->ma;
1450     
1451     if (DDMASTER(dd))
1452     {
1453         get_commbuffer_counts(dd,&rcounts,&disps);
1454
1455         buf = ma->vbuf;
1456     }
1457     
1458     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1459
1460     if (DDMASTER(dd))
1461     {
1462         cgs_gl = &dd->comm->cgs_gl;
1463
1464         a = 0;
1465         for(n=0; n<dd->nnodes; n++)
1466         {
1467             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1468             {
1469                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1470                 {
1471                     copy_rvec(buf[a++],v[c]);
1472                 }
1473             }
1474         }
1475     }
1476 }
1477
1478 void dd_collect_vec(gmx_domdec_t *dd,
1479                     t_state *state_local,rvec *lv,rvec *v)
1480 {
1481     gmx_domdec_master_t *ma;
1482     int  n,i,c,a,nalloc=0;
1483     rvec *buf=NULL;
1484     
1485     dd_collect_cg(dd,state_local);
1486
1487     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1488     {
1489         dd_collect_vec_sendrecv(dd,lv,v);
1490     }
1491     else
1492     {
1493         dd_collect_vec_gatherv(dd,lv,v);
1494     }
1495 }
1496
1497
1498 void dd_collect_state(gmx_domdec_t *dd,
1499                       t_state *state_local,t_state *state)
1500 {
1501     int est,i,j,nh;
1502
1503     nh = state->nhchainlength;
1504
1505     if (DDMASTER(dd))
1506     {
1507         for (i=0;i<efptNR;i++) {
1508             state->lambda[i] = state_local->lambda[i];
1509         }
1510         state->fep_state = state_local->fep_state;
1511         state->veta = state_local->veta;
1512         state->vol0 = state_local->vol0;
1513         copy_mat(state_local->box,state->box);
1514         copy_mat(state_local->boxv,state->boxv);
1515         copy_mat(state_local->svir_prev,state->svir_prev);
1516         copy_mat(state_local->fvir_prev,state->fvir_prev);
1517         copy_mat(state_local->pres_prev,state->pres_prev);
1518
1519
1520         for(i=0; i<state_local->ngtc; i++)
1521         {
1522             for(j=0; j<nh; j++) {
1523                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1524                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1525             }
1526             state->therm_integral[i] = state_local->therm_integral[i];            
1527         }
1528         for(i=0; i<state_local->nnhpres; i++) 
1529         {
1530             for(j=0; j<nh; j++) {
1531                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1532                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1533             }
1534         }
1535     }
1536     for(est=0; est<estNR; est++)
1537     {
1538         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1539         {
1540             switch (est) {
1541             case estX:
1542                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1543                 break;
1544             case estV:
1545                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1546                 break;
1547             case estSDX:
1548                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1549                 break;
1550             case estCGP:
1551                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1552                 break;
1553             case estLD_RNG:
1554                 if (state->nrngi == 1)
1555                 {
1556                     if (DDMASTER(dd))
1557                     {
1558                         for(i=0; i<state_local->nrng; i++)
1559                         {
1560                             state->ld_rng[i] = state_local->ld_rng[i];
1561                         }
1562                     }
1563                 }
1564                 else
1565                 {
1566                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1567                               state_local->ld_rng,state->ld_rng);
1568                 }
1569                 break;
1570             case estLD_RNGI:
1571                 if (state->nrngi == 1)
1572                 {
1573                    if (DDMASTER(dd))
1574                     {
1575                         state->ld_rngi[0] = state_local->ld_rngi[0];
1576                     } 
1577                 }
1578                 else
1579                 {
1580                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1581                               state_local->ld_rngi,state->ld_rngi);
1582                 }
1583                 break;
1584             case estDISRE_INITF:
1585             case estDISRE_RM3TAV:
1586             case estORIRE_INITF:
1587             case estORIRE_DTAV:
1588                 break;
1589             default:
1590                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1591             }
1592         }
1593     }
1594 }
1595
1596 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1597 {
1598     int est;
1599
1600     if (debug)
1601     {
1602         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1603     }
1604
1605     state->nalloc = over_alloc_dd(nalloc);
1606     
1607     for(est=0; est<estNR; est++)
1608     {
1609         if (EST_DISTR(est) && (state->flags & (1<<est)))
1610         {
1611             switch(est) {
1612             case estX:
1613                 srenew(state->x,state->nalloc);
1614                 break;
1615             case estV:
1616                 srenew(state->v,state->nalloc);
1617                 break;
1618             case estSDX:
1619                 srenew(state->sd_X,state->nalloc);
1620                 break;
1621             case estCGP:
1622                 srenew(state->cg_p,state->nalloc);
1623                 break;
1624             case estLD_RNG:
1625             case estLD_RNGI:
1626             case estDISRE_INITF:
1627             case estDISRE_RM3TAV:
1628             case estORIRE_INITF:
1629             case estORIRE_DTAV:
1630                 /* No reallocation required */
1631                 break;
1632             default:
1633                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1634             }
1635         }
1636     }
1637     
1638     if (f != NULL)
1639     {
1640         srenew(*f,state->nalloc);
1641     }
1642 }
1643
1644 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1645                                int nalloc)
1646 {
1647     if (nalloc > fr->cg_nalloc)
1648     {
1649         if (debug)
1650         {
1651             fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1652         }
1653         fr->cg_nalloc = over_alloc_dd(nalloc);
1654         srenew(fr->cginfo,fr->cg_nalloc);
1655         if (fr->cutoff_scheme == ecutsGROUP)
1656         {
1657             srenew(fr->cg_cm,fr->cg_nalloc);
1658         }
1659     }
1660     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1661     {
1662         /* We don't use charge groups, we use x in state to set up
1663          * the atom communication.
1664          */
1665         dd_realloc_state(state,f,nalloc);
1666     }
1667 }
1668
1669 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1670                                        rvec *v,rvec *lv)
1671 {
1672     gmx_domdec_master_t *ma;
1673     int  n,i,c,a,nalloc=0;
1674     rvec *buf=NULL;
1675     
1676     if (DDMASTER(dd))
1677     {
1678         ma  = dd->ma;
1679         
1680         for(n=0; n<dd->nnodes; n++)
1681         {
1682             if (n != dd->rank)
1683             {
1684                 if (ma->nat[n] > nalloc)
1685                 {
1686                     nalloc = over_alloc_dd(ma->nat[n]);
1687                     srenew(buf,nalloc);
1688                 }
1689                 /* Use lv as a temporary buffer */
1690                 a = 0;
1691                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1692                 {
1693                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1694                     {
1695                         copy_rvec(v[c],buf[a++]);
1696                     }
1697                 }
1698                 if (a != ma->nat[n])
1699                 {
1700                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1701                               a,ma->nat[n]);
1702                 }
1703                 
1704 #ifdef GMX_MPI
1705                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1706                          DDRANK(dd,n),n,dd->mpi_comm_all);
1707 #endif
1708             }
1709         }
1710         sfree(buf);
1711         n = DDMASTERRANK(dd);
1712         a = 0;
1713         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1714         {
1715             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1716             {
1717                 copy_rvec(v[c],lv[a++]);
1718             }
1719         }
1720     }
1721     else
1722     {
1723 #ifdef GMX_MPI
1724         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1725                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1726 #endif
1727     }
1728 }
1729
1730 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1731                                        rvec *v,rvec *lv)
1732 {
1733     gmx_domdec_master_t *ma;
1734     int  *scounts=NULL,*disps=NULL;
1735     int  n,i,c,a,nalloc=0;
1736     rvec *buf=NULL;
1737     
1738     if (DDMASTER(dd))
1739     {
1740         ma  = dd->ma;
1741      
1742         get_commbuffer_counts(dd,&scounts,&disps);
1743
1744         buf = ma->vbuf;
1745         a = 0;
1746         for(n=0; n<dd->nnodes; n++)
1747         {
1748             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1749             {
1750                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1751                 {
1752                     copy_rvec(v[c],buf[a++]);
1753                 }
1754             }
1755         }
1756     }
1757
1758     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1759 }
1760
1761 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1762 {
1763     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1764     {
1765         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1766     }
1767     else
1768     {
1769         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1770     }
1771 }
1772
1773 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1774                                 t_state *state,t_state *state_local,
1775                                 rvec **f)
1776 {
1777     int  i,j,nh;
1778
1779     nh = state->nhchainlength;
1780
1781     if (DDMASTER(dd))
1782     {
1783         for(i=0;i<efptNR;i++)
1784         {
1785             state_local->lambda[i] = state->lambda[i];
1786         }
1787         state_local->fep_state = state->fep_state;
1788         state_local->veta   = state->veta;
1789         state_local->vol0   = state->vol0;
1790         copy_mat(state->box,state_local->box);
1791         copy_mat(state->box_rel,state_local->box_rel);
1792         copy_mat(state->boxv,state_local->boxv);
1793         copy_mat(state->svir_prev,state_local->svir_prev);
1794         copy_mat(state->fvir_prev,state_local->fvir_prev);
1795         for(i=0; i<state_local->ngtc; i++)
1796         {
1797             for(j=0; j<nh; j++) {
1798                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1799                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1800             }
1801             state_local->therm_integral[i] = state->therm_integral[i];
1802         }
1803         for(i=0; i<state_local->nnhpres; i++)
1804         {
1805             for(j=0; j<nh; j++) {
1806                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1807                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1808             }
1809         }
1810     }
1811     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1812     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1813     dd_bcast(dd,sizeof(real),&state_local->veta);
1814     dd_bcast(dd,sizeof(real),&state_local->vol0);
1815     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1816     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1817     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1818     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1819     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1820     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1821     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1822     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1823     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1824     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1825
1826     if (dd->nat_home > state_local->nalloc)
1827     {
1828         dd_realloc_state(state_local,f,dd->nat_home);
1829     }
1830     for(i=0; i<estNR; i++)
1831     {
1832         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1833         {
1834             switch (i) {
1835             case estX:
1836                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1837                 break;
1838             case estV:
1839                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1840                 break;
1841             case estSDX:
1842                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1843                 break;
1844             case estCGP:
1845                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1846                 break;
1847             case estLD_RNG:
1848                 if (state->nrngi == 1)
1849                 {
1850                     dd_bcastc(dd,
1851                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1852                               state->ld_rng,state_local->ld_rng);
1853                 }
1854                 else
1855                 {
1856                     dd_scatter(dd,
1857                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1858                                state->ld_rng,state_local->ld_rng);
1859                 }
1860                 break;
1861             case estLD_RNGI:
1862                 if (state->nrngi == 1)
1863                 {
1864                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1865                               state->ld_rngi,state_local->ld_rngi);
1866                 }
1867                 else
1868                 {
1869                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1870                                state->ld_rngi,state_local->ld_rngi);
1871                 }   
1872                 break;
1873             case estDISRE_INITF:
1874             case estDISRE_RM3TAV:
1875             case estORIRE_INITF:
1876             case estORIRE_DTAV:
1877                 /* Not implemented yet */
1878                 break;
1879             default:
1880                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1881             }
1882         }
1883     }
1884 }
1885
1886 static char dim2char(int dim)
1887 {
1888     char c='?';
1889     
1890     switch (dim)
1891     {
1892     case XX: c = 'X'; break;
1893     case YY: c = 'Y'; break;
1894     case ZZ: c = 'Z'; break;
1895     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1896     }
1897     
1898     return c;
1899 }
1900
1901 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1902                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1903 {
1904     rvec grid_s[2],*grid_r=NULL,cx,r;
1905     char fname[STRLEN],format[STRLEN],buf[22];
1906     FILE *out;
1907     int  a,i,d,z,y,x;
1908     matrix tric;
1909     real vol;
1910
1911     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1912     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1913     
1914     if (DDMASTER(dd))
1915     {
1916         snew(grid_r,2*dd->nnodes);
1917     }
1918     
1919     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1920     
1921     if (DDMASTER(dd))
1922     {
1923         for(d=0; d<DIM; d++)
1924         {
1925             for(i=0; i<DIM; i++)
1926             {
1927                 if (d == i)
1928                 {
1929                     tric[d][i] = 1;
1930                 }
1931                 else
1932                 {
1933                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1934                     {
1935                         tric[d][i] = box[i][d]/box[i][i];
1936                     }
1937                     else
1938                     {
1939                         tric[d][i] = 0;
1940                     }
1941                 }
1942             }
1943         }
1944         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1945         sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1946         out = gmx_fio_fopen(fname,"w");
1947         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1948         a = 1;
1949         for(i=0; i<dd->nnodes; i++)
1950         {
1951             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1952             for(d=0; d<DIM; d++)
1953             {
1954                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1955             }
1956             for(z=0; z<2; z++)
1957             {
1958                 for(y=0; y<2; y++)
1959                 {
1960                     for(x=0; x<2; x++)
1961                     {
1962                         cx[XX] = grid_r[i*2+x][XX];
1963                         cx[YY] = grid_r[i*2+y][YY];
1964                         cx[ZZ] = grid_r[i*2+z][ZZ];
1965                         mvmul(tric,cx,r);
1966                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1967                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1968                     }
1969                 }
1970             }
1971             for(d=0; d<DIM; d++)
1972             {
1973                 for(x=0; x<4; x++)
1974                 {
1975                     switch(d)
1976                     {
1977                     case 0: y = 1 + i*8 + 2*x; break;
1978                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1979                     case 2: y = 1 + i*8 + x; break;
1980                     }
1981                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1982                 }
1983             }
1984         }
1985         gmx_fio_fclose(out);
1986         sfree(grid_r);
1987     }
1988 }
1989
1990 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1991                   gmx_mtop_t *mtop,t_commrec *cr,
1992                   int natoms,rvec x[],matrix box)
1993 {
1994     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1995     FILE *out;
1996     int  i,ii,resnr,c;
1997     char *atomname,*resname;
1998     real b;
1999     gmx_domdec_t *dd;
2000     
2001     dd = cr->dd;
2002     if (natoms == -1)
2003     {
2004         natoms = dd->comm->nat[ddnatVSITE];
2005     }
2006     
2007     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2008     
2009     sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
2010     sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
2011     
2012     out = gmx_fio_fopen(fname,"w");
2013     
2014     fprintf(out,"TITLE     %s\n",title);
2015     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2016     for(i=0; i<natoms; i++)
2017     {
2018         ii = dd->gatindex[i];
2019         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2020         if (i < dd->comm->nat[ddnatZONE])
2021         {
2022             c = 0;
2023             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2024             {
2025                 c++;
2026             }
2027             b = c;
2028         }
2029         else if (i < dd->comm->nat[ddnatVSITE])
2030         {
2031             b = dd->comm->zones.n;
2032         }
2033         else
2034         {
2035             b = dd->comm->zones.n + 1;
2036         }
2037         fprintf(out,strlen(atomname)<4 ? format : format4,
2038                 "ATOM",(ii+1)%100000,
2039                 atomname,resname,' ',resnr%10000,' ',
2040                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2041     }
2042     fprintf(out,"TER\n");
2043     
2044     gmx_fio_fclose(out);
2045 }
2046
2047 real dd_cutoff_mbody(gmx_domdec_t *dd)
2048 {
2049     gmx_domdec_comm_t *comm;
2050     int  di;
2051     real r;
2052
2053     comm = dd->comm;
2054
2055     r = -1;
2056     if (comm->bInterCGBondeds)
2057     {
2058         if (comm->cutoff_mbody > 0)
2059         {
2060             r = comm->cutoff_mbody;
2061         }
2062         else
2063         {
2064             /* cutoff_mbody=0 means we do not have DLB */
2065             r = comm->cellsize_min[dd->dim[0]];
2066             for(di=1; di<dd->ndim; di++)
2067             {
2068                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2069             }
2070             if (comm->bBondComm)
2071             {
2072                 r = max(r,comm->cutoff_mbody);
2073             }
2074             else
2075             {
2076                 r = min(r,comm->cutoff);
2077             }
2078         }
2079     }
2080
2081     return r;
2082 }
2083
2084 real dd_cutoff_twobody(gmx_domdec_t *dd)
2085 {
2086     real r_mb;
2087
2088     r_mb = dd_cutoff_mbody(dd);
2089
2090     return max(dd->comm->cutoff,r_mb);
2091 }
2092
2093
2094 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2095 {
2096     int nc,ntot;
2097     
2098     nc   = dd->nc[dd->comm->cartpmedim];
2099     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2100     copy_ivec(coord,coord_pme);
2101     coord_pme[dd->comm->cartpmedim] =
2102         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2103 }
2104
2105 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2106 {
2107     /* Here we assign a PME node to communicate with this DD node
2108      * by assuming that the major index of both is x.
2109      * We add cr->npmenodes/2 to obtain an even distribution.
2110      */
2111     return (ddindex*npme + npme/2)/ndd;
2112 }
2113
2114 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2115 {
2116     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2117 }
2118
2119 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2120 {
2121     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2122 }
2123
2124 static int *dd_pmenodes(t_commrec *cr)
2125 {
2126     int *pmenodes;
2127     int n,i,p0,p1;
2128     
2129     snew(pmenodes,cr->npmenodes);
2130     n = 0;
2131     for(i=0; i<cr->dd->nnodes; i++) {
2132         p0 = cr_ddindex2pmeindex(cr,i);
2133         p1 = cr_ddindex2pmeindex(cr,i+1);
2134         if (i+1 == cr->dd->nnodes || p1 > p0) {
2135             if (debug)
2136                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2137             pmenodes[n] = i + 1 + n;
2138             n++;
2139         }
2140     }
2141
2142     return pmenodes;
2143 }
2144
2145 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2146 {
2147     gmx_domdec_t *dd;
2148     ivec coords,coords_pme,nc;
2149     int  slab;
2150     
2151     dd = cr->dd;
2152     /*
2153       if (dd->comm->bCartesian) {
2154       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2155       dd_coords2pmecoords(dd,coords,coords_pme);
2156       copy_ivec(dd->ntot,nc);
2157       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2158       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2159       
2160       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2161       } else {
2162       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2163       }
2164     */
2165     coords[XX] = x;
2166     coords[YY] = y;
2167     coords[ZZ] = z;
2168     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2169     
2170     return slab;
2171 }
2172
2173 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2174 {
2175     gmx_domdec_comm_t *comm;
2176     ivec coords;
2177     int  ddindex,nodeid=-1;
2178     
2179     comm = cr->dd->comm;
2180     
2181     coords[XX] = x;
2182     coords[YY] = y;
2183     coords[ZZ] = z;
2184     if (comm->bCartesianPP_PME)
2185     {
2186 #ifdef GMX_MPI
2187         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2188 #endif
2189     }
2190     else
2191     {
2192         ddindex = dd_index(cr->dd->nc,coords);
2193         if (comm->bCartesianPP)
2194         {
2195             nodeid = comm->ddindex2simnodeid[ddindex];
2196         }
2197         else
2198         {
2199             if (comm->pmenodes)
2200             {
2201                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2202             }
2203             else
2204             {
2205                 nodeid = ddindex;
2206             }
2207         }
2208     }
2209   
2210     return nodeid;
2211 }
2212
2213 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2214 {
2215     gmx_domdec_t *dd;
2216     gmx_domdec_comm_t *comm;
2217     ivec coord,coord_pme;
2218     int  i;
2219     int  pmenode=-1;
2220     
2221     dd = cr->dd;
2222     comm = dd->comm;
2223     
2224     /* This assumes a uniform x domain decomposition grid cell size */
2225     if (comm->bCartesianPP_PME)
2226     {
2227 #ifdef GMX_MPI
2228         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2229         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2230         {
2231             /* This is a PP node */
2232             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2233             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2234         }
2235 #endif
2236     }
2237     else if (comm->bCartesianPP)
2238     {
2239         if (sim_nodeid < dd->nnodes)
2240         {
2241             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2242         }
2243     }
2244     else
2245     {
2246         /* This assumes DD cells with identical x coordinates
2247          * are numbered sequentially.
2248          */
2249         if (dd->comm->pmenodes == NULL)
2250         {
2251             if (sim_nodeid < dd->nnodes)
2252             {
2253                 /* The DD index equals the nodeid */
2254                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2255             }
2256         }
2257         else
2258         {
2259             i = 0;
2260             while (sim_nodeid > dd->comm->pmenodes[i])
2261             {
2262                 i++;
2263             }
2264             if (sim_nodeid < dd->comm->pmenodes[i])
2265             {
2266                 pmenode = dd->comm->pmenodes[i];
2267             }
2268         }
2269     }
2270     
2271     return pmenode;
2272 }
2273
2274 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2275 {
2276     gmx_bool bPMEOnlyNode;
2277     
2278     if (DOMAINDECOMP(cr))
2279     {
2280         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2281     }
2282     else
2283     {
2284         bPMEOnlyNode = FALSE;
2285     }
2286     
2287     return bPMEOnlyNode;
2288 }
2289
2290 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2291                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2292 {
2293     gmx_domdec_t *dd;
2294     int x,y,z;
2295     ivec coord,coord_pme;
2296     
2297     dd = cr->dd;
2298     
2299     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2300     
2301     *nmy_ddnodes = 0;
2302     for(x=0; x<dd->nc[XX]; x++)
2303     {
2304         for(y=0; y<dd->nc[YY]; y++)
2305         {
2306             for(z=0; z<dd->nc[ZZ]; z++)
2307             {
2308                 if (dd->comm->bCartesianPP_PME)
2309                 {
2310                     coord[XX] = x;
2311                     coord[YY] = y;
2312                     coord[ZZ] = z;
2313                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2314                     if (dd->ci[XX] == coord_pme[XX] &&
2315                         dd->ci[YY] == coord_pme[YY] &&
2316                         dd->ci[ZZ] == coord_pme[ZZ])
2317                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2318                 }
2319                 else
2320                 {
2321                     /* The slab corresponds to the nodeid in the PME group */
2322                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2323                     {
2324                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2325                     }
2326                 }
2327             }
2328         }
2329     }
2330     
2331     /* The last PP-only node is the peer node */
2332     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2333     
2334     if (debug)
2335     {
2336         fprintf(debug,"Receive coordinates from PP nodes:");
2337         for(x=0; x<*nmy_ddnodes; x++)
2338         {
2339             fprintf(debug," %d",(*my_ddnodes)[x]);
2340         }
2341         fprintf(debug,"\n");
2342     }
2343 }
2344
2345 static gmx_bool receive_vir_ener(t_commrec *cr)
2346 {
2347     gmx_domdec_comm_t *comm;
2348     int  pmenode,coords[DIM],rank;
2349     gmx_bool bReceive;
2350     
2351     bReceive = TRUE;
2352     if (cr->npmenodes < cr->dd->nnodes)
2353     {
2354         comm = cr->dd->comm;
2355         if (comm->bCartesianPP_PME)
2356         {
2357             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2358 #ifdef GMX_MPI
2359             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2360             coords[comm->cartpmedim]++;
2361             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2362             {
2363                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2364                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2365                 {
2366                     /* This is not the last PP node for pmenode */
2367                     bReceive = FALSE;
2368                 }
2369             }
2370 #endif  
2371         }
2372         else
2373         {
2374             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2375             if (cr->sim_nodeid+1 < cr->nnodes &&
2376                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2377             {
2378                 /* This is not the last PP node for pmenode */
2379                 bReceive = FALSE;
2380             }
2381         }
2382     }
2383     
2384     return bReceive;
2385 }
2386
2387 static void set_zones_ncg_home(gmx_domdec_t *dd)
2388 {
2389     gmx_domdec_zones_t *zones;
2390     int i;
2391
2392     zones = &dd->comm->zones;
2393
2394     zones->cg_range[0] = 0;
2395     for(i=1; i<zones->n+1; i++)
2396     {
2397         zones->cg_range[i] = dd->ncg_home;
2398     }
2399 }
2400
2401 static void rebuild_cgindex(gmx_domdec_t *dd,
2402                             const int *gcgs_index,t_state *state)
2403 {
2404     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2405     
2406     ind = state->cg_gl;
2407     dd_cg_gl = dd->index_gl;
2408     cgindex  = dd->cgindex;
2409     nat = 0;
2410     cgindex[0] = nat;
2411     for(i=0; i<state->ncg_gl; i++)
2412     {
2413         cgindex[i] = nat;
2414         cg_gl = ind[i];
2415         dd_cg_gl[i] = cg_gl;
2416         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2417     }
2418     cgindex[i] = nat;
2419     
2420     dd->ncg_home = state->ncg_gl;
2421     dd->nat_home = nat;
2422
2423     set_zones_ncg_home(dd);
2424 }
2425
2426 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2427 {
2428     while (cg >= cginfo_mb->cg_end)
2429     {
2430         cginfo_mb++;
2431     }
2432
2433     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2434 }
2435
2436 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2437                           t_forcerec *fr,char *bLocalCG)
2438 {
2439     cginfo_mb_t *cginfo_mb;
2440     int *cginfo;
2441     int cg;
2442
2443     if (fr != NULL)
2444     {
2445         cginfo_mb = fr->cginfo_mb;
2446         cginfo    = fr->cginfo;
2447
2448         for(cg=cg0; cg<cg1; cg++)
2449         {
2450             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2451         }
2452     }
2453
2454     if (bLocalCG != NULL)
2455     {
2456         for(cg=cg0; cg<cg1; cg++)
2457         {
2458             bLocalCG[index_gl[cg]] = TRUE;
2459         }
2460     }
2461 }
2462
2463 static void make_dd_indices(gmx_domdec_t *dd,
2464                             const int *gcgs_index,int cg_start)
2465 {
2466     int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2467     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2468     gmx_ga2la_t *ga2la;
2469     char *bLocalCG;
2470     gmx_bool bCGs;
2471
2472     bLocalCG = dd->comm->bLocalCG;
2473
2474     if (dd->nat_tot > dd->gatindex_nalloc)
2475     {
2476         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2477         srenew(dd->gatindex,dd->gatindex_nalloc);
2478     }
2479
2480     nzone      = dd->comm->zones.n;
2481     zone2cg    = dd->comm->zones.cg_range;
2482     zone_ncg1  = dd->comm->zone_ncg1;
2483     index_gl   = dd->index_gl;
2484     gatindex   = dd->gatindex;
2485     bCGs       = dd->comm->bCGs;
2486
2487     if (zone2cg[1] != dd->ncg_home)
2488     {
2489         gmx_incons("dd->ncg_zone is not up to date");
2490     }
2491     
2492     /* Make the local to global and global to local atom index */
2493     a = dd->cgindex[cg_start];
2494     for(zone=0; zone<nzone; zone++)
2495     {
2496         if (zone == 0)
2497         {
2498             cg0 = cg_start;
2499         }
2500         else
2501         {
2502             cg0 = zone2cg[zone];
2503         }
2504         cg1    = zone2cg[zone+1];
2505         cg1_p1 = cg0 + zone_ncg1[zone];
2506
2507         for(cg=cg0; cg<cg1; cg++)
2508         {
2509             zone1 = zone;
2510             if (cg >= cg1_p1)
2511             {
2512                 /* Signal that this cg is from more than one pulse away */
2513                 zone1 += nzone;
2514             }
2515             cg_gl = index_gl[cg];
2516             if (bCGs)
2517             {
2518                 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2519                 {
2520                     gatindex[a] = a_gl;
2521                     ga2la_set(dd->ga2la,a_gl,a,zone1);
2522                     a++;
2523                 }
2524             }
2525             else
2526             {
2527                 gatindex[a] = cg_gl;
2528                 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2529                 a++;
2530             }
2531         }
2532     }
2533 }
2534
2535 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2536                           const char *where)
2537 {
2538     int ncg,i,ngl,nerr;
2539
2540     nerr = 0;
2541     if (bLocalCG == NULL)
2542     {
2543         return nerr;
2544     }
2545     for(i=0; i<dd->ncg_tot; i++)
2546     {
2547         if (!bLocalCG[dd->index_gl[i]])
2548         {
2549             fprintf(stderr,
2550                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2551             nerr++;
2552         }
2553     }
2554     ngl = 0;
2555     for(i=0; i<ncg_sys; i++)
2556     {
2557         if (bLocalCG[i])
2558         {
2559             ngl++;
2560         }
2561     }
2562     if (ngl != dd->ncg_tot)
2563     {
2564         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2565         nerr++;
2566     }
2567
2568     return nerr;
2569 }
2570
2571 static void check_index_consistency(gmx_domdec_t *dd,
2572                                     int natoms_sys,int ncg_sys,
2573                                     const char *where)
2574 {
2575     int  nerr,ngl,i,a,cell;
2576     int  *have;
2577
2578     nerr = 0;
2579
2580     if (dd->comm->DD_debug > 1)
2581     {
2582         snew(have,natoms_sys);
2583         for(a=0; a<dd->nat_tot; a++)
2584         {
2585             if (have[dd->gatindex[a]] > 0)
2586             {
2587                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2588             }
2589             else
2590             {
2591                 have[dd->gatindex[a]] = a + 1;
2592             }
2593         }
2594         sfree(have);
2595     }
2596
2597     snew(have,dd->nat_tot);
2598
2599     ngl  = 0;
2600     for(i=0; i<natoms_sys; i++)
2601     {
2602         if (ga2la_get(dd->ga2la,i,&a,&cell))
2603         {
2604             if (a >= dd->nat_tot)
2605             {
2606                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2607                 nerr++;
2608             }
2609             else
2610             {
2611                 have[a] = 1;
2612                 if (dd->gatindex[a] != i)
2613                 {
2614                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2615                     nerr++;
2616                 }
2617             }
2618             ngl++;
2619         }
2620     }
2621     if (ngl != dd->nat_tot)
2622     {
2623         fprintf(stderr,
2624                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2625                 dd->rank,where,ngl,dd->nat_tot);
2626     }
2627     for(a=0; a<dd->nat_tot; a++)
2628     {
2629         if (have[a] == 0)
2630         {
2631             fprintf(stderr,
2632                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2633                     dd->rank,where,a+1,dd->gatindex[a]+1);
2634         }
2635     }
2636     sfree(have);
2637
2638     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2639
2640     if (nerr > 0) {
2641         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2642                   dd->rank,where,nerr);
2643     }
2644 }
2645
2646 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2647 {
2648     int  i;
2649     char *bLocalCG;
2650
2651     if (a_start == 0)
2652     {
2653         /* Clear the whole list without searching */
2654         ga2la_clear(dd->ga2la);
2655     }
2656     else
2657     {
2658         for(i=a_start; i<dd->nat_tot; i++)
2659         {
2660             ga2la_del(dd->ga2la,dd->gatindex[i]);
2661         }
2662     }
2663
2664     bLocalCG = dd->comm->bLocalCG;
2665     if (bLocalCG)
2666     {
2667         for(i=cg_start; i<dd->ncg_tot; i++)
2668         {
2669             bLocalCG[dd->index_gl[i]] = FALSE;
2670         }
2671     }
2672
2673     dd_clear_local_vsite_indices(dd);
2674     
2675     if (dd->constraints)
2676     {
2677         dd_clear_local_constraint_indices(dd);
2678     }
2679 }
2680
2681 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2682                             int dim_ind)
2683 {
2684     real grid_jump_limit;
2685
2686     /* The distance between the boundaries of cells at distance
2687      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2688      * and by the fact that cells should not be shifted by more than
2689      * half their size, such that cg's only shift by one cell
2690      * at redecomposition.
2691      */
2692     grid_jump_limit = comm->cellsize_limit;
2693     if (!comm->bVacDLBNoLimit)
2694     {
2695         grid_jump_limit = max(grid_jump_limit,
2696                               cutoff/comm->cd[dim_ind].np);
2697     }
2698
2699     return grid_jump_limit;
2700 }
2701
2702 static gmx_bool check_grid_jump(gmx_large_int_t step,
2703                                 gmx_domdec_t *dd,
2704                                 real cutoff,
2705                                 gmx_ddbox_t *ddbox,
2706                                 gmx_bool bFatal)
2707 {
2708     gmx_domdec_comm_t *comm;
2709     int  d,dim;
2710     real limit,bfac;
2711     gmx_bool bInvalid;
2712
2713     bInvalid = FALSE;
2714
2715     comm = dd->comm;
2716     
2717     for(d=1; d<dd->ndim; d++)
2718     {
2719         dim = dd->dim[d];
2720         limit = grid_jump_limit(comm,cutoff,d);
2721         bfac = ddbox->box_size[dim];
2722         if (ddbox->tric_dir[dim])
2723         {
2724             bfac *= ddbox->skew_fac[dim];
2725         }
2726         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2727             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2728         {
2729             bInvalid = TRUE;
2730
2731             if (bFatal)
2732             {
2733                 char buf[22];
2734
2735                 /* This error should never be triggered under normal
2736                  * circumstances, but you never know ...
2737                  */
2738                 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2739                           gmx_step_str(step,buf),
2740                           dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2741             }
2742         }
2743     }
2744
2745     return bInvalid;
2746 }
2747
2748 static int dd_load_count(gmx_domdec_comm_t *comm)
2749 {
2750     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2751 }
2752
2753 static float dd_force_load(gmx_domdec_comm_t *comm)
2754 {
2755     float load;
2756     
2757     if (comm->eFlop)
2758     {
2759         load = comm->flop;
2760         if (comm->eFlop > 1)
2761         {
2762             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2763         }
2764     } 
2765     else
2766     {
2767         load = comm->cycl[ddCyclF];
2768         if (comm->cycl_n[ddCyclF] > 1)
2769         {
2770             /* Subtract the maximum of the last n cycle counts
2771              * to get rid of possible high counts due to other soures,
2772              * for instance system activity, that would otherwise
2773              * affect the dynamic load balancing.
2774              */
2775             load -= comm->cycl_max[ddCyclF];
2776         }
2777     }
2778     
2779     return load;
2780 }
2781
2782 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2783 {
2784     gmx_domdec_comm_t *comm;
2785     int i;
2786     
2787     comm = dd->comm;
2788     
2789     snew(*dim_f,dd->nc[dim]+1);
2790     (*dim_f)[0] = 0;
2791     for(i=1; i<dd->nc[dim]; i++)
2792     {
2793         if (comm->slb_frac[dim])
2794         {
2795             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2796         }
2797         else
2798         {
2799             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2800         }
2801     }
2802     (*dim_f)[dd->nc[dim]] = 1;
2803 }
2804
2805 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2806 {
2807     int  pmeindex,slab,nso,i;
2808     ivec xyz;
2809     
2810     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2811     {
2812         ddpme->dim = YY;
2813     }
2814     else
2815     {
2816         ddpme->dim = dimind;
2817     }
2818     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2819     
2820     ddpme->nslab = (ddpme->dim == 0 ?
2821                     dd->comm->npmenodes_x :
2822                     dd->comm->npmenodes_y);
2823
2824     if (ddpme->nslab <= 1)
2825     {
2826         return;
2827     }
2828
2829     nso = dd->comm->npmenodes/ddpme->nslab;
2830     /* Determine for each PME slab the PP location range for dimension dim */
2831     snew(ddpme->pp_min,ddpme->nslab);
2832     snew(ddpme->pp_max,ddpme->nslab);
2833     for(slab=0; slab<ddpme->nslab; slab++) {
2834         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2835         ddpme->pp_max[slab] = 0;
2836     }
2837     for(i=0; i<dd->nnodes; i++) {
2838         ddindex2xyz(dd->nc,i,xyz);
2839         /* For y only use our y/z slab.
2840          * This assumes that the PME x grid size matches the DD grid size.
2841          */
2842         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2843             pmeindex = ddindex2pmeindex(dd,i);
2844             if (dimind == 0) {
2845                 slab = pmeindex/nso;
2846             } else {
2847                 slab = pmeindex % ddpme->nslab;
2848             }
2849             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2850             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2851         }
2852     }
2853
2854     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2855 }
2856
2857 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2858 {
2859     if (dd->comm->ddpme[0].dim == XX)
2860     {
2861         return dd->comm->ddpme[0].maxshift;
2862     }
2863     else
2864     {
2865         return 0;
2866     }
2867 }
2868
2869 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2870 {
2871     if (dd->comm->ddpme[0].dim == YY)
2872     {
2873         return dd->comm->ddpme[0].maxshift;
2874     }
2875     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2876     {
2877         return dd->comm->ddpme[1].maxshift;
2878     }
2879     else
2880     {
2881         return 0;
2882     }
2883 }
2884
2885 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2886                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2887 {
2888     gmx_domdec_comm_t *comm;
2889     int  nc,ns,s;
2890     int  *xmin,*xmax;
2891     real range,pme_boundary;
2892     int  sh;
2893     
2894     comm = dd->comm;
2895     nc  = dd->nc[ddpme->dim];
2896     ns  = ddpme->nslab;
2897     
2898     if (!ddpme->dim_match)
2899     {
2900         /* PP decomposition is not along dim: the worst situation */
2901         sh = ns/2;
2902     }
2903     else if (ns <= 3 || (bUniform && ns == nc))
2904     {
2905         /* The optimal situation */
2906         sh = 1;
2907     }
2908     else
2909     {
2910         /* We need to check for all pme nodes which nodes they
2911          * could possibly need to communicate with.
2912          */
2913         xmin = ddpme->pp_min;
2914         xmax = ddpme->pp_max;
2915         /* Allow for atoms to be maximally 2/3 times the cut-off
2916          * out of their DD cell. This is a reasonable balance between
2917          * between performance and support for most charge-group/cut-off
2918          * combinations.
2919          */
2920         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2921         /* Avoid extra communication when we are exactly at a boundary */
2922         range *= 0.999;
2923         
2924         sh = 1;
2925         for(s=0; s<ns; s++)
2926         {
2927             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2928             pme_boundary = (real)s/ns;
2929             while (sh+1 < ns &&
2930                    ((s-(sh+1) >= 0 &&
2931                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2932                     (s-(sh+1) <  0 &&
2933                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2934             {
2935                 sh++;
2936             }
2937             pme_boundary = (real)(s+1)/ns;
2938             while (sh+1 < ns &&
2939                    ((s+(sh+1) <  ns &&
2940                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2941                     (s+(sh+1) >= ns &&
2942                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2943             {
2944                 sh++;
2945             }
2946         }
2947     }
2948     
2949     ddpme->maxshift = sh;
2950     
2951     if (debug)
2952     {
2953         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2954                 ddpme->dim,ddpme->maxshift);
2955     }
2956 }
2957
2958 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2959 {
2960     int d,dim;
2961     
2962     for(d=0; d<dd->ndim; d++)
2963     {
2964         dim = dd->dim[d];
2965         if (dim < ddbox->nboundeddim &&
2966             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2967             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2968         {
2969             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2970                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2971                       dd->nc[dim],dd->comm->cellsize_limit);
2972         }
2973     }
2974 }
2975
2976 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2977                                   gmx_bool bMaster,ivec npulse)
2978 {
2979     gmx_domdec_comm_t *comm;
2980     int  d,j;
2981     rvec cellsize_min;
2982     real *cell_x,cell_dx,cellsize;
2983     
2984     comm = dd->comm;
2985     
2986     for(d=0; d<DIM; d++)
2987     {
2988         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2989         npulse[d] = 1;
2990         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2991         {
2992             /* Uniform grid */
2993             cell_dx = ddbox->box_size[d]/dd->nc[d];
2994             if (bMaster)
2995             {
2996                 for(j=0; j<dd->nc[d]+1; j++)
2997                 {
2998                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2999                 }
3000             }
3001             else
3002             {
3003                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3004                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3005             }
3006             cellsize = cell_dx*ddbox->skew_fac[d];
3007             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3008             {
3009                 npulse[d]++;
3010             }
3011             cellsize_min[d] = cellsize;
3012         }
3013         else
3014         {
3015             /* Statically load balanced grid */
3016             /* Also when we are not doing a master distribution we determine
3017              * all cell borders in a loop to obtain identical values
3018              * to the master distribution case and to determine npulse.
3019              */
3020             if (bMaster)
3021             {
3022                 cell_x = dd->ma->cell_x[d];
3023             }
3024             else
3025             {
3026                 snew(cell_x,dd->nc[d]+1);
3027             }
3028             cell_x[0] = ddbox->box0[d];
3029             for(j=0; j<dd->nc[d]; j++)
3030             {
3031                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3032                 cell_x[j+1] = cell_x[j] + cell_dx;
3033                 cellsize = cell_dx*ddbox->skew_fac[d];
3034                 while (cellsize*npulse[d] < comm->cutoff &&
3035                        npulse[d] < dd->nc[d]-1)
3036                 {
3037                     npulse[d]++;
3038                 }
3039                 cellsize_min[d] = min(cellsize_min[d],cellsize);
3040             }
3041             if (!bMaster)
3042             {
3043                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3044                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3045                 sfree(cell_x);
3046             }
3047         }
3048         /* The following limitation is to avoid that a cell would receive
3049          * some of its own home charge groups back over the periodic boundary.
3050          * Double charge groups cause trouble with the global indices.
3051          */
3052         if (d < ddbox->npbcdim &&
3053             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3054         {
3055             gmx_fatal_collective(FARGS,NULL,dd,
3056                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3057                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3058                                  comm->cutoff,
3059                                  dd->nc[d],dd->nc[d],
3060                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3061         }
3062     }
3063     
3064     if (!comm->bDynLoadBal)
3065     {
3066         copy_rvec(cellsize_min,comm->cellsize_min);
3067     }
3068    
3069     for(d=0; d<comm->npmedecompdim; d++)
3070     {
3071         set_pme_maxshift(dd,&comm->ddpme[d],
3072                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3073                          comm->ddpme[d].slb_dim_f);
3074     }
3075 }
3076
3077
3078 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3079                                        int d,int dim,gmx_domdec_root_t *root,
3080                                        gmx_ddbox_t *ddbox,
3081                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3082 {
3083     gmx_domdec_comm_t *comm;
3084     int  ncd,i,j,nmin,nmin_old;
3085     gmx_bool bLimLo,bLimHi;
3086     real *cell_size;
3087     real fac,halfway,cellsize_limit_f_i,region_size;
3088     gmx_bool bPBC,bLastHi=FALSE;
3089     int nrange[]={range[0],range[1]};
3090
3091     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
3092
3093     comm = dd->comm;
3094
3095     ncd = dd->nc[dim];
3096
3097     bPBC = (dim < ddbox->npbcdim);
3098
3099     cell_size = root->buf_ncd;
3100
3101     if (debug) 
3102     {
3103         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3104     }
3105
3106     /* First we need to check if the scaling does not make cells
3107      * smaller than the smallest allowed size.
3108      * We need to do this iteratively, since if a cell is too small,
3109      * it needs to be enlarged, which makes all the other cells smaller,
3110      * which could in turn make another cell smaller than allowed.
3111      */
3112     for(i=range[0]; i<range[1]; i++)
3113     {
3114         root->bCellMin[i] = FALSE;
3115     }
3116     nmin = 0;
3117     do
3118     {
3119         nmin_old = nmin;
3120         /* We need the total for normalization */
3121         fac = 0;
3122         for(i=range[0]; i<range[1]; i++)
3123         {
3124             if (root->bCellMin[i] == FALSE)
3125             {
3126                 fac += cell_size[i];
3127             }
3128         }
3129         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3130         /* Determine the cell boundaries */
3131         for(i=range[0]; i<range[1]; i++)
3132         {
3133             if (root->bCellMin[i] == FALSE)
3134             {
3135                 cell_size[i] *= fac;
3136                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3137                 {
3138                     cellsize_limit_f_i = 0;
3139                 }
3140                 else
3141                 {
3142                     cellsize_limit_f_i = cellsize_limit_f;
3143                 }
3144                 if (cell_size[i] < cellsize_limit_f_i)
3145                 {
3146                     root->bCellMin[i] = TRUE;
3147                     cell_size[i] = cellsize_limit_f_i;
3148                     nmin++;
3149                 }
3150             }
3151             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3152         }
3153     }
3154     while (nmin > nmin_old);
3155     
3156     i=range[1]-1;
3157     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3158     /* For this check we should not use DD_CELL_MARGIN,
3159      * but a slightly smaller factor,
3160      * since rounding could get use below the limit.
3161      */
3162     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3163     {
3164         char buf[22];
3165         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3166                   gmx_step_str(step,buf),
3167                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3168                   ncd,comm->cellsize_min[dim]);
3169     }
3170     
3171     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3172     
3173     if (!bUniform)
3174     {
3175         /* Check if the boundary did not displace more than halfway
3176          * each of the cells it bounds, as this could cause problems,
3177          * especially when the differences between cell sizes are large.
3178          * If changes are applied, they will not make cells smaller
3179          * than the cut-off, as we check all the boundaries which
3180          * might be affected by a change and if the old state was ok,
3181          * the cells will at most be shrunk back to their old size.
3182          */
3183         for(i=range[0]+1; i<range[1]; i++)
3184         {
3185             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3186             if (root->cell_f[i] < halfway)
3187             {
3188                 root->cell_f[i] = halfway;
3189                 /* Check if the change also causes shifts of the next boundaries */
3190                 for(j=i+1; j<range[1]; j++)
3191                 {
3192                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3193                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3194                 }
3195             }
3196             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3197             if (root->cell_f[i] > halfway)
3198             {
3199                 root->cell_f[i] = halfway;
3200                 /* Check if the change also causes shifts of the next boundaries */
3201                 for(j=i-1; j>=range[0]+1; j--)
3202                 {
3203                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3204                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3205                 }
3206             }
3207         }
3208     }
3209     
3210     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3211     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3212      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3213      * for a and b nrange is used */
3214     if (d > 0)
3215     {
3216         /* Take care of the staggering of the cell boundaries */
3217         if (bUniform)
3218         {
3219             for(i=range[0]; i<range[1]; i++)
3220             {
3221                 root->cell_f_max0[i] = root->cell_f[i];
3222                 root->cell_f_min1[i] = root->cell_f[i+1];
3223             }
3224         }
3225         else
3226         {
3227             for(i=range[0]+1; i<range[1]; i++)
3228             {
3229                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3230                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3231                 if (bLimLo && bLimHi)
3232                 {
3233                     /* Both limits violated, try the best we can */
3234                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3235                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3236                     nrange[0]=range[0];
3237                     nrange[1]=i;
3238                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3239
3240                     nrange[0]=i;
3241                     nrange[1]=range[1];
3242                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3243
3244                     return;
3245                 }
3246                 else if (bLimLo)
3247                 {
3248                     /* root->cell_f[i] = root->bound_min[i]; */
3249                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3250                     bLastHi=FALSE;
3251                 }
3252                 else if (bLimHi && !bLastHi)
3253                 {
3254                     bLastHi=TRUE;
3255                     if (nrange[1] < range[1])   /* found a LimLo before */
3256                     {
3257                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3258                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3259                         nrange[0]=nrange[1];
3260                     }
3261                     root->cell_f[i] = root->bound_max[i];
3262                     nrange[1]=i; 
3263                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3264                     nrange[0]=i;
3265                     nrange[1]=range[1];
3266                 }
3267             }
3268             if (nrange[1] < range[1])   /* found last a LimLo */
3269             {
3270                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3271                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3272                 nrange[0]=nrange[1];
3273                 nrange[1]=range[1];
3274                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3275             } 
3276             else if (nrange[0] > range[0]) /* found at least one LimHi */
3277             {
3278                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3279             }
3280         }
3281     }
3282 }
3283
3284
3285 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3286                                        int d,int dim,gmx_domdec_root_t *root,
3287                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3288                                        gmx_bool bUniform,gmx_large_int_t step)
3289 {
3290     gmx_domdec_comm_t *comm;
3291     int  ncd,d1,i,j,pos;
3292     real *cell_size;
3293     real load_aver,load_i,imbalance,change,change_max,sc;
3294     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3295     real change_limit;
3296     real relax = 0.5;
3297     gmx_bool bPBC;
3298     int range[] = { 0, 0 };
3299
3300     comm = dd->comm;
3301
3302     /* Convert the maximum change from the input percentage to a fraction */
3303     change_limit = comm->dlb_scale_lim*0.01;
3304
3305     ncd = dd->nc[dim];
3306
3307     bPBC = (dim < ddbox->npbcdim);
3308
3309     cell_size = root->buf_ncd;
3310
3311     /* Store the original boundaries */
3312     for(i=0; i<ncd+1; i++)
3313     {
3314         root->old_cell_f[i] = root->cell_f[i];
3315     }
3316     if (bUniform) {
3317         for(i=0; i<ncd; i++)
3318         {
3319             cell_size[i] = 1.0/ncd;
3320         }
3321     }
3322     else if (dd_load_count(comm))
3323     {
3324         load_aver = comm->load[d].sum_m/ncd;
3325         change_max = 0;
3326         for(i=0; i<ncd; i++)
3327         {
3328             /* Determine the relative imbalance of cell i */
3329             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3330             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3331             /* Determine the change of the cell size using underrelaxation */
3332             change = -relax*imbalance;
3333             change_max = max(change_max,max(change,-change));
3334         }
3335         /* Limit the amount of scaling.
3336          * We need to use the same rescaling for all cells in one row,
3337          * otherwise the load balancing might not converge.
3338          */
3339         sc = relax;
3340         if (change_max > change_limit)
3341         {
3342             sc *= change_limit/change_max;
3343         }
3344         for(i=0; i<ncd; i++)
3345         {
3346             /* Determine the relative imbalance of cell i */
3347             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3348             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3349             /* Determine the change of the cell size using underrelaxation */
3350             change = -sc*imbalance;
3351             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3352         }
3353     }
3354     
3355     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3356     cellsize_limit_f *= DD_CELL_MARGIN;
3357     dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3358     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3359     if (ddbox->tric_dir[dim])
3360     {
3361         cellsize_limit_f /= ddbox->skew_fac[dim];
3362         dist_min_f       /= ddbox->skew_fac[dim];
3363     }
3364     if (bDynamicBox && d > 0)
3365     {
3366         dist_min_f *= DD_PRES_SCALE_MARGIN;
3367     }
3368     if (d > 0 && !bUniform)
3369     {
3370         /* Make sure that the grid is not shifted too much */
3371         for(i=1; i<ncd; i++) {
3372             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3373             {
3374                 gmx_incons("Inconsistent DD boundary staggering limits!");
3375             }
3376             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3377             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3378             if (space > 0) {
3379                 root->bound_min[i] += 0.5*space;
3380             }
3381             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3382             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3383             if (space < 0) {
3384                 root->bound_max[i] += 0.5*space;
3385             }
3386             if (debug)
3387             {
3388                 fprintf(debug,
3389                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3390                         d,i,
3391                         root->cell_f_max0[i-1] + dist_min_f,
3392                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3393                         root->cell_f_min1[i] - dist_min_f);
3394             }
3395         }
3396     }
3397     range[1]=ncd;
3398     root->cell_f[0] = 0;
3399     root->cell_f[ncd] = 1;
3400     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3401
3402
3403     /* After the checks above, the cells should obey the cut-off
3404      * restrictions, but it does not hurt to check.
3405      */
3406     for(i=0; i<ncd; i++)
3407     {
3408         if (debug)
3409         {
3410             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3411                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3412         }
3413
3414         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3415             root->cell_f[i+1] - root->cell_f[i] <
3416             cellsize_limit_f/DD_CELL_MARGIN)
3417         {
3418             char buf[22];
3419             fprintf(stderr,
3420                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3421                     gmx_step_str(step,buf),dim2char(dim),i,
3422                     (root->cell_f[i+1] - root->cell_f[i])
3423                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3424         }
3425     }
3426     
3427     pos = ncd + 1;
3428     /* Store the cell boundaries of the lower dimensions at the end */
3429     for(d1=0; d1<d; d1++)
3430     {
3431         root->cell_f[pos++] = comm->cell_f0[d1];
3432         root->cell_f[pos++] = comm->cell_f1[d1];
3433     }
3434     
3435     if (d < comm->npmedecompdim)
3436     {
3437         /* The master determines the maximum shift for
3438          * the coordinate communication between separate PME nodes.
3439          */
3440         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3441     }
3442     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3443     if (d >= 1)
3444     {
3445         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3446     }
3447 }    
3448
3449 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3450                                              gmx_ddbox_t *ddbox,int dimind)
3451 {
3452     gmx_domdec_comm_t *comm;
3453     int dim;
3454
3455     comm = dd->comm;
3456
3457     /* Set the cell dimensions */
3458     dim = dd->dim[dimind];
3459     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3460     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3461     if (dim >= ddbox->nboundeddim)
3462     {
3463         comm->cell_x0[dim] += ddbox->box0[dim];
3464         comm->cell_x1[dim] += ddbox->box0[dim];
3465     }
3466 }
3467
3468 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3469                                          int d,int dim,real *cell_f_row,
3470                                          gmx_ddbox_t *ddbox)
3471 {
3472     gmx_domdec_comm_t *comm;
3473     int d1,dim1,pos;
3474
3475     comm = dd->comm;
3476
3477 #ifdef GMX_MPI
3478     /* Each node would only need to know two fractions,
3479      * but it is probably cheaper to broadcast the whole array.
3480      */
3481     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3482               0,comm->mpi_comm_load[d]);
3483 #endif
3484     /* Copy the fractions for this dimension from the buffer */
3485     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3486     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3487     /* The whole array was communicated, so set the buffer position */
3488     pos = dd->nc[dim] + 1;
3489     for(d1=0; d1<=d; d1++)
3490     {
3491         if (d1 < d)
3492         {
3493             /* Copy the cell fractions of the lower dimensions */
3494             comm->cell_f0[d1] = cell_f_row[pos++];
3495             comm->cell_f1[d1] = cell_f_row[pos++];
3496         }
3497         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3498     }
3499     /* Convert the communicated shift from float to int */
3500     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3501     if (d >= 1)
3502     {
3503         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3504     }
3505 }
3506
3507 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3508                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3509                                          gmx_bool bUniform,gmx_large_int_t step)
3510 {
3511     gmx_domdec_comm_t *comm;
3512     int d,dim,d1;
3513     gmx_bool bRowMember,bRowRoot;
3514     real *cell_f_row;
3515     
3516     comm = dd->comm;
3517
3518     for(d=0; d<dd->ndim; d++)
3519     {
3520         dim = dd->dim[d];
3521         bRowMember = TRUE;
3522         bRowRoot = TRUE;
3523         for(d1=d; d1<dd->ndim; d1++)
3524         {
3525             if (dd->ci[dd->dim[d1]] > 0)
3526             {
3527                 if (d1 > d)
3528                 {
3529                     bRowMember = FALSE;
3530                 }
3531                 bRowRoot = FALSE;
3532             }
3533         }
3534         if (bRowMember)
3535         {
3536             if (bRowRoot)
3537             {
3538                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3539                                            ddbox,bDynamicBox,bUniform,step);
3540                 cell_f_row = comm->root[d]->cell_f;
3541             }
3542             else
3543             {
3544                 cell_f_row = comm->cell_f_row;
3545             }
3546             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3547         }
3548     }
3549 }    
3550
3551 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3552 {
3553     int d;
3554
3555     /* This function assumes the box is static and should therefore
3556      * not be called when the box has changed since the last
3557      * call to dd_partition_system.
3558      */
3559     for(d=0; d<dd->ndim; d++)
3560     {
3561         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3562     }
3563 }
3564
3565
3566
3567 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3568                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3569                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3570                                   gmx_wallcycle_t wcycle)
3571 {
3572     gmx_domdec_comm_t *comm;
3573     int dim;
3574
3575     comm = dd->comm;
3576     
3577     if (bDoDLB)
3578     {
3579         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3580         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3581         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3582     }
3583     else if (bDynamicBox)
3584     {
3585         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3586     }
3587     
3588     /* Set the dimensions for which no DD is used */
3589     for(dim=0; dim<DIM; dim++) {
3590         if (dd->nc[dim] == 1) {
3591             comm->cell_x0[dim] = 0;
3592             comm->cell_x1[dim] = ddbox->box_size[dim];
3593             if (dim >= ddbox->nboundeddim)
3594             {
3595                 comm->cell_x0[dim] += ddbox->box0[dim];
3596                 comm->cell_x1[dim] += ddbox->box0[dim];
3597             }
3598         }
3599     }
3600 }
3601
3602 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3603 {
3604     int d,np,i;
3605     gmx_domdec_comm_dim_t *cd;
3606     
3607     for(d=0; d<dd->ndim; d++)
3608     {
3609         cd = &dd->comm->cd[d];
3610         np = npulse[dd->dim[d]];
3611         if (np > cd->np_nalloc)
3612         {
3613             if (debug)
3614             {
3615                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3616                         dim2char(dd->dim[d]),np);
3617             }
3618             if (DDMASTER(dd) && cd->np_nalloc > 0)
3619             {
3620                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3621             }
3622             srenew(cd->ind,np);
3623             for(i=cd->np_nalloc; i<np; i++)
3624             {
3625                 cd->ind[i].index  = NULL;
3626                 cd->ind[i].nalloc = 0;
3627             }
3628             cd->np_nalloc = np;
3629         }
3630         cd->np = np;
3631     }
3632 }
3633
3634
3635 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3636                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3637                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3638                               gmx_wallcycle_t wcycle)
3639 {
3640     gmx_domdec_comm_t *comm;
3641     int  d;
3642     ivec npulse;
3643     
3644     comm = dd->comm;
3645
3646     /* Copy the old cell boundaries for the cg displacement check */
3647     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3648     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3649     
3650     if (comm->bDynLoadBal)
3651     {
3652         if (DDMASTER(dd))
3653         {
3654             check_box_size(dd,ddbox);
3655         }
3656         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3657     }
3658     else
3659     {
3660         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3661         realloc_comm_ind(dd,npulse);
3662     }
3663     
3664     if (debug)
3665     {
3666         for(d=0; d<DIM; d++)
3667         {
3668             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3669                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3670         }
3671     }
3672 }
3673
3674 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3675                                   gmx_ddbox_t *ddbox,
3676                                   rvec cell_ns_x0,rvec cell_ns_x1,
3677                                   gmx_large_int_t step)
3678 {
3679     gmx_domdec_comm_t *comm;
3680     int dim_ind,dim;
3681     
3682     comm = dd->comm;
3683
3684     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3685     {
3686         dim = dd->dim[dim_ind];
3687         
3688         /* Without PBC we don't have restrictions on the outer cells */
3689         if (!(dim >= ddbox->npbcdim && 
3690               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3691             comm->bDynLoadBal &&
3692             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3693             comm->cellsize_min[dim])
3694         {
3695             char buf[22];
3696             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3697                       gmx_step_str(step,buf),dim2char(dim),
3698                       comm->cell_x1[dim] - comm->cell_x0[dim],
3699                       ddbox->skew_fac[dim],
3700                       dd->comm->cellsize_min[dim],
3701                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3702         }
3703     }
3704     
3705     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3706     {
3707         /* Communicate the boundaries and update cell_ns_x0/1 */
3708         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3709         if (dd->bGridJump && dd->ndim > 1)
3710         {
3711             check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3712         }
3713     }
3714 }
3715
3716 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3717 {
3718     if (YY < npbcdim)
3719     {
3720         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3721     }
3722     else
3723     {
3724         tcm[YY][XX] = 0;
3725     }
3726     if (ZZ < npbcdim)
3727     {
3728         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3729         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3730     }
3731     else
3732     {
3733         tcm[ZZ][XX] = 0;
3734         tcm[ZZ][YY] = 0;
3735     }
3736 }
3737
3738 static void check_screw_box(matrix box)
3739 {
3740     /* Mathematical limitation */
3741     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3742     {
3743         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3744     }
3745     
3746     /* Limitation due to the asymmetry of the eighth shell method */
3747     if (box[ZZ][YY] != 0)
3748     {
3749         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3750     }
3751 }
3752
3753 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3754                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3755                           gmx_domdec_t *dd)
3756 {
3757     gmx_domdec_master_t *ma;
3758     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3759     int  i,icg,j,k,k0,k1,d,npbcdim;
3760     matrix tcm;
3761     rvec box_size,cg_cm;
3762     ivec ind;
3763     real nrcg,inv_ncg,pos_d;
3764     atom_id *cgindex;
3765     gmx_bool bUnbounded,bScrew;
3766
3767     ma = dd->ma;
3768     
3769     if (tmp_ind == NULL)
3770     {
3771         snew(tmp_nalloc,dd->nnodes);
3772         snew(tmp_ind,dd->nnodes);
3773         for(i=0; i<dd->nnodes; i++)
3774         {
3775             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3776             snew(tmp_ind[i],tmp_nalloc[i]);
3777         }
3778     }
3779     
3780     /* Clear the count */
3781     for(i=0; i<dd->nnodes; i++)
3782     {
3783         ma->ncg[i] = 0;
3784         ma->nat[i] = 0;
3785     }
3786     
3787     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3788     
3789     cgindex = cgs->index;
3790     
3791     /* Compute the center of geometry for all charge groups */
3792     for(icg=0; icg<cgs->nr; icg++)
3793     {
3794         k0      = cgindex[icg];
3795         k1      = cgindex[icg+1];
3796         nrcg    = k1 - k0;
3797         if (nrcg == 1)
3798         {
3799             copy_rvec(pos[k0],cg_cm);
3800         }
3801         else
3802         {
3803             inv_ncg = 1.0/nrcg;
3804             
3805             clear_rvec(cg_cm);
3806             for(k=k0; (k<k1); k++)
3807             {
3808                 rvec_inc(cg_cm,pos[k]);
3809             }
3810             for(d=0; (d<DIM); d++)
3811             {
3812                 cg_cm[d] *= inv_ncg;
3813             }
3814         }
3815         /* Put the charge group in the box and determine the cell index */
3816         for(d=DIM-1; d>=0; d--) {
3817             pos_d = cg_cm[d];
3818             if (d < dd->npbcdim)
3819             {
3820                 bScrew = (dd->bScrewPBC && d == XX);
3821                 if (tric_dir[d] && dd->nc[d] > 1)
3822                 {
3823                     /* Use triclinic coordintates for this dimension */
3824                     for(j=d+1; j<DIM; j++)
3825                     {
3826                         pos_d += cg_cm[j]*tcm[j][d];
3827                     }
3828                 }
3829                 while(pos_d >= box[d][d])
3830                 {
3831                     pos_d -= box[d][d];
3832                     rvec_dec(cg_cm,box[d]);
3833                     if (bScrew)
3834                     {
3835                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3836                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3837                     }
3838                     for(k=k0; (k<k1); k++)
3839                     {
3840                         rvec_dec(pos[k],box[d]);
3841                         if (bScrew)
3842                         {
3843                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3844                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3845                         }
3846                     }
3847                 }
3848                 while(pos_d < 0)
3849                 {
3850                     pos_d += box[d][d];
3851                     rvec_inc(cg_cm,box[d]);
3852                     if (bScrew)
3853                     {
3854                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3855                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3856                     }
3857                     for(k=k0; (k<k1); k++)
3858                     {
3859                         rvec_inc(pos[k],box[d]);
3860                         if (bScrew) {
3861                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3862                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3863                         }
3864                     }
3865                 }
3866             }
3867             /* This could be done more efficiently */
3868             ind[d] = 0;
3869             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3870             {
3871                 ind[d]++;
3872             }
3873         }
3874         i = dd_index(dd->nc,ind);
3875         if (ma->ncg[i] == tmp_nalloc[i])
3876         {
3877             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3878             srenew(tmp_ind[i],tmp_nalloc[i]);
3879         }
3880         tmp_ind[i][ma->ncg[i]] = icg;
3881         ma->ncg[i]++;
3882         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3883     }
3884     
3885     k1 = 0;
3886     for(i=0; i<dd->nnodes; i++)
3887     {
3888         ma->index[i] = k1;
3889         for(k=0; k<ma->ncg[i]; k++)
3890         {
3891             ma->cg[k1++] = tmp_ind[i][k];
3892         }
3893     }
3894     ma->index[dd->nnodes] = k1;
3895     
3896     for(i=0; i<dd->nnodes; i++)
3897     {
3898         sfree(tmp_ind[i]);
3899     }
3900     sfree(tmp_ind);
3901     sfree(tmp_nalloc);
3902     
3903     if (fplog)
3904     {
3905         char buf[22];
3906         fprintf(fplog,"Charge group distribution at step %s:",
3907                 gmx_step_str(step,buf));
3908         for(i=0; i<dd->nnodes; i++)
3909         {
3910             fprintf(fplog," %d",ma->ncg[i]);
3911         }
3912         fprintf(fplog,"\n");
3913     }
3914 }
3915
3916 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3917                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3918                                 rvec pos[])
3919 {
3920     gmx_domdec_master_t *ma=NULL;
3921     ivec npulse;
3922     int  i,cg_gl;
3923     int  *ibuf,buf2[2] = { 0, 0 };
3924     gmx_bool bMaster = DDMASTER(dd);
3925     if (bMaster)
3926     {
3927         ma = dd->ma;
3928         
3929         if (dd->bScrewPBC)
3930         {
3931             check_screw_box(box);
3932         }
3933     
3934         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3935     
3936         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3937         for(i=0; i<dd->nnodes; i++)
3938         {
3939             ma->ibuf[2*i]   = ma->ncg[i];
3940             ma->ibuf[2*i+1] = ma->nat[i];
3941         }
3942         ibuf = ma->ibuf;
3943     }
3944     else
3945     {
3946         ibuf = NULL;
3947     }
3948     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3949     
3950     dd->ncg_home = buf2[0];
3951     dd->nat_home = buf2[1];
3952     dd->ncg_tot  = dd->ncg_home;
3953     dd->nat_tot  = dd->nat_home;
3954     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3955     {
3956         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3957         srenew(dd->index_gl,dd->cg_nalloc);
3958         srenew(dd->cgindex,dd->cg_nalloc+1);
3959     }
3960     if (bMaster)
3961     {
3962         for(i=0; i<dd->nnodes; i++)
3963         {
3964             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3965             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3966         }
3967     }
3968     
3969     dd_scatterv(dd,
3970                 DDMASTER(dd) ? ma->ibuf : NULL,
3971                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3972                 DDMASTER(dd) ? ma->cg : NULL,
3973                 dd->ncg_home*sizeof(int),dd->index_gl);
3974     
3975     /* Determine the home charge group sizes */
3976     dd->cgindex[0] = 0;
3977     for(i=0; i<dd->ncg_home; i++)
3978     {
3979         cg_gl = dd->index_gl[i];
3980         dd->cgindex[i+1] =
3981             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3982     }
3983     
3984     if (debug)
3985     {
3986         fprintf(debug,"Home charge groups:\n");
3987         for(i=0; i<dd->ncg_home; i++)
3988         {
3989             fprintf(debug," %d",dd->index_gl[i]);
3990             if (i % 10 == 9) 
3991                 fprintf(debug,"\n");
3992         }
3993         fprintf(debug,"\n");
3994     }
3995 }
3996
3997 static int compact_and_copy_vec_at(int ncg,int *move,
3998                                    int *cgindex,
3999                                    int nvec,int vec,
4000                                    rvec *src,gmx_domdec_comm_t *comm,
4001                                    gmx_bool bCompact)
4002 {
4003     int m,icg,i,i0,i1,nrcg;
4004     int home_pos;
4005     int pos_vec[DIM*2];
4006     
4007     home_pos = 0;
4008
4009     for(m=0; m<DIM*2; m++)
4010     {
4011         pos_vec[m] = 0;
4012     }
4013     
4014     i0 = 0;
4015     for(icg=0; icg<ncg; icg++)
4016     {
4017         i1 = cgindex[icg+1];
4018         m = move[icg];
4019         if (m == -1)
4020         {
4021             if (bCompact)
4022             {
4023                 /* Compact the home array in place */
4024                 for(i=i0; i<i1; i++)
4025                 {
4026                     copy_rvec(src[i],src[home_pos++]);
4027                 }
4028             }
4029         }
4030         else
4031         {
4032             /* Copy to the communication buffer */
4033             nrcg = i1 - i0;
4034             pos_vec[m] += 1 + vec*nrcg;
4035             for(i=i0; i<i1; i++)
4036             {
4037                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4038             }
4039             pos_vec[m] += (nvec - vec - 1)*nrcg;
4040         }
4041         if (!bCompact)
4042         {
4043             home_pos += i1 - i0;
4044         }
4045         i0 = i1;
4046     }
4047     
4048     return home_pos;
4049 }
4050
4051 static int compact_and_copy_vec_cg(int ncg,int *move,
4052                                    int *cgindex,
4053                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
4054                                    gmx_bool bCompact)
4055 {
4056     int m,icg,i0,i1,nrcg;
4057     int home_pos;
4058     int pos_vec[DIM*2];
4059     
4060     home_pos = 0;
4061     
4062     for(m=0; m<DIM*2; m++)
4063     {
4064         pos_vec[m] = 0;
4065     }
4066     
4067     i0 = 0;
4068     for(icg=0; icg<ncg; icg++)
4069     {
4070         i1 = cgindex[icg+1];
4071         m = move[icg];
4072         if (m == -1)
4073         {
4074             if (bCompact)
4075             {
4076                 /* Compact the home array in place */
4077                 copy_rvec(src[icg],src[home_pos++]);
4078             }
4079         }
4080         else
4081         {
4082             nrcg = i1 - i0;
4083             /* Copy to the communication buffer */
4084             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4085             pos_vec[m] += 1 + nrcg*nvec;
4086         }
4087         i0 = i1;
4088     }
4089     if (!bCompact)
4090     {
4091         home_pos = ncg;
4092     }
4093     
4094     return home_pos;
4095 }
4096
4097 static int compact_ind(int ncg,int *move,
4098                        int *index_gl,int *cgindex,
4099                        int *gatindex,
4100                        gmx_ga2la_t ga2la,char *bLocalCG,
4101                        int *cginfo)
4102 {
4103     int cg,nat,a0,a1,a,a_gl;
4104     int home_pos;
4105
4106     home_pos = 0;
4107     nat = 0;
4108     for(cg=0; cg<ncg; cg++)
4109     {
4110         a0 = cgindex[cg];
4111         a1 = cgindex[cg+1];
4112         if (move[cg] == -1)
4113         {
4114             /* Compact the home arrays in place.
4115              * Anything that can be done here avoids access to global arrays.
4116              */
4117             cgindex[home_pos] = nat;
4118             for(a=a0; a<a1; a++)
4119             {
4120                 a_gl = gatindex[a];
4121                 gatindex[nat] = a_gl;
4122                 /* The cell number stays 0, so we don't need to set it */
4123                 ga2la_change_la(ga2la,a_gl,nat);
4124                 nat++;
4125             }
4126             index_gl[home_pos] = index_gl[cg];
4127             cginfo[home_pos]   = cginfo[cg];
4128             /* The charge group remains local, so bLocalCG does not change */
4129             home_pos++;
4130         }
4131         else
4132         {
4133             /* Clear the global indices */
4134             for(a=a0; a<a1; a++)
4135             {
4136                 ga2la_del(ga2la,gatindex[a]);
4137             }
4138             if (bLocalCG)
4139             {
4140                 bLocalCG[index_gl[cg]] = FALSE;
4141             }
4142         }
4143     }
4144     cgindex[home_pos] = nat;
4145     
4146     return home_pos;
4147 }
4148
4149 static void clear_and_mark_ind(int ncg,int *move,
4150                                int *index_gl,int *cgindex,int *gatindex,
4151                                gmx_ga2la_t ga2la,char *bLocalCG,
4152                                int *cell_index)
4153 {
4154     int cg,a0,a1,a;
4155     
4156     for(cg=0; cg<ncg; cg++)
4157     {
4158         if (move[cg] >= 0)
4159         {
4160             a0 = cgindex[cg];
4161             a1 = cgindex[cg+1];
4162             /* Clear the global indices */
4163             for(a=a0; a<a1; a++)
4164             {
4165                 ga2la_del(ga2la,gatindex[a]);
4166             }
4167             if (bLocalCG)
4168             {
4169                 bLocalCG[index_gl[cg]] = FALSE;
4170             }
4171             /* Signal that this cg has moved using the ns cell index.
4172              * Here we set it to -1. fill_grid will change it
4173              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4174              */
4175             cell_index[cg] = -1;
4176         }
4177     }
4178 }
4179
4180 static void print_cg_move(FILE *fplog,
4181                           gmx_domdec_t *dd,
4182                           gmx_large_int_t step,int cg,int dim,int dir,
4183                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4184                           rvec cm_old,rvec cm_new,real pos_d)
4185 {
4186     gmx_domdec_comm_t *comm;
4187     char buf[22];
4188
4189     comm = dd->comm;
4190
4191     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4192     if (bHaveLimitdAndCMOld)
4193     {
4194         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4195                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4196     }
4197     else
4198     {
4199         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4200                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4201     }
4202     fprintf(fplog,"distance out of cell %f\n",
4203             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4204     if (bHaveLimitdAndCMOld)
4205     {
4206         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4207                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4208     }
4209     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4210             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4211     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4212             dim2char(dim),
4213             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4214     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4215             dim2char(dim),
4216             comm->cell_x0[dim],comm->cell_x1[dim]);
4217 }
4218
4219 static void cg_move_error(FILE *fplog,
4220                           gmx_domdec_t *dd,
4221                           gmx_large_int_t step,int cg,int dim,int dir,
4222                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4223                           rvec cm_old,rvec cm_new,real pos_d)
4224 {
4225     if (fplog)
4226     {
4227         print_cg_move(fplog, dd,step,cg,dim,dir,
4228                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4229     }
4230     print_cg_move(stderr,dd,step,cg,dim,dir,
4231                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4232     gmx_fatal(FARGS,
4233               "A charge group moved too far between two domain decomposition steps\n"
4234               "This usually means that your system is not well equilibrated");
4235 }
4236
4237 static void rotate_state_atom(t_state *state,int a)
4238 {
4239     int est;
4240
4241     for(est=0; est<estNR; est++)
4242     {
4243         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4244             switch (est) {
4245             case estX:
4246                 /* Rotate the complete state; for a rectangular box only */
4247                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4248                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4249                 break;
4250             case estV:
4251                 state->v[a][YY] = -state->v[a][YY];
4252                 state->v[a][ZZ] = -state->v[a][ZZ];
4253                 break;
4254             case estSDX:
4255                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4256                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4257                 break;
4258             case estCGP:
4259                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4260                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4261                 break;
4262             case estDISRE_INITF:
4263             case estDISRE_RM3TAV:
4264             case estORIRE_INITF:
4265             case estORIRE_DTAV:
4266                 /* These are distances, so not affected by rotation */
4267                 break;
4268             default:
4269                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4270             }
4271         }
4272     }
4273 }
4274
4275 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4276 {
4277     if (natoms > comm->moved_nalloc)
4278     {
4279         /* Contents should be preserved here */
4280         comm->moved_nalloc = over_alloc_dd(natoms);
4281         srenew(comm->moved,comm->moved_nalloc);
4282     }
4283
4284     return comm->moved;
4285 }
4286
4287 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4288                          gmx_domdec_t *dd,
4289                          t_state *state,
4290                          ivec tric_dir,matrix tcm,
4291                          rvec cell_x0,rvec cell_x1,
4292                          rvec limitd,rvec limit0,rvec limit1,
4293                          const int *cgindex,
4294                          int cg_start,int cg_end,
4295                          rvec *cg_cm,
4296                          int *move)
4297 {
4298     int  npbcdim;
4299     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4300     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4301     int  flag;
4302     gmx_bool bScrew;
4303     ivec dev;
4304     real inv_ncg,pos_d;
4305     rvec cm_new;
4306
4307     npbcdim = dd->npbcdim;
4308
4309     for(cg=cg_start; cg<cg_end; cg++)
4310     {
4311         k0   = cgindex[cg];
4312         k1   = cgindex[cg+1];
4313         nrcg = k1 - k0;
4314         if (nrcg == 1)
4315         {
4316             copy_rvec(state->x[k0],cm_new);
4317         }
4318         else
4319         {
4320             inv_ncg = 1.0/nrcg;
4321             
4322             clear_rvec(cm_new);
4323             for(k=k0; (k<k1); k++)
4324             {
4325                 rvec_inc(cm_new,state->x[k]);
4326             }
4327             for(d=0; (d<DIM); d++)
4328             {
4329                 cm_new[d] = inv_ncg*cm_new[d];
4330             }
4331         }
4332         
4333         clear_ivec(dev);
4334         /* Do pbc and check DD cell boundary crossings */
4335         for(d=DIM-1; d>=0; d--)
4336         {
4337             if (dd->nc[d] > 1)
4338             {
4339                 bScrew = (dd->bScrewPBC && d == XX);
4340                 /* Determine the location of this cg in lattice coordinates */
4341                 pos_d = cm_new[d];
4342                 if (tric_dir[d])
4343                 {
4344                     for(d2=d+1; d2<DIM; d2++)
4345                     {
4346                         pos_d += cm_new[d2]*tcm[d2][d];
4347                     }
4348                 }
4349                 /* Put the charge group in the triclinic unit-cell */
4350                 if (pos_d >= cell_x1[d])
4351                 {
4352                     if (pos_d >= limit1[d])
4353                     {
4354                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4355                                       cg_cm[cg],cm_new,pos_d);
4356                     }
4357                     dev[d] = 1;
4358                     if (dd->ci[d] == dd->nc[d] - 1)
4359                     {
4360                         rvec_dec(cm_new,state->box[d]);
4361                         if (bScrew)
4362                         {
4363                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4364                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4365                         }
4366                         for(k=k0; (k<k1); k++)
4367                         {
4368                             rvec_dec(state->x[k],state->box[d]);
4369                             if (bScrew)
4370                             {
4371                                 rotate_state_atom(state,k);
4372                             }
4373                         }
4374                     }
4375                 }
4376                 else if (pos_d < cell_x0[d])
4377                 {
4378                     if (pos_d < limit0[d])
4379                     {
4380                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4381                                       cg_cm[cg],cm_new,pos_d);
4382                     }
4383                     dev[d] = -1;
4384                     if (dd->ci[d] == 0)
4385                     {
4386                         rvec_inc(cm_new,state->box[d]);
4387                         if (bScrew)
4388                         {
4389                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4390                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4391                         }
4392                         for(k=k0; (k<k1); k++)
4393                         {
4394                             rvec_inc(state->x[k],state->box[d]);
4395                             if (bScrew)
4396                             {
4397                                 rotate_state_atom(state,k);
4398                             }
4399                         }
4400                     }
4401                 }
4402             }
4403             else if (d < npbcdim)
4404             {
4405                 /* Put the charge group in the rectangular unit-cell */
4406                 while (cm_new[d] >= state->box[d][d])
4407                 {
4408                     rvec_dec(cm_new,state->box[d]);
4409                     for(k=k0; (k<k1); k++)
4410                     {
4411                         rvec_dec(state->x[k],state->box[d]);
4412                     }
4413                 }
4414                 while (cm_new[d] < 0)
4415                 {
4416                     rvec_inc(cm_new,state->box[d]);
4417                     for(k=k0; (k<k1); k++)
4418                     {
4419                         rvec_inc(state->x[k],state->box[d]);
4420                     }
4421                 }
4422             }
4423         }
4424     
4425         copy_rvec(cm_new,cg_cm[cg]);
4426         
4427         /* Determine where this cg should go */
4428         flag = 0;
4429         mc = -1;
4430         for(d=0; d<dd->ndim; d++)
4431         {
4432             dim = dd->dim[d];
4433             if (dev[dim] == 1)
4434             {
4435                 flag |= DD_FLAG_FW(d);
4436                 if (mc == -1)
4437                 {
4438                     mc = d*2;
4439                 }
4440             }
4441             else if (dev[dim] == -1)
4442             {
4443                 flag |= DD_FLAG_BW(d);
4444                 if (mc == -1) {
4445                     if (dd->nc[dim] > 2)
4446                     {
4447                         mc = d*2 + 1;
4448                     }
4449                     else
4450                     {
4451                         mc = d*2;
4452                     }
4453                 }
4454             }
4455         }
4456         /* Temporarily store the flag in move */
4457         move[cg] = mc + flag;
4458     }
4459 }
4460
4461 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4462                                gmx_domdec_t *dd,ivec tric_dir,
4463                                t_state *state,rvec **f,
4464                                t_forcerec *fr,t_mdatoms *md,
4465                                gmx_bool bCompact,
4466                                t_nrnb *nrnb,
4467                                int *ncg_stay_home,
4468                                int *ncg_moved)
4469 {
4470     int  *move;
4471     int  npbcdim;
4472     int  ncg[DIM*2],nat[DIM*2];
4473     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4474     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4475     int  sbuf[2],rbuf[2];
4476     int  home_pos_cg,home_pos_at,buf_pos;
4477     int  flag;
4478     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4479     gmx_bool bScrew;
4480     ivec dev;
4481     real inv_ncg,pos_d;
4482     matrix tcm;
4483     rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4484     atom_id *cgindex;
4485     cginfo_mb_t *cginfo_mb;
4486     gmx_domdec_comm_t *comm;
4487     int  *moved;
4488     int  nthread,thread;
4489     
4490     if (dd->bScrewPBC)
4491     {
4492         check_screw_box(state->box);
4493     }
4494     
4495     comm  = dd->comm;
4496     if (fr->cutoff_scheme == ecutsGROUP)
4497     {
4498         cg_cm = fr->cg_cm;
4499     }
4500     
4501     for(i=0; i<estNR; i++)
4502     {
4503         if (EST_DISTR(i))
4504         {
4505             switch (i)
4506             {
4507             case estX:   /* Always present */            break;
4508             case estV:   bV   = (state->flags & (1<<i)); break;
4509             case estSDX: bSDX = (state->flags & (1<<i)); break;
4510             case estCGP: bCGP = (state->flags & (1<<i)); break;
4511             case estLD_RNG:
4512             case estLD_RNGI:
4513             case estDISRE_INITF:
4514             case estDISRE_RM3TAV:
4515             case estORIRE_INITF:
4516             case estORIRE_DTAV:
4517                 /* No processing required */
4518                 break;
4519             default:
4520             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4521             }
4522         }
4523     }
4524     
4525     if (dd->ncg_tot > comm->nalloc_int)
4526     {
4527         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4528         srenew(comm->buf_int,comm->nalloc_int);
4529     }
4530     move = comm->buf_int;
4531     
4532     /* Clear the count */
4533     for(c=0; c<dd->ndim*2; c++)
4534     {
4535         ncg[c] = 0;
4536         nat[c] = 0;
4537     }
4538
4539     npbcdim = dd->npbcdim;
4540
4541     for(d=0; (d<DIM); d++)
4542     {
4543         limitd[d] = dd->comm->cellsize_min[d];
4544         if (d >= npbcdim && dd->ci[d] == 0)
4545         {
4546             cell_x0[d] = -GMX_FLOAT_MAX;
4547         }
4548         else
4549         {
4550             cell_x0[d] = comm->cell_x0[d];
4551         }
4552         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4553         {
4554             cell_x1[d] = GMX_FLOAT_MAX;
4555         }
4556         else
4557         {
4558             cell_x1[d] = comm->cell_x1[d];
4559         }
4560         if (d < npbcdim)
4561         {
4562             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4563             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4564         }
4565         else
4566         {
4567             /* We check after communication if a charge group moved
4568              * more than one cell. Set the pre-comm check limit to float_max.
4569              */
4570             limit0[d] = -GMX_FLOAT_MAX;
4571             limit1[d] =  GMX_FLOAT_MAX;
4572         }
4573     }
4574     
4575     make_tric_corr_matrix(npbcdim,state->box,tcm);
4576     
4577     cgindex = dd->cgindex;
4578
4579     nthread = gmx_omp_nthreads_get(emntDomdec);
4580
4581     /* Compute the center of geometry for all home charge groups
4582      * and put them in the box and determine where they should go.
4583      */
4584 #pragma omp parallel for num_threads(nthread) schedule(static)
4585     for(thread=0; thread<nthread; thread++)
4586     {
4587         calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4588                      cell_x0,cell_x1,limitd,limit0,limit1,
4589                      cgindex,
4590                      ( thread   *dd->ncg_home)/nthread,
4591                      ((thread+1)*dd->ncg_home)/nthread,
4592                      fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4593                      move);
4594     }
4595
4596     for(cg=0; cg<dd->ncg_home; cg++)
4597     {
4598         if (move[cg] >= 0)
4599         {
4600             mc = move[cg];
4601             flag     = mc & ~DD_FLAG_NRCG;
4602             mc       = mc & DD_FLAG_NRCG;
4603             move[cg] = mc;
4604
4605             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4606             {
4607                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4608                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4609             }
4610             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4611             /* We store the cg size in the lower 16 bits
4612              * and the place where the charge group should go
4613              * in the next 6 bits. This saves some communication volume.
4614              */
4615             nrcg = cgindex[cg+1] - cgindex[cg];
4616             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4617             ncg[mc] += 1;
4618             nat[mc] += nrcg;
4619         }
4620     }
4621     
4622     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4623     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4624
4625     *ncg_moved = 0;
4626     for(i=0; i<dd->ndim*2; i++)
4627     {
4628         *ncg_moved += ncg[i];
4629     }
4630     
4631     nvec = 1;
4632     if (bV)
4633     {
4634         nvec++;
4635     }
4636     if (bSDX)
4637     {
4638         nvec++;
4639     }
4640     if (bCGP)
4641     {
4642         nvec++;
4643     }
4644     
4645     /* Make sure the communication buffers are large enough */
4646     for(mc=0; mc<dd->ndim*2; mc++)
4647     {
4648         nvr = ncg[mc] + nat[mc]*nvec;
4649         if (nvr > comm->cgcm_state_nalloc[mc])
4650         {
4651             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4652             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4653         }
4654     }
4655     
4656     switch (fr->cutoff_scheme)
4657     {
4658     case ecutsGROUP:
4659         /* Recalculating cg_cm might be cheaper than communicating,
4660          * but that could give rise to rounding issues.
4661          */
4662         home_pos_cg =
4663             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4664                                     nvec,cg_cm,comm,bCompact);
4665     break;
4666     case ecutsVERLET:
4667         /* Without charge groups we send the moved atom coordinates
4668          * over twice. This is so the code below can be used without
4669          * many conditionals for both for with and without charge groups.
4670          */
4671         home_pos_cg =
4672             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4673                                     nvec,state->x,comm,FALSE);
4674         if (bCompact)
4675         {
4676             home_pos_cg -= *ncg_moved;
4677         }
4678         break;
4679     default:
4680         gmx_incons("unimplemented");
4681         home_pos_cg = 0;
4682     }
4683     
4684     vec = 0;
4685     home_pos_at =
4686         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4687                                 nvec,vec++,state->x,comm,bCompact);
4688     if (bV)
4689     {
4690         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4691                                 nvec,vec++,state->v,comm,bCompact);
4692     }
4693     if (bSDX)
4694     {
4695         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4696                                 nvec,vec++,state->sd_X,comm,bCompact);
4697     }
4698     if (bCGP)
4699     {
4700         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4701                                 nvec,vec++,state->cg_p,comm,bCompact);
4702     }
4703     
4704     if (bCompact)
4705     {
4706         compact_ind(dd->ncg_home,move,
4707                     dd->index_gl,dd->cgindex,dd->gatindex,
4708                     dd->ga2la,comm->bLocalCG,
4709                     fr->cginfo);
4710     }
4711     else
4712     {
4713         if (fr->cutoff_scheme == ecutsVERLET)
4714         {
4715             moved = get_moved(comm,dd->ncg_home);
4716
4717             for(k=0; k<dd->ncg_home; k++)
4718             {
4719                 moved[k] = 0;
4720             }
4721         }
4722         else
4723         {
4724             moved = fr->ns.grid->cell_index;
4725         }
4726
4727         clear_and_mark_ind(dd->ncg_home,move,
4728                            dd->index_gl,dd->cgindex,dd->gatindex,
4729                            dd->ga2la,comm->bLocalCG,
4730                            moved);
4731     }
4732     
4733     cginfo_mb = fr->cginfo_mb;
4734
4735     *ncg_stay_home = home_pos_cg;
4736     for(d=0; d<dd->ndim; d++)
4737     {
4738         dim = dd->dim[d];
4739         ncg_recv = 0;
4740         nat_recv = 0;
4741         nvr      = 0;
4742         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4743         {
4744             cdd = d*2 + dir;
4745             /* Communicate the cg and atom counts */
4746             sbuf[0] = ncg[cdd];
4747             sbuf[1] = nat[cdd];
4748             if (debug)
4749             {
4750                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4751                         d,dir,sbuf[0],sbuf[1]);
4752             }
4753             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4754             
4755             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4756             {
4757                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4758                 srenew(comm->buf_int,comm->nalloc_int);
4759             }
4760             
4761             /* Communicate the charge group indices, sizes and flags */
4762             dd_sendrecv_int(dd, d, dir,
4763                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4764                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4765             
4766             nvs = ncg[cdd] + nat[cdd]*nvec;
4767             i   = rbuf[0]  + rbuf[1] *nvec;
4768             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4769             
4770             /* Communicate cgcm and state */
4771             dd_sendrecv_rvec(dd, d, dir,
4772                              comm->cgcm_state[cdd], nvs,
4773                              comm->vbuf.v+nvr, i);
4774             ncg_recv += rbuf[0];
4775             nat_recv += rbuf[1];
4776             nvr      += i;
4777         }
4778         
4779         /* Process the received charge groups */
4780         buf_pos = 0;
4781         for(cg=0; cg<ncg_recv; cg++)
4782         {
4783             flag = comm->buf_int[cg*DD_CGIBS+1];
4784
4785             if (dim >= npbcdim && dd->nc[dim] > 2)
4786             {
4787                 /* No pbc in this dim and more than one domain boundary.
4788                  * We do a separate check if a charge group didn't move too far.
4789                  */
4790                 if (((flag & DD_FLAG_FW(d)) &&
4791                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4792                     ((flag & DD_FLAG_BW(d)) &&
4793                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4794                 {
4795                     cg_move_error(fplog,dd,step,cg,dim,
4796                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4797                                    FALSE,0,
4798                                    comm->vbuf.v[buf_pos],
4799                                    comm->vbuf.v[buf_pos],
4800                                    comm->vbuf.v[buf_pos][dim]);
4801                 }
4802             }
4803
4804             mc = -1;
4805             if (d < dd->ndim-1)
4806             {
4807                 /* Check which direction this cg should go */
4808                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4809                 {
4810                     if (dd->bGridJump)
4811                     {
4812                         /* The cell boundaries for dimension d2 are not equal
4813                          * for each cell row of the lower dimension(s),
4814                          * therefore we might need to redetermine where
4815                          * this cg should go.
4816                          */
4817                         dim2 = dd->dim[d2];
4818                         /* If this cg crosses the box boundary in dimension d2
4819                          * we can use the communicated flag, so we do not
4820                          * have to worry about pbc.
4821                          */
4822                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4823                                (flag & DD_FLAG_FW(d2))) ||
4824                               (dd->ci[dim2] == 0 &&
4825                                (flag & DD_FLAG_BW(d2)))))
4826                         {
4827                             /* Clear the two flags for this dimension */
4828                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4829                             /* Determine the location of this cg
4830                              * in lattice coordinates
4831                              */
4832                             pos_d = comm->vbuf.v[buf_pos][dim2];
4833                             if (tric_dir[dim2])
4834                             {
4835                                 for(d3=dim2+1; d3<DIM; d3++)
4836                                 {
4837                                     pos_d +=
4838                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4839                                 }
4840                             }
4841                             /* Check of we are not at the box edge.
4842                              * pbc is only handled in the first step above,
4843                              * but this check could move over pbc while
4844                              * the first step did not due to different rounding.
4845                              */
4846                             if (pos_d >= cell_x1[dim2] &&
4847                                 dd->ci[dim2] != dd->nc[dim2]-1)
4848                             {
4849                                 flag |= DD_FLAG_FW(d2);
4850                             }
4851                             else if (pos_d < cell_x0[dim2] &&
4852                                      dd->ci[dim2] != 0)
4853                             {
4854                                 flag |= DD_FLAG_BW(d2);
4855                             }
4856                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4857                         }
4858                     }
4859                     /* Set to which neighboring cell this cg should go */
4860                     if (flag & DD_FLAG_FW(d2))
4861                     {
4862                         mc = d2*2;
4863                     }
4864                     else if (flag & DD_FLAG_BW(d2))
4865                     {
4866                         if (dd->nc[dd->dim[d2]] > 2)
4867                         {
4868                             mc = d2*2+1;
4869                         }
4870                         else
4871                         {
4872                             mc = d2*2;
4873                         }
4874                     }
4875                 }
4876             }
4877             
4878             nrcg = flag & DD_FLAG_NRCG;
4879             if (mc == -1)
4880             {
4881                 if (home_pos_cg+1 > dd->cg_nalloc)
4882                 {
4883                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4884                     srenew(dd->index_gl,dd->cg_nalloc);
4885                     srenew(dd->cgindex,dd->cg_nalloc+1);
4886                 }
4887                 /* Set the global charge group index and size */
4888                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4889                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4890                 /* Copy the state from the buffer */
4891                 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4892                 if (fr->cutoff_scheme == ecutsGROUP)
4893                 {
4894                     cg_cm = fr->cg_cm;
4895                     copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4896                 }
4897                 buf_pos++;
4898
4899                 /* Set the cginfo */
4900                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4901                                                    dd->index_gl[home_pos_cg]);
4902                 if (comm->bLocalCG)
4903                 {
4904                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4905                 }
4906
4907                 if (home_pos_at+nrcg > state->nalloc)
4908                 {
4909                     dd_realloc_state(state,f,home_pos_at+nrcg);
4910                 }
4911                 for(i=0; i<nrcg; i++)
4912                 {
4913                     copy_rvec(comm->vbuf.v[buf_pos++],
4914                               state->x[home_pos_at+i]);
4915                 }
4916                 if (bV)
4917                 {
4918                     for(i=0; i<nrcg; i++)
4919                     {
4920                         copy_rvec(comm->vbuf.v[buf_pos++],
4921                                   state->v[home_pos_at+i]);
4922                     }
4923                 }
4924                 if (bSDX)
4925                 {
4926                     for(i=0; i<nrcg; i++)
4927                     {
4928                         copy_rvec(comm->vbuf.v[buf_pos++],
4929                                   state->sd_X[home_pos_at+i]);
4930                     }
4931                 }
4932                 if (bCGP)
4933                 {
4934                     for(i=0; i<nrcg; i++)
4935                     {
4936                         copy_rvec(comm->vbuf.v[buf_pos++],
4937                                   state->cg_p[home_pos_at+i]);
4938                     }
4939                 }
4940                 home_pos_cg += 1;
4941                 home_pos_at += nrcg;
4942             }
4943             else
4944             {
4945                 /* Reallocate the buffers if necessary  */
4946                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4947                 {
4948                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4949                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4950                 }
4951                 nvr = ncg[mc] + nat[mc]*nvec;
4952                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4953                 {
4954                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4955                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4956                 }
4957                 /* Copy from the receive to the send buffers */
4958                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4959                        comm->buf_int + cg*DD_CGIBS,
4960                        DD_CGIBS*sizeof(int));
4961                 memcpy(comm->cgcm_state[mc][nvr],
4962                        comm->vbuf.v[buf_pos],
4963                        (1+nrcg*nvec)*sizeof(rvec));
4964                 buf_pos += 1 + nrcg*nvec;
4965                 ncg[mc] += 1;
4966                 nat[mc] += nrcg;
4967             }
4968         }
4969     }
4970     
4971     /* With sorting (!bCompact) the indices are now only partially up to date
4972      * and ncg_home and nat_home are not the real count, since there are
4973      * "holes" in the arrays for the charge groups that moved to neighbors.
4974      */
4975     if (fr->cutoff_scheme == ecutsVERLET)
4976     {
4977         moved = get_moved(comm,home_pos_cg);
4978
4979         for(i=dd->ncg_home; i<home_pos_cg; i++)
4980         {
4981             moved[i] = 0;
4982         }
4983     }
4984     dd->ncg_home = home_pos_cg;
4985     dd->nat_home = home_pos_at;
4986
4987     if (debug)
4988     {
4989         fprintf(debug,
4990                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4991                 *ncg_moved,dd->ncg_home-*ncg_moved);
4992                 
4993     }
4994 }
4995
4996 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4997 {
4998     dd->comm->cycl[ddCycl] += cycles;
4999     dd->comm->cycl_n[ddCycl]++;
5000     if (cycles > dd->comm->cycl_max[ddCycl])
5001     {
5002         dd->comm->cycl_max[ddCycl] = cycles;
5003     }
5004 }
5005
5006 static double force_flop_count(t_nrnb *nrnb)
5007 {
5008     int i;
5009     double sum;
5010     const char *name;
5011
5012     sum = 0;
5013     for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5014     {
5015         /* To get closer to the real timings, we half the count
5016          * for the normal loops and again half it for water loops.
5017          */
5018         name = nrnb_str(i);
5019         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5020         {
5021             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5022         }
5023         else
5024         {
5025             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5026         }
5027     }
5028     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5029     {
5030         name = nrnb_str(i);
5031         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5032         sum += nrnb->n[i]*cost_nrnb(i);
5033     }
5034     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5035     {
5036         sum += nrnb->n[i]*cost_nrnb(i);
5037     }
5038
5039     return sum;
5040 }
5041
5042 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5043 {
5044     if (dd->comm->eFlop)
5045     {
5046         dd->comm->flop -= force_flop_count(nrnb);
5047     }
5048 }
5049 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5050 {
5051     if (dd->comm->eFlop)
5052     {
5053         dd->comm->flop += force_flop_count(nrnb);
5054         dd->comm->flop_n++;
5055     }
5056 }  
5057
5058 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5059 {
5060     int i;
5061     
5062     for(i=0; i<ddCyclNr; i++)
5063     {
5064         dd->comm->cycl[i] = 0;
5065         dd->comm->cycl_n[i] = 0;
5066         dd->comm->cycl_max[i] = 0;
5067     }
5068     dd->comm->flop = 0;
5069     dd->comm->flop_n = 0;
5070 }
5071
5072 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5073 {
5074     gmx_domdec_comm_t *comm;
5075     gmx_domdec_load_t *load;
5076     gmx_domdec_root_t *root=NULL;
5077     int  d,dim,cid,i,pos;
5078     float cell_frac=0,sbuf[DD_NLOAD_MAX];
5079     gmx_bool bSepPME;
5080     
5081     if (debug)
5082     {
5083         fprintf(debug,"get_load_distribution start\n");
5084     }
5085
5086     wallcycle_start(wcycle,ewcDDCOMMLOAD);
5087     
5088     comm = dd->comm;
5089     
5090     bSepPME = (dd->pme_nodeid >= 0);
5091     
5092     for(d=dd->ndim-1; d>=0; d--)
5093     {
5094         dim = dd->dim[d];
5095         /* Check if we participate in the communication in this dimension */
5096         if (d == dd->ndim-1 || 
5097             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5098         {
5099             load = &comm->load[d];
5100             if (dd->bGridJump)
5101             {
5102                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5103             }
5104             pos = 0;
5105             if (d == dd->ndim-1)
5106             {
5107                 sbuf[pos++] = dd_force_load(comm);
5108                 sbuf[pos++] = sbuf[0];
5109                 if (dd->bGridJump)
5110                 {
5111                     sbuf[pos++] = sbuf[0];
5112                     sbuf[pos++] = cell_frac;
5113                     if (d > 0)
5114                     {
5115                         sbuf[pos++] = comm->cell_f_max0[d];
5116                         sbuf[pos++] = comm->cell_f_min1[d];
5117                     }
5118                 }
5119                 if (bSepPME)
5120                 {
5121                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5122                     sbuf[pos++] = comm->cycl[ddCyclPME];
5123                 }
5124             }
5125             else
5126             {
5127                 sbuf[pos++] = comm->load[d+1].sum;
5128                 sbuf[pos++] = comm->load[d+1].max;
5129                 if (dd->bGridJump)
5130                 {
5131                     sbuf[pos++] = comm->load[d+1].sum_m;
5132                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5133                     sbuf[pos++] = comm->load[d+1].flags;
5134                     if (d > 0)
5135                     {
5136                         sbuf[pos++] = comm->cell_f_max0[d];
5137                         sbuf[pos++] = comm->cell_f_min1[d];
5138                     }
5139                 }
5140                 if (bSepPME)
5141                 {
5142                     sbuf[pos++] = comm->load[d+1].mdf;
5143                     sbuf[pos++] = comm->load[d+1].pme;
5144                 }
5145             }
5146             load->nload = pos;
5147             /* Communicate a row in DD direction d.
5148              * The communicators are setup such that the root always has rank 0.
5149              */
5150 #ifdef GMX_MPI
5151             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
5152                        load->load,load->nload*sizeof(float),MPI_BYTE,
5153                        0,comm->mpi_comm_load[d]);
5154 #endif
5155             if (dd->ci[dim] == dd->master_ci[dim])
5156             {
5157                 /* We are the root, process this row */
5158                 if (comm->bDynLoadBal)
5159                 {
5160                     root = comm->root[d];
5161                 }
5162                 load->sum = 0;
5163                 load->max = 0;
5164                 load->sum_m = 0;
5165                 load->cvol_min = 1;
5166                 load->flags = 0;
5167                 load->mdf = 0;
5168                 load->pme = 0;
5169                 pos = 0;
5170                 for(i=0; i<dd->nc[dim]; i++)
5171                 {
5172                     load->sum += load->load[pos++];
5173                     load->max = max(load->max,load->load[pos]);
5174                     pos++;
5175                     if (dd->bGridJump)
5176                     {
5177                         if (root->bLimited)
5178                         {
5179                             /* This direction could not be load balanced properly,
5180                              * therefore we need to use the maximum iso the average load.
5181                              */
5182                             load->sum_m = max(load->sum_m,load->load[pos]);
5183                         }
5184                         else
5185                         {
5186                             load->sum_m += load->load[pos];
5187                         }
5188                         pos++;
5189                         load->cvol_min = min(load->cvol_min,load->load[pos]);
5190                         pos++;
5191                         if (d < dd->ndim-1)
5192                         {
5193                             load->flags = (int)(load->load[pos++] + 0.5);
5194                         }
5195                         if (d > 0)
5196                         {
5197                             root->cell_f_max0[i] = load->load[pos++];
5198                             root->cell_f_min1[i] = load->load[pos++];
5199                         }
5200                     }
5201                     if (bSepPME)
5202                     {
5203                         load->mdf = max(load->mdf,load->load[pos]);
5204                         pos++;
5205                         load->pme = max(load->pme,load->load[pos]);
5206                         pos++;
5207                     }
5208                 }
5209                 if (comm->bDynLoadBal && root->bLimited)
5210                 {
5211                     load->sum_m *= dd->nc[dim];
5212                     load->flags |= (1<<d);
5213                 }
5214             }
5215         }
5216     }
5217
5218     if (DDMASTER(dd))
5219     {
5220         comm->nload      += dd_load_count(comm);
5221         comm->load_step  += comm->cycl[ddCyclStep];
5222         comm->load_sum   += comm->load[0].sum;
5223         comm->load_max   += comm->load[0].max;
5224         if (comm->bDynLoadBal)
5225         {
5226             for(d=0; d<dd->ndim; d++)
5227             {
5228                 if (comm->load[0].flags & (1<<d))
5229                 {
5230                     comm->load_lim[d]++;
5231                 }
5232             }
5233         }
5234         if (bSepPME)
5235         {
5236             comm->load_mdf += comm->load[0].mdf;
5237             comm->load_pme += comm->load[0].pme;
5238         }
5239     }
5240
5241     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5242     
5243     if (debug)
5244     {
5245         fprintf(debug,"get_load_distribution finished\n");
5246     }
5247 }
5248
5249 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5250 {
5251     /* Return the relative performance loss on the total run time
5252      * due to the force calculation load imbalance.
5253      */
5254     if (dd->comm->nload > 0)
5255     {
5256         return
5257             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5258             (dd->comm->load_step*dd->nnodes);
5259     }
5260     else
5261     {
5262         return 0;
5263     }
5264 }
5265
5266 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5267 {
5268     char  buf[STRLEN];
5269     int   npp,npme,nnodes,d,limp;
5270     float imbal,pme_f_ratio,lossf,lossp=0;
5271     gmx_bool  bLim;
5272     gmx_domdec_comm_t *comm;
5273
5274     comm = dd->comm;
5275     if (DDMASTER(dd) && comm->nload > 0)
5276     {
5277         npp    = dd->nnodes;
5278         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5279         nnodes = npp + npme;
5280         imbal = comm->load_max*npp/comm->load_sum - 1;
5281         lossf = dd_force_imb_perf_loss(dd);
5282         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5283         fprintf(fplog,"%s",buf);
5284         fprintf(stderr,"\n");
5285         fprintf(stderr,"%s",buf);
5286         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5287         fprintf(fplog,"%s",buf);
5288         fprintf(stderr,"%s",buf);
5289         bLim = FALSE;
5290         if (comm->bDynLoadBal)
5291         {
5292             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5293             for(d=0; d<dd->ndim; d++)
5294             {
5295                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5296                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5297                 if (limp >= 50)
5298                 {
5299                     bLim = TRUE;
5300                 }
5301             }
5302             sprintf(buf+strlen(buf),"\n");
5303             fprintf(fplog,"%s",buf);
5304             fprintf(stderr,"%s",buf);
5305         }
5306         if (npme > 0)
5307         {
5308             pme_f_ratio = comm->load_pme/comm->load_mdf;
5309             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5310             if (lossp <= 0)
5311             {
5312                 lossp *= (float)npme/(float)nnodes;
5313             }
5314             else
5315             {
5316                 lossp *= (float)npp/(float)nnodes;
5317             }
5318             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5319             fprintf(fplog,"%s",buf);
5320             fprintf(stderr,"%s",buf);
5321             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5322             fprintf(fplog,"%s",buf);
5323             fprintf(stderr,"%s",buf);
5324         }
5325         fprintf(fplog,"\n");
5326         fprintf(stderr,"\n");
5327         
5328         if (lossf >= DD_PERF_LOSS)
5329         {
5330             sprintf(buf,
5331                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5332                     "      in the domain decomposition.\n",lossf*100);
5333             if (!comm->bDynLoadBal)
5334             {
5335                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5336             }
5337             else if (bLim)
5338             {
5339                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5340             }
5341             fprintf(fplog,"%s\n",buf);
5342             fprintf(stderr,"%s\n",buf);
5343         }
5344         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5345         {
5346             sprintf(buf,
5347                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5348                     "      had %s work to do than the PP nodes.\n"
5349                     "      You might want to %s the number of PME nodes\n"
5350                     "      or %s the cut-off and the grid spacing.\n",
5351                     fabs(lossp*100),
5352                     (lossp < 0) ? "less"     : "more",
5353                     (lossp < 0) ? "decrease" : "increase",
5354                     (lossp < 0) ? "decrease" : "increase");
5355             fprintf(fplog,"%s\n",buf);
5356             fprintf(stderr,"%s\n",buf);
5357         }
5358     }
5359 }
5360
5361 static float dd_vol_min(gmx_domdec_t *dd)
5362 {
5363     return dd->comm->load[0].cvol_min*dd->nnodes;
5364 }
5365
5366 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5367 {
5368     return dd->comm->load[0].flags;
5369 }
5370
5371 static float dd_f_imbal(gmx_domdec_t *dd)
5372 {
5373     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5374 }
5375
5376 float dd_pme_f_ratio(gmx_domdec_t *dd)
5377 {
5378     if (dd->comm->cycl_n[ddCyclPME] > 0)
5379     {
5380         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5381     }
5382     else
5383     {
5384         return -1.0;
5385     }
5386 }
5387
5388 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5389 {
5390     int flags,d;
5391     char buf[22];
5392     
5393     flags = dd_load_flags(dd);
5394     if (flags)
5395     {
5396         fprintf(fplog,
5397                 "DD  load balancing is limited by minimum cell size in dimension");
5398         for(d=0; d<dd->ndim; d++)
5399         {
5400             if (flags & (1<<d))
5401             {
5402                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5403             }
5404         }
5405         fprintf(fplog,"\n");
5406     }
5407     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5408     if (dd->comm->bDynLoadBal)
5409     {
5410         fprintf(fplog,"  vol min/aver %5.3f%c",
5411                 dd_vol_min(dd),flags ? '!' : ' ');
5412     }
5413     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5414     if (dd->comm->cycl_n[ddCyclPME])
5415     {
5416         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5417     }
5418     fprintf(fplog,"\n\n");
5419 }
5420
5421 static void dd_print_load_verbose(gmx_domdec_t *dd)
5422 {
5423     if (dd->comm->bDynLoadBal)
5424     {
5425         fprintf(stderr,"vol %4.2f%c ",
5426                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5427     }
5428     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5429     if (dd->comm->cycl_n[ddCyclPME])
5430     {
5431         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5432     }
5433 }
5434
5435 #ifdef GMX_MPI
5436 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5437 {
5438     MPI_Comm  c_row;
5439     int  dim, i, rank;
5440     ivec loc_c;
5441     gmx_domdec_root_t *root;
5442     gmx_bool bPartOfGroup = FALSE;
5443     
5444     dim = dd->dim[dim_ind];
5445     copy_ivec(loc,loc_c);
5446     for(i=0; i<dd->nc[dim]; i++)
5447     {
5448         loc_c[dim] = i;
5449         rank = dd_index(dd->nc,loc_c);
5450         if (rank == dd->rank)
5451         {
5452             /* This process is part of the group */
5453             bPartOfGroup = TRUE;
5454         }
5455     }
5456     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5457                    &c_row);
5458     if (bPartOfGroup)
5459     {
5460         dd->comm->mpi_comm_load[dim_ind] = c_row;
5461         if (dd->comm->eDLB != edlbNO)
5462         {
5463             if (dd->ci[dim] == dd->master_ci[dim])
5464             {
5465                 /* This is the root process of this row */
5466                 snew(dd->comm->root[dim_ind],1);
5467                 root = dd->comm->root[dim_ind];
5468                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5469                 snew(root->old_cell_f,dd->nc[dim]+1);
5470                 snew(root->bCellMin,dd->nc[dim]);
5471                 if (dim_ind > 0)
5472                 {
5473                     snew(root->cell_f_max0,dd->nc[dim]);
5474                     snew(root->cell_f_min1,dd->nc[dim]);
5475                     snew(root->bound_min,dd->nc[dim]);
5476                     snew(root->bound_max,dd->nc[dim]);
5477                 }
5478                 snew(root->buf_ncd,dd->nc[dim]);
5479             }
5480             else
5481             {
5482                 /* This is not a root process, we only need to receive cell_f */
5483                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5484             }
5485         }
5486         if (dd->ci[dim] == dd->master_ci[dim])
5487         {
5488             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5489         }
5490     }
5491 }
5492 #endif
5493
5494 static void make_load_communicators(gmx_domdec_t *dd)
5495 {
5496 #ifdef GMX_MPI
5497   int  dim0,dim1,i,j;
5498   ivec loc;
5499
5500   if (debug)
5501     fprintf(debug,"Making load communicators\n");
5502
5503   snew(dd->comm->load,dd->ndim);
5504   snew(dd->comm->mpi_comm_load,dd->ndim);
5505   
5506   clear_ivec(loc);
5507   make_load_communicator(dd,0,loc);
5508   if (dd->ndim > 1) {
5509     dim0 = dd->dim[0];
5510     for(i=0; i<dd->nc[dim0]; i++) {
5511       loc[dim0] = i;
5512       make_load_communicator(dd,1,loc);
5513     }
5514   }
5515   if (dd->ndim > 2) {
5516     dim0 = dd->dim[0];
5517     for(i=0; i<dd->nc[dim0]; i++) {
5518       loc[dim0] = i;
5519       dim1 = dd->dim[1];
5520       for(j=0; j<dd->nc[dim1]; j++) {
5521           loc[dim1] = j;
5522           make_load_communicator(dd,2,loc);
5523       }
5524     }
5525   }
5526
5527   if (debug)
5528     fprintf(debug,"Finished making load communicators\n");
5529 #endif
5530 }
5531
5532 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5533 {
5534     gmx_bool bZYX;
5535     int  d,dim,i,j,m;
5536     ivec tmp,s;
5537     int  nzone,nzonep;
5538     ivec dd_zp[DD_MAXIZONE];
5539     gmx_domdec_zones_t *zones;
5540     gmx_domdec_ns_ranges_t *izone;
5541     
5542     for(d=0; d<dd->ndim; d++)
5543     {
5544         dim = dd->dim[d];
5545         copy_ivec(dd->ci,tmp);
5546         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5547         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5548         copy_ivec(dd->ci,tmp);
5549         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5550         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5551         if (debug)
5552         {
5553             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5554                     dd->rank,dim,
5555                     dd->neighbor[d][0],
5556                     dd->neighbor[d][1]);
5557         }
5558     }
5559     
5560     if (DDMASTER(dd))
5561     {
5562         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5563             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5564     }
5565     if (fplog)
5566     {
5567         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5568                 dd->ndim,
5569                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5570                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5571     }
5572     switch (dd->ndim)
5573     {
5574     case 3:
5575         nzone  = dd_z3n;
5576         nzonep = dd_zp3n;
5577         for(i=0; i<nzonep; i++)
5578         {
5579             copy_ivec(dd_zp3[i],dd_zp[i]);
5580         }
5581         break;
5582     case 2:
5583         nzone  = dd_z2n;
5584         nzonep = dd_zp2n;
5585         for(i=0; i<nzonep; i++)
5586         {
5587             copy_ivec(dd_zp2[i],dd_zp[i]);
5588         }
5589         break;
5590     case 1:
5591         nzone  = dd_z1n;
5592         nzonep = dd_zp1n;
5593         for(i=0; i<nzonep; i++)
5594         {
5595             copy_ivec(dd_zp1[i],dd_zp[i]);
5596         }
5597         break;
5598     default:
5599         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5600         nzone = 0;
5601         nzonep = 0;
5602     }
5603
5604     zones = &dd->comm->zones;
5605
5606     for(i=0; i<nzone; i++)
5607     {
5608         m = 0;
5609         clear_ivec(zones->shift[i]);
5610         for(d=0; d<dd->ndim; d++)
5611         {
5612             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5613         }
5614     }
5615     
5616     zones->n = nzone;
5617     for(i=0; i<nzone; i++)
5618     {
5619         for(d=0; d<DIM; d++)
5620         {
5621             s[d] = dd->ci[d] - zones->shift[i][d];
5622             if (s[d] < 0)
5623             {
5624                 s[d] += dd->nc[d];
5625             }
5626             else if (s[d] >= dd->nc[d])
5627             {
5628                 s[d] -= dd->nc[d];
5629             }
5630         }
5631     }
5632     zones->nizone = nzonep;
5633     for(i=0; i<zones->nizone; i++)
5634     {
5635         if (dd_zp[i][0] != i)
5636         {
5637             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5638         }
5639         izone = &zones->izone[i];
5640         izone->j0 = dd_zp[i][1];
5641         izone->j1 = dd_zp[i][2];
5642         for(dim=0; dim<DIM; dim++)
5643         {
5644             if (dd->nc[dim] == 1)
5645             {
5646                 /* All shifts should be allowed */
5647                 izone->shift0[dim] = -1;
5648                 izone->shift1[dim] = 1;
5649             }
5650             else
5651             {
5652                 /*
5653                   izone->shift0[d] = 0;
5654                   izone->shift1[d] = 0;
5655                   for(j=izone->j0; j<izone->j1; j++) {
5656                   if (dd->shift[j][d] > dd->shift[i][d])
5657                   izone->shift0[d] = -1;
5658                   if (dd->shift[j][d] < dd->shift[i][d])
5659                   izone->shift1[d] = 1;
5660                   }
5661                 */
5662                 
5663                 int shift_diff;
5664                 
5665                 /* Assume the shift are not more than 1 cell */
5666                 izone->shift0[dim] = 1;
5667                 izone->shift1[dim] = -1;
5668                 for(j=izone->j0; j<izone->j1; j++)
5669                 {
5670                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5671                     if (shift_diff < izone->shift0[dim])
5672                     {
5673                         izone->shift0[dim] = shift_diff;
5674                     }
5675                     if (shift_diff > izone->shift1[dim])
5676                     {
5677                         izone->shift1[dim] = shift_diff;
5678                     }
5679                 }
5680             }
5681         }
5682     }
5683     
5684     if (dd->comm->eDLB != edlbNO)
5685     {
5686         snew(dd->comm->root,dd->ndim);
5687     }
5688     
5689     if (dd->comm->bRecordLoad)
5690     {
5691         make_load_communicators(dd);
5692     }
5693 }
5694
5695 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5696 {
5697     gmx_domdec_t *dd;
5698     gmx_domdec_comm_t *comm;
5699     int  i,rank,*buf;
5700     ivec periods;
5701 #ifdef GMX_MPI
5702     MPI_Comm comm_cart;
5703 #endif
5704     
5705     dd = cr->dd;
5706     comm = dd->comm;
5707     
5708 #ifdef GMX_MPI
5709     if (comm->bCartesianPP)
5710     {
5711         /* Set up cartesian communication for the particle-particle part */
5712         if (fplog)
5713         {
5714             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5715                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5716         }
5717         
5718         for(i=0; i<DIM; i++)
5719         {
5720             periods[i] = TRUE;
5721         }
5722         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5723                         &comm_cart);
5724         /* We overwrite the old communicator with the new cartesian one */
5725         cr->mpi_comm_mygroup = comm_cart;
5726     }
5727     
5728     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5729     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5730     
5731     if (comm->bCartesianPP_PME)
5732     {
5733         /* Since we want to use the original cartesian setup for sim,
5734          * and not the one after split, we need to make an index.
5735          */
5736         snew(comm->ddindex2ddnodeid,dd->nnodes);
5737         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5738         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5739         /* Get the rank of the DD master,
5740          * above we made sure that the master node is a PP node.
5741          */
5742         if (MASTER(cr))
5743         {
5744             rank = dd->rank;
5745         }
5746         else
5747         {
5748             rank = 0;
5749         }
5750         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5751     }
5752     else if (comm->bCartesianPP)
5753     {
5754         if (cr->npmenodes == 0)
5755         {
5756             /* The PP communicator is also
5757              * the communicator for this simulation
5758              */
5759             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5760         }
5761         cr->nodeid = dd->rank;
5762         
5763         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5764         
5765         /* We need to make an index to go from the coordinates
5766          * to the nodeid of this simulation.
5767          */
5768         snew(comm->ddindex2simnodeid,dd->nnodes);
5769         snew(buf,dd->nnodes);
5770         if (cr->duty & DUTY_PP)
5771         {
5772             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5773         }
5774         /* Communicate the ddindex to simulation nodeid index */
5775         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5776                       cr->mpi_comm_mysim);
5777         sfree(buf);
5778         
5779         /* Determine the master coordinates and rank.
5780          * The DD master should be the same node as the master of this sim.
5781          */
5782         for(i=0; i<dd->nnodes; i++)
5783         {
5784             if (comm->ddindex2simnodeid[i] == 0)
5785             {
5786                 ddindex2xyz(dd->nc,i,dd->master_ci);
5787                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5788             }
5789         }
5790         if (debug)
5791         {
5792             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5793         }
5794     }
5795     else
5796     {
5797         /* No Cartesian communicators */
5798         /* We use the rank in dd->comm->all as DD index */
5799         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5800         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5801         dd->masterrank = 0;
5802         clear_ivec(dd->master_ci);
5803     }
5804 #endif
5805   
5806     if (fplog)
5807     {
5808         fprintf(fplog,
5809                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5810                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5811     }
5812     if (debug)
5813     {
5814         fprintf(debug,
5815                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5816                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5817     }
5818 }
5819
5820 static void receive_ddindex2simnodeid(t_commrec *cr)
5821 {
5822     gmx_domdec_t *dd;
5823     
5824     gmx_domdec_comm_t *comm;
5825     int  *buf;
5826     
5827     dd = cr->dd;
5828     comm = dd->comm;
5829     
5830 #ifdef GMX_MPI
5831     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5832     {
5833         snew(comm->ddindex2simnodeid,dd->nnodes);
5834         snew(buf,dd->nnodes);
5835         if (cr->duty & DUTY_PP)
5836         {
5837             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5838         }
5839 #ifdef GMX_MPI
5840         /* Communicate the ddindex to simulation nodeid index */
5841         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5842                       cr->mpi_comm_mysim);
5843 #endif
5844         sfree(buf);
5845     }
5846 #endif
5847 }
5848
5849 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5850                                                      int ncg,int natoms)
5851 {
5852     gmx_domdec_master_t *ma;
5853     int i;
5854
5855     snew(ma,1);
5856     
5857     snew(ma->ncg,dd->nnodes);
5858     snew(ma->index,dd->nnodes+1);
5859     snew(ma->cg,ncg);
5860     snew(ma->nat,dd->nnodes);
5861     snew(ma->ibuf,dd->nnodes*2);
5862     snew(ma->cell_x,DIM);
5863     for(i=0; i<DIM; i++)
5864     {
5865         snew(ma->cell_x[i],dd->nc[i]+1);
5866     }
5867
5868     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5869     {
5870         ma->vbuf = NULL;
5871     }
5872     else
5873     {
5874         snew(ma->vbuf,natoms);
5875     }
5876
5877     return ma;
5878 }
5879
5880 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5881                                int reorder)
5882 {
5883     gmx_domdec_t *dd;
5884     gmx_domdec_comm_t *comm;
5885     int  i,rank;
5886     gmx_bool bDiv[DIM];
5887     ivec periods;
5888 #ifdef GMX_MPI
5889     MPI_Comm comm_cart;
5890 #endif
5891     
5892     dd = cr->dd;
5893     comm = dd->comm;
5894     
5895     if (comm->bCartesianPP)
5896     {
5897         for(i=1; i<DIM; i++)
5898         {
5899             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5900         }
5901         if (bDiv[YY] || bDiv[ZZ])
5902         {
5903             comm->bCartesianPP_PME = TRUE;
5904             /* If we have 2D PME decomposition, which is always in x+y,
5905              * we stack the PME only nodes in z.
5906              * Otherwise we choose the direction that provides the thinnest slab
5907              * of PME only nodes as this will have the least effect
5908              * on the PP communication.
5909              * But for the PME communication the opposite might be better.
5910              */
5911             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5912                              !bDiv[YY] ||
5913                              dd->nc[YY] > dd->nc[ZZ]))
5914             {
5915                 comm->cartpmedim = ZZ;
5916             }
5917             else
5918             {
5919                 comm->cartpmedim = YY;
5920             }
5921             comm->ntot[comm->cartpmedim]
5922                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5923         }
5924         else if (fplog)
5925         {
5926             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5927             fprintf(fplog,
5928                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5929         }
5930     }
5931     
5932 #ifdef GMX_MPI
5933     if (comm->bCartesianPP_PME)
5934     {
5935         if (fplog)
5936         {
5937             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5938         }
5939         
5940         for(i=0; i<DIM; i++)
5941         {
5942             periods[i] = TRUE;
5943         }
5944         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5945                         &comm_cart);
5946         
5947         MPI_Comm_rank(comm_cart,&rank);
5948         if (MASTERNODE(cr) && rank != 0)
5949         {
5950             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5951         }
5952         
5953         /* With this assigment we loose the link to the original communicator
5954          * which will usually be MPI_COMM_WORLD, unless have multisim.
5955          */
5956         cr->mpi_comm_mysim = comm_cart;
5957         cr->sim_nodeid = rank;
5958         
5959         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5960         
5961         if (fplog)
5962         {
5963             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5964                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5965         }
5966         
5967         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5968         {
5969             cr->duty = DUTY_PP;
5970         }
5971         if (cr->npmenodes == 0 ||
5972             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5973         {
5974             cr->duty = DUTY_PME;
5975         }
5976         
5977         /* Split the sim communicator into PP and PME only nodes */
5978         MPI_Comm_split(cr->mpi_comm_mysim,
5979                        cr->duty,
5980                        dd_index(comm->ntot,dd->ci),
5981                        &cr->mpi_comm_mygroup);
5982     }
5983     else
5984     {
5985         switch (dd_node_order)
5986         {
5987         case ddnoPP_PME:
5988             if (fplog)
5989             {
5990                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5991             }
5992             break;
5993         case ddnoINTERLEAVE:
5994             /* Interleave the PP-only and PME-only nodes,
5995              * as on clusters with dual-core machines this will double
5996              * the communication bandwidth of the PME processes
5997              * and thus speed up the PP <-> PME and inter PME communication.
5998              */
5999             if (fplog)
6000             {
6001                 fprintf(fplog,"Interleaving PP and PME nodes\n");
6002             }
6003             comm->pmenodes = dd_pmenodes(cr);
6004             break;
6005         case ddnoCARTESIAN:
6006             break;
6007         default:
6008             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6009         }
6010     
6011         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6012         {
6013             cr->duty = DUTY_PME;
6014         }
6015         else
6016         {
6017             cr->duty = DUTY_PP;
6018         }
6019         
6020         /* Split the sim communicator into PP and PME only nodes */
6021         MPI_Comm_split(cr->mpi_comm_mysim,
6022                        cr->duty,
6023                        cr->nodeid,
6024                        &cr->mpi_comm_mygroup);
6025         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6026     }
6027 #endif
6028
6029     if (fplog)
6030     {
6031         fprintf(fplog,"This is a %s only node\n\n",
6032                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6033     }
6034 }
6035
6036 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6037 {
6038     gmx_domdec_t *dd;
6039     gmx_domdec_comm_t *comm;
6040     int CartReorder;
6041     
6042     dd = cr->dd;
6043     comm = dd->comm;
6044     
6045     copy_ivec(dd->nc,comm->ntot);
6046     
6047     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6048     comm->bCartesianPP_PME = FALSE;
6049     
6050     /* Reorder the nodes by default. This might change the MPI ranks.
6051      * Real reordering is only supported on very few architectures,
6052      * Blue Gene is one of them.
6053      */
6054     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6055     
6056     if (cr->npmenodes > 0)
6057     {
6058         /* Split the communicator into a PP and PME part */
6059         split_communicator(fplog,cr,dd_node_order,CartReorder);
6060         if (comm->bCartesianPP_PME)
6061         {
6062             /* We (possibly) reordered the nodes in split_communicator,
6063              * so it is no longer required in make_pp_communicator.
6064              */
6065             CartReorder = FALSE;
6066         }
6067     }
6068     else
6069     {
6070         /* All nodes do PP and PME */
6071 #ifdef GMX_MPI    
6072         /* We do not require separate communicators */
6073         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6074 #endif
6075     }
6076     
6077     if (cr->duty & DUTY_PP)
6078     {
6079         /* Copy or make a new PP communicator */
6080         make_pp_communicator(fplog,cr,CartReorder);
6081     }
6082     else
6083     {
6084         receive_ddindex2simnodeid(cr);
6085     }
6086     
6087     if (!(cr->duty & DUTY_PME))
6088     {
6089         /* Set up the commnuication to our PME node */
6090         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6091         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6092         if (debug)
6093         {
6094             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6095                     dd->pme_nodeid,dd->pme_receive_vir_ener);
6096         }
6097     }
6098     else
6099     {
6100         dd->pme_nodeid = -1;
6101     }
6102
6103     if (DDMASTER(dd))
6104     {
6105         dd->ma = init_gmx_domdec_master_t(dd,
6106                                           comm->cgs_gl.nr,
6107                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6108     }
6109 }
6110
6111 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6112 {
6113     real *slb_frac,tot;
6114     int  i,n;
6115     double dbl;
6116     
6117     slb_frac = NULL;
6118     if (nc > 1 && size_string != NULL)
6119     {
6120         if (fplog)
6121         {
6122             fprintf(fplog,"Using static load balancing for the %s direction\n",
6123                     dir);
6124         }
6125         snew(slb_frac,nc);
6126         tot = 0;
6127         for (i=0; i<nc; i++)
6128         {
6129             dbl = 0;
6130             sscanf(size_string,"%lf%n",&dbl,&n);
6131             if (dbl == 0)
6132             {
6133                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6134             }
6135             slb_frac[i] = dbl;
6136             size_string += n;
6137             tot += slb_frac[i];
6138         }
6139         /* Normalize */
6140         if (fplog)
6141         {
6142             fprintf(fplog,"Relative cell sizes:");
6143         }
6144         for (i=0; i<nc; i++)
6145         {
6146             slb_frac[i] /= tot;
6147             if (fplog)
6148             {
6149                 fprintf(fplog," %5.3f",slb_frac[i]);
6150             }
6151         }
6152         if (fplog)
6153         {
6154             fprintf(fplog,"\n");
6155         }
6156     }
6157     
6158     return slb_frac;
6159 }
6160
6161 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6162 {
6163     int n,nmol,ftype;
6164     gmx_mtop_ilistloop_t iloop;
6165     t_ilist *il;
6166     
6167     n = 0;
6168     iloop = gmx_mtop_ilistloop_init(mtop);
6169     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6170     {
6171         for(ftype=0; ftype<F_NRE; ftype++)
6172         {
6173             if ((interaction_function[ftype].flags & IF_BOND) &&
6174                 NRAL(ftype) >  2)
6175             {
6176                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6177             }
6178         }
6179   }
6180
6181   return n;
6182 }
6183
6184 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6185 {
6186     char *val;
6187     int  nst;
6188     
6189     nst = def;
6190     val = getenv(env_var);
6191     if (val)
6192     {
6193         if (sscanf(val,"%d",&nst) <= 0)
6194         {
6195             nst = 1;
6196         }
6197         if (fplog)
6198         {
6199             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6200                     env_var,val,nst);
6201         }
6202     }
6203     
6204     return nst;
6205 }
6206
6207 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6208 {
6209     if (MASTER(cr))
6210     {
6211         fprintf(stderr,"\n%s\n",warn_string);
6212     }
6213     if (fplog)
6214     {
6215         fprintf(fplog,"\n%s\n",warn_string);
6216     }
6217 }
6218
6219 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6220                                   t_inputrec *ir,FILE *fplog)
6221 {
6222     if (ir->ePBC == epbcSCREW &&
6223         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6224     {
6225         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6226     }
6227
6228     if (ir->ns_type == ensSIMPLE)
6229     {
6230         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6231     }
6232
6233     if (ir->nstlist == 0)
6234     {
6235         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6236     }
6237
6238     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6239     {
6240         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6241     }
6242 }
6243
6244 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6245 {
6246     int  di,d;
6247     real r;
6248
6249     r = ddbox->box_size[XX];
6250     for(di=0; di<dd->ndim; di++)
6251     {
6252         d = dd->dim[di];
6253         /* Check using the initial average cell size */
6254         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6255     }
6256
6257     return r;
6258 }
6259
6260 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6261                              const char *dlb_opt,gmx_bool bRecordLoad,
6262                              unsigned long Flags,t_inputrec *ir)
6263 {
6264     gmx_domdec_t *dd;
6265     int  eDLB=-1;
6266     char buf[STRLEN];
6267
6268     switch (dlb_opt[0])
6269     {
6270     case 'a': eDLB = edlbAUTO; break;
6271     case 'n': eDLB = edlbNO;   break;
6272     case 'y': eDLB = edlbYES;  break;
6273     default: gmx_incons("Unknown dlb_opt");
6274     }
6275
6276     if (Flags & MD_RERUN)
6277     {
6278         return edlbNO;
6279     }
6280
6281     if (!EI_DYNAMICS(ir->eI))
6282     {
6283         if (eDLB == edlbYES)
6284         {
6285             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6286             dd_warning(cr,fplog,buf);
6287         }
6288             
6289         return edlbNO;
6290     }
6291
6292     if (!bRecordLoad)
6293     {
6294         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6295
6296         return edlbNO;
6297     }
6298
6299     if (Flags & MD_REPRODUCIBLE)
6300     {
6301         switch (eDLB)
6302         {
6303                         case edlbNO: 
6304                                 break;
6305                         case edlbAUTO:
6306                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6307                                 eDLB = edlbNO;
6308                                 break;
6309                         case edlbYES:
6310                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6311                                 break;
6312                         default:
6313                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6314                                 break;
6315         }
6316     }
6317
6318     return eDLB;
6319 }
6320
6321 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6322 {
6323     int dim;
6324
6325     dd->ndim = 0;
6326     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6327     {
6328         /* Decomposition order z,y,x */
6329         if (fplog)
6330         {
6331             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6332         }
6333         for(dim=DIM-1; dim>=0; dim--)
6334         {
6335             if (dd->nc[dim] > 1)
6336             {
6337                 dd->dim[dd->ndim++] = dim;
6338             }
6339         }
6340     }
6341     else
6342     {
6343         /* Decomposition order x,y,z */
6344         for(dim=0; dim<DIM; dim++)
6345         {
6346             if (dd->nc[dim] > 1)
6347             {
6348                 dd->dim[dd->ndim++] = dim;
6349             }
6350         }
6351     }
6352 }
6353
6354 static gmx_domdec_comm_t *init_dd_comm()
6355 {
6356     gmx_domdec_comm_t *comm;
6357     int  i;
6358
6359     snew(comm,1);
6360     snew(comm->cggl_flag,DIM*2);
6361     snew(comm->cgcm_state,DIM*2);
6362     for(i=0; i<DIM*2; i++)
6363     {
6364         comm->cggl_flag_nalloc[i]  = 0;
6365         comm->cgcm_state_nalloc[i] = 0;
6366     }
6367     
6368     comm->nalloc_int = 0;
6369     comm->buf_int    = NULL;
6370
6371     vec_rvec_init(&comm->vbuf);
6372
6373     comm->n_load_have    = 0;
6374     comm->n_load_collect = 0;
6375
6376     for(i=0; i<ddnatNR-ddnatZONE; i++)
6377     {
6378         comm->sum_nat[i] = 0;
6379     }
6380     comm->ndecomp = 0;
6381     comm->nload   = 0;
6382     comm->load_step = 0;
6383     comm->load_sum  = 0;
6384     comm->load_max  = 0;
6385     clear_ivec(comm->load_lim);
6386     comm->load_mdf  = 0;
6387     comm->load_pme  = 0;
6388
6389     return comm;
6390 }
6391
6392 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6393                                         unsigned long Flags,
6394                                         ivec nc,
6395                                         real comm_distance_min,real rconstr,
6396                                         const char *dlb_opt,real dlb_scale,
6397                                         const char *sizex,const char *sizey,const char *sizez,
6398                                         gmx_mtop_t *mtop,t_inputrec *ir,
6399                                         matrix box,rvec *x,
6400                                         gmx_ddbox_t *ddbox,
6401                                         int *npme_x,int *npme_y)
6402 {
6403     gmx_domdec_t *dd;
6404     gmx_domdec_comm_t *comm;
6405     int  recload;
6406     int  d,i,j;
6407     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6408     gmx_bool bC;
6409     char buf[STRLEN];
6410     
6411     if (fplog)
6412     {
6413         fprintf(fplog,
6414                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6415     }
6416     
6417     snew(dd,1);
6418
6419     dd->comm = init_dd_comm();
6420     comm = dd->comm;
6421     snew(comm->cggl_flag,DIM*2);
6422     snew(comm->cgcm_state,DIM*2);
6423
6424     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6425     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6426     
6427     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6428     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6429     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6430     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6431     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6432     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6433     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6434     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6435
6436     dd->pme_recv_f_alloc = 0;
6437     dd->pme_recv_f_buf = NULL;
6438
6439     if (dd->bSendRecv2 && fplog)
6440     {
6441         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6442     }
6443     if (comm->eFlop)
6444     {
6445         if (fplog)
6446         {
6447             fprintf(fplog,"Will load balance based on FLOP count\n");
6448         }
6449         if (comm->eFlop > 1)
6450         {
6451             srand(1+cr->nodeid);
6452         }
6453         comm->bRecordLoad = TRUE;
6454     }
6455     else
6456     {
6457         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6458                              
6459     }
6460     
6461     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6462     
6463     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6464     if (fplog)
6465     {
6466         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6467     }
6468     dd->bGridJump = comm->bDynLoadBal;
6469     
6470     if (comm->nstSortCG)
6471     {
6472         if (fplog)
6473         {
6474             if (comm->nstSortCG == 1)
6475             {
6476                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6477             }
6478             else
6479             {
6480                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6481                         comm->nstSortCG);
6482             }
6483         }
6484         snew(comm->sort,1);
6485     }
6486     else
6487     {
6488         if (fplog)
6489         {
6490             fprintf(fplog,"Will not sort the charge groups\n");
6491         }
6492     }
6493
6494     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6495     
6496     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6497     if (comm->bInterCGBondeds)
6498     {
6499         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6500     }
6501     else
6502     {
6503         comm->bInterCGMultiBody = FALSE;
6504     }
6505     
6506     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6507     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6508
6509     if (ir->rlistlong == 0)
6510     {
6511         /* Set the cut-off to some very large value,
6512          * so we don't need if statements everywhere in the code.
6513          * We use sqrt, since the cut-off is squared in some places.
6514          */
6515         comm->cutoff   = GMX_CUTOFF_INF;
6516     }
6517     else
6518     {
6519         comm->cutoff   = ir->rlistlong;
6520     }
6521     comm->cutoff_mbody = 0;
6522     
6523     comm->cellsize_limit = 0;
6524     comm->bBondComm = FALSE;
6525
6526     if (comm->bInterCGBondeds)
6527     {
6528         if (comm_distance_min > 0)
6529         {
6530             comm->cutoff_mbody = comm_distance_min;
6531             if (Flags & MD_DDBONDCOMM)
6532             {
6533                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6534             }
6535             else
6536             {
6537                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6538             }
6539             r_bonded_limit = comm->cutoff_mbody;
6540         }
6541         else if (ir->bPeriodicMols)
6542         {
6543             /* Can not easily determine the required cut-off */
6544             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6545             comm->cutoff_mbody = comm->cutoff/2;
6546             r_bonded_limit = comm->cutoff_mbody;
6547         }
6548         else
6549         {
6550             if (MASTER(cr))
6551             {
6552                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6553                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6554             }
6555             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6556             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6557
6558             /* We use an initial margin of 10% for the minimum cell size,
6559              * except when we are just below the non-bonded cut-off.
6560              */
6561             if (Flags & MD_DDBONDCOMM)
6562             {
6563                 if (max(r_2b,r_mb) > comm->cutoff)
6564                 {
6565                     r_bonded       = max(r_2b,r_mb);
6566                     r_bonded_limit = 1.1*r_bonded;
6567                     comm->bBondComm = TRUE;
6568                 }
6569                 else
6570                 {
6571                     r_bonded       = r_mb;
6572                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6573                 }
6574                 /* We determine cutoff_mbody later */
6575             }
6576             else
6577             {
6578                 /* No special bonded communication,
6579                  * simply increase the DD cut-off.
6580                  */
6581                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6582                 comm->cutoff_mbody = r_bonded_limit;
6583                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6584             }
6585         }
6586         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6587         if (fplog)
6588         {
6589             fprintf(fplog,
6590                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6591                     comm->cellsize_limit);
6592         }
6593     }
6594
6595     if (dd->bInterCGcons && rconstr <= 0)
6596     {
6597         /* There is a cell size limit due to the constraints (P-LINCS) */
6598         rconstr = constr_r_max(fplog,mtop,ir);
6599         if (fplog)
6600         {
6601             fprintf(fplog,
6602                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6603                     rconstr);
6604             if (rconstr > comm->cellsize_limit)
6605             {
6606                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6607             }
6608         }
6609     }
6610     else if (rconstr > 0 && fplog)
6611     {
6612         /* Here we do not check for dd->bInterCGcons,
6613          * because one can also set a cell size limit for virtual sites only
6614          * and at this point we don't know yet if there are intercg v-sites.
6615          */
6616         fprintf(fplog,
6617                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6618                 rconstr);
6619     }
6620     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6621
6622     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6623
6624     if (nc[XX] > 0)
6625     {
6626         copy_ivec(nc,dd->nc);
6627         set_dd_dim(fplog,dd);
6628         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6629
6630         if (cr->npmenodes == -1)
6631         {
6632             cr->npmenodes = 0;
6633         }
6634         acs = average_cellsize_min(dd,ddbox);
6635         if (acs < comm->cellsize_limit)
6636         {
6637             if (fplog)
6638             {
6639                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6640             }
6641             gmx_fatal_collective(FARGS,cr,NULL,
6642                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6643                                  acs,comm->cellsize_limit);
6644         }
6645     }
6646     else
6647     {
6648         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6649
6650         /* We need to choose the optimal DD grid and possibly PME nodes */
6651         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6652                                comm->eDLB!=edlbNO,dlb_scale,
6653                                comm->cellsize_limit,comm->cutoff,
6654                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6655         
6656         if (dd->nc[XX] == 0)
6657         {
6658             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6659             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6660                     !bC ? "-rdd" : "-rcon",
6661                     comm->eDLB!=edlbNO ? " or -dds" : "",
6662                     bC ? " or your LINCS settings" : "");
6663
6664             gmx_fatal_collective(FARGS,cr,NULL,
6665                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6666                                  "%s\n"
6667                                  "Look in the log file for details on the domain decomposition",
6668                                  cr->nnodes-cr->npmenodes,limit,buf);
6669         }
6670         set_dd_dim(fplog,dd);
6671     }
6672
6673     if (fplog)
6674     {
6675         fprintf(fplog,
6676                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6677                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6678     }
6679     
6680     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6681     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6682     {
6683         gmx_fatal_collective(FARGS,cr,NULL,
6684                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6685                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6686     }
6687     if (cr->npmenodes > dd->nnodes)
6688     {
6689         gmx_fatal_collective(FARGS,cr,NULL,
6690                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6691     }
6692     if (cr->npmenodes > 0)
6693     {
6694         comm->npmenodes = cr->npmenodes;
6695     }
6696     else
6697     {
6698         comm->npmenodes = dd->nnodes;
6699     }
6700
6701     if (EEL_PME(ir->coulombtype))
6702     {
6703         /* The following choices should match those
6704          * in comm_cost_est in domdec_setup.c.
6705          * Note that here the checks have to take into account
6706          * that the decomposition might occur in a different order than xyz
6707          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6708          * in which case they will not match those in comm_cost_est,
6709          * but since that is mainly for testing purposes that's fine.
6710          */
6711         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6712             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6713             getenv("GMX_PMEONEDD") == NULL)
6714         {
6715             comm->npmedecompdim = 2;
6716             comm->npmenodes_x   = dd->nc[XX];
6717             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6718         }
6719         else
6720         {
6721             /* In case nc is 1 in both x and y we could still choose to
6722              * decompose pme in y instead of x, but we use x for simplicity.
6723              */
6724             comm->npmedecompdim = 1;
6725             if (dd->dim[0] == YY)
6726             {
6727                 comm->npmenodes_x = 1;
6728                 comm->npmenodes_y = comm->npmenodes;
6729             }
6730             else
6731             {
6732                 comm->npmenodes_x = comm->npmenodes;
6733                 comm->npmenodes_y = 1;
6734             }
6735         }    
6736         if (fplog)
6737         {
6738             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6739                     comm->npmenodes_x,comm->npmenodes_y,1);
6740         }
6741     }
6742     else
6743     {
6744         comm->npmedecompdim = 0;
6745         comm->npmenodes_x   = 0;
6746         comm->npmenodes_y   = 0;
6747     }
6748     
6749     /* Technically we don't need both of these,
6750      * but it simplifies code not having to recalculate it.
6751      */
6752     *npme_x = comm->npmenodes_x;
6753     *npme_y = comm->npmenodes_y;
6754         
6755     snew(comm->slb_frac,DIM);
6756     if (comm->eDLB == edlbNO)
6757     {
6758         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6759         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6760         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6761     }
6762
6763     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6764     {
6765         if (comm->bBondComm || comm->eDLB != edlbNO)
6766         {
6767             /* Set the bonded communication distance to halfway
6768              * the minimum and the maximum,
6769              * since the extra communication cost is nearly zero.
6770              */
6771             acs = average_cellsize_min(dd,ddbox);
6772             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6773             if (comm->eDLB != edlbNO)
6774             {
6775                 /* Check if this does not limit the scaling */
6776                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6777             }
6778             if (!comm->bBondComm)
6779             {
6780                 /* Without bBondComm do not go beyond the n.b. cut-off */
6781                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6782                 if (comm->cellsize_limit >= comm->cutoff)
6783                 {
6784                     /* We don't loose a lot of efficieny
6785                      * when increasing it to the n.b. cut-off.
6786                      * It can even be slightly faster, because we need
6787                      * less checks for the communication setup.
6788                      */
6789                     comm->cutoff_mbody = comm->cutoff;
6790                 }
6791             }
6792             /* Check if we did not end up below our original limit */
6793             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6794
6795             if (comm->cutoff_mbody > comm->cellsize_limit)
6796             {
6797                 comm->cellsize_limit = comm->cutoff_mbody;
6798             }
6799         }
6800         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6801     }
6802
6803     if (debug)
6804     {
6805         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6806                 "cellsize limit %f\n",
6807                 comm->bBondComm,comm->cellsize_limit);
6808     }
6809     
6810     if (MASTER(cr))
6811     {
6812         check_dd_restrictions(cr,dd,ir,fplog);
6813     }
6814
6815     comm->partition_step = INT_MIN;
6816     dd->ddp_count = 0;
6817
6818     clear_dd_cycle_counts(dd);
6819
6820     return dd;
6821 }
6822
6823 static void set_dlb_limits(gmx_domdec_t *dd)
6824
6825 {
6826     int d;
6827
6828     for(d=0; d<dd->ndim; d++)
6829     {
6830         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6831         dd->comm->cellsize_min[dd->dim[d]] =
6832             dd->comm->cellsize_min_dlb[dd->dim[d]];
6833     }
6834 }
6835
6836
6837 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6838 {
6839     gmx_domdec_t *dd;
6840     gmx_domdec_comm_t *comm;
6841     real cellsize_min;
6842     int  d,nc,i;
6843     char buf[STRLEN];
6844     
6845     dd = cr->dd;
6846     comm = dd->comm;
6847     
6848     if (fplog)
6849     {
6850         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6851     }
6852
6853     cellsize_min = comm->cellsize_min[dd->dim[0]];
6854     for(d=1; d<dd->ndim; d++)
6855     {
6856         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6857     }
6858
6859     if (cellsize_min < comm->cellsize_limit*1.05)
6860     {
6861         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6862
6863         /* Change DLB from "auto" to "no". */
6864         comm->eDLB = edlbNO;
6865
6866         return;
6867     }
6868
6869     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6870     comm->bDynLoadBal = TRUE;
6871     dd->bGridJump = TRUE;
6872     
6873     set_dlb_limits(dd);
6874
6875     /* We can set the required cell size info here,
6876      * so we do not need to communicate this.
6877      * The grid is completely uniform.
6878      */
6879     for(d=0; d<dd->ndim; d++)
6880     {
6881         if (comm->root[d])
6882         {
6883             comm->load[d].sum_m = comm->load[d].sum;
6884
6885             nc = dd->nc[dd->dim[d]];
6886             for(i=0; i<nc; i++)
6887             {
6888                 comm->root[d]->cell_f[i]    = i/(real)nc;
6889                 if (d > 0)
6890                 {
6891                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6892                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6893                 }
6894             }
6895             comm->root[d]->cell_f[nc] = 1.0;
6896         }
6897     }
6898 }
6899
6900 static char *init_bLocalCG(gmx_mtop_t *mtop)
6901 {
6902     int  ncg,cg;
6903     char *bLocalCG;
6904     
6905     ncg = ncg_mtop(mtop);
6906     snew(bLocalCG,ncg);
6907     for(cg=0; cg<ncg; cg++)
6908     {
6909         bLocalCG[cg] = FALSE;
6910     }
6911
6912     return bLocalCG;
6913 }
6914
6915 void dd_init_bondeds(FILE *fplog,
6916                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6917                      gmx_vsite_t *vsite,gmx_constr_t constr,
6918                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6919 {
6920     gmx_domdec_comm_t *comm;
6921     gmx_bool bBondComm;
6922     int  d;
6923
6924     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6925
6926     comm = dd->comm;
6927
6928     if (comm->bBondComm)
6929     {
6930         /* Communicate atoms beyond the cut-off for bonded interactions */
6931         comm = dd->comm;
6932
6933         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6934
6935         comm->bLocalCG = init_bLocalCG(mtop);
6936     }
6937     else
6938     {
6939         /* Only communicate atoms based on cut-off */
6940         comm->cglink   = NULL;
6941         comm->bLocalCG = NULL;
6942     }
6943 }
6944
6945 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6946                               t_inputrec *ir,
6947                               gmx_bool bDynLoadBal,real dlb_scale,
6948                               gmx_ddbox_t *ddbox)
6949 {
6950     gmx_domdec_comm_t *comm;
6951     int  d;
6952     ivec np;
6953     real limit,shrink;
6954     char buf[64];
6955
6956     if (fplog == NULL)
6957     {
6958         return;
6959     }
6960
6961     comm = dd->comm;
6962
6963     if (bDynLoadBal)
6964     {
6965         fprintf(fplog,"The maximum number of communication pulses is:");
6966         for(d=0; d<dd->ndim; d++)
6967         {
6968             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6969         }
6970         fprintf(fplog,"\n");
6971         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6972         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6973         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6974         for(d=0; d<DIM; d++)
6975         {
6976             if (dd->nc[d] > 1)
6977             {
6978                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6979                 {
6980                     shrink = 0;
6981                 }
6982                 else
6983                 {
6984                     shrink =
6985                         comm->cellsize_min_dlb[d]/
6986                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6987                 }
6988                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6989             }
6990         }
6991         fprintf(fplog,"\n");
6992     }
6993     else
6994     {
6995         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6996         fprintf(fplog,"The initial number of communication pulses is:");
6997         for(d=0; d<dd->ndim; d++)
6998         {
6999             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7000         }
7001         fprintf(fplog,"\n");
7002         fprintf(fplog,"The initial domain decomposition cell size is:");
7003         for(d=0; d<DIM; d++) {
7004             if (dd->nc[d] > 1)
7005             {
7006                 fprintf(fplog," %c %.2f nm",
7007                         dim2char(d),dd->comm->cellsize_min[d]);
7008             }
7009         }
7010         fprintf(fplog,"\n\n");
7011     }
7012     
7013     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7014     {
7015         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7016         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7017                 "non-bonded interactions","",comm->cutoff);
7018
7019         if (bDynLoadBal)
7020         {
7021             limit = dd->comm->cellsize_limit;
7022         }
7023         else
7024         {
7025             if (dynamic_dd_box(ddbox,ir))
7026             {
7027                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7028             }
7029             limit = dd->comm->cellsize_min[XX];
7030             for(d=1; d<DIM; d++)
7031             {
7032                 limit = min(limit,dd->comm->cellsize_min[d]);
7033             }
7034         }
7035
7036         if (comm->bInterCGBondeds)
7037         {
7038             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7039                     "two-body bonded interactions","(-rdd)",
7040                     max(comm->cutoff,comm->cutoff_mbody));
7041             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7042                     "multi-body bonded interactions","(-rdd)",
7043                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7044         }
7045         if (dd->vsite_comm)
7046         {
7047             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7048                     "virtual site constructions","(-rcon)",limit);
7049         }
7050         if (dd->constraint_comm)
7051         {
7052             sprintf(buf,"atoms separated by up to %d constraints",
7053                     1+ir->nProjOrder);
7054             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7055                     buf,"(-rcon)",limit);
7056         }
7057         fprintf(fplog,"\n");
7058     }
7059     
7060     fflush(fplog);
7061 }
7062
7063 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7064                                 real dlb_scale,
7065                                 const t_inputrec *ir,
7066                                 const gmx_ddbox_t *ddbox)
7067 {
7068     gmx_domdec_comm_t *comm;
7069     int  d,dim,npulse,npulse_d_max,npulse_d;
7070     gmx_bool bNoCutOff;
7071
7072     comm = dd->comm;
7073
7074     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7075
7076     /* Determine the maximum number of comm. pulses in one dimension */
7077         
7078     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7079         
7080     /* Determine the maximum required number of grid pulses */
7081     if (comm->cellsize_limit >= comm->cutoff)
7082     {
7083         /* Only a single pulse is required */
7084         npulse = 1;
7085     }
7086     else if (!bNoCutOff && comm->cellsize_limit > 0)
7087     {
7088         /* We round down slightly here to avoid overhead due to the latency
7089          * of extra communication calls when the cut-off
7090          * would be only slightly longer than the cell size.
7091          * Later cellsize_limit is redetermined,
7092          * so we can not miss interactions due to this rounding.
7093          */
7094         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7095     }
7096     else
7097     {
7098         /* There is no cell size limit */
7099         npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7100     }
7101
7102     if (!bNoCutOff && npulse > 1)
7103     {
7104         /* See if we can do with less pulses, based on dlb_scale */
7105         npulse_d_max = 0;
7106         for(d=0; d<dd->ndim; d++)
7107         {
7108             dim = dd->dim[d];
7109             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7110                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7111             npulse_d_max = max(npulse_d_max,npulse_d);
7112         }
7113         npulse = min(npulse,npulse_d_max);
7114     }
7115
7116     /* This env var can override npulse */
7117     d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7118     if (d > 0)
7119     {
7120         npulse = d;
7121     }
7122
7123     comm->maxpulse = 1;
7124     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7125     for(d=0; d<dd->ndim; d++)
7126     {
7127         comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7128         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7129         snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7130         comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7131         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7132         {
7133             comm->bVacDLBNoLimit = FALSE;
7134         }
7135     }
7136
7137     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7138     if (!comm->bVacDLBNoLimit)
7139     {
7140         comm->cellsize_limit = max(comm->cellsize_limit,
7141                                    comm->cutoff/comm->maxpulse);
7142     }
7143     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7144     /* Set the minimum cell size for each DD dimension */
7145     for(d=0; d<dd->ndim; d++)
7146     {
7147         if (comm->bVacDLBNoLimit ||
7148             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7149         {
7150             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7151         }
7152         else
7153         {
7154             comm->cellsize_min_dlb[dd->dim[d]] =
7155                 comm->cutoff/comm->cd[d].np_dlb;
7156         }
7157     }
7158     if (comm->cutoff_mbody <= 0)
7159     {
7160         comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7161     }
7162     if (comm->bDynLoadBal)
7163     {
7164         set_dlb_limits(dd);
7165     }
7166 }
7167
7168 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7169 {
7170     /* If each molecule is a single charge group
7171      * or we use domain decomposition for each periodic dimension,
7172      * we do not need to take pbc into account for the bonded interactions.
7173      */
7174     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7175             !(dd->nc[XX]>1 &&
7176               dd->nc[YY]>1 &&
7177               (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7178 }
7179
7180 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7181                        t_inputrec *ir,t_forcerec *fr,
7182                        gmx_ddbox_t *ddbox)
7183 {
7184     gmx_domdec_comm_t *comm;
7185     int  natoms_tot;
7186     real vol_frac;
7187
7188     comm = dd->comm;
7189
7190     /* Initialize the thread data.
7191      * This can not be done in init_domain_decomposition,
7192      * as the numbers of threads is determined later.
7193      */
7194     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7195     if (comm->nth > 1)
7196     {
7197         snew(comm->dth,comm->nth);
7198     }
7199
7200     if (EEL_PME(ir->coulombtype))
7201     {
7202         init_ddpme(dd,&comm->ddpme[0],0);
7203         if (comm->npmedecompdim >= 2)
7204         {
7205             init_ddpme(dd,&comm->ddpme[1],1);
7206         }
7207     }
7208     else
7209     {
7210         comm->npmenodes = 0;
7211         if (dd->pme_nodeid >= 0)
7212         {
7213             gmx_fatal_collective(FARGS,NULL,dd,
7214                                  "Can not have separate PME nodes without PME electrostatics");
7215         }
7216     }
7217         
7218     if (debug)
7219     {
7220         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7221     }
7222     if (comm->eDLB != edlbNO)
7223     {
7224         set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7225     }
7226     
7227     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7228     if (comm->eDLB == edlbAUTO)
7229     {
7230         if (fplog)
7231         {
7232             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7233         }
7234         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7235     }
7236
7237     if (ir->ePBC == epbcNONE)
7238     {
7239         vol_frac = 1 - 1/(double)dd->nnodes;
7240     }
7241     else
7242     {
7243         vol_frac =
7244             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7245     }
7246     if (debug)
7247     {
7248         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7249     }
7250     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7251    
7252     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7253 }
7254
7255 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7256                           real cutoff_req)
7257 {
7258     gmx_domdec_t *dd;
7259     gmx_ddbox_t ddbox;
7260     int d,dim,np;
7261     real inv_cell_size;
7262     int LocallyLimited;
7263
7264     dd = cr->dd;
7265
7266     set_ddbox(dd,FALSE,cr,ir,state->box,
7267               TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7268
7269     LocallyLimited = 0;
7270
7271     for(d=0; d<dd->ndim; d++)
7272     {
7273         dim = dd->dim[d];
7274
7275         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7276         if (dynamic_dd_box(&ddbox,ir))
7277         {
7278             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7279         }
7280
7281         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7282
7283         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7284             dd->comm->cd[d].np_dlb > 0)
7285         {
7286             if (np > dd->comm->cd[d].np_dlb)
7287             {
7288                 return FALSE;
7289             }
7290
7291             /* If a current local cell size is smaller than the requested
7292              * cut-off, we could still fix it, but this gets very complicated.
7293              * Without fixing here, we might actually need more checks.
7294              */
7295             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7296             {
7297                 LocallyLimited = 1;
7298             }
7299         }
7300     }
7301
7302     if (dd->comm->eDLB != edlbNO)
7303     {
7304         /* If DLB is not active yet, we don't need to check the grid jumps.
7305          * Actually we shouldn't, because then the grid jump data is not set.
7306          */
7307         if (dd->comm->bDynLoadBal &&
7308             check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7309         {
7310             LocallyLimited = 1; 
7311         }
7312
7313         gmx_sumi(1,&LocallyLimited,cr);
7314
7315         if (LocallyLimited > 0)
7316         {
7317             return FALSE;
7318         }
7319     }
7320
7321     dd->comm->cutoff = cutoff_req;
7322
7323     return TRUE;
7324 }
7325
7326 static void merge_cg_buffers(int ncell,
7327                              gmx_domdec_comm_dim_t *cd, int pulse,
7328                              int  *ncg_cell,
7329                              int  *index_gl, int  *recv_i,
7330                              rvec *cg_cm,    rvec *recv_vr,
7331                              int *cgindex,
7332                              cginfo_mb_t *cginfo_mb,int *cginfo)
7333 {
7334     gmx_domdec_ind_t *ind,*ind_p;
7335     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7336     int shift,shift_at;
7337     
7338     ind = &cd->ind[pulse];
7339     
7340     /* First correct the already stored data */
7341     shift = ind->nrecv[ncell];
7342     for(cell=ncell-1; cell>=0; cell--)
7343     {
7344         shift -= ind->nrecv[cell];
7345         if (shift > 0)
7346         {
7347             /* Move the cg's present from previous grid pulses */
7348             cg0 = ncg_cell[ncell+cell];
7349             cg1 = ncg_cell[ncell+cell+1];
7350             cgindex[cg1+shift] = cgindex[cg1];
7351             for(cg=cg1-1; cg>=cg0; cg--)
7352             {
7353                 index_gl[cg+shift] = index_gl[cg];
7354                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7355                 cgindex[cg+shift] = cgindex[cg];
7356                 cginfo[cg+shift] = cginfo[cg];
7357             }
7358             /* Correct the already stored send indices for the shift */
7359             for(p=1; p<=pulse; p++)
7360             {
7361                 ind_p = &cd->ind[p];
7362                 cg0 = 0;
7363                 for(c=0; c<cell; c++)
7364                 {
7365                     cg0 += ind_p->nsend[c];
7366                 }
7367                 cg1 = cg0 + ind_p->nsend[cell];
7368                 for(cg=cg0; cg<cg1; cg++)
7369                 {
7370                     ind_p->index[cg] += shift;
7371                 }
7372             }
7373         }
7374     }
7375
7376     /* Merge in the communicated buffers */
7377     shift = 0;
7378     shift_at = 0;
7379     cg0 = 0;
7380     for(cell=0; cell<ncell; cell++)
7381     {
7382         cg1 = ncg_cell[ncell+cell+1] + shift;
7383         if (shift_at > 0)
7384         {
7385             /* Correct the old cg indices */
7386             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7387             {
7388                 cgindex[cg+1] += shift_at;
7389             }
7390         }
7391         for(cg=0; cg<ind->nrecv[cell]; cg++)
7392         {
7393             /* Copy this charge group from the buffer */
7394             index_gl[cg1] = recv_i[cg0];
7395             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7396             /* Add it to the cgindex */
7397             cg_gl = index_gl[cg1];
7398             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7399             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7400             cgindex[cg1+1] = cgindex[cg1] + nat;
7401             cg0++;
7402             cg1++;
7403             shift_at += nat;
7404         }
7405         shift += ind->nrecv[cell];
7406         ncg_cell[ncell+cell+1] = cg1;
7407     }
7408 }
7409
7410 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7411                                int nzone,int cg0,const int *cgindex)
7412 {
7413     int cg,zone,p;
7414     
7415     /* Store the atom block boundaries for easy copying of communication buffers
7416      */
7417     cg = cg0;
7418     for(zone=0; zone<nzone; zone++)
7419     {
7420         for(p=0; p<cd->np; p++) {
7421             cd->ind[p].cell2at0[zone] = cgindex[cg];
7422             cg += cd->ind[p].nrecv[zone];
7423             cd->ind[p].cell2at1[zone] = cgindex[cg];
7424         }
7425     }
7426 }
7427
7428 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7429 {
7430     int  i;
7431     gmx_bool bMiss;
7432
7433     bMiss = FALSE;
7434     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7435     {
7436         if (!bLocalCG[link->a[i]])
7437         {
7438             bMiss = TRUE;
7439         }
7440     }
7441
7442     return bMiss;
7443 }
7444
7445 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7446 typedef struct {
7447     real c[DIM][4]; /* the corners for the non-bonded communication */
7448     real cr0;       /* corner for rounding */
7449     real cr1[4];    /* corners for rounding */
7450     real bc[DIM];   /* corners for bounded communication */
7451     real bcr1;      /* corner for rounding for bonded communication */
7452 } dd_corners_t;
7453
7454 /* Determine the corners of the domain(s) we are communicating with */
7455 static void
7456 set_dd_corners(const gmx_domdec_t *dd,
7457                int dim0, int dim1, int dim2,
7458                gmx_bool bDistMB,
7459                dd_corners_t *c)
7460 {
7461     const gmx_domdec_comm_t *comm;
7462     const gmx_domdec_zones_t *zones;
7463     int i,j;
7464
7465     comm = dd->comm;
7466
7467     zones = &comm->zones;
7468
7469     /* Keep the compiler happy */
7470     c->cr0  = 0;
7471     c->bcr1 = 0;
7472
7473     /* The first dimension is equal for all cells */
7474     c->c[0][0] = comm->cell_x0[dim0];
7475     if (bDistMB)
7476     {
7477         c->bc[0] = c->c[0][0];
7478     }
7479     if (dd->ndim >= 2)
7480     {
7481         dim1 = dd->dim[1];
7482         /* This cell row is only seen from the first row */
7483         c->c[1][0] = comm->cell_x0[dim1];
7484         /* All rows can see this row */
7485         c->c[1][1] = comm->cell_x0[dim1];
7486         if (dd->bGridJump)
7487         {
7488             c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7489             if (bDistMB)
7490             {
7491                 /* For the multi-body distance we need the maximum */
7492                 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7493             }
7494         }
7495         /* Set the upper-right corner for rounding */
7496         c->cr0 = comm->cell_x1[dim0];
7497         
7498         if (dd->ndim >= 3)
7499         {
7500             dim2 = dd->dim[2];
7501             for(j=0; j<4; j++)
7502             {
7503                 c->c[2][j] = comm->cell_x0[dim2];
7504             }
7505             if (dd->bGridJump)
7506             {
7507                 /* Use the maximum of the i-cells that see a j-cell */
7508                 for(i=0; i<zones->nizone; i++)
7509                 {
7510                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7511                     {
7512                         if (j >= 4)
7513                         {
7514                             c->c[2][j-4] =
7515                                 max(c->c[2][j-4],
7516                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7517                         }
7518                     }
7519                 }
7520                 if (bDistMB)
7521                 {
7522                     /* For the multi-body distance we need the maximum */
7523                     c->bc[2] = comm->cell_x0[dim2];
7524                     for(i=0; i<2; i++)
7525                     {
7526                         for(j=0; j<2; j++)
7527                         {
7528                             c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7529                         }
7530                     }
7531                 }
7532             }
7533             
7534             /* Set the upper-right corner for rounding */
7535             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7536              * Only cell (0,0,0) can see cell 7 (1,1,1)
7537              */
7538             c->cr1[0] = comm->cell_x1[dim1];
7539             c->cr1[3] = comm->cell_x1[dim1];
7540             if (dd->bGridJump)
7541             {
7542                 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7543                 if (bDistMB)
7544                 {
7545                     /* For the multi-body distance we need the maximum */
7546                     c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7547                 }
7548             }
7549         }
7550     }
7551 }
7552
7553 /* Determine which cg's we need to send in this pulse from this zone */
7554 static void
7555 get_zone_pulse_cgs(gmx_domdec_t *dd,
7556                    int zonei, int zone,
7557                    int cg0, int cg1,
7558                    const int *index_gl,
7559                    const int *cgindex,
7560                    int dim, int dim_ind,
7561                    int dim0, int dim1, int dim2,
7562                    real r_comm2, real r_bcomm2,
7563                    matrix box,
7564                    ivec tric_dist,
7565                    rvec *normal,
7566                    real skew_fac2_d, real skew_fac_01,
7567                    rvec *v_d, rvec *v_0, rvec *v_1,
7568                    const dd_corners_t *c,
7569                    rvec sf2_round,
7570                    gmx_bool bDistBonded,
7571                    gmx_bool bBondComm,
7572                    gmx_bool bDist2B,
7573                    gmx_bool bDistMB,
7574                    rvec *cg_cm,
7575                    int *cginfo,
7576                    gmx_domdec_ind_t *ind,
7577                    int **ibuf, int *ibuf_nalloc,
7578                    vec_rvec_t *vbuf,
7579                    int *nsend_ptr,
7580                    int *nat_ptr,
7581                    int *nsend_z_ptr)
7582 {
7583     gmx_domdec_comm_t *comm;
7584     gmx_bool bScrew;
7585     gmx_bool bDistMB_pulse;
7586     int  cg,i;
7587     real r2,rb2,r,tric_sh;
7588     rvec rn,rb;
7589     int  dimd;
7590     int  nsend_z,nsend,nat;
7591
7592     comm = dd->comm;
7593
7594     bScrew = (dd->bScrewPBC && dim == XX);
7595
7596     bDistMB_pulse = (bDistMB && bDistBonded);
7597
7598     nsend_z = 0;
7599     nsend   = *nsend_ptr;
7600     nat     = *nat_ptr;
7601
7602     for(cg=cg0; cg<cg1; cg++)
7603     {
7604         r2  = 0;
7605         rb2 = 0;
7606         if (tric_dist[dim_ind] == 0)
7607         {
7608             /* Rectangular direction, easy */
7609             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7610             if (r > 0)
7611             {
7612                 r2 += r*r;
7613             }
7614             if (bDistMB_pulse)
7615             {
7616                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7617                 if (r > 0)
7618                 {
7619                     rb2 += r*r;
7620                 }
7621             }
7622             /* Rounding gives at most a 16% reduction
7623              * in communicated atoms
7624              */
7625             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7626             {
7627                 r = cg_cm[cg][dim0] - c->cr0;
7628                 /* This is the first dimension, so always r >= 0 */
7629                 r2 += r*r;
7630                 if (bDistMB_pulse)
7631                 {
7632                     rb2 += r*r;
7633                 }
7634             }
7635             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7636             {
7637                 r = cg_cm[cg][dim1] - c->cr1[zone];
7638                 if (r > 0)
7639                 {
7640                     r2 += r*r;
7641                 }
7642                 if (bDistMB_pulse)
7643                 {
7644                     r = cg_cm[cg][dim1] - c->bcr1;
7645                     if (r > 0)
7646                     {
7647                         rb2 += r*r;
7648                     }
7649                 }
7650             }
7651         }
7652         else
7653         {
7654             /* Triclinic direction, more complicated */
7655             clear_rvec(rn);
7656             clear_rvec(rb);
7657             /* Rounding, conservative as the skew_fac multiplication
7658              * will slightly underestimate the distance.
7659              */
7660             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7661             {
7662                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7663                 for(i=dim0+1; i<DIM; i++)
7664                 {
7665                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7666                 }
7667                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7668                 if (bDistMB_pulse)
7669                 {
7670                     rb[dim0] = rn[dim0];
7671                     rb2 = r2;
7672                 }
7673                 /* Take care that the cell planes along dim0 might not
7674                  * be orthogonal to those along dim1 and dim2.
7675                  */
7676                 for(i=1; i<=dim_ind; i++)
7677                 {
7678                     dimd = dd->dim[i];
7679                     if (normal[dim0][dimd] > 0)
7680                     {
7681                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7682                         if (bDistMB_pulse)
7683                         {
7684                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7685                         }
7686                     }
7687                 }
7688             }
7689             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7690             {
7691                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7692                 tric_sh = 0;
7693                 for(i=dim1+1; i<DIM; i++)
7694                 {
7695                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7696                 }
7697                 rn[dim1] += tric_sh;
7698                 if (rn[dim1] > 0)
7699                 {
7700                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7701                     /* Take care of coupling of the distances
7702                      * to the planes along dim0 and dim1 through dim2.
7703                      */
7704                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7705                     /* Take care that the cell planes along dim1
7706                      * might not be orthogonal to that along dim2.
7707                      */
7708                     if (normal[dim1][dim2] > 0)
7709                     {
7710                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7711                     }
7712                 }
7713                 if (bDistMB_pulse)
7714                 {
7715                     rb[dim1] +=
7716                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7717                     if (rb[dim1] > 0)
7718                     {
7719                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7720                         /* Take care of coupling of the distances
7721                          * to the planes along dim0 and dim1 through dim2.
7722                          */
7723                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7724                         /* Take care that the cell planes along dim1
7725                          * might not be orthogonal to that along dim2.
7726                          */
7727                         if (normal[dim1][dim2] > 0)
7728                         {
7729                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7730                         }
7731                     }
7732                 }
7733             }
7734             /* The distance along the communication direction */
7735             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7736             tric_sh = 0;
7737             for(i=dim+1; i<DIM; i++)
7738             {
7739                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7740             }
7741             rn[dim] += tric_sh;
7742             if (rn[dim] > 0)
7743             {
7744                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7745                 /* Take care of coupling of the distances
7746                  * to the planes along dim0 and dim1 through dim2.
7747                  */
7748                 if (dim_ind == 1 && zonei == 1)
7749                 {
7750                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7751                 }
7752             }
7753             if (bDistMB_pulse)
7754             {
7755                 clear_rvec(rb);
7756                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7757                 if (rb[dim] > 0)
7758                 {
7759                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7760                     /* Take care of coupling of the distances
7761                      * to the planes along dim0 and dim1 through dim2.
7762                      */
7763                     if (dim_ind == 1 && zonei == 1)
7764                     {
7765                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7766                     }
7767                 }
7768             }
7769         }
7770         
7771         if (r2 < r_comm2 ||
7772             (bDistBonded &&
7773              ((bDistMB && rb2 < r_bcomm2) ||
7774               (bDist2B && r2  < r_bcomm2)) &&
7775              (!bBondComm ||
7776               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7777                missing_link(comm->cglink,index_gl[cg],
7778                             comm->bLocalCG)))))
7779         {
7780             /* Make an index to the local charge groups */
7781             if (nsend+1 > ind->nalloc)
7782             {
7783                 ind->nalloc = over_alloc_large(nsend+1);
7784                 srenew(ind->index,ind->nalloc);
7785             }
7786             if (nsend+1 > *ibuf_nalloc)
7787             {
7788                 *ibuf_nalloc = over_alloc_large(nsend+1);
7789                 srenew(*ibuf,*ibuf_nalloc);
7790             }
7791             ind->index[nsend] = cg;
7792             (*ibuf)[nsend] = index_gl[cg];
7793             nsend_z++;
7794             vec_rvec_check_alloc(vbuf,nsend+1);
7795             
7796             if (dd->ci[dim] == 0)
7797             {
7798                 /* Correct cg_cm for pbc */
7799                 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7800                 if (bScrew)
7801                 {
7802                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7803                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7804                 }
7805             }
7806             else
7807             {
7808                 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7809             }
7810             nsend++;
7811             nat += cgindex[cg+1] - cgindex[cg];
7812         }
7813     }
7814
7815     *nsend_ptr   = nsend;
7816     *nat_ptr     = nat;
7817     *nsend_z_ptr = nsend_z;
7818 }
7819
7820 static void setup_dd_communication(gmx_domdec_t *dd,
7821                                    matrix box,gmx_ddbox_t *ddbox,
7822                                    t_forcerec *fr,t_state *state,rvec **f)
7823 {
7824     int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7825     int nzone,nzone_send,zone,zonei,cg0,cg1;
7826     int c,i,j,cg,cg_gl,nrcg;
7827     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7828     gmx_domdec_comm_t *comm;
7829     gmx_domdec_zones_t *zones;
7830     gmx_domdec_comm_dim_t *cd;
7831     gmx_domdec_ind_t *ind;
7832     cginfo_mb_t *cginfo_mb;
7833     gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7834     real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7835     dd_corners_t corners;
7836     ivec tric_dist;
7837     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7838     real skew_fac2_d,skew_fac_01;
7839     rvec sf2_round;
7840     int  nsend,nat;
7841     int  th;
7842     
7843     if (debug)
7844     {
7845         fprintf(debug,"Setting up DD communication\n");
7846     }
7847     
7848     comm  = dd->comm;
7849
7850     switch (fr->cutoff_scheme)
7851     {
7852     case ecutsGROUP:
7853         cg_cm = fr->cg_cm;
7854         break;
7855     case ecutsVERLET:
7856         cg_cm = state->x;
7857         break;
7858     default:
7859         gmx_incons("unimplemented");
7860         cg_cm = NULL;
7861     }
7862
7863     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7864     {
7865         dim = dd->dim[dim_ind];
7866
7867         /* Check if we need to use triclinic distances */
7868         tric_dist[dim_ind] = 0;
7869         for(i=0; i<=dim_ind; i++)
7870         {
7871             if (ddbox->tric_dir[dd->dim[i]])
7872             {
7873                 tric_dist[dim_ind] = 1;
7874             }
7875         }
7876     }
7877
7878     bBondComm = comm->bBondComm;
7879
7880     /* Do we need to determine extra distances for multi-body bondeds? */
7881     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7882     
7883     /* Do we need to determine extra distances for only two-body bondeds? */
7884     bDist2B = (bBondComm && !bDistMB);
7885
7886     r_comm2  = sqr(comm->cutoff);
7887     r_bcomm2 = sqr(comm->cutoff_mbody);
7888
7889     if (debug)
7890     {
7891         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7892     }
7893
7894     zones = &comm->zones;
7895     
7896     dim0 = dd->dim[0];
7897     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7898     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7899
7900     set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7901     
7902     /* Triclinic stuff */
7903     normal = ddbox->normal;
7904     skew_fac_01 = 0;
7905     if (dd->ndim >= 2)
7906     {
7907         v_0 = ddbox->v[dim0];
7908         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7909         {
7910             /* Determine the coupling coefficient for the distances
7911              * to the cell planes along dim0 and dim1 through dim2.
7912              * This is required for correct rounding.
7913              */
7914             skew_fac_01 =
7915                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7916             if (debug)
7917             {
7918                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7919             }
7920         }
7921     }
7922     if (dd->ndim >= 3)
7923     {
7924         v_1 = ddbox->v[dim1];
7925     }
7926     
7927     zone_cg_range = zones->cg_range;
7928     index_gl = dd->index_gl;
7929     cgindex  = dd->cgindex;
7930     cginfo_mb = fr->cginfo_mb;
7931     
7932     zone_cg_range[0]   = 0;
7933     zone_cg_range[1]   = dd->ncg_home;
7934     comm->zone_ncg1[0] = dd->ncg_home;
7935     pos_cg             = dd->ncg_home;
7936     
7937     nat_tot = dd->nat_home;
7938     nzone = 1;
7939     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7940     {
7941         dim = dd->dim[dim_ind];
7942         cd = &comm->cd[dim_ind];
7943         
7944         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7945         {
7946             /* No pbc in this dimension, the first node should not comm. */
7947             nzone_send = 0;
7948         }
7949         else
7950         {
7951             nzone_send = nzone;
7952         }
7953
7954         v_d = ddbox->v[dim];
7955         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7956
7957         cd->bInPlace = TRUE;
7958         for(p=0; p<cd->np; p++)
7959         {
7960             /* Only atoms communicated in the first pulse are used
7961              * for multi-body bonded interactions or for bBondComm.
7962              */
7963             bDistBonded = ((bDistMB || bDist2B) && p == 0);
7964
7965             ind = &cd->ind[p];
7966             nsend = 0;
7967             nat = 0;
7968             for(zone=0; zone<nzone_send; zone++)
7969             {
7970                 if (tric_dist[dim_ind] && dim_ind > 0)
7971                 {
7972                     /* Determine slightly more optimized skew_fac's
7973                      * for rounding.
7974                      * This reduces the number of communicated atoms
7975                      * by about 10% for 3D DD of rhombic dodecahedra.
7976                      */
7977                     for(dimd=0; dimd<dim; dimd++)
7978                     {
7979                         sf2_round[dimd] = 1;
7980                         if (ddbox->tric_dir[dimd])
7981                         {
7982                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7983                             {
7984                                 /* If we are shifted in dimension i
7985                                  * and the cell plane is tilted forward
7986                                  * in dimension i, skip this coupling.
7987                                  */
7988                                 if (!(zones->shift[nzone+zone][i] &&
7989                                       ddbox->v[dimd][i][dimd] >= 0))
7990                                 {
7991                                     sf2_round[dimd] +=
7992                                         sqr(ddbox->v[dimd][i][dimd]);
7993                                 }
7994                             }
7995                             sf2_round[dimd] = 1/sf2_round[dimd];
7996                         }
7997                     }
7998                 }
7999
8000                 zonei = zone_perm[dim_ind][zone];
8001                 if (p == 0)
8002                 {
8003                     /* Here we permutate the zones to obtain a convenient order
8004                      * for neighbor searching
8005                      */
8006                     cg0 = zone_cg_range[zonei];
8007                     cg1 = zone_cg_range[zonei+1];
8008                 }
8009                 else
8010                 {
8011                     /* Look only at the cg's received in the previous grid pulse
8012                      */
8013                     cg1 = zone_cg_range[nzone+zone+1];
8014                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8015                 }
8016
8017 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8018                 for(th=0; th<comm->nth; th++)
8019                 {
8020                     gmx_domdec_ind_t *ind_p;
8021                     int **ibuf_p,*ibuf_nalloc_p;
8022                     vec_rvec_t *vbuf_p;
8023                     int *nsend_p,*nat_p;
8024                     int *nsend_zone_p;
8025                     int cg0_th,cg1_th;
8026
8027                     if (th == 0)
8028                     {
8029                         /* Thread 0 writes in the comm buffers */
8030                         ind_p         = ind;
8031                         ibuf_p        = &comm->buf_int;
8032                         ibuf_nalloc_p = &comm->nalloc_int;
8033                         vbuf_p        = &comm->vbuf;
8034                         nsend_p       = &nsend;
8035                         nat_p         = &nat;
8036                         nsend_zone_p  = &ind->nsend[zone];
8037                     }
8038                     else
8039                     {
8040                         /* Other threads write into temp buffers */
8041                         ind_p         = &comm->dth[th].ind;
8042                         ibuf_p        = &comm->dth[th].ibuf;
8043                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8044                         vbuf_p        = &comm->dth[th].vbuf;
8045                         nsend_p       = &comm->dth[th].nsend;
8046                         nat_p         = &comm->dth[th].nat;
8047                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8048
8049                         comm->dth[th].nsend      = 0;
8050                         comm->dth[th].nat        = 0;
8051                         comm->dth[th].nsend_zone = 0;
8052                     }
8053
8054                     if (comm->nth == 1)
8055                     {
8056                         cg0_th = cg0;
8057                         cg1_th = cg1;
8058                     }
8059                     else
8060                     {
8061                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8062                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8063                     }
8064                     
8065                     /* Get the cg's for this pulse in this zone */
8066                     get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8067                                        index_gl,cgindex,
8068                                        dim,dim_ind,dim0,dim1,dim2,
8069                                        r_comm2,r_bcomm2,
8070                                        box,tric_dist,
8071                                        normal,skew_fac2_d,skew_fac_01,
8072                                        v_d,v_0,v_1,&corners,sf2_round,
8073                                        bDistBonded,bBondComm,
8074                                        bDist2B,bDistMB,
8075                                        cg_cm,fr->cginfo,
8076                                        ind_p,
8077                                        ibuf_p,ibuf_nalloc_p,
8078                                        vbuf_p,
8079                                        nsend_p,nat_p,
8080                                        nsend_zone_p);
8081                 }
8082
8083                 /* Append data of threads>=1 to the communication buffers */
8084                 for(th=1; th<comm->nth; th++)
8085                 {
8086                     dd_comm_setup_work_t *dth;
8087                     int i,ns1;
8088
8089                     dth = &comm->dth[th];
8090
8091                     ns1 = nsend + dth->nsend_zone;
8092                     if (ns1 > ind->nalloc)
8093                     {
8094                         ind->nalloc = over_alloc_dd(ns1);
8095                         srenew(ind->index,ind->nalloc);
8096                     }
8097                     if (ns1 > comm->nalloc_int)
8098                     {
8099                         comm->nalloc_int = over_alloc_dd(ns1);
8100                         srenew(comm->buf_int,comm->nalloc_int);
8101                     }
8102                     if (ns1 > comm->vbuf.nalloc)
8103                     {
8104                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8105                         srenew(comm->vbuf.v,comm->vbuf.nalloc);
8106                     }
8107
8108                     for(i=0; i<dth->nsend_zone; i++)
8109                     {
8110                         ind->index[nsend] = dth->ind.index[i];
8111                         comm->buf_int[nsend] = dth->ibuf[i];
8112                         copy_rvec(dth->vbuf.v[i],
8113                                   comm->vbuf.v[nsend]);
8114                         nsend++;
8115                     }
8116                     nat              += dth->nat;
8117                     ind->nsend[zone] += dth->nsend_zone;
8118                 }
8119             }
8120             /* Clear the counts in case we do not have pbc */
8121             for(zone=nzone_send; zone<nzone; zone++)
8122             {
8123                 ind->nsend[zone] = 0;
8124             }
8125             ind->nsend[nzone]   = nsend;
8126             ind->nsend[nzone+1] = nat;
8127             /* Communicate the number of cg's and atoms to receive */
8128             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8129                             ind->nsend, nzone+2,
8130                             ind->nrecv, nzone+2);
8131             
8132             /* The rvec buffer is also required for atom buffers of size nsend
8133              * in dd_move_x and dd_move_f.
8134              */
8135             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8136
8137             if (p > 0)
8138             {
8139                 /* We can receive in place if only the last zone is not empty */
8140                 for(zone=0; zone<nzone-1; zone++)
8141                 {
8142                     if (ind->nrecv[zone] > 0)
8143                     {
8144                         cd->bInPlace = FALSE;
8145                     }
8146                 }
8147                 if (!cd->bInPlace)
8148                 {
8149                     /* The int buffer is only required here for the cg indices */
8150                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8151                     {
8152                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8153                         srenew(comm->buf_int2,comm->nalloc_int2);
8154                     }
8155                     /* The rvec buffer is also required for atom buffers
8156                      * of size nrecv in dd_move_x and dd_move_f.
8157                      */
8158                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8159                     vec_rvec_check_alloc(&comm->vbuf2,i);
8160                 }
8161             }
8162             
8163             /* Make space for the global cg indices */
8164             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8165                 || dd->cg_nalloc == 0)
8166             {
8167                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8168                 srenew(index_gl,dd->cg_nalloc);
8169                 srenew(cgindex,dd->cg_nalloc+1);
8170             }
8171             /* Communicate the global cg indices */
8172             if (cd->bInPlace)
8173             {
8174                 recv_i = index_gl + pos_cg;
8175             }
8176             else
8177             {
8178                 recv_i = comm->buf_int2;
8179             }
8180             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8181                             comm->buf_int, nsend,
8182                             recv_i,        ind->nrecv[nzone]);
8183
8184             /* Make space for cg_cm */
8185             dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8186             if (fr->cutoff_scheme == ecutsGROUP)
8187             {
8188                 cg_cm = fr->cg_cm;
8189             }
8190             else
8191             {
8192                 cg_cm = state->x;
8193             }
8194             /* Communicate cg_cm */
8195             if (cd->bInPlace)
8196             {
8197                 recv_vr = cg_cm + pos_cg;
8198             }
8199             else
8200             {
8201                 recv_vr = comm->vbuf2.v;
8202             }
8203             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8204                              comm->vbuf.v, nsend,
8205                              recv_vr,      ind->nrecv[nzone]);
8206             
8207             /* Make the charge group index */
8208             if (cd->bInPlace)
8209             {
8210                 zone = (p == 0 ? 0 : nzone - 1);
8211                 while (zone < nzone)
8212                 {
8213                     for(cg=0; cg<ind->nrecv[zone]; cg++)
8214                     {
8215                         cg_gl = index_gl[pos_cg];
8216                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8217                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8218                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8219                         if (bBondComm)
8220                         {
8221                             /* Update the charge group presence,
8222                              * so we can use it in the next pass of the loop.
8223                              */
8224                             comm->bLocalCG[cg_gl] = TRUE;
8225                         }
8226                         pos_cg++;
8227                     }
8228                     if (p == 0)
8229                     {
8230                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8231                     }
8232                     zone++;
8233                     zone_cg_range[nzone+zone] = pos_cg;
8234                 }
8235             }
8236             else
8237             {
8238                 /* This part of the code is never executed with bBondComm. */
8239                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8240                                  index_gl,recv_i,cg_cm,recv_vr,
8241                                  cgindex,fr->cginfo_mb,fr->cginfo);
8242                 pos_cg += ind->nrecv[nzone];
8243             }
8244             nat_tot += ind->nrecv[nzone+1];
8245         }
8246         if (!cd->bInPlace)
8247         {
8248             /* Store the atom block for easy copying of communication buffers */
8249             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8250         }
8251         nzone += nzone;
8252     }
8253     dd->index_gl = index_gl;
8254     dd->cgindex  = cgindex;
8255     
8256     dd->ncg_tot = zone_cg_range[zones->n];
8257     dd->nat_tot = nat_tot;
8258     comm->nat[ddnatHOME] = dd->nat_home;
8259     for(i=ddnatZONE; i<ddnatNR; i++)
8260     {
8261         comm->nat[i] = dd->nat_tot;
8262     }
8263
8264     if (!bBondComm)
8265     {
8266         /* We don't need to update cginfo, since that was alrady done above.
8267          * So we pass NULL for the forcerec.
8268          */
8269         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8270                       NULL,comm->bLocalCG);
8271     }
8272
8273     if (debug)
8274     {
8275         fprintf(debug,"Finished setting up DD communication, zones:");
8276         for(c=0; c<zones->n; c++)
8277         {
8278             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8279         }
8280         fprintf(debug,"\n");
8281     }
8282 }
8283
8284 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8285 {
8286     int c;
8287     
8288     for(c=0; c<zones->nizone; c++)
8289     {
8290         zones->izone[c].cg1  = zones->cg_range[c+1];
8291         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8292         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8293     }
8294 }
8295
8296 static void set_zones_size(gmx_domdec_t *dd,
8297                            matrix box,const gmx_ddbox_t *ddbox,
8298                            int zone_start,int zone_end)
8299 {
8300     gmx_domdec_comm_t *comm;
8301     gmx_domdec_zones_t *zones;
8302     gmx_bool bDistMB;
8303     int  z,zi,zj0,zj1,d,dim;
8304     real rcs,rcmbs;
8305     int  i,j;
8306     real size_j,add_tric;
8307     real vol;
8308
8309     comm = dd->comm;
8310
8311     zones = &comm->zones;
8312
8313     /* Do we need to determine extra distances for multi-body bondeds? */
8314     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8315
8316     for(z=zone_start; z<zone_end; z++)
8317     {
8318         /* Copy cell limits to zone limits.
8319          * Valid for non-DD dims and non-shifted dims.
8320          */
8321         copy_rvec(comm->cell_x0,zones->size[z].x0);
8322         copy_rvec(comm->cell_x1,zones->size[z].x1);
8323     }
8324
8325     for(d=0; d<dd->ndim; d++)
8326     {
8327         dim = dd->dim[d];
8328
8329         for(z=0; z<zones->n; z++)
8330         {
8331             /* With a staggered grid we have different sizes
8332              * for non-shifted dimensions.
8333              */
8334             if (dd->bGridJump && zones->shift[z][dim] == 0)
8335             {
8336                 if (d == 1)
8337                 {
8338                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8339                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8340                 }
8341                 else if (d == 2)
8342                 {
8343                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8344                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8345                 }
8346             }
8347         }
8348
8349         rcs   = comm->cutoff;
8350         rcmbs = comm->cutoff_mbody;
8351         if (ddbox->tric_dir[dim])
8352         {
8353             rcs   /= ddbox->skew_fac[dim];
8354             rcmbs /= ddbox->skew_fac[dim];
8355         }
8356
8357         /* Set the lower limit for the shifted zone dimensions */
8358         for(z=zone_start; z<zone_end; z++)
8359         {
8360             if (zones->shift[z][dim] > 0)
8361             {
8362                 dim = dd->dim[d];
8363                 if (!dd->bGridJump || d == 0)
8364                 {
8365                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8366                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8367                 }
8368                 else
8369                 {
8370                     /* Here we take the lower limit of the zone from
8371                      * the lowest domain of the zone below.
8372                      */
8373                     if (z < 4)
8374                     {
8375                         zones->size[z].x0[dim] =
8376                              comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8377                     }
8378                     else
8379                     {
8380                         if (d == 1)
8381                         {
8382                             zones->size[z].x0[dim] =
8383                                 zones->size[zone_perm[2][z-4]].x0[dim];
8384                         }
8385                         else
8386                         {
8387                             zones->size[z].x0[dim] =
8388                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8389                         }
8390                     }
8391                     /* A temporary limit, is updated below */
8392                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8393
8394                     if (bDistMB)
8395                     {
8396                         for(zi=0; zi<zones->nizone; zi++)
8397                         {
8398                             if (zones->shift[zi][dim] == 0)
8399                             {
8400                                 /* This takes the whole zone into account.
8401                                  * With multiple pulses this will lead
8402                                  * to a larger zone then strictly necessary.
8403                                  */
8404                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8405                                                              zones->size[zi].x1[dim]+rcmbs);
8406                             }
8407                         }
8408                     }
8409                 }
8410             }
8411         }
8412
8413         /* Loop over the i-zones to set the upper limit of each
8414          * j-zone they see.
8415          */
8416         for(zi=0; zi<zones->nizone; zi++)
8417         {
8418             if (zones->shift[zi][dim] == 0)
8419             {
8420                 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8421                 {
8422                     if (zones->shift[z][dim] > 0)
8423                     {
8424                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8425                                                      zones->size[zi].x1[dim]+rcs);
8426                     }
8427                 }
8428             }
8429         }
8430     }
8431
8432     for(z=zone_start; z<zone_end; z++)
8433     {
8434         for(i=0; i<DIM; i++)
8435         {
8436             zones->size[z].bb_x0[i] = zones->size[z].x0[i];
8437             zones->size[z].bb_x1[i] = zones->size[z].x1[i];
8438
8439             for(j=i+1; j<ddbox->npbcdim; j++)
8440             {
8441                 /* With 1D domain decomposition the cg's are not in
8442                  * the triclinic box, but trilinic x-y and rectangular y-z.
8443                  */
8444                 if (box[j][i] != 0 &&
8445                     !(dd->ndim == 1 && i == YY && j == ZZ))
8446                 {
8447                     /* Correct for triclinic offset of the lower corner */
8448                     add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
8449                     zones->size[z].bb_x0[i] += add_tric;
8450                     zones->size[z].bb_x1[i] += add_tric;
8451
8452                     /* Correct for triclinic offset of the upper corner */
8453                     size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
8454                     add_tric = size_j*box[j][i]/box[j][j];
8455
8456                     if (box[j][i] < 0)
8457                     {
8458                         zones->size[z].bb_x0[i] += add_tric;
8459                     }
8460                     else
8461                     {
8462                         zones->size[z].bb_x1[i] += add_tric;
8463                     }
8464                 }
8465             }
8466         }
8467     }
8468
8469     if (zone_start == 0)
8470     {
8471         vol = 1;
8472         for(dim=0; dim<DIM; dim++)
8473         {
8474             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8475         }
8476         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8477     }
8478
8479     if (debug)
8480     {
8481         for(z=zone_start; z<zone_end; z++)
8482         {
8483             fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8484                     z,
8485                     zones->size[z].x0[XX],zones->size[z].x1[XX],
8486                     zones->size[z].x0[YY],zones->size[z].x1[YY],
8487                     zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8488             fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8489                     z,
8490                     zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8491                     zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8492                     zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8493         }
8494     }
8495 }
8496
8497 static int comp_cgsort(const void *a,const void *b)
8498 {
8499     int comp;
8500     
8501     gmx_cgsort_t *cga,*cgb;
8502     cga = (gmx_cgsort_t *)a;
8503     cgb = (gmx_cgsort_t *)b;
8504     
8505     comp = cga->nsc - cgb->nsc;
8506     if (comp == 0)
8507     {
8508         comp = cga->ind_gl - cgb->ind_gl;
8509     }
8510     
8511     return comp;
8512 }
8513
8514 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8515                          int *a,int *buf)
8516 {
8517     int i;
8518     
8519     /* Order the data */
8520     for(i=0; i<n; i++)
8521     {
8522         buf[i] = a[sort[i].ind];
8523     }
8524     
8525     /* Copy back to the original array */
8526     for(i=0; i<n; i++)
8527     {
8528         a[i] = buf[i];
8529     }
8530 }
8531
8532 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8533                          rvec *v,rvec *buf)
8534 {
8535     int i;
8536     
8537     /* Order the data */
8538     for(i=0; i<n; i++)
8539     {
8540         copy_rvec(v[sort[i].ind],buf[i]);
8541     }
8542     
8543     /* Copy back to the original array */
8544     for(i=0; i<n; i++)
8545     {
8546         copy_rvec(buf[i],v[i]);
8547     }
8548 }
8549
8550 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8551                            rvec *v,rvec *buf)
8552 {
8553     int a,atot,cg,cg0,cg1,i;
8554     
8555     if (cgindex == NULL)
8556     {
8557         /* Avoid the useless loop of the atoms within a cg */
8558         order_vec_cg(ncg,sort,v,buf);
8559
8560         return;
8561     }
8562
8563     /* Order the data */
8564     a = 0;
8565     for(cg=0; cg<ncg; cg++)
8566     {
8567         cg0 = cgindex[sort[cg].ind];
8568         cg1 = cgindex[sort[cg].ind+1];
8569         for(i=cg0; i<cg1; i++)
8570         {
8571             copy_rvec(v[i],buf[a]);
8572             a++;
8573         }
8574     }
8575     atot = a;
8576     
8577     /* Copy back to the original array */
8578     for(a=0; a<atot; a++)
8579     {
8580         copy_rvec(buf[a],v[a]);
8581     }
8582 }
8583
8584 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8585                          int nsort_new,gmx_cgsort_t *sort_new,
8586                          gmx_cgsort_t *sort1)
8587 {
8588     int i1,i2,i_new;
8589     
8590     /* The new indices are not very ordered, so we qsort them */
8591     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8592     
8593     /* sort2 is already ordered, so now we can merge the two arrays */
8594     i1 = 0;
8595     i2 = 0;
8596     i_new = 0;
8597     while(i2 < nsort2 || i_new < nsort_new)
8598     {
8599         if (i2 == nsort2)
8600         {
8601             sort1[i1++] = sort_new[i_new++];
8602         }
8603         else if (i_new == nsort_new)
8604         {
8605             sort1[i1++] = sort2[i2++];
8606         }
8607         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8608                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8609                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8610         {
8611             sort1[i1++] = sort2[i2++];
8612         }
8613         else
8614         {
8615             sort1[i1++] = sort_new[i_new++];
8616         }
8617     }
8618 }
8619
8620 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8621 {
8622     gmx_domdec_sort_t *sort;
8623     gmx_cgsort_t *cgsort,*sort_i;
8624     int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8625     int  sort_last,sort_skip;
8626
8627     sort = dd->comm->sort;
8628
8629     a = fr->ns.grid->cell_index;
8630
8631     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8632
8633     if (ncg_home_old >= 0)
8634     {
8635         /* The charge groups that remained in the same ns grid cell
8636          * are completely ordered. So we can sort efficiently by sorting
8637          * the charge groups that did move into the stationary list.
8638          */
8639         ncg_new = 0;
8640         nsort2 = 0;
8641         nsort_new = 0;
8642         for(i=0; i<dd->ncg_home; i++)
8643         {
8644             /* Check if this cg did not move to another node */
8645             if (a[i] < moved)
8646             {
8647                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8648                 {
8649                     /* This cg is new on this node or moved ns grid cell */
8650                     if (nsort_new >= sort->sort_new_nalloc)
8651                     {
8652                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8653                         srenew(sort->sort_new,sort->sort_new_nalloc);
8654                     }
8655                     sort_i = &(sort->sort_new[nsort_new++]);
8656                 }
8657                 else
8658                 {
8659                     /* This cg did not move */
8660                     sort_i = &(sort->sort2[nsort2++]);
8661                 }
8662                 /* Sort on the ns grid cell indices
8663                  * and the global topology index.
8664                  * index_gl is irrelevant with cell ns,
8665                  * but we set it here anyhow to avoid a conditional.
8666                  */
8667                 sort_i->nsc    = a[i];
8668                 sort_i->ind_gl = dd->index_gl[i];
8669                 sort_i->ind    = i;
8670                 ncg_new++;
8671             }
8672         }
8673         if (debug)
8674         {
8675             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8676                     nsort2,nsort_new);
8677         }
8678         /* Sort efficiently */
8679         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8680                      sort->sort);
8681     }
8682     else
8683     {
8684         cgsort = sort->sort;
8685         ncg_new = 0;
8686         for(i=0; i<dd->ncg_home; i++)
8687         {
8688             /* Sort on the ns grid cell indices
8689              * and the global topology index
8690              */
8691             cgsort[i].nsc    = a[i];
8692             cgsort[i].ind_gl = dd->index_gl[i];
8693             cgsort[i].ind    = i;
8694             if (cgsort[i].nsc < moved)
8695             {
8696                 ncg_new++;
8697             }
8698         }
8699         if (debug)
8700         {
8701             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8702         }
8703         /* Determine the order of the charge groups using qsort */
8704         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8705     }
8706
8707     return ncg_new;
8708 }
8709
8710 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8711 {
8712     gmx_cgsort_t *sort;
8713     int  ncg_new,i,*a,na;
8714
8715     sort = dd->comm->sort->sort;
8716
8717     nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8718
8719     ncg_new = 0;
8720     for(i=0; i<na; i++)
8721     {
8722         if (a[i] >= 0)
8723         {
8724             sort[ncg_new].ind = a[i];
8725             ncg_new++;
8726         }
8727     }
8728
8729     return ncg_new;
8730 }
8731
8732 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8733                           rvec *cgcm,t_forcerec *fr,t_state *state,
8734                           int ncg_home_old)
8735 {
8736     gmx_domdec_sort_t *sort;
8737     gmx_cgsort_t *cgsort,*sort_i;
8738     int  *cgindex;
8739     int  ncg_new,i,*ibuf,cgsize;
8740     rvec *vbuf;
8741     
8742     sort = dd->comm->sort;
8743     
8744     if (dd->ncg_home > sort->sort_nalloc)
8745     {
8746         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8747         srenew(sort->sort,sort->sort_nalloc);
8748         srenew(sort->sort2,sort->sort_nalloc);
8749     }
8750     cgsort = sort->sort;
8751
8752     switch (fr->cutoff_scheme)
8753     {
8754     case ecutsGROUP:
8755         ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8756         break;
8757     case ecutsVERLET:
8758         ncg_new = dd_sort_order_nbnxn(dd,fr);
8759         break;
8760     default:
8761         gmx_incons("unimplemented");
8762         ncg_new = 0;
8763     }
8764
8765     /* We alloc with the old size, since cgindex is still old */
8766     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8767     vbuf = dd->comm->vbuf.v;
8768     
8769     if (dd->comm->bCGs)
8770     {
8771         cgindex = dd->cgindex;
8772     }
8773     else
8774     {
8775         cgindex = NULL;
8776     }
8777
8778     /* Remove the charge groups which are no longer at home here */
8779     dd->ncg_home = ncg_new;
8780     if (debug)
8781     {
8782         fprintf(debug,"Set the new home charge group count to %d\n",
8783                 dd->ncg_home);
8784     }
8785     
8786     /* Reorder the state */
8787     for(i=0; i<estNR; i++)
8788     {
8789         if (EST_DISTR(i) && (state->flags & (1<<i)))
8790         {
8791             switch (i)
8792             {
8793             case estX:
8794                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8795                 break;
8796             case estV:
8797                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8798                 break;
8799             case estSDX:
8800                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8801                 break;
8802             case estCGP:
8803                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8804                 break;
8805             case estLD_RNG:
8806             case estLD_RNGI:
8807             case estDISRE_INITF:
8808             case estDISRE_RM3TAV:
8809             case estORIRE_INITF:
8810             case estORIRE_DTAV:
8811                 /* No ordering required */
8812                 break;
8813             default:
8814                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8815                 break;
8816             }
8817         }
8818     }
8819     if (fr->cutoff_scheme == ecutsGROUP)
8820     {
8821         /* Reorder cgcm */
8822         order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8823     }
8824     
8825     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8826     {
8827         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8828         srenew(sort->ibuf,sort->ibuf_nalloc);
8829     }
8830     ibuf = sort->ibuf;
8831     /* Reorder the global cg index */
8832     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8833     /* Reorder the cginfo */
8834     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8835     /* Rebuild the local cg index */
8836     if (dd->comm->bCGs)
8837     {
8838         ibuf[0] = 0;
8839         for(i=0; i<dd->ncg_home; i++)
8840         {
8841             cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8842             ibuf[i+1] = ibuf[i] + cgsize;
8843         }
8844         for(i=0; i<dd->ncg_home+1; i++)
8845         {
8846             dd->cgindex[i] = ibuf[i];
8847         }
8848     }
8849     else
8850     {
8851         for(i=0; i<dd->ncg_home+1; i++)
8852         {
8853             dd->cgindex[i] = i;
8854         }
8855     }
8856     /* Set the home atom number */
8857     dd->nat_home = dd->cgindex[dd->ncg_home];
8858
8859     if (fr->cutoff_scheme == ecutsVERLET)
8860     {
8861         /* The atoms are now exactly in grid order, update the grid order */
8862         nbnxn_set_atomorder(fr->nbv->nbs);
8863     }
8864     else
8865     {
8866         /* Copy the sorted ns cell indices back to the ns grid struct */
8867         for(i=0; i<dd->ncg_home; i++)
8868         {
8869             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8870         }
8871         fr->ns.grid->nr = dd->ncg_home;
8872     }
8873 }
8874
8875 static void add_dd_statistics(gmx_domdec_t *dd)
8876 {
8877     gmx_domdec_comm_t *comm;
8878     int ddnat;
8879     
8880     comm = dd->comm;
8881     
8882     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8883     {
8884         comm->sum_nat[ddnat-ddnatZONE] +=
8885             comm->nat[ddnat] - comm->nat[ddnat-1];
8886     }
8887     comm->ndecomp++;
8888 }
8889
8890 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8891 {
8892     gmx_domdec_comm_t *comm;
8893     int ddnat;
8894     
8895     comm = dd->comm;
8896
8897     /* Reset all the statistics and counters for total run counting */
8898     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8899     {
8900         comm->sum_nat[ddnat-ddnatZONE] = 0;
8901     }
8902     comm->ndecomp = 0;
8903     comm->nload = 0;
8904     comm->load_step = 0;
8905     comm->load_sum = 0;
8906     comm->load_max = 0;
8907     clear_ivec(comm->load_lim);
8908     comm->load_mdf = 0;
8909     comm->load_pme = 0;
8910 }
8911
8912 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8913 {
8914     gmx_domdec_comm_t *comm;
8915     int ddnat;
8916     double av;
8917    
8918     comm = cr->dd->comm;
8919     
8920     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8921     
8922     if (fplog == NULL)
8923     {
8924         return;
8925     }
8926     
8927     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8928             
8929     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8930     {
8931         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8932         switch(ddnat)
8933         {
8934         case ddnatZONE:
8935             fprintf(fplog,
8936                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8937                     2,av);
8938             break;
8939         case ddnatVSITE:
8940             if (cr->dd->vsite_comm)
8941             {
8942                 fprintf(fplog,
8943                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8944                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8945                         av);
8946             }
8947             break;
8948         case ddnatCON:
8949             if (cr->dd->constraint_comm)
8950             {
8951                 fprintf(fplog,
8952                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8953                         1 + ir->nLincsIter,av);
8954             }
8955             break;
8956         default:
8957             gmx_incons(" Unknown type for DD statistics");
8958         }
8959     }
8960     fprintf(fplog,"\n");
8961     
8962     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8963     {
8964         print_dd_load_av(fplog,cr->dd);
8965     }
8966 }
8967
8968 void dd_partition_system(FILE            *fplog,
8969                          gmx_large_int_t      step,
8970                          t_commrec       *cr,
8971                          gmx_bool            bMasterState,
8972                          int             nstglobalcomm,
8973                          t_state         *state_global,
8974                          gmx_mtop_t      *top_global,
8975                          t_inputrec      *ir,
8976                          t_state         *state_local,
8977                          rvec            **f,
8978                          t_mdatoms       *mdatoms,
8979                          gmx_localtop_t  *top_local,
8980                          t_forcerec      *fr,
8981                          gmx_vsite_t     *vsite,
8982                          gmx_shellfc_t   shellfc,
8983                          gmx_constr_t    constr,
8984                          t_nrnb          *nrnb,
8985                          gmx_wallcycle_t wcycle,
8986                          gmx_bool            bVerbose)
8987 {
8988     gmx_domdec_t *dd;
8989     gmx_domdec_comm_t *comm;
8990     gmx_ddbox_t ddbox={0};
8991     t_block *cgs_gl;
8992     gmx_large_int_t step_pcoupl;
8993     rvec cell_ns_x0,cell_ns_x1;
8994     int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
8995     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8996     gmx_bool bRedist,bSortCG,bResortAll;
8997     ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
8998     real grid_density;
8999     char sbuf[22];
9000         
9001     dd = cr->dd;
9002     comm = dd->comm;
9003
9004     bBoxChanged = (bMasterState || DEFORM(*ir));
9005     if (ir->epc != epcNO)
9006     {
9007         /* With nstpcouple > 1 pressure coupling happens.
9008          * one step after calculating the pressure.
9009          * Box scaling happens at the end of the MD step,
9010          * after the DD partitioning.
9011          * We therefore have to do DLB in the first partitioning
9012          * after an MD step where P-coupling occured.
9013          * We need to determine the last step in which p-coupling occurred.
9014          * MRS -- need to validate this for vv?
9015          */
9016         n = ir->nstpcouple;
9017         if (n == 1)
9018         {
9019             step_pcoupl = step - 1;
9020         }
9021         else
9022         {
9023             step_pcoupl = ((step - 1)/n)*n + 1;
9024         }
9025         if (step_pcoupl >= comm->partition_step)
9026         {
9027             bBoxChanged = TRUE;
9028         }
9029     }
9030
9031     bNStGlobalComm = (step % nstglobalcomm == 0);
9032
9033     if (!comm->bDynLoadBal)
9034     {
9035         bDoDLB = FALSE;
9036     }
9037     else
9038     {
9039         /* Should we do dynamic load balacing this step?
9040          * Since it requires (possibly expensive) global communication,
9041          * we might want to do DLB less frequently.
9042          */
9043         if (bBoxChanged || ir->epc != epcNO)
9044         {
9045             bDoDLB = bBoxChanged;
9046         }
9047         else
9048         {
9049             bDoDLB = bNStGlobalComm;
9050         }
9051     }
9052
9053     /* Check if we have recorded loads on the nodes */
9054     if (comm->bRecordLoad && dd_load_count(comm))
9055     {
9056         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9057         {
9058             /* Check if we should use DLB at the second partitioning
9059              * and every 100 partitionings,
9060              * so the extra communication cost is negligible.
9061              */
9062             n = max(100,nstglobalcomm);
9063             bCheckDLB = (comm->n_load_collect == 0 ||
9064                          comm->n_load_have % n == n-1);
9065         }
9066         else
9067         {
9068             bCheckDLB = FALSE;
9069         }
9070         
9071         /* Print load every nstlog, first and last step to the log file */
9072         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9073                     comm->n_load_collect == 0 ||
9074                     (ir->nsteps >= 0 &&
9075                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9076
9077         /* Avoid extra communication due to verbose screen output
9078          * when nstglobalcomm is set.
9079          */
9080         if (bDoDLB || bLogLoad || bCheckDLB ||
9081             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9082         {
9083             get_load_distribution(dd,wcycle);
9084             if (DDMASTER(dd))
9085             {
9086                 if (bLogLoad)
9087                 {
9088                     dd_print_load(fplog,dd,step-1);
9089                 }
9090                 if (bVerbose)
9091                 {
9092                     dd_print_load_verbose(dd);
9093                 }
9094             }
9095             comm->n_load_collect++;
9096
9097             if (bCheckDLB) {
9098                 /* Since the timings are node dependent, the master decides */
9099                 if (DDMASTER(dd))
9100                 {
9101                     bTurnOnDLB =
9102                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9103                     if (debug)
9104                     {
9105                         fprintf(debug,"step %s, imb loss %f\n",
9106                                 gmx_step_str(step,sbuf),
9107                                 dd_force_imb_perf_loss(dd));
9108                     }
9109                 }
9110                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9111                 if (bTurnOnDLB)
9112                 {
9113                     turn_on_dlb(fplog,cr,step);
9114                     bDoDLB = TRUE;
9115                 }
9116             }
9117         }
9118         comm->n_load_have++;
9119     }
9120
9121     cgs_gl = &comm->cgs_gl;
9122
9123     bRedist = FALSE;
9124     if (bMasterState)
9125     {
9126         /* Clear the old state */
9127         clear_dd_indices(dd,0,0);
9128
9129         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9130                   TRUE,cgs_gl,state_global->x,&ddbox);
9131     
9132         get_cg_distribution(fplog,step,dd,cgs_gl,
9133                             state_global->box,&ddbox,state_global->x);
9134         
9135         dd_distribute_state(dd,cgs_gl,
9136                             state_global,state_local,f);
9137         
9138         dd_make_local_cgs(dd,&top_local->cgs);
9139         
9140         /* Ensure that we have space for the new distribution */
9141         dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9142
9143         if (fr->cutoff_scheme == ecutsGROUP)
9144         {
9145             calc_cgcm(fplog,0,dd->ncg_home,
9146                       &top_local->cgs,state_local->x,fr->cg_cm);
9147         }
9148         
9149         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9150         
9151         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9152
9153         cg0 = 0;
9154     }
9155     else if (state_local->ddp_count != dd->ddp_count)
9156     {
9157         if (state_local->ddp_count > dd->ddp_count)
9158         {
9159             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9160         }
9161         
9162         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9163         {
9164             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9165         }
9166         
9167         /* Clear the old state */
9168         clear_dd_indices(dd,0,0);
9169         
9170         /* Build the new indices */
9171         rebuild_cgindex(dd,cgs_gl->index,state_local);
9172         make_dd_indices(dd,cgs_gl->index,0);
9173
9174         if (fr->cutoff_scheme == ecutsGROUP)
9175         {
9176             /* Redetermine the cg COMs */
9177             calc_cgcm(fplog,0,dd->ncg_home,
9178                       &top_local->cgs,state_local->x,fr->cg_cm);
9179         }
9180         
9181         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9182
9183         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9184
9185         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9186                   TRUE,&top_local->cgs,state_local->x,&ddbox);
9187
9188         bRedist = comm->bDynLoadBal;
9189     }
9190     else
9191     {
9192         /* We have the full state, only redistribute the cgs */
9193
9194         /* Clear the non-home indices */
9195         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9196
9197         /* Avoid global communication for dim's without pbc and -gcom */
9198         if (!bNStGlobalComm)
9199         {
9200             copy_rvec(comm->box0    ,ddbox.box0    );
9201             copy_rvec(comm->box_size,ddbox.box_size);
9202         }
9203         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9204                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9205
9206         bBoxChanged = TRUE;
9207         bRedist = TRUE;
9208     }
9209     /* For dim's without pbc and -gcom */
9210     copy_rvec(ddbox.box0    ,comm->box0    );
9211     copy_rvec(ddbox.box_size,comm->box_size);
9212     
9213     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9214                       step,wcycle);
9215     
9216     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9217     {
9218         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9219     }
9220     
9221     /* Check if we should sort the charge groups */
9222     if (comm->nstSortCG > 0)
9223     {
9224         bSortCG = (bMasterState ||
9225                    (bRedist && (step % comm->nstSortCG == 0)));
9226     }
9227     else
9228     {
9229         bSortCG = FALSE;
9230     }
9231
9232     ncg_home_old = dd->ncg_home;
9233
9234     ncg_moved = 0;
9235     if (bRedist)
9236     {
9237         wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9238
9239         dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9240                            state_local,f,fr,mdatoms,
9241                            !bSortCG,nrnb,&cg0,&ncg_moved);
9242
9243         wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9244     }
9245     
9246     get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9247                           dd,&ddbox,
9248                           &comm->cell_x0,&comm->cell_x1,
9249                           dd->ncg_home,fr->cg_cm,
9250                           cell_ns_x0,cell_ns_x1,&grid_density);
9251
9252     if (bBoxChanged)
9253     {
9254         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9255     }
9256
9257     switch (fr->cutoff_scheme)
9258     {
9259     case ecutsGROUP:
9260         copy_ivec(fr->ns.grid->n,ncells_old);
9261         grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9262                    state_local->box,cell_ns_x0,cell_ns_x1,
9263                    fr->rlistlong,grid_density);
9264         break;
9265     case ecutsVERLET:
9266         nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9267         break;
9268     default:
9269         gmx_incons("unimplemented");
9270     }
9271     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9272     copy_ivec(ddbox.tric_dir,comm->tric_dir);
9273
9274     if (bSortCG)
9275     {
9276         wallcycle_sub_start(wcycle,ewcsDD_GRID);
9277
9278         /* Sort the state on charge group position.
9279          * This enables exact restarts from this step.
9280          * It also improves performance by about 15% with larger numbers
9281          * of atoms per node.
9282          */
9283         
9284         /* Fill the ns grid with the home cell,
9285          * so we can sort with the indices.
9286          */
9287         set_zones_ncg_home(dd);
9288
9289         switch (fr->cutoff_scheme)
9290         {
9291         case ecutsVERLET:
9292             set_zones_size(dd,state_local->box,&ddbox,0,1);
9293
9294             nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9295                               0,
9296                               comm->zones.size[0].bb_x0,
9297                               comm->zones.size[0].bb_x1,
9298                               0,dd->ncg_home,
9299                               comm->zones.dens_zone0,
9300                               fr->cginfo,
9301                               state_local->x,
9302                               ncg_moved,comm->moved,
9303                               fr->nbv->grp[eintLocal].kernel_type,
9304                               fr->nbv->grp[eintLocal].nbat);
9305
9306             nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9307             break;
9308         case ecutsGROUP:
9309             fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9310                       0,dd->ncg_home,fr->cg_cm);
9311             
9312             copy_ivec(fr->ns.grid->n,ncells_new);
9313             break;
9314         default:
9315             gmx_incons("unimplemented");
9316         }
9317
9318         bResortAll = bMasterState;
9319    
9320         /* Check if we can user the old order and ns grid cell indices
9321          * of the charge groups to sort the charge groups efficiently.
9322          */
9323         if (ncells_new[XX] != ncells_old[XX] ||
9324             ncells_new[YY] != ncells_old[YY] ||
9325             ncells_new[ZZ] != ncells_old[ZZ])
9326         {
9327             bResortAll = TRUE;
9328         }
9329
9330         if (debug)
9331         {
9332             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9333                     gmx_step_str(step,sbuf),dd->ncg_home);
9334         }
9335         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9336                       bResortAll ? -1 : ncg_home_old);
9337         /* Rebuild all the indices */
9338         cg0 = 0;
9339         ga2la_clear(dd->ga2la);
9340
9341         wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9342     }
9343
9344     wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9345     
9346     /* Setup up the communication and communicate the coordinates */
9347     setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9348     
9349     /* Set the indices */
9350     make_dd_indices(dd,cgs_gl->index,cg0);
9351
9352     /* Set the charge group boundaries for neighbor searching */
9353     set_cg_boundaries(&comm->zones);
9354
9355     if (fr->cutoff_scheme == ecutsVERLET)
9356     {
9357         set_zones_size(dd,state_local->box,&ddbox,
9358                        bSortCG ? 1 : 0,comm->zones.n);
9359     }
9360
9361     wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9362
9363     /*
9364     write_dd_pdb("dd_home",step,"dump",top_global,cr,
9365                  -1,state_local->x,state_local->box);
9366     */
9367
9368     wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9369     
9370     /* Extract a local topology from the global topology */
9371     for(i=0; i<dd->ndim; i++)
9372     {
9373         np[dd->dim[i]] = comm->cd[i].np;
9374     }
9375     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9376                       comm->cellsize_min,np,
9377                       fr,
9378                       fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9379                       vsite,top_global,top_local);
9380
9381     wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9382
9383     wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9384     
9385     /* Set up the special atom communication */
9386     n = comm->nat[ddnatZONE];
9387     for(i=ddnatZONE+1; i<ddnatNR; i++)
9388     {
9389         switch(i)
9390         {
9391         case ddnatVSITE:
9392             if (vsite && vsite->n_intercg_vsite)
9393             {
9394                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9395             }
9396             break;
9397         case ddnatCON:
9398             if (dd->bInterCGcons || dd->bInterCGsettles)
9399             {
9400                 /* Only for inter-cg constraints we need special code */
9401                 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9402                                               constr,ir->nProjOrder,
9403                                               top_local->idef.il);
9404             }
9405             break;
9406         default:
9407             gmx_incons("Unknown special atom type setup");
9408         }
9409         comm->nat[i] = n;
9410     }
9411
9412     wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9413
9414     wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9415
9416     /* Make space for the extra coordinates for virtual site
9417      * or constraint communication.
9418      */
9419     state_local->natoms = comm->nat[ddnatNR-1];
9420     if (state_local->natoms > state_local->nalloc)
9421     {
9422         dd_realloc_state(state_local,f,state_local->natoms);
9423     }
9424
9425     if (fr->bF_NoVirSum)
9426     {
9427         if (vsite && vsite->n_intercg_vsite)
9428         {
9429             nat_f_novirsum = comm->nat[ddnatVSITE];
9430         }
9431         else
9432         {
9433             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9434             {
9435                 nat_f_novirsum = dd->nat_tot;
9436             }
9437             else
9438             {
9439                 nat_f_novirsum = dd->nat_home;
9440             }
9441         }
9442     }
9443     else
9444     {
9445         nat_f_novirsum = 0;
9446     }
9447
9448     /* Set the number of atoms required for the force calculation.
9449      * Forces need to be constrained when using a twin-range setup
9450      * or with energy minimization. For simple simulations we could
9451      * avoid some allocation, zeroing and copying, but this is
9452      * probably not worth the complications ande checking.
9453      */
9454     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9455                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9456
9457     /* We make the all mdatoms up to nat_tot_con.
9458      * We could save some work by only setting invmass
9459      * between nat_tot and nat_tot_con.
9460      */
9461     /* This call also sets the new number of home particles to dd->nat_home */
9462     atoms2md(top_global,ir,
9463              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9464
9465     /* Now we have the charges we can sort the FE interactions */
9466     dd_sort_local_top(dd,mdatoms,top_local);
9467
9468     if (vsite != NULL)
9469     {
9470         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9471         split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
9472     }
9473
9474     if (shellfc)
9475     {
9476         /* Make the local shell stuff, currently no communication is done */
9477         make_local_shells(cr,mdatoms,shellfc);
9478     }
9479     
9480         if (ir->implicit_solvent)
9481     {
9482         make_local_gb(cr,fr->born,ir->gb_algorithm);
9483     }
9484
9485     init_bonded_thread_force_reduction(fr,&top_local->idef);
9486
9487     if (!(cr->duty & DUTY_PME))
9488     {
9489         /* Send the charges to our PME only node */
9490         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9491                        mdatoms->chargeA,mdatoms->chargeB,
9492                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9493     }
9494     
9495     if (constr)
9496     {
9497         set_constraints(constr,top_local,ir,mdatoms,cr);
9498     }
9499     
9500     if (ir->ePull != epullNO)
9501     {
9502         /* Update the local pull groups */
9503         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9504     }
9505     
9506     if (ir->bRot)
9507     {
9508         /* Update the local rotation groups */
9509         dd_make_local_rotation_groups(dd,ir->rot);
9510     }
9511
9512
9513     add_dd_statistics(dd);
9514     
9515     /* Make sure we only count the cycles for this DD partitioning */
9516     clear_dd_cycle_counts(dd);
9517     
9518     /* Because the order of the atoms might have changed since
9519      * the last vsite construction, we need to communicate the constructing
9520      * atom coordinates again (for spreading the forces this MD step).
9521      */
9522     dd_move_x_vsites(dd,state_local->box,state_local->x);
9523
9524     wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9525     
9526     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9527     {
9528         dd_move_x(dd,state_local->box,state_local->x);
9529         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9530                      -1,state_local->x,state_local->box);
9531     }
9532
9533     /* Store the partitioning step */
9534     comm->partition_step = step;
9535     
9536     /* Increase the DD partitioning counter */
9537     dd->ddp_count++;
9538     /* The state currently matches this DD partitioning count, store it */
9539     state_local->ddp_count = dd->ddp_count;
9540     if (bMasterState)
9541     {
9542         /* The DD master node knows the complete cg distribution,
9543          * store the count so we can possibly skip the cg info communication.
9544          */
9545         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9546     }
9547
9548     if (comm->DD_debug > 0)
9549     {
9550         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9551         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9552                                 "after partitioning");
9553     }
9554 }