Merge branch 'release-4-6', adds the nbnxn functionality
[alexxy/gromacs.git] / src / gromacs / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "gmx_fatal.h"
31 #include "gmx_fatal_collective.h"
32 #include "vec.h"
33 #include "domdec.h"
34 #include "domdec_network.h"
35 #include "nrnb.h"
36 #include "pbc.h"
37 #include "chargegroup.h"
38 #include "constr.h"
39 #include "mdatoms.h"
40 #include "names.h"
41 #include "pdbio.h"
42 #include "futil.h"
43 #include "force.h"
44 #include "pme.h"
45 #include "pull.h"
46 #include "pull_rotation.h"
47 #include "gmx_wallcycle.h"
48 #include "mdrun.h"
49 #include "nsgrid.h"
50 #include "shellfc.h"
51 #include "mtop_util.h"
52 #include "gmxfio.h"
53 #include "gmx_ga2la.h"
54 #include "gmx_sort.h"
55 #include "macros.h"
56 #include "nbnxn_search.h"
57 #include "bondf.h"
58 #include "gmx_omp_nthreads.h"
59
60 #ifdef GMX_LIB_MPI
61 #include <mpi.h>
62 #endif
63 #ifdef GMX_THREAD_MPI
64 #include "tmpi.h"
65 #endif
66
67 #define DDRANK(dd,rank)    (rank)
68 #define DDMASTERRANK(dd)   (dd->masterrank)
69
70 typedef struct gmx_domdec_master
71 {
72     /* The cell boundaries */
73     real **cell_x;
74     /* The global charge group division */
75     int  *ncg;     /* Number of home charge groups for each node */
76     int  *index;   /* Index of nnodes+1 into cg */
77     int  *cg;      /* Global charge group index */
78     int  *nat;     /* Number of home atoms for each node. */
79     int  *ibuf;    /* Buffer for communication */
80     rvec *vbuf;    /* Buffer for state scattering and gathering */
81 } gmx_domdec_master_t;
82
83 typedef struct
84 {
85     /* The numbers of charge groups to send and receive for each cell
86      * that requires communication, the last entry contains the total
87      * number of atoms that needs to be communicated.
88      */
89     int nsend[DD_MAXIZONE+2];
90     int nrecv[DD_MAXIZONE+2];
91     /* The charge groups to send */
92     int *index;
93     int nalloc;
94     /* The atom range for non-in-place communication */
95     int cell2at0[DD_MAXIZONE];
96     int cell2at1[DD_MAXIZONE];
97 } gmx_domdec_ind_t;
98
99 typedef struct
100 {
101     int  np;                   /* Number of grid pulses in this dimension */
102     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
103     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
104     int  np_nalloc;
105     gmx_bool bInPlace;             /* Can we communicate in place?            */
106 } gmx_domdec_comm_dim_t;
107
108 typedef struct
109 {
110     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
111     real *cell_f;      /* State var.: cell boundaries, box relative      */
112     real *old_cell_f;  /* Temp. var.: old cell size                      */
113     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
114     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
115     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
116     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
117     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
118     real *buf_ncd;     /* Temp. var.                                     */
119 } gmx_domdec_root_t;
120
121 #define DD_NLOAD_MAX 9
122
123 /* Here floats are accurate enough, since these variables
124  * only influence the load balancing, not the actual MD results.
125  */
126 typedef struct
127 {
128     int  nload;
129     float *load;
130     float sum;
131     float max;
132     float sum_m;
133     float cvol_min;
134     float mdf;
135     float pme;
136     int   flags;
137 } gmx_domdec_load_t;
138
139 typedef struct
140 {
141     int  nsc;
142     int  ind_gl;
143     int  ind;
144 } gmx_cgsort_t;
145
146 typedef struct
147 {
148     gmx_cgsort_t *sort;
149     gmx_cgsort_t *sort2;
150     int  sort_nalloc;
151     gmx_cgsort_t *sort_new;
152     int  sort_new_nalloc;
153     int  *ibuf;
154     int  ibuf_nalloc;
155 } gmx_domdec_sort_t;
156
157 typedef struct
158 {
159     rvec *v;
160     int  nalloc;
161 } vec_rvec_t;
162
163 /* This enum determines the order of the coordinates.
164  * ddnatHOME and ddnatZONE should be first and second,
165  * the others can be ordered as wanted.
166  */
167 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
168
169 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
170 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
171
172 typedef struct
173 {
174     int  dim;      /* The dimension                                          */
175     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
176     int  nslab;    /* The number of PME slabs in this dimension              */
177     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
178     int  *pp_min;  /* The minimum pp node location, size nslab               */
179     int  *pp_max;  /* The maximum pp node location,size nslab                */
180     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
181 } gmx_ddpme_t;
182
183 typedef struct
184 {
185     real min0;    /* The minimum bottom of this zone                        */
186     real max1;    /* The maximum top of this zone                           */
187     real min1;    /* The minimum top of this zone                           */
188     real mch0;    /* The maximum bottom communicaton height for this zone   */
189     real mch1;    /* The maximum top communicaton height for this zone      */
190     real p1_0;    /* The bottom value of the first cell in this zone        */
191     real p1_1;    /* The top value of the first cell in this zone           */
192 } gmx_ddzone_t;
193
194 typedef struct
195 {
196     gmx_domdec_ind_t ind;
197     int *ibuf;
198     int ibuf_nalloc;
199     vec_rvec_t vbuf;
200     int nsend;
201     int nat;
202     int nsend_zone;
203 } dd_comm_setup_work_t;
204
205 typedef struct gmx_domdec_comm
206 {
207     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
208      * unless stated otherwise.
209      */
210
211     /* The number of decomposition dimensions for PME, 0: no PME */
212     int  npmedecompdim;
213     /* The number of nodes doing PME (PP/PME or only PME) */
214     int  npmenodes;
215     int  npmenodes_x;
216     int  npmenodes_y;
217     /* The communication setup including the PME only nodes */
218     gmx_bool bCartesianPP_PME;
219     ivec ntot;
220     int  cartpmedim;
221     int  *pmenodes;          /* size npmenodes                         */
222     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
223                               * but with bCartesianPP_PME              */
224     gmx_ddpme_t ddpme[2];
225     
226     /* The DD particle-particle nodes only */
227     gmx_bool bCartesianPP;
228     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
229     
230     /* The global charge groups */
231     t_block cgs_gl;
232
233     /* Should we sort the cgs */
234     int  nstSortCG;
235     gmx_domdec_sort_t *sort;
236     
237     /* Are there charge groups? */
238     gmx_bool bCGs;
239
240     /* Are there bonded and multi-body interactions between charge groups? */
241     gmx_bool bInterCGBondeds;
242     gmx_bool bInterCGMultiBody;
243
244     /* Data for the optional bonded interaction atom communication range */
245     gmx_bool bBondComm;
246     t_blocka *cglink;
247     char *bLocalCG;
248
249     /* The DLB option */
250     int  eDLB;
251     /* Are we actually using DLB? */
252     gmx_bool bDynLoadBal;
253
254     /* Cell sizes for static load balancing, first index cartesian */
255     real **slb_frac;
256     
257     /* The width of the communicated boundaries */
258     real cutoff_mbody;
259     real cutoff;
260     /* The minimum cell size (including triclinic correction) */
261     rvec cellsize_min;
262     /* For dlb, for use with edlbAUTO */
263     rvec cellsize_min_dlb;
264     /* The lower limit for the DD cell size with DLB */
265     real cellsize_limit;
266     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
267     gmx_bool bVacDLBNoLimit;
268
269     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
270     ivec tric_dir;
271     /* box0 and box_size are required with dim's without pbc and -gcom */
272     rvec box0;
273     rvec box_size;
274     
275     /* The cell boundaries */
276     rvec cell_x0;
277     rvec cell_x1;
278
279     /* The old location of the cell boundaries, to check cg displacements */
280     rvec old_cell_x0;
281     rvec old_cell_x1;
282
283     /* The communication setup and charge group boundaries for the zones */
284     gmx_domdec_zones_t zones;
285     
286     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
287      * cell boundaries of neighboring cells for dynamic load balancing.
288      */
289     gmx_ddzone_t zone_d1[2];
290     gmx_ddzone_t zone_d2[2][2];
291     
292     /* The coordinate/force communication setup and indices */
293     gmx_domdec_comm_dim_t cd[DIM];
294     /* The maximum number of cells to communicate with in one dimension */
295     int  maxpulse;
296     
297     /* Which cg distribution is stored on the master node */
298     int master_cg_ddp_count;
299     
300     /* The number of cg's received from the direct neighbors */
301     int  zone_ncg1[DD_MAXZONE];
302     
303     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
304     int  nat[ddnatNR];
305
306     /* Array for signalling if atoms have moved to another domain */
307     int  *moved;
308     int  moved_nalloc;
309     
310     /* Communication buffer for general use */
311     int  *buf_int;
312     int  nalloc_int;
313
314     /* Communication buffer for general use */
315     vec_rvec_t vbuf;
316
317     /* Temporary storage for thread parallel communication setup */
318     int nth;
319     dd_comm_setup_work_t *dth;
320
321     /* Communication buffers only used with multiple grid pulses */
322     int  *buf_int2;
323     int  nalloc_int2;
324     vec_rvec_t vbuf2;
325     
326     /* Communication buffers for local redistribution */
327     int  **cggl_flag;
328     int  cggl_flag_nalloc[DIM*2];
329     rvec **cgcm_state;
330     int  cgcm_state_nalloc[DIM*2];
331     
332     /* Cell sizes for dynamic load balancing */
333     gmx_domdec_root_t **root;
334     real *cell_f_row;
335     real cell_f0[DIM];
336     real cell_f1[DIM];
337     real cell_f_max0[DIM];
338     real cell_f_min1[DIM];
339     
340     /* Stuff for load communication */
341     gmx_bool bRecordLoad;
342     gmx_domdec_load_t *load;
343 #ifdef GMX_MPI
344     MPI_Comm *mpi_comm_load;
345 #endif
346
347     /* Maximum DLB scaling per load balancing step in percent */
348     int dlb_scale_lim;
349
350     /* Cycle counters */
351     float cycl[ddCyclNr];
352     int   cycl_n[ddCyclNr];
353     float cycl_max[ddCyclNr];
354     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
355     int eFlop;
356     double flop;
357     int    flop_n;
358     /* Have often have did we have load measurements */
359     int    n_load_have;
360     /* Have often have we collected the load measurements */
361     int    n_load_collect;
362     
363     /* Statistics */
364     double sum_nat[ddnatNR-ddnatZONE];
365     int    ndecomp;
366     int    nload;
367     double load_step;
368     double load_sum;
369     double load_max;
370     ivec   load_lim;
371     double load_mdf;
372     double load_pme;
373
374     /* The last partition step */
375     gmx_large_int_t partition_step;
376
377     /* Debugging */
378     int  nstDDDump;
379     int  nstDDDumpGrid;
380     int  DD_debug;
381 } gmx_domdec_comm_t;
382
383 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
384 #define DD_CGIBS 2
385
386 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
387 #define DD_FLAG_NRCG  65535
388 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
389 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
390
391 /* Zone permutation required to obtain consecutive charge groups
392  * for neighbor searching.
393  */
394 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
395
396 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
397  * components see only j zones with that component 0.
398  */
399
400 /* The DD zone order */
401 static const ivec dd_zo[DD_MAXZONE] =
402   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
403
404 /* The 3D setup */
405 #define dd_z3n  8
406 #define dd_zp3n 4
407 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
408
409 /* The 2D setup */
410 #define dd_z2n  4
411 #define dd_zp2n 2
412 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
413
414 /* The 1D setup */
415 #define dd_z1n  2
416 #define dd_zp1n 1
417 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
418
419 /* Factors used to avoid problems due to rounding issues */
420 #define DD_CELL_MARGIN       1.0001
421 #define DD_CELL_MARGIN2      1.00005
422 /* Factor to account for pressure scaling during nstlist steps */
423 #define DD_PRES_SCALE_MARGIN 1.02
424
425 /* Allowed performance loss before we DLB or warn */
426 #define DD_PERF_LOSS 0.05
427
428 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
429
430 /* Use separate MPI send and receive commands
431  * when nnodes <= GMX_DD_NNODES_SENDRECV.
432  * This saves memory (and some copying for small nnodes).
433  * For high parallelization scatter and gather calls are used.
434  */
435 #define GMX_DD_NNODES_SENDRECV 4
436
437
438 /*
439 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
440
441 static void index2xyz(ivec nc,int ind,ivec xyz)
442 {
443   xyz[XX] = ind % nc[XX];
444   xyz[YY] = (ind / nc[XX]) % nc[YY];
445   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
446 }
447 */
448
449 /* This order is required to minimize the coordinate communication in PME
450  * which uses decomposition in the x direction.
451  */
452 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
453
454 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
455 {
456     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
457     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
458     xyz[ZZ] = ind % nc[ZZ];
459 }
460
461 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
462 {
463     int ddindex;
464     int ddnodeid=-1;
465     
466     ddindex = dd_index(dd->nc,c);
467     if (dd->comm->bCartesianPP_PME)
468     {
469         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
470     }
471     else if (dd->comm->bCartesianPP)
472     {
473 #ifdef GMX_MPI
474         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
475 #endif
476     }
477     else
478     {
479         ddnodeid = ddindex;
480     }
481     
482     return ddnodeid;
483 }
484
485 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
486 {
487     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
488 }
489
490 int ddglatnr(gmx_domdec_t *dd,int i)
491 {
492     int atnr;
493     
494     if (dd == NULL)
495     {
496         atnr = i + 1;
497     }
498     else
499     {
500         if (i >= dd->comm->nat[ddnatNR-1])
501         {
502             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
503         }
504         atnr = dd->gatindex[i] + 1;
505     }
506     
507     return atnr;
508 }
509
510 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
511 {
512     return &dd->comm->cgs_gl;
513 }
514
515 static void vec_rvec_init(vec_rvec_t *v)
516 {
517     v->nalloc = 0;
518     v->v      = NULL;
519 }
520
521 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
522 {
523     if (n > v->nalloc)
524     {
525         v->nalloc = over_alloc_dd(n);
526         srenew(v->v,v->nalloc);
527     }
528 }
529
530 void dd_store_state(gmx_domdec_t *dd,t_state *state)
531 {
532     int i;
533     
534     if (state->ddp_count != dd->ddp_count)
535     {
536         gmx_incons("The state does not the domain decomposition state");
537     }
538     
539     state->ncg_gl = dd->ncg_home;
540     if (state->ncg_gl > state->cg_gl_nalloc)
541     {
542         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
543         srenew(state->cg_gl,state->cg_gl_nalloc);
544     }
545     for(i=0; i<state->ncg_gl; i++)
546     {
547         state->cg_gl[i] = dd->index_gl[i];
548     }
549     
550     state->ddp_count_cg_gl = dd->ddp_count;
551 }
552
553 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
554 {
555     return &dd->comm->zones;
556 }
557
558 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
559                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
560 {
561     gmx_domdec_zones_t *zones;
562     int izone,d,dim;
563
564     zones = &dd->comm->zones;
565
566     izone = 0;
567     while (icg >= zones->izone[izone].cg1)
568     {
569         izone++;
570     }
571     
572     if (izone == 0)
573     {
574         *jcg0 = icg;
575     }
576     else if (izone < zones->nizone)
577     {
578         *jcg0 = zones->izone[izone].jcg0;
579     }
580     else
581     {
582         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
583                   icg,izone,zones->nizone);
584     }
585         
586     *jcg1 = zones->izone[izone].jcg1;
587     
588     for(d=0; d<dd->ndim; d++)
589     {
590         dim = dd->dim[d];
591         shift0[dim] = zones->izone[izone].shift0[dim];
592         shift1[dim] = zones->izone[izone].shift1[dim];
593         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
594         {
595             /* A conservative approach, this can be optimized */
596             shift0[dim] -= 1;
597             shift1[dim] += 1;
598         }
599     }
600 }
601
602 int dd_natoms_vsite(gmx_domdec_t *dd)
603 {
604     return dd->comm->nat[ddnatVSITE];
605 }
606
607 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
608 {
609     *at_start = dd->comm->nat[ddnatCON-1];
610     *at_end   = dd->comm->nat[ddnatCON];
611 }
612
613 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
614 {
615     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
616     int  *index,*cgindex;
617     gmx_domdec_comm_t *comm;
618     gmx_domdec_comm_dim_t *cd;
619     gmx_domdec_ind_t *ind;
620     rvec shift={0,0,0},*buf,*rbuf;
621     gmx_bool bPBC,bScrew;
622     
623     comm = dd->comm;
624     
625     cgindex = dd->cgindex;
626     
627     buf = comm->vbuf.v;
628
629     nzone = 1;
630     nat_tot = dd->nat_home;
631     for(d=0; d<dd->ndim; d++)
632     {
633         bPBC   = (dd->ci[dd->dim[d]] == 0);
634         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
635         if (bPBC)
636         {
637             copy_rvec(box[dd->dim[d]],shift);
638         }
639         cd = &comm->cd[d];
640         for(p=0; p<cd->np; p++)
641         {
642             ind = &cd->ind[p];
643             index = ind->index;
644             n = 0;
645             if (!bPBC)
646             {
647                 for(i=0; i<ind->nsend[nzone]; i++)
648                 {
649                     at0 = cgindex[index[i]];
650                     at1 = cgindex[index[i]+1];
651                     for(j=at0; j<at1; j++)
652                     {
653                         copy_rvec(x[j],buf[n]);
654                         n++;
655                     }
656                 }
657             }
658             else if (!bScrew)
659             {
660                 for(i=0; i<ind->nsend[nzone]; i++)
661                 {
662                     at0 = cgindex[index[i]];
663                     at1 = cgindex[index[i]+1];
664                     for(j=at0; j<at1; j++)
665                     {
666                         /* We need to shift the coordinates */
667                         rvec_add(x[j],shift,buf[n]);
668                         n++;
669                     }
670                 }
671             }
672             else
673             {
674                 for(i=0; i<ind->nsend[nzone]; i++)
675                 {
676                     at0 = cgindex[index[i]];
677                     at1 = cgindex[index[i]+1];
678                     for(j=at0; j<at1; j++)
679                     {
680                         /* Shift x */
681                         buf[n][XX] = x[j][XX] + shift[XX];
682                         /* Rotate y and z.
683                          * This operation requires a special shift force
684                          * treatment, which is performed in calc_vir.
685                          */
686                         buf[n][YY] = box[YY][YY] - x[j][YY];
687                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
688                         n++;
689                     }
690                 }
691             }
692             
693             if (cd->bInPlace)
694             {
695                 rbuf = x + nat_tot;
696             }
697             else
698             {
699                 rbuf = comm->vbuf2.v;
700             }
701             /* Send and receive the coordinates */
702             dd_sendrecv_rvec(dd, d, dddirBackward,
703                              buf,  ind->nsend[nzone+1],
704                              rbuf, ind->nrecv[nzone+1]);
705             if (!cd->bInPlace)
706             {
707                 j = 0;
708                 for(zone=0; zone<nzone; zone++)
709                 {
710                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
711                     {
712                         copy_rvec(rbuf[j],x[i]);
713                         j++;
714                     }
715                 }
716             }
717             nat_tot += ind->nrecv[nzone+1];
718         }
719         nzone += nzone;
720     }
721 }
722
723 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
724 {
725     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
726     int  *index,*cgindex;
727     gmx_domdec_comm_t *comm;
728     gmx_domdec_comm_dim_t *cd;
729     gmx_domdec_ind_t *ind;
730     rvec *buf,*sbuf;
731     ivec vis;
732     int  is;
733     gmx_bool bPBC,bScrew;
734     
735     comm = dd->comm;
736     
737     cgindex = dd->cgindex;
738
739     buf = comm->vbuf.v;
740
741     n = 0;
742     nzone = comm->zones.n/2;
743     nat_tot = dd->nat_tot;
744     for(d=dd->ndim-1; d>=0; d--)
745     {
746         bPBC   = (dd->ci[dd->dim[d]] == 0);
747         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
748         if (fshift == NULL && !bScrew)
749         {
750             bPBC = FALSE;
751         }
752         /* Determine which shift vector we need */
753         clear_ivec(vis);
754         vis[dd->dim[d]] = 1;
755         is = IVEC2IS(vis);
756         
757         cd = &comm->cd[d];
758         for(p=cd->np-1; p>=0; p--) {
759             ind = &cd->ind[p];
760             nat_tot -= ind->nrecv[nzone+1];
761             if (cd->bInPlace)
762             {
763                 sbuf = f + nat_tot;
764             }
765             else
766             {
767                 sbuf = comm->vbuf2.v;
768                 j = 0;
769                 for(zone=0; zone<nzone; zone++)
770                 {
771                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
772                     {
773                         copy_rvec(f[i],sbuf[j]);
774                         j++;
775                     }
776                 }
777             }
778             /* Communicate the forces */
779             dd_sendrecv_rvec(dd, d, dddirForward,
780                              sbuf, ind->nrecv[nzone+1],
781                              buf,  ind->nsend[nzone+1]);
782             index = ind->index;
783             /* Add the received forces */
784             n = 0;
785             if (!bPBC)
786             {
787                 for(i=0; i<ind->nsend[nzone]; i++)
788                 {
789                     at0 = cgindex[index[i]];
790                     at1 = cgindex[index[i]+1];
791                     for(j=at0; j<at1; j++)
792                     {
793                         rvec_inc(f[j],buf[n]);
794                         n++;
795                     }
796                 } 
797             }
798             else if (!bScrew)
799             {
800                 for(i=0; i<ind->nsend[nzone]; i++)
801                 {
802                     at0 = cgindex[index[i]];
803                     at1 = cgindex[index[i]+1];
804                     for(j=at0; j<at1; j++)
805                     {
806                         rvec_inc(f[j],buf[n]);
807                         /* Add this force to the shift force */
808                         rvec_inc(fshift[is],buf[n]);
809                         n++;
810                     }
811                 }
812             }
813             else
814             {
815                 for(i=0; i<ind->nsend[nzone]; i++)
816                 {
817                     at0 = cgindex[index[i]];
818                     at1 = cgindex[index[i]+1];
819                     for(j=at0; j<at1; j++)
820                     {
821                         /* Rotate the force */
822                         f[j][XX] += buf[n][XX];
823                         f[j][YY] -= buf[n][YY];
824                         f[j][ZZ] -= buf[n][ZZ];
825                         if (fshift)
826                         {
827                             /* Add this force to the shift force */
828                             rvec_inc(fshift[is],buf[n]);
829                         }
830                         n++;
831                     }
832                 }
833             }
834         }
835         nzone /= 2;
836     }
837 }
838
839 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
840 {
841     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
842     int  *index,*cgindex;
843     gmx_domdec_comm_t *comm;
844     gmx_domdec_comm_dim_t *cd;
845     gmx_domdec_ind_t *ind;
846     real *buf,*rbuf;
847     
848     comm = dd->comm;
849     
850     cgindex = dd->cgindex;
851     
852     buf = &comm->vbuf.v[0][0];
853
854     nzone = 1;
855     nat_tot = dd->nat_home;
856     for(d=0; d<dd->ndim; d++)
857     {
858         cd = &comm->cd[d];
859         for(p=0; p<cd->np; p++)
860         {
861             ind = &cd->ind[p];
862             index = ind->index;
863             n = 0;
864             for(i=0; i<ind->nsend[nzone]; i++)
865             {
866                 at0 = cgindex[index[i]];
867                 at1 = cgindex[index[i]+1];
868                 for(j=at0; j<at1; j++)
869                 {
870                     buf[n] = v[j];
871                     n++;
872                 }
873             }
874             
875             if (cd->bInPlace)
876             {
877                 rbuf = v + nat_tot;
878             }
879             else
880             {
881                 rbuf = &comm->vbuf2.v[0][0];
882             }
883             /* Send and receive the coordinates */
884             dd_sendrecv_real(dd, d, dddirBackward,
885                              buf,  ind->nsend[nzone+1],
886                              rbuf, ind->nrecv[nzone+1]);
887             if (!cd->bInPlace)
888             {
889                 j = 0;
890                 for(zone=0; zone<nzone; zone++)
891                 {
892                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
893                     {
894                         v[i] = rbuf[j];
895                         j++;
896                     }
897                 }
898             }
899             nat_tot += ind->nrecv[nzone+1];
900         }
901         nzone += nzone;
902     }
903 }
904
905 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
906 {
907     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
908     int  *index,*cgindex;
909     gmx_domdec_comm_t *comm;
910     gmx_domdec_comm_dim_t *cd;
911     gmx_domdec_ind_t *ind;
912     real *buf,*sbuf;
913     
914     comm = dd->comm;
915     
916     cgindex = dd->cgindex;
917
918     buf = &comm->vbuf.v[0][0];
919
920     n = 0;
921     nzone = comm->zones.n/2;
922     nat_tot = dd->nat_tot;
923     for(d=dd->ndim-1; d>=0; d--)
924     {
925         cd = &comm->cd[d];
926         for(p=cd->np-1; p>=0; p--) {
927             ind = &cd->ind[p];
928             nat_tot -= ind->nrecv[nzone+1];
929             if (cd->bInPlace)
930             {
931                 sbuf = v + nat_tot;
932             }
933             else
934             {
935                 sbuf = &comm->vbuf2.v[0][0];
936                 j = 0;
937                 for(zone=0; zone<nzone; zone++)
938                 {
939                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
940                     {
941                         sbuf[j] = v[i];
942                         j++;
943                     }
944                 }
945             }
946             /* Communicate the forces */
947             dd_sendrecv_real(dd, d, dddirForward,
948                              sbuf, ind->nrecv[nzone+1],
949                              buf,  ind->nsend[nzone+1]);
950             index = ind->index;
951             /* Add the received forces */
952             n = 0;
953             for(i=0; i<ind->nsend[nzone]; i++)
954             {
955                 at0 = cgindex[index[i]];
956                 at1 = cgindex[index[i]+1];
957                 for(j=at0; j<at1; j++)
958                 {
959                     v[j] += buf[n];
960                     n++;
961                 }
962             } 
963         }
964         nzone /= 2;
965     }
966 }
967
968 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
969 {
970     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
971             d,i,j,
972             zone->min0,zone->max1,
973             zone->mch0,zone->mch0,
974             zone->p1_0,zone->p1_1);
975 }
976
977
978 #define DDZONECOMM_MAXZONE  5
979 #define DDZONECOMM_BUFSIZE  3
980
981 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
982                                int ddimind,int direction,
983                                gmx_ddzone_t *buf_s,int n_s,
984                                gmx_ddzone_t *buf_r,int n_r)
985 {
986 #define ZBS  DDZONECOMM_BUFSIZE
987     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
988     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
989     int i;
990
991     for(i=0; i<n_s; i++)
992     {
993         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
994         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
995         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
996         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
997         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
998         vbuf_s[i*ZBS+1][2] = 0;
999         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1000         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1001         vbuf_s[i*ZBS+2][2] = 0;
1002     }
1003
1004     dd_sendrecv_rvec(dd, ddimind, direction,
1005                      vbuf_s, n_s*ZBS,
1006                      vbuf_r, n_r*ZBS);
1007
1008     for(i=0; i<n_r; i++)
1009     {
1010         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1011         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1012         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1013         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1014         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1015         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1016         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1017     }
1018
1019 #undef ZBS
1020 }
1021
1022 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1023                           rvec cell_ns_x0,rvec cell_ns_x1)
1024 {
1025     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1026     gmx_ddzone_t *zp;
1027     gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1028     gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1029     gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1030     rvec extr_s[2],extr_r[2];
1031     rvec dh;
1032     real dist_d,c=0,det;
1033     gmx_domdec_comm_t *comm;
1034     gmx_bool bPBC,bUse;
1035
1036     comm = dd->comm;
1037
1038     for(d=1; d<dd->ndim; d++)
1039     {
1040         dim = dd->dim[d];
1041         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1042         zp->min0 = cell_ns_x0[dim];
1043         zp->max1 = cell_ns_x1[dim];
1044         zp->min1 = cell_ns_x1[dim];
1045         zp->mch0 = cell_ns_x0[dim];
1046         zp->mch1 = cell_ns_x1[dim];
1047         zp->p1_0 = cell_ns_x0[dim];
1048         zp->p1_1 = cell_ns_x1[dim];
1049     }
1050     
1051     for(d=dd->ndim-2; d>=0; d--)
1052     {
1053         dim  = dd->dim[d];
1054         bPBC = (dim < ddbox->npbcdim);
1055
1056         /* Use an rvec to store two reals */
1057         extr_s[d][0] = comm->cell_f0[d+1];
1058         extr_s[d][1] = comm->cell_f1[d+1];
1059         extr_s[d][2] = comm->cell_f1[d+1];
1060
1061         pos = 0;
1062         /* Store the extremes in the backward sending buffer,
1063          * so the get updated separately from the forward communication.
1064          */
1065         for(d1=d; d1<dd->ndim-1; d1++)
1066         {
1067             /* We invert the order to be able to use the same loop for buf_e */
1068             buf_s[pos].min0 = extr_s[d1][1];
1069             buf_s[pos].max1 = extr_s[d1][0];
1070             buf_s[pos].min1 = extr_s[d1][2];
1071             buf_s[pos].mch0 = 0;
1072             buf_s[pos].mch1 = 0;
1073             /* Store the cell corner of the dimension we communicate along */
1074             buf_s[pos].p1_0 = comm->cell_x0[dim];
1075             buf_s[pos].p1_1 = 0;
1076             pos++;
1077         }
1078
1079         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1080         pos++;
1081
1082         if (dd->ndim == 3 && d == 0)
1083         {
1084             buf_s[pos] = comm->zone_d2[0][1];
1085             pos++;
1086             buf_s[pos] = comm->zone_d1[0];
1087             pos++;
1088         }
1089
1090         /* We only need to communicate the extremes
1091          * in the forward direction
1092          */
1093         npulse = comm->cd[d].np;
1094         if (bPBC)
1095         {
1096             /* Take the minimum to avoid double communication */
1097             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1098         }
1099         else
1100         {
1101             /* Without PBC we should really not communicate over
1102              * the boundaries, but implementing that complicates
1103              * the communication setup and therefore we simply
1104              * do all communication, but ignore some data.
1105              */
1106             npulse_min = npulse;
1107         }
1108         for(p=0; p<npulse_min; p++)
1109         {
1110             /* Communicate the extremes forward */
1111             bUse = (bPBC || dd->ci[dim] > 0);
1112
1113             dd_sendrecv_rvec(dd, d, dddirForward,
1114                              extr_s+d, dd->ndim-d-1,
1115                              extr_r+d, dd->ndim-d-1);
1116
1117             if (bUse)
1118             {
1119                 for(d1=d; d1<dd->ndim-1; d1++)
1120                 {
1121                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1122                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1123                     extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1124                 }
1125             }
1126         }
1127
1128         buf_size = pos;
1129         for(p=0; p<npulse; p++)
1130         {
1131             /* Communicate all the zone information backward */
1132             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1133
1134             dd_sendrecv_ddzone(dd, d, dddirBackward,
1135                                buf_s, buf_size,
1136                                buf_r, buf_size);
1137
1138             clear_rvec(dh);
1139             if (p > 0)
1140             {
1141                 for(d1=d+1; d1<dd->ndim; d1++)
1142                 {
1143                     /* Determine the decrease of maximum required
1144                      * communication height along d1 due to the distance along d,
1145                      * this avoids a lot of useless atom communication.
1146                      */
1147                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1148
1149                     if (ddbox->tric_dir[dim])
1150                     {
1151                         /* c is the off-diagonal coupling between the cell planes
1152                          * along directions d and d1.
1153                          */
1154                         c = ddbox->v[dim][dd->dim[d1]][dim];
1155                     }
1156                     else
1157                     {
1158                         c = 0;
1159                     }
1160                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1161                     if (det > 0)
1162                     {
1163                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1164                     }
1165                     else
1166                     {
1167                         /* A negative value signals out of range */
1168                         dh[d1] = -1;
1169                     }
1170                 }
1171             }
1172
1173             /* Accumulate the extremes over all pulses */
1174             for(i=0; i<buf_size; i++)
1175             {
1176                 if (p == 0)
1177                 {
1178                     buf_e[i] = buf_r[i];
1179                 }
1180                 else
1181                 {
1182                     if (bUse)
1183                     {
1184                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1185                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1186                         buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1187                     }
1188
1189                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1190                     {
1191                         d1 = 1;
1192                     }
1193                     else
1194                     {
1195                         d1 = d + 1;
1196                     }
1197                     if (bUse && dh[d1] >= 0)
1198                     {
1199                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1200                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1201                     }
1202                 }
1203                 /* Copy the received buffer to the send buffer,
1204                  * to pass the data through with the next pulse.
1205                  */
1206                 buf_s[i] = buf_r[i];
1207             }
1208             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1209                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1210             {
1211                 /* Store the extremes */ 
1212                 pos = 0;
1213
1214                 for(d1=d; d1<dd->ndim-1; d1++)
1215                 {
1216                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1217                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1218                     extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1219                     pos++;
1220                 }
1221
1222                 if (d == 1 || (d == 0 && dd->ndim == 3))
1223                 {
1224                     for(i=d; i<2; i++)
1225                     {
1226                         comm->zone_d2[1-d][i] = buf_e[pos];
1227                         pos++;
1228                     }
1229                 }
1230                 if (d == 0)
1231                 {
1232                     comm->zone_d1[1] = buf_e[pos];
1233                     pos++;
1234                 }
1235             }
1236         }
1237     }
1238     
1239     if (dd->ndim >= 2)
1240     {
1241         dim = dd->dim[1];
1242         for(i=0; i<2; i++)
1243         {
1244             if (debug)
1245             {
1246                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1247             }
1248             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1249             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1250         }
1251     }
1252     if (dd->ndim >= 3)
1253     {
1254         dim = dd->dim[2];
1255         for(i=0; i<2; i++)
1256         {
1257             for(j=0; j<2; j++)
1258             {
1259                 if (debug)
1260                 {
1261                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1262                 }
1263                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1264                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1265             }
1266         }
1267     }
1268     for(d=1; d<dd->ndim; d++)
1269     {
1270         comm->cell_f_max0[d] = extr_s[d-1][0];
1271         comm->cell_f_min1[d] = extr_s[d-1][1];
1272         if (debug)
1273         {
1274             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1275                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1276         }
1277     }
1278 }
1279
1280 static void dd_collect_cg(gmx_domdec_t *dd,
1281                           t_state *state_local)
1282 {
1283     gmx_domdec_master_t *ma=NULL;
1284     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1285     t_block *cgs_gl;
1286
1287     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1288     {
1289         /* The master has the correct distribution */
1290         return;
1291     }
1292     
1293     if (state_local->ddp_count == dd->ddp_count)
1294     {
1295         ncg_home = dd->ncg_home;
1296         cg       = dd->index_gl;
1297         nat_home = dd->nat_home;
1298     } 
1299     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1300     {
1301         cgs_gl = &dd->comm->cgs_gl;
1302
1303         ncg_home = state_local->ncg_gl;
1304         cg       = state_local->cg_gl;
1305         nat_home = 0;
1306         for(i=0; i<ncg_home; i++)
1307         {
1308             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1309         }
1310     }
1311     else
1312     {
1313         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1314     }
1315     
1316     buf2[0] = dd->ncg_home;
1317     buf2[1] = dd->nat_home;
1318     if (DDMASTER(dd))
1319     {
1320         ma = dd->ma;
1321         ibuf = ma->ibuf;
1322     }
1323     else
1324     {
1325         ibuf = NULL;
1326     }
1327     /* Collect the charge group and atom counts on the master */
1328     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1329     
1330     if (DDMASTER(dd))
1331     {
1332         ma->index[0] = 0;
1333         for(i=0; i<dd->nnodes; i++)
1334         {
1335             ma->ncg[i] = ma->ibuf[2*i];
1336             ma->nat[i] = ma->ibuf[2*i+1];
1337             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1338             
1339         }
1340         /* Make byte counts and indices */
1341         for(i=0; i<dd->nnodes; i++)
1342         {
1343             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1344             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1345         }
1346         if (debug)
1347         {
1348             fprintf(debug,"Initial charge group distribution: ");
1349             for(i=0; i<dd->nnodes; i++)
1350                 fprintf(debug," %d",ma->ncg[i]);
1351             fprintf(debug,"\n");
1352         }
1353     }
1354     
1355     /* Collect the charge group indices on the master */
1356     dd_gatherv(dd,
1357                dd->ncg_home*sizeof(int),dd->index_gl,
1358                DDMASTER(dd) ? ma->ibuf : NULL,
1359                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1360                DDMASTER(dd) ? ma->cg : NULL);
1361     
1362     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1363 }
1364
1365 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1366                                     rvec *lv,rvec *v)
1367 {
1368     gmx_domdec_master_t *ma;
1369     int  n,i,c,a,nalloc=0;
1370     rvec *buf=NULL;
1371     t_block *cgs_gl;
1372
1373     ma = dd->ma;
1374     
1375     if (!DDMASTER(dd))
1376     {
1377 #ifdef GMX_MPI
1378         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1379                  dd->rank,dd->mpi_comm_all);
1380 #endif
1381     } else {
1382         /* Copy the master coordinates to the global array */
1383         cgs_gl = &dd->comm->cgs_gl;
1384
1385         n = DDMASTERRANK(dd);
1386         a = 0;
1387         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1388         {
1389             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1390             {
1391                 copy_rvec(lv[a++],v[c]);
1392             }
1393         }
1394         
1395         for(n=0; n<dd->nnodes; n++)
1396         {
1397             if (n != dd->rank)
1398             {
1399                 if (ma->nat[n] > nalloc)
1400                 {
1401                     nalloc = over_alloc_dd(ma->nat[n]);
1402                     srenew(buf,nalloc);
1403                 }
1404 #ifdef GMX_MPI
1405                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1406                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1407 #endif
1408                 a = 0;
1409                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1410                 {
1411                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1412                     {
1413                         copy_rvec(buf[a++],v[c]);
1414                     }
1415                 }
1416             }
1417         }
1418         sfree(buf);
1419     }
1420 }
1421
1422 static void get_commbuffer_counts(gmx_domdec_t *dd,
1423                                   int **counts,int **disps)
1424 {
1425     gmx_domdec_master_t *ma;
1426     int n;
1427
1428     ma = dd->ma;
1429     
1430     /* Make the rvec count and displacment arrays */
1431     *counts  = ma->ibuf;
1432     *disps   = ma->ibuf + dd->nnodes;
1433     for(n=0; n<dd->nnodes; n++)
1434     {
1435         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1436         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1437     }
1438 }
1439
1440 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1441                                    rvec *lv,rvec *v)
1442 {
1443     gmx_domdec_master_t *ma;
1444     int  *rcounts=NULL,*disps=NULL;
1445     int  n,i,c,a;
1446     rvec *buf=NULL;
1447     t_block *cgs_gl;
1448     
1449     ma = dd->ma;
1450     
1451     if (DDMASTER(dd))
1452     {
1453         get_commbuffer_counts(dd,&rcounts,&disps);
1454
1455         buf = ma->vbuf;
1456     }
1457     
1458     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1459
1460     if (DDMASTER(dd))
1461     {
1462         cgs_gl = &dd->comm->cgs_gl;
1463
1464         a = 0;
1465         for(n=0; n<dd->nnodes; n++)
1466         {
1467             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1468             {
1469                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1470                 {
1471                     copy_rvec(buf[a++],v[c]);
1472                 }
1473             }
1474         }
1475     }
1476 }
1477
1478 void dd_collect_vec(gmx_domdec_t *dd,
1479                     t_state *state_local,rvec *lv,rvec *v)
1480 {
1481     gmx_domdec_master_t *ma;
1482     int  n,i,c,a,nalloc=0;
1483     rvec *buf=NULL;
1484     
1485     dd_collect_cg(dd,state_local);
1486
1487     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1488     {
1489         dd_collect_vec_sendrecv(dd,lv,v);
1490     }
1491     else
1492     {
1493         dd_collect_vec_gatherv(dd,lv,v);
1494     }
1495 }
1496
1497
1498 void dd_collect_state(gmx_domdec_t *dd,
1499                       t_state *state_local,t_state *state)
1500 {
1501     int est,i,j,nh;
1502
1503     nh = state->nhchainlength;
1504
1505     if (DDMASTER(dd))
1506     {
1507         for (i=0;i<efptNR;i++) {
1508             state->lambda[i] = state_local->lambda[i];
1509         }
1510         state->fep_state = state_local->fep_state;
1511         state->veta = state_local->veta;
1512         state->vol0 = state_local->vol0;
1513         copy_mat(state_local->box,state->box);
1514         copy_mat(state_local->boxv,state->boxv);
1515         copy_mat(state_local->svir_prev,state->svir_prev);
1516         copy_mat(state_local->fvir_prev,state->fvir_prev);
1517         copy_mat(state_local->pres_prev,state->pres_prev);
1518
1519
1520         for(i=0; i<state_local->ngtc; i++)
1521         {
1522             for(j=0; j<nh; j++) {
1523                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1524                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1525             }
1526             state->therm_integral[i] = state_local->therm_integral[i];            
1527         }
1528         for(i=0; i<state_local->nnhpres; i++) 
1529         {
1530             for(j=0; j<nh; j++) {
1531                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1532                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1533             }
1534         }
1535     }
1536     for(est=0; est<estNR; est++)
1537     {
1538         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1539         {
1540             switch (est) {
1541             case estX:
1542                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1543                 break;
1544             case estV:
1545                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1546                 break;
1547             case estSDX:
1548                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1549                 break;
1550             case estCGP:
1551                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1552                 break;
1553             case estLD_RNG:
1554                 if (state->nrngi == 1)
1555                 {
1556                     if (DDMASTER(dd))
1557                     {
1558                         for(i=0; i<state_local->nrng; i++)
1559                         {
1560                             state->ld_rng[i] = state_local->ld_rng[i];
1561                         }
1562                     }
1563                 }
1564                 else
1565                 {
1566                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1567                               state_local->ld_rng,state->ld_rng);
1568                 }
1569                 break;
1570             case estLD_RNGI:
1571                 if (state->nrngi == 1)
1572                 {
1573                    if (DDMASTER(dd))
1574                     {
1575                         state->ld_rngi[0] = state_local->ld_rngi[0];
1576                     } 
1577                 }
1578                 else
1579                 {
1580                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1581                               state_local->ld_rngi,state->ld_rngi);
1582                 }
1583                 break;
1584             case estDISRE_INITF:
1585             case estDISRE_RM3TAV:
1586             case estORIRE_INITF:
1587             case estORIRE_DTAV:
1588                 break;
1589             default:
1590                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1591             }
1592         }
1593     }
1594 }
1595
1596 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1597 {
1598     int est;
1599
1600     if (debug)
1601     {
1602         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1603     }
1604
1605     state->nalloc = over_alloc_dd(nalloc);
1606     
1607     for(est=0; est<estNR; est++)
1608     {
1609         if (EST_DISTR(est) && (state->flags & (1<<est)))
1610         {
1611             switch(est) {
1612             case estX:
1613                 srenew(state->x,state->nalloc);
1614                 break;
1615             case estV:
1616                 srenew(state->v,state->nalloc);
1617                 break;
1618             case estSDX:
1619                 srenew(state->sd_X,state->nalloc);
1620                 break;
1621             case estCGP:
1622                 srenew(state->cg_p,state->nalloc);
1623                 break;
1624             case estLD_RNG:
1625             case estLD_RNGI:
1626             case estDISRE_INITF:
1627             case estDISRE_RM3TAV:
1628             case estORIRE_INITF:
1629             case estORIRE_DTAV:
1630                 /* No reallocation required */
1631                 break;
1632             default:
1633                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1634             }
1635         }
1636     }
1637     
1638     if (f != NULL)
1639     {
1640         srenew(*f,state->nalloc);
1641     }
1642 }
1643
1644 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1645                                int nalloc)
1646 {
1647     if (nalloc > fr->cg_nalloc)
1648     {
1649         if (debug)
1650         {
1651             fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1652         }
1653         fr->cg_nalloc = over_alloc_dd(nalloc);
1654         srenew(fr->cginfo,fr->cg_nalloc);
1655         if (fr->cutoff_scheme == ecutsGROUP)
1656         {
1657             srenew(fr->cg_cm,fr->cg_nalloc);
1658         }
1659     }
1660     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1661     {
1662         /* We don't use charge groups, we use x in state to set up
1663          * the atom communication.
1664          */
1665         dd_realloc_state(state,f,nalloc);
1666     }
1667 }
1668
1669 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1670                                        rvec *v,rvec *lv)
1671 {
1672     gmx_domdec_master_t *ma;
1673     int  n,i,c,a,nalloc=0;
1674     rvec *buf=NULL;
1675     
1676     if (DDMASTER(dd))
1677     {
1678         ma  = dd->ma;
1679         
1680         for(n=0; n<dd->nnodes; n++)
1681         {
1682             if (n != dd->rank)
1683             {
1684                 if (ma->nat[n] > nalloc)
1685                 {
1686                     nalloc = over_alloc_dd(ma->nat[n]);
1687                     srenew(buf,nalloc);
1688                 }
1689                 /* Use lv as a temporary buffer */
1690                 a = 0;
1691                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1692                 {
1693                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1694                     {
1695                         copy_rvec(v[c],buf[a++]);
1696                     }
1697                 }
1698                 if (a != ma->nat[n])
1699                 {
1700                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1701                               a,ma->nat[n]);
1702                 }
1703                 
1704 #ifdef GMX_MPI
1705                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1706                          DDRANK(dd,n),n,dd->mpi_comm_all);
1707 #endif
1708             }
1709         }
1710         sfree(buf);
1711         n = DDMASTERRANK(dd);
1712         a = 0;
1713         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1714         {
1715             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1716             {
1717                 copy_rvec(v[c],lv[a++]);
1718             }
1719         }
1720     }
1721     else
1722     {
1723 #ifdef GMX_MPI
1724         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1725                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1726 #endif
1727     }
1728 }
1729
1730 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1731                                        rvec *v,rvec *lv)
1732 {
1733     gmx_domdec_master_t *ma;
1734     int  *scounts=NULL,*disps=NULL;
1735     int  n,i,c,a,nalloc=0;
1736     rvec *buf=NULL;
1737     
1738     if (DDMASTER(dd))
1739     {
1740         ma  = dd->ma;
1741      
1742         get_commbuffer_counts(dd,&scounts,&disps);
1743
1744         buf = ma->vbuf;
1745         a = 0;
1746         for(n=0; n<dd->nnodes; n++)
1747         {
1748             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1749             {
1750                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1751                 {
1752                     copy_rvec(v[c],buf[a++]);
1753                 }
1754             }
1755         }
1756     }
1757
1758     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1759 }
1760
1761 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1762 {
1763     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1764     {
1765         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1766     }
1767     else
1768     {
1769         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1770     }
1771 }
1772
1773 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1774                                 t_state *state,t_state *state_local,
1775                                 rvec **f)
1776 {
1777     int  i,j,nh;
1778
1779     nh = state->nhchainlength;
1780
1781     if (DDMASTER(dd))
1782     {
1783         for(i=0;i<efptNR;i++)
1784         {
1785             state_local->lambda[i] = state->lambda[i];
1786         }
1787         state_local->fep_state = state->fep_state;
1788         state_local->veta   = state->veta;
1789         state_local->vol0   = state->vol0;
1790         copy_mat(state->box,state_local->box);
1791         copy_mat(state->box_rel,state_local->box_rel);
1792         copy_mat(state->boxv,state_local->boxv);
1793         copy_mat(state->svir_prev,state_local->svir_prev);
1794         copy_mat(state->fvir_prev,state_local->fvir_prev);
1795         for(i=0; i<state_local->ngtc; i++)
1796         {
1797             for(j=0; j<nh; j++) {
1798                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1799                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1800             }
1801             state_local->therm_integral[i] = state->therm_integral[i];
1802         }
1803         for(i=0; i<state_local->nnhpres; i++)
1804         {
1805             for(j=0; j<nh; j++) {
1806                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1807                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1808             }
1809         }
1810     }
1811     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1812     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1813     dd_bcast(dd,sizeof(real),&state_local->veta);
1814     dd_bcast(dd,sizeof(real),&state_local->vol0);
1815     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1816     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1817     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1818     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1819     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1820     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1821     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1822     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1823     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1824     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1825
1826     if (dd->nat_home > state_local->nalloc)
1827     {
1828         dd_realloc_state(state_local,f,dd->nat_home);
1829     }
1830     for(i=0; i<estNR; i++)
1831     {
1832         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1833         {
1834             switch (i) {
1835             case estX:
1836                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1837                 break;
1838             case estV:
1839                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1840                 break;
1841             case estSDX:
1842                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1843                 break;
1844             case estCGP:
1845                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1846                 break;
1847             case estLD_RNG:
1848                 if (state->nrngi == 1)
1849                 {
1850                     dd_bcastc(dd,
1851                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1852                               state->ld_rng,state_local->ld_rng);
1853                 }
1854                 else
1855                 {
1856                     dd_scatter(dd,
1857                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1858                                state->ld_rng,state_local->ld_rng);
1859                 }
1860                 break;
1861             case estLD_RNGI:
1862                 if (state->nrngi == 1)
1863                 {
1864                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1865                               state->ld_rngi,state_local->ld_rngi);
1866                 }
1867                 else
1868                 {
1869                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1870                                state->ld_rngi,state_local->ld_rngi);
1871                 }   
1872                 break;
1873             case estDISRE_INITF:
1874             case estDISRE_RM3TAV:
1875             case estORIRE_INITF:
1876             case estORIRE_DTAV:
1877                 /* Not implemented yet */
1878                 break;
1879             default:
1880                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1881             }
1882         }
1883     }
1884 }
1885
1886 static char dim2char(int dim)
1887 {
1888     char c='?';
1889     
1890     switch (dim)
1891     {
1892     case XX: c = 'X'; break;
1893     case YY: c = 'Y'; break;
1894     case ZZ: c = 'Z'; break;
1895     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1896     }
1897     
1898     return c;
1899 }
1900
1901 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1902                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1903 {
1904     rvec grid_s[2],*grid_r=NULL,cx,r;
1905     char fname[STRLEN],format[STRLEN],buf[22];
1906     FILE *out;
1907     int  a,i,d,z,y,x;
1908     matrix tric;
1909     real vol;
1910
1911     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1912     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1913     
1914     if (DDMASTER(dd))
1915     {
1916         snew(grid_r,2*dd->nnodes);
1917     }
1918     
1919     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1920     
1921     if (DDMASTER(dd))
1922     {
1923         for(d=0; d<DIM; d++)
1924         {
1925             for(i=0; i<DIM; i++)
1926             {
1927                 if (d == i)
1928                 {
1929                     tric[d][i] = 1;
1930                 }
1931                 else
1932                 {
1933                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1934                     {
1935                         tric[d][i] = box[i][d]/box[i][i];
1936                     }
1937                     else
1938                     {
1939                         tric[d][i] = 0;
1940                     }
1941                 }
1942             }
1943         }
1944         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1945         sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1946         out = gmx_fio_fopen(fname,"w");
1947         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1948         a = 1;
1949         for(i=0; i<dd->nnodes; i++)
1950         {
1951             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1952             for(d=0; d<DIM; d++)
1953             {
1954                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1955             }
1956             for(z=0; z<2; z++)
1957             {
1958                 for(y=0; y<2; y++)
1959                 {
1960                     for(x=0; x<2; x++)
1961                     {
1962                         cx[XX] = grid_r[i*2+x][XX];
1963                         cx[YY] = grid_r[i*2+y][YY];
1964                         cx[ZZ] = grid_r[i*2+z][ZZ];
1965                         mvmul(tric,cx,r);
1966                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1967                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1968                     }
1969                 }
1970             }
1971             for(d=0; d<DIM; d++)
1972             {
1973                 for(x=0; x<4; x++)
1974                 {
1975                     switch(d)
1976                     {
1977                     case 0: y = 1 + i*8 + 2*x; break;
1978                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1979                     case 2: y = 1 + i*8 + x; break;
1980                     }
1981                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1982                 }
1983             }
1984         }
1985         gmx_fio_fclose(out);
1986         sfree(grid_r);
1987     }
1988 }
1989
1990 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1991                   gmx_mtop_t *mtop,t_commrec *cr,
1992                   int natoms,rvec x[],matrix box)
1993 {
1994     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1995     FILE *out;
1996     int  i,ii,resnr,c;
1997     char *atomname,*resname;
1998     real b;
1999     gmx_domdec_t *dd;
2000     
2001     dd = cr->dd;
2002     if (natoms == -1)
2003     {
2004         natoms = dd->comm->nat[ddnatVSITE];
2005     }
2006     
2007     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2008     
2009     sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
2010     sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
2011     
2012     out = gmx_fio_fopen(fname,"w");
2013     
2014     fprintf(out,"TITLE     %s\n",title);
2015     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2016     for(i=0; i<natoms; i++)
2017     {
2018         ii = dd->gatindex[i];
2019         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2020         if (i < dd->comm->nat[ddnatZONE])
2021         {
2022             c = 0;
2023             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2024             {
2025                 c++;
2026             }
2027             b = c;
2028         }
2029         else if (i < dd->comm->nat[ddnatVSITE])
2030         {
2031             b = dd->comm->zones.n;
2032         }
2033         else
2034         {
2035             b = dd->comm->zones.n + 1;
2036         }
2037         fprintf(out,strlen(atomname)<4 ? format : format4,
2038                 "ATOM",(ii+1)%100000,
2039                 atomname,resname,' ',resnr%10000,' ',
2040                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2041     }
2042     fprintf(out,"TER\n");
2043     
2044     gmx_fio_fclose(out);
2045 }
2046
2047 real dd_cutoff_mbody(gmx_domdec_t *dd)
2048 {
2049     gmx_domdec_comm_t *comm;
2050     int  di;
2051     real r;
2052
2053     comm = dd->comm;
2054
2055     r = -1;
2056     if (comm->bInterCGBondeds)
2057     {
2058         if (comm->cutoff_mbody > 0)
2059         {
2060             r = comm->cutoff_mbody;
2061         }
2062         else
2063         {
2064             /* cutoff_mbody=0 means we do not have DLB */
2065             r = comm->cellsize_min[dd->dim[0]];
2066             for(di=1; di<dd->ndim; di++)
2067             {
2068                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2069             }
2070             if (comm->bBondComm)
2071             {
2072                 r = max(r,comm->cutoff_mbody);
2073             }
2074             else
2075             {
2076                 r = min(r,comm->cutoff);
2077             }
2078         }
2079     }
2080
2081     return r;
2082 }
2083
2084 real dd_cutoff_twobody(gmx_domdec_t *dd)
2085 {
2086     real r_mb;
2087
2088     r_mb = dd_cutoff_mbody(dd);
2089
2090     return max(dd->comm->cutoff,r_mb);
2091 }
2092
2093
2094 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2095 {
2096     int nc,ntot;
2097     
2098     nc   = dd->nc[dd->comm->cartpmedim];
2099     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2100     copy_ivec(coord,coord_pme);
2101     coord_pme[dd->comm->cartpmedim] =
2102         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2103 }
2104
2105 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2106 {
2107     /* Here we assign a PME node to communicate with this DD node
2108      * by assuming that the major index of both is x.
2109      * We add cr->npmenodes/2 to obtain an even distribution.
2110      */
2111     return (ddindex*npme + npme/2)/ndd;
2112 }
2113
2114 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2115 {
2116     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2117 }
2118
2119 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2120 {
2121     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2122 }
2123
2124 static int *dd_pmenodes(t_commrec *cr)
2125 {
2126     int *pmenodes;
2127     int n,i,p0,p1;
2128     
2129     snew(pmenodes,cr->npmenodes);
2130     n = 0;
2131     for(i=0; i<cr->dd->nnodes; i++) {
2132         p0 = cr_ddindex2pmeindex(cr,i);
2133         p1 = cr_ddindex2pmeindex(cr,i+1);
2134         if (i+1 == cr->dd->nnodes || p1 > p0) {
2135             if (debug)
2136                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2137             pmenodes[n] = i + 1 + n;
2138             n++;
2139         }
2140     }
2141
2142     return pmenodes;
2143 }
2144
2145 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2146 {
2147     gmx_domdec_t *dd;
2148     ivec coords,coords_pme,nc;
2149     int  slab;
2150     
2151     dd = cr->dd;
2152     /*
2153       if (dd->comm->bCartesian) {
2154       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2155       dd_coords2pmecoords(dd,coords,coords_pme);
2156       copy_ivec(dd->ntot,nc);
2157       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2158       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2159       
2160       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2161       } else {
2162       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2163       }
2164     */
2165     coords[XX] = x;
2166     coords[YY] = y;
2167     coords[ZZ] = z;
2168     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2169     
2170     return slab;
2171 }
2172
2173 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2174 {
2175     gmx_domdec_comm_t *comm;
2176     ivec coords;
2177     int  ddindex,nodeid=-1;
2178     
2179     comm = cr->dd->comm;
2180     
2181     coords[XX] = x;
2182     coords[YY] = y;
2183     coords[ZZ] = z;
2184     if (comm->bCartesianPP_PME)
2185     {
2186 #ifdef GMX_MPI
2187         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2188 #endif
2189     }
2190     else
2191     {
2192         ddindex = dd_index(cr->dd->nc,coords);
2193         if (comm->bCartesianPP)
2194         {
2195             nodeid = comm->ddindex2simnodeid[ddindex];
2196         }
2197         else
2198         {
2199             if (comm->pmenodes)
2200             {
2201                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2202             }
2203             else
2204             {
2205                 nodeid = ddindex;
2206             }
2207         }
2208     }
2209   
2210     return nodeid;
2211 }
2212
2213 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2214 {
2215     gmx_domdec_t *dd;
2216     gmx_domdec_comm_t *comm;
2217     ivec coord,coord_pme;
2218     int  i;
2219     int  pmenode=-1;
2220     
2221     dd = cr->dd;
2222     comm = dd->comm;
2223     
2224     /* This assumes a uniform x domain decomposition grid cell size */
2225     if (comm->bCartesianPP_PME)
2226     {
2227 #ifdef GMX_MPI
2228         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2229         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2230         {
2231             /* This is a PP node */
2232             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2233             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2234         }
2235 #endif
2236     }
2237     else if (comm->bCartesianPP)
2238     {
2239         if (sim_nodeid < dd->nnodes)
2240         {
2241             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2242         }
2243     }
2244     else
2245     {
2246         /* This assumes DD cells with identical x coordinates
2247          * are numbered sequentially.
2248          */
2249         if (dd->comm->pmenodes == NULL)
2250         {
2251             if (sim_nodeid < dd->nnodes)
2252             {
2253                 /* The DD index equals the nodeid */
2254                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2255             }
2256         }
2257         else
2258         {
2259             i = 0;
2260             while (sim_nodeid > dd->comm->pmenodes[i])
2261             {
2262                 i++;
2263             }
2264             if (sim_nodeid < dd->comm->pmenodes[i])
2265             {
2266                 pmenode = dd->comm->pmenodes[i];
2267             }
2268         }
2269     }
2270     
2271     return pmenode;
2272 }
2273
2274 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2275 {
2276     gmx_bool bPMEOnlyNode;
2277     
2278     if (DOMAINDECOMP(cr))
2279     {
2280         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2281     }
2282     else
2283     {
2284         bPMEOnlyNode = FALSE;
2285     }
2286     
2287     return bPMEOnlyNode;
2288 }
2289
2290 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2291                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2292 {
2293     gmx_domdec_t *dd;
2294     int x,y,z;
2295     ivec coord,coord_pme;
2296     
2297     dd = cr->dd;
2298     
2299     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2300     
2301     *nmy_ddnodes = 0;
2302     for(x=0; x<dd->nc[XX]; x++)
2303     {
2304         for(y=0; y<dd->nc[YY]; y++)
2305         {
2306             for(z=0; z<dd->nc[ZZ]; z++)
2307             {
2308                 if (dd->comm->bCartesianPP_PME)
2309                 {
2310                     coord[XX] = x;
2311                     coord[YY] = y;
2312                     coord[ZZ] = z;
2313                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2314                     if (dd->ci[XX] == coord_pme[XX] &&
2315                         dd->ci[YY] == coord_pme[YY] &&
2316                         dd->ci[ZZ] == coord_pme[ZZ])
2317                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2318                 }
2319                 else
2320                 {
2321                     /* The slab corresponds to the nodeid in the PME group */
2322                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2323                     {
2324                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2325                     }
2326                 }
2327             }
2328         }
2329     }
2330     
2331     /* The last PP-only node is the peer node */
2332     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2333     
2334     if (debug)
2335     {
2336         fprintf(debug,"Receive coordinates from PP nodes:");
2337         for(x=0; x<*nmy_ddnodes; x++)
2338         {
2339             fprintf(debug," %d",(*my_ddnodes)[x]);
2340         }
2341         fprintf(debug,"\n");
2342     }
2343 }
2344
2345 static gmx_bool receive_vir_ener(t_commrec *cr)
2346 {
2347     gmx_domdec_comm_t *comm;
2348     int  pmenode,coords[DIM],rank;
2349     gmx_bool bReceive;
2350     
2351     bReceive = TRUE;
2352     if (cr->npmenodes < cr->dd->nnodes)
2353     {
2354         comm = cr->dd->comm;
2355         if (comm->bCartesianPP_PME)
2356         {
2357             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2358 #ifdef GMX_MPI
2359             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2360             coords[comm->cartpmedim]++;
2361             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2362             {
2363                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2364                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2365                 {
2366                     /* This is not the last PP node for pmenode */
2367                     bReceive = FALSE;
2368                 }
2369             }
2370 #endif  
2371         }
2372         else
2373         {
2374             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2375             if (cr->sim_nodeid+1 < cr->nnodes &&
2376                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2377             {
2378                 /* This is not the last PP node for pmenode */
2379                 bReceive = FALSE;
2380             }
2381         }
2382     }
2383     
2384     return bReceive;
2385 }
2386
2387 static void set_zones_ncg_home(gmx_domdec_t *dd)
2388 {
2389     gmx_domdec_zones_t *zones;
2390     int i;
2391
2392     zones = &dd->comm->zones;
2393
2394     zones->cg_range[0] = 0;
2395     for(i=1; i<zones->n+1; i++)
2396     {
2397         zones->cg_range[i] = dd->ncg_home;
2398     }
2399 }
2400
2401 static void rebuild_cgindex(gmx_domdec_t *dd,
2402                             const int *gcgs_index,t_state *state)
2403 {
2404     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2405     
2406     ind = state->cg_gl;
2407     dd_cg_gl = dd->index_gl;
2408     cgindex  = dd->cgindex;
2409     nat = 0;
2410     cgindex[0] = nat;
2411     for(i=0; i<state->ncg_gl; i++)
2412     {
2413         cgindex[i] = nat;
2414         cg_gl = ind[i];
2415         dd_cg_gl[i] = cg_gl;
2416         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2417     }
2418     cgindex[i] = nat;
2419     
2420     dd->ncg_home = state->ncg_gl;
2421     dd->nat_home = nat;
2422
2423     set_zones_ncg_home(dd);
2424 }
2425
2426 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2427 {
2428     while (cg >= cginfo_mb->cg_end)
2429     {
2430         cginfo_mb++;
2431     }
2432
2433     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2434 }
2435
2436 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2437                           t_forcerec *fr,char *bLocalCG)
2438 {
2439     cginfo_mb_t *cginfo_mb;
2440     int *cginfo;
2441     int cg;
2442
2443     if (fr != NULL)
2444     {
2445         cginfo_mb = fr->cginfo_mb;
2446         cginfo    = fr->cginfo;
2447
2448         for(cg=cg0; cg<cg1; cg++)
2449         {
2450             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2451         }
2452     }
2453
2454     if (bLocalCG != NULL)
2455     {
2456         for(cg=cg0; cg<cg1; cg++)
2457         {
2458             bLocalCG[index_gl[cg]] = TRUE;
2459         }
2460     }
2461 }
2462
2463 static void make_dd_indices(gmx_domdec_t *dd,
2464                             const int *gcgs_index,int cg_start)
2465 {
2466     int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2467     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2468     gmx_ga2la_t *ga2la;
2469     char *bLocalCG;
2470     gmx_bool bCGs;
2471
2472     bLocalCG = dd->comm->bLocalCG;
2473
2474     if (dd->nat_tot > dd->gatindex_nalloc)
2475     {
2476         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2477         srenew(dd->gatindex,dd->gatindex_nalloc);
2478     }
2479
2480     nzone      = dd->comm->zones.n;
2481     zone2cg    = dd->comm->zones.cg_range;
2482     zone_ncg1  = dd->comm->zone_ncg1;
2483     index_gl   = dd->index_gl;
2484     gatindex   = dd->gatindex;
2485     bCGs       = dd->comm->bCGs;
2486
2487     if (zone2cg[1] != dd->ncg_home)
2488     {
2489         gmx_incons("dd->ncg_zone is not up to date");
2490     }
2491     
2492     /* Make the local to global and global to local atom index */
2493     a = dd->cgindex[cg_start];
2494     for(zone=0; zone<nzone; zone++)
2495     {
2496         if (zone == 0)
2497         {
2498             cg0 = cg_start;
2499         }
2500         else
2501         {
2502             cg0 = zone2cg[zone];
2503         }
2504         cg1    = zone2cg[zone+1];
2505         cg1_p1 = cg0 + zone_ncg1[zone];
2506
2507         for(cg=cg0; cg<cg1; cg++)
2508         {
2509             zone1 = zone;
2510             if (cg >= cg1_p1)
2511             {
2512                 /* Signal that this cg is from more than one pulse away */
2513                 zone1 += nzone;
2514             }
2515             cg_gl = index_gl[cg];
2516             if (bCGs)
2517             {
2518                 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2519                 {
2520                     gatindex[a] = a_gl;
2521                     ga2la_set(dd->ga2la,a_gl,a,zone1);
2522                     a++;
2523                 }
2524             }
2525             else
2526             {
2527                 gatindex[a] = cg_gl;
2528                 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2529                 a++;
2530             }
2531         }
2532     }
2533 }
2534
2535 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2536                           const char *where)
2537 {
2538     int ncg,i,ngl,nerr;
2539
2540     nerr = 0;
2541     if (bLocalCG == NULL)
2542     {
2543         return nerr;
2544     }
2545     for(i=0; i<dd->ncg_tot; i++)
2546     {
2547         if (!bLocalCG[dd->index_gl[i]])
2548         {
2549             fprintf(stderr,
2550                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2551             nerr++;
2552         }
2553     }
2554     ngl = 0;
2555     for(i=0; i<ncg_sys; i++)
2556     {
2557         if (bLocalCG[i])
2558         {
2559             ngl++;
2560         }
2561     }
2562     if (ngl != dd->ncg_tot)
2563     {
2564         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2565         nerr++;
2566     }
2567
2568     return nerr;
2569 }
2570
2571 static void check_index_consistency(gmx_domdec_t *dd,
2572                                     int natoms_sys,int ncg_sys,
2573                                     const char *where)
2574 {
2575     int  nerr,ngl,i,a,cell;
2576     int  *have;
2577
2578     nerr = 0;
2579
2580     if (dd->comm->DD_debug > 1)
2581     {
2582         snew(have,natoms_sys);
2583         for(a=0; a<dd->nat_tot; a++)
2584         {
2585             if (have[dd->gatindex[a]] > 0)
2586             {
2587                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2588             }
2589             else
2590             {
2591                 have[dd->gatindex[a]] = a + 1;
2592             }
2593         }
2594         sfree(have);
2595     }
2596
2597     snew(have,dd->nat_tot);
2598
2599     ngl  = 0;
2600     for(i=0; i<natoms_sys; i++)
2601     {
2602         if (ga2la_get(dd->ga2la,i,&a,&cell))
2603         {
2604             if (a >= dd->nat_tot)
2605             {
2606                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2607                 nerr++;
2608             }
2609             else
2610             {
2611                 have[a] = 1;
2612                 if (dd->gatindex[a] != i)
2613                 {
2614                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2615                     nerr++;
2616                 }
2617             }
2618             ngl++;
2619         }
2620     }
2621     if (ngl != dd->nat_tot)
2622     {
2623         fprintf(stderr,
2624                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2625                 dd->rank,where,ngl,dd->nat_tot);
2626     }
2627     for(a=0; a<dd->nat_tot; a++)
2628     {
2629         if (have[a] == 0)
2630         {
2631             fprintf(stderr,
2632                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2633                     dd->rank,where,a+1,dd->gatindex[a]+1);
2634         }
2635     }
2636     sfree(have);
2637
2638     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2639
2640     if (nerr > 0) {
2641         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2642                   dd->rank,where,nerr);
2643     }
2644 }
2645
2646 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2647 {
2648     int  i;
2649     char *bLocalCG;
2650
2651     if (a_start == 0)
2652     {
2653         /* Clear the whole list without searching */
2654         ga2la_clear(dd->ga2la);
2655     }
2656     else
2657     {
2658         for(i=a_start; i<dd->nat_tot; i++)
2659         {
2660             ga2la_del(dd->ga2la,dd->gatindex[i]);
2661         }
2662     }
2663
2664     bLocalCG = dd->comm->bLocalCG;
2665     if (bLocalCG)
2666     {
2667         for(i=cg_start; i<dd->ncg_tot; i++)
2668         {
2669             bLocalCG[dd->index_gl[i]] = FALSE;
2670         }
2671     }
2672
2673     dd_clear_local_vsite_indices(dd);
2674     
2675     if (dd->constraints)
2676     {
2677         dd_clear_local_constraint_indices(dd);
2678     }
2679 }
2680
2681 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2682                             int dim_ind)
2683 {
2684     real grid_jump_limit;
2685
2686     /* The distance between the boundaries of cells at distance
2687      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2688      * and by the fact that cells should not be shifted by more than
2689      * half their size, such that cg's only shift by one cell
2690      * at redecomposition.
2691      */
2692     grid_jump_limit = comm->cellsize_limit;
2693     if (!comm->bVacDLBNoLimit)
2694     {
2695         grid_jump_limit = max(grid_jump_limit,
2696                               cutoff/comm->cd[dim_ind].np);
2697     }
2698
2699     return grid_jump_limit;
2700 }
2701
2702 static gmx_bool check_grid_jump(gmx_large_int_t step,
2703                                 gmx_domdec_t *dd,
2704                                 real cutoff,
2705                                 gmx_ddbox_t *ddbox,
2706                                 gmx_bool bFatal)
2707 {
2708     gmx_domdec_comm_t *comm;
2709     int  d,dim;
2710     real limit,bfac;
2711     gmx_bool bInvalid;
2712
2713     bInvalid = FALSE;
2714
2715     comm = dd->comm;
2716     
2717     for(d=1; d<dd->ndim; d++)
2718     {
2719         dim = dd->dim[d];
2720         limit = grid_jump_limit(comm,cutoff,d);
2721         bfac = ddbox->box_size[dim];
2722         if (ddbox->tric_dir[dim])
2723         {
2724             bfac *= ddbox->skew_fac[dim];
2725         }
2726         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2727             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2728         {
2729             bInvalid = TRUE;
2730
2731             if (bFatal)
2732             {
2733                 char buf[22];
2734
2735                 /* This error should never be triggered under normal
2736                  * circumstances, but you never know ...
2737                  */
2738                 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2739                           gmx_step_str(step,buf),
2740                           dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2741             }
2742         }
2743     }
2744
2745     return bInvalid;
2746 }
2747
2748 static int dd_load_count(gmx_domdec_comm_t *comm)
2749 {
2750     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2751 }
2752
2753 static float dd_force_load(gmx_domdec_comm_t *comm)
2754 {
2755     float load;
2756     
2757     if (comm->eFlop)
2758     {
2759         load = comm->flop;
2760         if (comm->eFlop > 1)
2761         {
2762             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2763         }
2764     } 
2765     else
2766     {
2767         load = comm->cycl[ddCyclF];
2768         if (comm->cycl_n[ddCyclF] > 1)
2769         {
2770             /* Subtract the maximum of the last n cycle counts
2771              * to get rid of possible high counts due to other soures,
2772              * for instance system activity, that would otherwise
2773              * affect the dynamic load balancing.
2774              */
2775             load -= comm->cycl_max[ddCyclF];
2776         }
2777     }
2778     
2779     return load;
2780 }
2781
2782 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2783 {
2784     gmx_domdec_comm_t *comm;
2785     int i;
2786     
2787     comm = dd->comm;
2788     
2789     snew(*dim_f,dd->nc[dim]+1);
2790     (*dim_f)[0] = 0;
2791     for(i=1; i<dd->nc[dim]; i++)
2792     {
2793         if (comm->slb_frac[dim])
2794         {
2795             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2796         }
2797         else
2798         {
2799             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2800         }
2801     }
2802     (*dim_f)[dd->nc[dim]] = 1;
2803 }
2804
2805 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2806 {
2807     int  pmeindex,slab,nso,i;
2808     ivec xyz;
2809     
2810     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2811     {
2812         ddpme->dim = YY;
2813     }
2814     else
2815     {
2816         ddpme->dim = dimind;
2817     }
2818     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2819     
2820     ddpme->nslab = (ddpme->dim == 0 ?
2821                     dd->comm->npmenodes_x :
2822                     dd->comm->npmenodes_y);
2823
2824     if (ddpme->nslab <= 1)
2825     {
2826         return;
2827     }
2828
2829     nso = dd->comm->npmenodes/ddpme->nslab;
2830     /* Determine for each PME slab the PP location range for dimension dim */
2831     snew(ddpme->pp_min,ddpme->nslab);
2832     snew(ddpme->pp_max,ddpme->nslab);
2833     for(slab=0; slab<ddpme->nslab; slab++) {
2834         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2835         ddpme->pp_max[slab] = 0;
2836     }
2837     for(i=0; i<dd->nnodes; i++) {
2838         ddindex2xyz(dd->nc,i,xyz);
2839         /* For y only use our y/z slab.
2840          * This assumes that the PME x grid size matches the DD grid size.
2841          */
2842         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2843             pmeindex = ddindex2pmeindex(dd,i);
2844             if (dimind == 0) {
2845                 slab = pmeindex/nso;
2846             } else {
2847                 slab = pmeindex % ddpme->nslab;
2848             }
2849             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2850             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2851         }
2852     }
2853
2854     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2855 }
2856
2857 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2858 {
2859     if (dd->comm->ddpme[0].dim == XX)
2860     {
2861         return dd->comm->ddpme[0].maxshift;
2862     }
2863     else
2864     {
2865         return 0;
2866     }
2867 }
2868
2869 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2870 {
2871     if (dd->comm->ddpme[0].dim == YY)
2872     {
2873         return dd->comm->ddpme[0].maxshift;
2874     }
2875     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2876     {
2877         return dd->comm->ddpme[1].maxshift;
2878     }
2879     else
2880     {
2881         return 0;
2882     }
2883 }
2884
2885 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2886                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2887 {
2888     gmx_domdec_comm_t *comm;
2889     int  nc,ns,s;
2890     int  *xmin,*xmax;
2891     real range,pme_boundary;
2892     int  sh;
2893     
2894     comm = dd->comm;
2895     nc  = dd->nc[ddpme->dim];
2896     ns  = ddpme->nslab;
2897     
2898     if (!ddpme->dim_match)
2899     {
2900         /* PP decomposition is not along dim: the worst situation */
2901         sh = ns/2;
2902     }
2903     else if (ns <= 3 || (bUniform && ns == nc))
2904     {
2905         /* The optimal situation */
2906         sh = 1;
2907     }
2908     else
2909     {
2910         /* We need to check for all pme nodes which nodes they
2911          * could possibly need to communicate with.
2912          */
2913         xmin = ddpme->pp_min;
2914         xmax = ddpme->pp_max;
2915         /* Allow for atoms to be maximally 2/3 times the cut-off
2916          * out of their DD cell. This is a reasonable balance between
2917          * between performance and support for most charge-group/cut-off
2918          * combinations.
2919          */
2920         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2921         /* Avoid extra communication when we are exactly at a boundary */
2922         range *= 0.999;
2923         
2924         sh = 1;
2925         for(s=0; s<ns; s++)
2926         {
2927             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2928             pme_boundary = (real)s/ns;
2929             while (sh+1 < ns &&
2930                    ((s-(sh+1) >= 0 &&
2931                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2932                     (s-(sh+1) <  0 &&
2933                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2934             {
2935                 sh++;
2936             }
2937             pme_boundary = (real)(s+1)/ns;
2938             while (sh+1 < ns &&
2939                    ((s+(sh+1) <  ns &&
2940                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2941                     (s+(sh+1) >= ns &&
2942                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2943             {
2944                 sh++;
2945             }
2946         }
2947     }
2948     
2949     ddpme->maxshift = sh;
2950     
2951     if (debug)
2952     {
2953         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2954                 ddpme->dim,ddpme->maxshift);
2955     }
2956 }
2957
2958 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2959 {
2960     int d,dim;
2961     
2962     for(d=0; d<dd->ndim; d++)
2963     {
2964         dim = dd->dim[d];
2965         if (dim < ddbox->nboundeddim &&
2966             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2967             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2968         {
2969             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2970                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2971                       dd->nc[dim],dd->comm->cellsize_limit);
2972         }
2973     }
2974 }
2975
2976 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2977                                   gmx_bool bMaster,ivec npulse)
2978 {
2979     gmx_domdec_comm_t *comm;
2980     int  d,j;
2981     rvec cellsize_min;
2982     real *cell_x,cell_dx,cellsize;
2983     
2984     comm = dd->comm;
2985     
2986     for(d=0; d<DIM; d++)
2987     {
2988         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2989         npulse[d] = 1;
2990         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2991         {
2992             /* Uniform grid */
2993             cell_dx = ddbox->box_size[d]/dd->nc[d];
2994             if (bMaster)
2995             {
2996                 for(j=0; j<dd->nc[d]+1; j++)
2997                 {
2998                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2999                 }
3000             }
3001             else
3002             {
3003                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3004                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3005             }
3006             cellsize = cell_dx*ddbox->skew_fac[d];
3007             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3008             {
3009                 npulse[d]++;
3010             }
3011             cellsize_min[d] = cellsize;
3012         }
3013         else
3014         {
3015             /* Statically load balanced grid */
3016             /* Also when we are not doing a master distribution we determine
3017              * all cell borders in a loop to obtain identical values
3018              * to the master distribution case and to determine npulse.
3019              */
3020             if (bMaster)
3021             {
3022                 cell_x = dd->ma->cell_x[d];
3023             }
3024             else
3025             {
3026                 snew(cell_x,dd->nc[d]+1);
3027             }
3028             cell_x[0] = ddbox->box0[d];
3029             for(j=0; j<dd->nc[d]; j++)
3030             {
3031                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3032                 cell_x[j+1] = cell_x[j] + cell_dx;
3033                 cellsize = cell_dx*ddbox->skew_fac[d];
3034                 while (cellsize*npulse[d] < comm->cutoff &&
3035                        npulse[d] < dd->nc[d]-1)
3036                 {
3037                     npulse[d]++;
3038                 }
3039                 cellsize_min[d] = min(cellsize_min[d],cellsize);
3040             }
3041             if (!bMaster)
3042             {
3043                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3044                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3045                 sfree(cell_x);
3046             }
3047         }
3048         /* The following limitation is to avoid that a cell would receive
3049          * some of its own home charge groups back over the periodic boundary.
3050          * Double charge groups cause trouble with the global indices.
3051          */
3052         if (d < ddbox->npbcdim &&
3053             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3054         {
3055             gmx_fatal_collective(FARGS,NULL,dd,
3056                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3057                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3058                                  comm->cutoff,
3059                                  dd->nc[d],dd->nc[d],
3060                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3061         }
3062     }
3063     
3064     if (!comm->bDynLoadBal)
3065     {
3066         copy_rvec(cellsize_min,comm->cellsize_min);
3067     }
3068    
3069     for(d=0; d<comm->npmedecompdim; d++)
3070     {
3071         set_pme_maxshift(dd,&comm->ddpme[d],
3072                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3073                          comm->ddpme[d].slb_dim_f);
3074     }
3075 }
3076
3077
3078 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3079                                        int d,int dim,gmx_domdec_root_t *root,
3080                                        gmx_ddbox_t *ddbox,
3081                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3082 {
3083     gmx_domdec_comm_t *comm;
3084     int  ncd,i,j,nmin,nmin_old;
3085     gmx_bool bLimLo,bLimHi;
3086     real *cell_size;
3087     real fac,halfway,cellsize_limit_f_i,region_size;
3088     gmx_bool bPBC,bLastHi=FALSE;
3089     int nrange[]={range[0],range[1]};
3090
3091     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
3092
3093     comm = dd->comm;
3094
3095     ncd = dd->nc[dim];
3096
3097     bPBC = (dim < ddbox->npbcdim);
3098
3099     cell_size = root->buf_ncd;
3100
3101     if (debug) 
3102     {
3103         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3104     }
3105
3106     /* First we need to check if the scaling does not make cells
3107      * smaller than the smallest allowed size.
3108      * We need to do this iteratively, since if a cell is too small,
3109      * it needs to be enlarged, which makes all the other cells smaller,
3110      * which could in turn make another cell smaller than allowed.
3111      */
3112     for(i=range[0]; i<range[1]; i++)
3113     {
3114         root->bCellMin[i] = FALSE;
3115     }
3116     nmin = 0;
3117     do
3118     {
3119         nmin_old = nmin;
3120         /* We need the total for normalization */
3121         fac = 0;
3122         for(i=range[0]; i<range[1]; i++)
3123         {
3124             if (root->bCellMin[i] == FALSE)
3125             {
3126                 fac += cell_size[i];
3127             }
3128         }
3129         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3130         /* Determine the cell boundaries */
3131         for(i=range[0]; i<range[1]; i++)
3132         {
3133             if (root->bCellMin[i] == FALSE)
3134             {
3135                 cell_size[i] *= fac;
3136                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3137                 {
3138                     cellsize_limit_f_i = 0;
3139                 }
3140                 else
3141                 {
3142                     cellsize_limit_f_i = cellsize_limit_f;
3143                 }
3144                 if (cell_size[i] < cellsize_limit_f_i)
3145                 {
3146                     root->bCellMin[i] = TRUE;
3147                     cell_size[i] = cellsize_limit_f_i;
3148                     nmin++;
3149                 }
3150             }
3151             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3152         }
3153     }
3154     while (nmin > nmin_old);
3155     
3156     i=range[1]-1;
3157     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3158     /* For this check we should not use DD_CELL_MARGIN,
3159      * but a slightly smaller factor,
3160      * since rounding could get use below the limit.
3161      */
3162     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3163     {
3164         char buf[22];
3165         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3166                   gmx_step_str(step,buf),
3167                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3168                   ncd,comm->cellsize_min[dim]);
3169     }
3170     
3171     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3172     
3173     if (!bUniform)
3174     {
3175         /* Check if the boundary did not displace more than halfway
3176          * each of the cells it bounds, as this could cause problems,
3177          * especially when the differences between cell sizes are large.
3178          * If changes are applied, they will not make cells smaller
3179          * than the cut-off, as we check all the boundaries which
3180          * might be affected by a change and if the old state was ok,
3181          * the cells will at most be shrunk back to their old size.
3182          */
3183         for(i=range[0]+1; i<range[1]; i++)
3184         {
3185             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3186             if (root->cell_f[i] < halfway)
3187             {
3188                 root->cell_f[i] = halfway;
3189                 /* Check if the change also causes shifts of the next boundaries */
3190                 for(j=i+1; j<range[1]; j++)
3191                 {
3192                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3193                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3194                 }
3195             }
3196             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3197             if (root->cell_f[i] > halfway)
3198             {
3199                 root->cell_f[i] = halfway;
3200                 /* Check if the change also causes shifts of the next boundaries */
3201                 for(j=i-1; j>=range[0]+1; j--)
3202                 {
3203                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3204                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3205                 }
3206             }
3207         }
3208     }
3209     
3210     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3211     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3212      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3213      * for a and b nrange is used */
3214     if (d > 0)
3215     {
3216         /* Take care of the staggering of the cell boundaries */
3217         if (bUniform)
3218         {
3219             for(i=range[0]; i<range[1]; i++)
3220             {
3221                 root->cell_f_max0[i] = root->cell_f[i];
3222                 root->cell_f_min1[i] = root->cell_f[i+1];
3223             }
3224         }
3225         else
3226         {
3227             for(i=range[0]+1; i<range[1]; i++)
3228             {
3229                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3230                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3231                 if (bLimLo && bLimHi)
3232                 {
3233                     /* Both limits violated, try the best we can */
3234                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3235                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3236                     nrange[0]=range[0];
3237                     nrange[1]=i;
3238                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3239
3240                     nrange[0]=i;
3241                     nrange[1]=range[1];
3242                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3243
3244                     return;
3245                 }
3246                 else if (bLimLo)
3247                 {
3248                     /* root->cell_f[i] = root->bound_min[i]; */
3249                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3250                     bLastHi=FALSE;
3251                 }
3252                 else if (bLimHi && !bLastHi)
3253                 {
3254                     bLastHi=TRUE;
3255                     if (nrange[1] < range[1])   /* found a LimLo before */
3256                     {
3257                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3258                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3259                         nrange[0]=nrange[1];
3260                     }
3261                     root->cell_f[i] = root->bound_max[i];
3262                     nrange[1]=i; 
3263                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3264                     nrange[0]=i;
3265                     nrange[1]=range[1];
3266                 }
3267             }
3268             if (nrange[1] < range[1])   /* found last a LimLo */
3269             {
3270                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3271                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3272                 nrange[0]=nrange[1];
3273                 nrange[1]=range[1];
3274                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3275             } 
3276             else if (nrange[0] > range[0]) /* found at least one LimHi */
3277             {
3278                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3279             }
3280         }
3281     }
3282 }
3283
3284
3285 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3286                                        int d,int dim,gmx_domdec_root_t *root,
3287                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3288                                        gmx_bool bUniform,gmx_large_int_t step)
3289 {
3290     gmx_domdec_comm_t *comm;
3291     int  ncd,d1,i,j,pos;
3292     real *cell_size;
3293     real load_aver,load_i,imbalance,change,change_max,sc;
3294     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3295     real change_limit;
3296     real relax = 0.5;
3297     gmx_bool bPBC;
3298     int range[] = { 0, 0 };
3299
3300     comm = dd->comm;
3301
3302     /* Convert the maximum change from the input percentage to a fraction */
3303     change_limit = comm->dlb_scale_lim*0.01;
3304
3305     ncd = dd->nc[dim];
3306
3307     bPBC = (dim < ddbox->npbcdim);
3308
3309     cell_size = root->buf_ncd;
3310
3311     /* Store the original boundaries */
3312     for(i=0; i<ncd+1; i++)
3313     {
3314         root->old_cell_f[i] = root->cell_f[i];
3315     }
3316     if (bUniform) {
3317         for(i=0; i<ncd; i++)
3318         {
3319             cell_size[i] = 1.0/ncd;
3320         }
3321     }
3322     else if (dd_load_count(comm))
3323     {
3324         load_aver = comm->load[d].sum_m/ncd;
3325         change_max = 0;
3326         for(i=0; i<ncd; i++)
3327         {
3328             /* Determine the relative imbalance of cell i */
3329             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3330             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3331             /* Determine the change of the cell size using underrelaxation */
3332             change = -relax*imbalance;
3333             change_max = max(change_max,max(change,-change));
3334         }
3335         /* Limit the amount of scaling.
3336          * We need to use the same rescaling for all cells in one row,
3337          * otherwise the load balancing might not converge.
3338          */
3339         sc = relax;
3340         if (change_max > change_limit)
3341         {
3342             sc *= change_limit/change_max;
3343         }
3344         for(i=0; i<ncd; i++)
3345         {
3346             /* Determine the relative imbalance of cell i */
3347             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3348             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3349             /* Determine the change of the cell size using underrelaxation */
3350             change = -sc*imbalance;
3351             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3352         }
3353     }
3354     
3355     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3356     cellsize_limit_f *= DD_CELL_MARGIN;
3357     dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3358     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3359     if (ddbox->tric_dir[dim])
3360     {
3361         cellsize_limit_f /= ddbox->skew_fac[dim];
3362         dist_min_f       /= ddbox->skew_fac[dim];
3363     }
3364     if (bDynamicBox && d > 0)
3365     {
3366         dist_min_f *= DD_PRES_SCALE_MARGIN;
3367     }
3368     if (d > 0 && !bUniform)
3369     {
3370         /* Make sure that the grid is not shifted too much */
3371         for(i=1; i<ncd; i++) {
3372             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3373             {
3374                 gmx_incons("Inconsistent DD boundary staggering limits!");
3375             }
3376             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3377             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3378             if (space > 0) {
3379                 root->bound_min[i] += 0.5*space;
3380             }
3381             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3382             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3383             if (space < 0) {
3384                 root->bound_max[i] += 0.5*space;
3385             }
3386             if (debug)
3387             {
3388                 fprintf(debug,
3389                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3390                         d,i,
3391                         root->cell_f_max0[i-1] + dist_min_f,
3392                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3393                         root->cell_f_min1[i] - dist_min_f);
3394             }
3395         }
3396     }
3397     range[1]=ncd;
3398     root->cell_f[0] = 0;
3399     root->cell_f[ncd] = 1;
3400     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3401
3402
3403     /* After the checks above, the cells should obey the cut-off
3404      * restrictions, but it does not hurt to check.
3405      */
3406     for(i=0; i<ncd; i++)
3407     {
3408         if (debug)
3409         {
3410             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3411                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3412         }
3413
3414         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3415             root->cell_f[i+1] - root->cell_f[i] <
3416             cellsize_limit_f/DD_CELL_MARGIN)
3417         {
3418             char buf[22];
3419             fprintf(stderr,
3420                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3421                     gmx_step_str(step,buf),dim2char(dim),i,
3422                     (root->cell_f[i+1] - root->cell_f[i])
3423                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3424         }
3425     }
3426     
3427     pos = ncd + 1;
3428     /* Store the cell boundaries of the lower dimensions at the end */
3429     for(d1=0; d1<d; d1++)
3430     {
3431         root->cell_f[pos++] = comm->cell_f0[d1];
3432         root->cell_f[pos++] = comm->cell_f1[d1];
3433     }
3434     
3435     if (d < comm->npmedecompdim)
3436     {
3437         /* The master determines the maximum shift for
3438          * the coordinate communication between separate PME nodes.
3439          */
3440         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3441     }
3442     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3443     if (d >= 1)
3444     {
3445         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3446     }
3447 }    
3448
3449 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3450                                              gmx_ddbox_t *ddbox,int dimind)
3451 {
3452     gmx_domdec_comm_t *comm;
3453     int dim;
3454
3455     comm = dd->comm;
3456
3457     /* Set the cell dimensions */
3458     dim = dd->dim[dimind];
3459     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3460     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3461     if (dim >= ddbox->nboundeddim)
3462     {
3463         comm->cell_x0[dim] += ddbox->box0[dim];
3464         comm->cell_x1[dim] += ddbox->box0[dim];
3465     }
3466 }
3467
3468 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3469                                          int d,int dim,real *cell_f_row,
3470                                          gmx_ddbox_t *ddbox)
3471 {
3472     gmx_domdec_comm_t *comm;
3473     int d1,dim1,pos;
3474
3475     comm = dd->comm;
3476
3477 #ifdef GMX_MPI
3478     /* Each node would only need to know two fractions,
3479      * but it is probably cheaper to broadcast the whole array.
3480      */
3481     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3482               0,comm->mpi_comm_load[d]);
3483 #endif
3484     /* Copy the fractions for this dimension from the buffer */
3485     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3486     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3487     /* The whole array was communicated, so set the buffer position */
3488     pos = dd->nc[dim] + 1;
3489     for(d1=0; d1<=d; d1++)
3490     {
3491         if (d1 < d)
3492         {
3493             /* Copy the cell fractions of the lower dimensions */
3494             comm->cell_f0[d1] = cell_f_row[pos++];
3495             comm->cell_f1[d1] = cell_f_row[pos++];
3496         }
3497         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3498     }
3499     /* Convert the communicated shift from float to int */
3500     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3501     if (d >= 1)
3502     {
3503         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3504     }
3505 }
3506
3507 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3508                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3509                                          gmx_bool bUniform,gmx_large_int_t step)
3510 {
3511     gmx_domdec_comm_t *comm;
3512     int d,dim,d1;
3513     gmx_bool bRowMember,bRowRoot;
3514     real *cell_f_row;
3515     
3516     comm = dd->comm;
3517
3518     for(d=0; d<dd->ndim; d++)
3519     {
3520         dim = dd->dim[d];
3521         bRowMember = TRUE;
3522         bRowRoot = TRUE;
3523         for(d1=d; d1<dd->ndim; d1++)
3524         {
3525             if (dd->ci[dd->dim[d1]] > 0)
3526             {
3527                 if (d1 > d)
3528                 {
3529                     bRowMember = FALSE;
3530                 }
3531                 bRowRoot = FALSE;
3532             }
3533         }
3534         if (bRowMember)
3535         {
3536             if (bRowRoot)
3537             {
3538                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3539                                            ddbox,bDynamicBox,bUniform,step);
3540                 cell_f_row = comm->root[d]->cell_f;
3541             }
3542             else
3543             {
3544                 cell_f_row = comm->cell_f_row;
3545             }
3546             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3547         }
3548     }
3549 }    
3550
3551 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3552 {
3553     int d;
3554
3555     /* This function assumes the box is static and should therefore
3556      * not be called when the box has changed since the last
3557      * call to dd_partition_system.
3558      */
3559     for(d=0; d<dd->ndim; d++)
3560     {
3561         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3562     }
3563 }
3564
3565
3566
3567 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3568                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3569                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3570                                   gmx_wallcycle_t wcycle)
3571 {
3572     gmx_domdec_comm_t *comm;
3573     int dim;
3574
3575     comm = dd->comm;
3576     
3577     if (bDoDLB)
3578     {
3579         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3580         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3581         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3582     }
3583     else if (bDynamicBox)
3584     {
3585         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3586     }
3587     
3588     /* Set the dimensions for which no DD is used */
3589     for(dim=0; dim<DIM; dim++) {
3590         if (dd->nc[dim] == 1) {
3591             comm->cell_x0[dim] = 0;
3592             comm->cell_x1[dim] = ddbox->box_size[dim];
3593             if (dim >= ddbox->nboundeddim)
3594             {
3595                 comm->cell_x0[dim] += ddbox->box0[dim];
3596                 comm->cell_x1[dim] += ddbox->box0[dim];
3597             }
3598         }
3599     }
3600 }
3601
3602 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3603 {
3604     int d,np,i;
3605     gmx_domdec_comm_dim_t *cd;
3606     
3607     for(d=0; d<dd->ndim; d++)
3608     {
3609         cd = &dd->comm->cd[d];
3610         np = npulse[dd->dim[d]];
3611         if (np > cd->np_nalloc)
3612         {
3613             if (debug)
3614             {
3615                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3616                         dim2char(dd->dim[d]),np);
3617             }
3618             if (DDMASTER(dd) && cd->np_nalloc > 0)
3619             {
3620                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3621             }
3622             srenew(cd->ind,np);
3623             for(i=cd->np_nalloc; i<np; i++)
3624             {
3625                 cd->ind[i].index  = NULL;
3626                 cd->ind[i].nalloc = 0;
3627             }
3628             cd->np_nalloc = np;
3629         }
3630         cd->np = np;
3631     }
3632 }
3633
3634
3635 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3636                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3637                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3638                               gmx_wallcycle_t wcycle)
3639 {
3640     gmx_domdec_comm_t *comm;
3641     int  d;
3642     ivec npulse;
3643     
3644     comm = dd->comm;
3645
3646     /* Copy the old cell boundaries for the cg displacement check */
3647     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3648     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3649     
3650     if (comm->bDynLoadBal)
3651     {
3652         if (DDMASTER(dd))
3653         {
3654             check_box_size(dd,ddbox);
3655         }
3656         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3657     }
3658     else
3659     {
3660         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3661         realloc_comm_ind(dd,npulse);
3662     }
3663     
3664     if (debug)
3665     {
3666         for(d=0; d<DIM; d++)
3667         {
3668             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3669                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3670         }
3671     }
3672 }
3673
3674 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3675                                   gmx_ddbox_t *ddbox,
3676                                   rvec cell_ns_x0,rvec cell_ns_x1,
3677                                   gmx_large_int_t step)
3678 {
3679     gmx_domdec_comm_t *comm;
3680     int dim_ind,dim;
3681     
3682     comm = dd->comm;
3683
3684     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3685     {
3686         dim = dd->dim[dim_ind];
3687         
3688         /* Without PBC we don't have restrictions on the outer cells */
3689         if (!(dim >= ddbox->npbcdim && 
3690               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3691             comm->bDynLoadBal &&
3692             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3693             comm->cellsize_min[dim])
3694         {
3695             char buf[22];
3696             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3697                       gmx_step_str(step,buf),dim2char(dim),
3698                       comm->cell_x1[dim] - comm->cell_x0[dim],
3699                       ddbox->skew_fac[dim],
3700                       dd->comm->cellsize_min[dim],
3701                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3702         }
3703     }
3704     
3705     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3706     {
3707         /* Communicate the boundaries and update cell_ns_x0/1 */
3708         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3709         if (dd->bGridJump && dd->ndim > 1)
3710         {
3711             check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3712         }
3713     }
3714 }
3715
3716 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3717 {
3718     if (YY < npbcdim)
3719     {
3720         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3721     }
3722     else
3723     {
3724         tcm[YY][XX] = 0;
3725     }
3726     if (ZZ < npbcdim)
3727     {
3728         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3729         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3730     }
3731     else
3732     {
3733         tcm[ZZ][XX] = 0;
3734         tcm[ZZ][YY] = 0;
3735     }
3736 }
3737
3738 static void check_screw_box(matrix box)
3739 {
3740     /* Mathematical limitation */
3741     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3742     {
3743         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3744     }
3745     
3746     /* Limitation due to the asymmetry of the eighth shell method */
3747     if (box[ZZ][YY] != 0)
3748     {
3749         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3750     }
3751 }
3752
3753 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3754                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3755                           gmx_domdec_t *dd)
3756 {
3757     gmx_domdec_master_t *ma;
3758     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3759     int  i,icg,j,k,k0,k1,d,npbcdim;
3760     matrix tcm;
3761     rvec box_size,cg_cm;
3762     ivec ind;
3763     real nrcg,inv_ncg,pos_d;
3764     atom_id *cgindex;
3765     gmx_bool bUnbounded,bScrew;
3766
3767     ma = dd->ma;
3768     
3769     if (tmp_ind == NULL)
3770     {
3771         snew(tmp_nalloc,dd->nnodes);
3772         snew(tmp_ind,dd->nnodes);
3773         for(i=0; i<dd->nnodes; i++)
3774         {
3775             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3776             snew(tmp_ind[i],tmp_nalloc[i]);
3777         }
3778     }
3779     
3780     /* Clear the count */
3781     for(i=0; i<dd->nnodes; i++)
3782     {
3783         ma->ncg[i] = 0;
3784         ma->nat[i] = 0;
3785     }
3786     
3787     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3788     
3789     cgindex = cgs->index;
3790     
3791     /* Compute the center of geometry for all charge groups */
3792     for(icg=0; icg<cgs->nr; icg++)
3793     {
3794         k0      = cgindex[icg];
3795         k1      = cgindex[icg+1];
3796         nrcg    = k1 - k0;
3797         if (nrcg == 1)
3798         {
3799             copy_rvec(pos[k0],cg_cm);
3800         }
3801         else
3802         {
3803             inv_ncg = 1.0/nrcg;
3804             
3805             clear_rvec(cg_cm);
3806             for(k=k0; (k<k1); k++)
3807             {
3808                 rvec_inc(cg_cm,pos[k]);
3809             }
3810             for(d=0; (d<DIM); d++)
3811             {
3812                 cg_cm[d] *= inv_ncg;
3813             }
3814         }
3815         /* Put the charge group in the box and determine the cell index */
3816         for(d=DIM-1; d>=0; d--) {
3817             pos_d = cg_cm[d];
3818             if (d < dd->npbcdim)
3819             {
3820                 bScrew = (dd->bScrewPBC && d == XX);
3821                 if (tric_dir[d] && dd->nc[d] > 1)
3822                 {
3823                     /* Use triclinic coordintates for this dimension */
3824                     for(j=d+1; j<DIM; j++)
3825                     {
3826                         pos_d += cg_cm[j]*tcm[j][d];
3827                     }
3828                 }
3829                 while(pos_d >= box[d][d])
3830                 {
3831                     pos_d -= box[d][d];
3832                     rvec_dec(cg_cm,box[d]);
3833                     if (bScrew)
3834                     {
3835                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3836                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3837                     }
3838                     for(k=k0; (k<k1); k++)
3839                     {
3840                         rvec_dec(pos[k],box[d]);
3841                         if (bScrew)
3842                         {
3843                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3844                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3845                         }
3846                     }
3847                 }
3848                 while(pos_d < 0)
3849                 {
3850                     pos_d += box[d][d];
3851                     rvec_inc(cg_cm,box[d]);
3852                     if (bScrew)
3853                     {
3854                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3855                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3856                     }
3857                     for(k=k0; (k<k1); k++)
3858                     {
3859                         rvec_inc(pos[k],box[d]);
3860                         if (bScrew) {
3861                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3862                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3863                         }
3864                     }
3865                 }
3866             }
3867             /* This could be done more efficiently */
3868             ind[d] = 0;
3869             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3870             {
3871                 ind[d]++;
3872             }
3873         }
3874         i = dd_index(dd->nc,ind);
3875         if (ma->ncg[i] == tmp_nalloc[i])
3876         {
3877             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3878             srenew(tmp_ind[i],tmp_nalloc[i]);
3879         }
3880         tmp_ind[i][ma->ncg[i]] = icg;
3881         ma->ncg[i]++;
3882         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3883     }
3884     
3885     k1 = 0;
3886     for(i=0; i<dd->nnodes; i++)
3887     {
3888         ma->index[i] = k1;
3889         for(k=0; k<ma->ncg[i]; k++)
3890         {
3891             ma->cg[k1++] = tmp_ind[i][k];
3892         }
3893     }
3894     ma->index[dd->nnodes] = k1;
3895     
3896     for(i=0; i<dd->nnodes; i++)
3897     {
3898         sfree(tmp_ind[i]);
3899     }
3900     sfree(tmp_ind);
3901     sfree(tmp_nalloc);
3902     
3903     if (fplog)
3904     {
3905         char buf[22];
3906         fprintf(fplog,"Charge group distribution at step %s:",
3907                 gmx_step_str(step,buf));
3908         for(i=0; i<dd->nnodes; i++)
3909         {
3910             fprintf(fplog," %d",ma->ncg[i]);
3911         }
3912         fprintf(fplog,"\n");
3913     }
3914 }
3915
3916 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3917                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3918                                 rvec pos[])
3919 {
3920     gmx_domdec_master_t *ma=NULL;
3921     ivec npulse;
3922     int  i,cg_gl;
3923     int  *ibuf,buf2[2] = { 0, 0 };
3924     gmx_bool bMaster = DDMASTER(dd);
3925     if (bMaster)
3926     {
3927         ma = dd->ma;
3928         
3929         if (dd->bScrewPBC)
3930         {
3931             check_screw_box(box);
3932         }
3933     
3934         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3935     
3936         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3937         for(i=0; i<dd->nnodes; i++)
3938         {
3939             ma->ibuf[2*i]   = ma->ncg[i];
3940             ma->ibuf[2*i+1] = ma->nat[i];
3941         }
3942         ibuf = ma->ibuf;
3943     }
3944     else
3945     {
3946         ibuf = NULL;
3947     }
3948     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3949     
3950     dd->ncg_home = buf2[0];
3951     dd->nat_home = buf2[1];
3952     dd->ncg_tot  = dd->ncg_home;
3953     dd->nat_tot  = dd->nat_home;
3954     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3955     {
3956         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3957         srenew(dd->index_gl,dd->cg_nalloc);
3958         srenew(dd->cgindex,dd->cg_nalloc+1);
3959     }
3960     if (bMaster)
3961     {
3962         for(i=0; i<dd->nnodes; i++)
3963         {
3964             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3965             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3966         }
3967     }
3968     
3969     dd_scatterv(dd,
3970                 DDMASTER(dd) ? ma->ibuf : NULL,
3971                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3972                 DDMASTER(dd) ? ma->cg : NULL,
3973                 dd->ncg_home*sizeof(int),dd->index_gl);
3974     
3975     /* Determine the home charge group sizes */
3976     dd->cgindex[0] = 0;
3977     for(i=0; i<dd->ncg_home; i++)
3978     {
3979         cg_gl = dd->index_gl[i];
3980         dd->cgindex[i+1] =
3981             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3982     }
3983     
3984     if (debug)
3985     {
3986         fprintf(debug,"Home charge groups:\n");
3987         for(i=0; i<dd->ncg_home; i++)
3988         {
3989             fprintf(debug," %d",dd->index_gl[i]);
3990             if (i % 10 == 9) 
3991                 fprintf(debug,"\n");
3992         }
3993         fprintf(debug,"\n");
3994     }
3995 }
3996
3997 static int compact_and_copy_vec_at(int ncg,int *move,
3998                                    int *cgindex,
3999                                    int nvec,int vec,
4000                                    rvec *src,gmx_domdec_comm_t *comm,
4001                                    gmx_bool bCompact)
4002 {
4003     int m,icg,i,i0,i1,nrcg;
4004     int home_pos;
4005     int pos_vec[DIM*2];
4006     
4007     home_pos = 0;
4008
4009     for(m=0; m<DIM*2; m++)
4010     {
4011         pos_vec[m] = 0;
4012     }
4013     
4014     i0 = 0;
4015     for(icg=0; icg<ncg; icg++)
4016     {
4017         i1 = cgindex[icg+1];
4018         m = move[icg];
4019         if (m == -1)
4020         {
4021             if (bCompact)
4022             {
4023                 /* Compact the home array in place */
4024                 for(i=i0; i<i1; i++)
4025                 {
4026                     copy_rvec(src[i],src[home_pos++]);
4027                 }
4028             }
4029         }
4030         else
4031         {
4032             /* Copy to the communication buffer */
4033             nrcg = i1 - i0;
4034             pos_vec[m] += 1 + vec*nrcg;
4035             for(i=i0; i<i1; i++)
4036             {
4037                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4038             }
4039             pos_vec[m] += (nvec - vec - 1)*nrcg;
4040         }
4041         if (!bCompact)
4042         {
4043             home_pos += i1 - i0;
4044         }
4045         i0 = i1;
4046     }
4047     
4048     return home_pos;
4049 }
4050
4051 static int compact_and_copy_vec_cg(int ncg,int *move,
4052                                    int *cgindex,
4053                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
4054                                    gmx_bool bCompact)
4055 {
4056     int m,icg,i0,i1,nrcg;
4057     int home_pos;
4058     int pos_vec[DIM*2];
4059     
4060     home_pos = 0;
4061     
4062     for(m=0; m<DIM*2; m++)
4063     {
4064         pos_vec[m] = 0;
4065     }
4066     
4067     i0 = 0;
4068     for(icg=0; icg<ncg; icg++)
4069     {
4070         i1 = cgindex[icg+1];
4071         m = move[icg];
4072         if (m == -1)
4073         {
4074             if (bCompact)
4075             {
4076                 /* Compact the home array in place */
4077                 copy_rvec(src[icg],src[home_pos++]);
4078             }
4079         }
4080         else
4081         {
4082             nrcg = i1 - i0;
4083             /* Copy to the communication buffer */
4084             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4085             pos_vec[m] += 1 + nrcg*nvec;
4086         }
4087         i0 = i1;
4088     }
4089     if (!bCompact)
4090     {
4091         home_pos = ncg;
4092     }
4093     
4094     return home_pos;
4095 }
4096
4097 static int compact_ind(int ncg,int *move,
4098                        int *index_gl,int *cgindex,
4099                        int *gatindex,
4100                        gmx_ga2la_t ga2la,char *bLocalCG,
4101                        int *cginfo)
4102 {
4103     int cg,nat,a0,a1,a,a_gl;
4104     int home_pos;
4105
4106     home_pos = 0;
4107     nat = 0;
4108     for(cg=0; cg<ncg; cg++)
4109     {
4110         a0 = cgindex[cg];
4111         a1 = cgindex[cg+1];
4112         if (move[cg] == -1)
4113         {
4114             /* Compact the home arrays in place.
4115              * Anything that can be done here avoids access to global arrays.
4116              */
4117             cgindex[home_pos] = nat;
4118             for(a=a0; a<a1; a++)
4119             {
4120                 a_gl = gatindex[a];
4121                 gatindex[nat] = a_gl;
4122                 /* The cell number stays 0, so we don't need to set it */
4123                 ga2la_change_la(ga2la,a_gl,nat);
4124                 nat++;
4125             }
4126             index_gl[home_pos] = index_gl[cg];
4127             cginfo[home_pos]   = cginfo[cg];
4128             /* The charge group remains local, so bLocalCG does not change */
4129             home_pos++;
4130         }
4131         else
4132         {
4133             /* Clear the global indices */
4134             for(a=a0; a<a1; a++)
4135             {
4136                 ga2la_del(ga2la,gatindex[a]);
4137             }
4138             if (bLocalCG)
4139             {
4140                 bLocalCG[index_gl[cg]] = FALSE;
4141             }
4142         }
4143     }
4144     cgindex[home_pos] = nat;
4145     
4146     return home_pos;
4147 }
4148
4149 static void clear_and_mark_ind(int ncg,int *move,
4150                                int *index_gl,int *cgindex,int *gatindex,
4151                                gmx_ga2la_t ga2la,char *bLocalCG,
4152                                int *cell_index)
4153 {
4154     int cg,a0,a1,a;
4155     
4156     for(cg=0; cg<ncg; cg++)
4157     {
4158         if (move[cg] >= 0)
4159         {
4160             a0 = cgindex[cg];
4161             a1 = cgindex[cg+1];
4162             /* Clear the global indices */
4163             for(a=a0; a<a1; a++)
4164             {
4165                 ga2la_del(ga2la,gatindex[a]);
4166             }
4167             if (bLocalCG)
4168             {
4169                 bLocalCG[index_gl[cg]] = FALSE;
4170             }
4171             /* Signal that this cg has moved using the ns cell index.
4172              * Here we set it to -1. fill_grid will change it
4173              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4174              */
4175             cell_index[cg] = -1;
4176         }
4177     }
4178 }
4179
4180 static void print_cg_move(FILE *fplog,
4181                           gmx_domdec_t *dd,
4182                           gmx_large_int_t step,int cg,int dim,int dir,
4183                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4184                           rvec cm_old,rvec cm_new,real pos_d)
4185 {
4186     gmx_domdec_comm_t *comm;
4187     char buf[22];
4188
4189     comm = dd->comm;
4190
4191     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4192     if (bHaveLimitdAndCMOld)
4193     {
4194         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4195                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4196     }
4197     else
4198     {
4199         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4200                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4201     }
4202     fprintf(fplog,"distance out of cell %f\n",
4203             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4204     if (bHaveLimitdAndCMOld)
4205     {
4206         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4207                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4208     }
4209     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4210             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4211     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4212             dim2char(dim),
4213             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4214     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4215             dim2char(dim),
4216             comm->cell_x0[dim],comm->cell_x1[dim]);
4217 }
4218
4219 static void cg_move_error(FILE *fplog,
4220                           gmx_domdec_t *dd,
4221                           gmx_large_int_t step,int cg,int dim,int dir,
4222                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4223                           rvec cm_old,rvec cm_new,real pos_d)
4224 {
4225     if (fplog)
4226     {
4227         print_cg_move(fplog, dd,step,cg,dim,dir,
4228                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4229     }
4230     print_cg_move(stderr,dd,step,cg,dim,dir,
4231                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4232     gmx_fatal(FARGS,
4233               "A charge group moved too far between two domain decomposition steps\n"
4234               "This usually means that your system is not well equilibrated");
4235 }
4236
4237 static void rotate_state_atom(t_state *state,int a)
4238 {
4239     int est;
4240
4241     for(est=0; est<estNR; est++)
4242     {
4243         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4244             switch (est) {
4245             case estX:
4246                 /* Rotate the complete state; for a rectangular box only */
4247                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4248                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4249                 break;
4250             case estV:
4251                 state->v[a][YY] = -state->v[a][YY];
4252                 state->v[a][ZZ] = -state->v[a][ZZ];
4253                 break;
4254             case estSDX:
4255                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4256                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4257                 break;
4258             case estCGP:
4259                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4260                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4261                 break;
4262             case estDISRE_INITF:
4263             case estDISRE_RM3TAV:
4264             case estORIRE_INITF:
4265             case estORIRE_DTAV:
4266                 /* These are distances, so not affected by rotation */
4267                 break;
4268             default:
4269                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4270             }
4271         }
4272     }
4273 }
4274
4275 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4276 {
4277     if (natoms > comm->moved_nalloc)
4278     {
4279         /* Contents should be preserved here */
4280         comm->moved_nalloc = over_alloc_dd(natoms);
4281         srenew(comm->moved,comm->moved_nalloc);
4282     }
4283
4284     return comm->moved;
4285 }
4286
4287 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4288                          gmx_domdec_t *dd,
4289                          t_state *state,
4290                          ivec tric_dir,matrix tcm,
4291                          rvec cell_x0,rvec cell_x1,
4292                          rvec limitd,rvec limit0,rvec limit1,
4293                          const int *cgindex,
4294                          int cg_start,int cg_end,
4295                          rvec *cg_cm,
4296                          int *move)
4297 {
4298     int  npbcdim;
4299     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4300     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4301     int  flag;
4302     gmx_bool bScrew;
4303     ivec dev;
4304     real inv_ncg,pos_d;
4305     rvec cm_new;
4306
4307     npbcdim = dd->npbcdim;
4308
4309     for(cg=cg_start; cg<cg_end; cg++)
4310     {
4311         k0   = cgindex[cg];
4312         k1   = cgindex[cg+1];
4313         nrcg = k1 - k0;
4314         if (nrcg == 1)
4315         {
4316             copy_rvec(state->x[k0],cm_new);
4317         }
4318         else
4319         {
4320             inv_ncg = 1.0/nrcg;
4321             
4322             clear_rvec(cm_new);
4323             for(k=k0; (k<k1); k++)
4324             {
4325                 rvec_inc(cm_new,state->x[k]);
4326             }
4327             for(d=0; (d<DIM); d++)
4328             {
4329                 cm_new[d] = inv_ncg*cm_new[d];
4330             }
4331         }
4332         
4333         clear_ivec(dev);
4334         /* Do pbc and check DD cell boundary crossings */
4335         for(d=DIM-1; d>=0; d--)
4336         {
4337             if (dd->nc[d] > 1)
4338             {
4339                 bScrew = (dd->bScrewPBC && d == XX);
4340                 /* Determine the location of this cg in lattice coordinates */
4341                 pos_d = cm_new[d];
4342                 if (tric_dir[d])
4343                 {
4344                     for(d2=d+1; d2<DIM; d2++)
4345                     {
4346                         pos_d += cm_new[d2]*tcm[d2][d];
4347                     }
4348                 }
4349                 /* Put the charge group in the triclinic unit-cell */
4350                 if (pos_d >= cell_x1[d])
4351                 {
4352                     if (pos_d >= limit1[d])
4353                     {
4354                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4355                                       cg_cm[cg],cm_new,pos_d);
4356                     }
4357                     dev[d] = 1;
4358                     if (dd->ci[d] == dd->nc[d] - 1)
4359                     {
4360                         rvec_dec(cm_new,state->box[d]);
4361                         if (bScrew)
4362                         {
4363                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4364                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4365                         }
4366                         for(k=k0; (k<k1); k++)
4367                         {
4368                             rvec_dec(state->x[k],state->box[d]);
4369                             if (bScrew)
4370                             {
4371                                 rotate_state_atom(state,k);
4372                             }
4373                         }
4374                     }
4375                 }
4376                 else if (pos_d < cell_x0[d])
4377                 {
4378                     if (pos_d < limit0[d])
4379                     {
4380                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4381                                       cg_cm[cg],cm_new,pos_d);
4382                     }
4383                     dev[d] = -1;
4384                     if (dd->ci[d] == 0)
4385                     {
4386                         rvec_inc(cm_new,state->box[d]);
4387                         if (bScrew)
4388                         {
4389                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4390                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4391                         }
4392                         for(k=k0; (k<k1); k++)
4393                         {
4394                             rvec_inc(state->x[k],state->box[d]);
4395                             if (bScrew)
4396                             {
4397                                 rotate_state_atom(state,k);
4398                             }
4399                         }
4400                     }
4401                 }
4402             }
4403             else if (d < npbcdim)
4404             {
4405                 /* Put the charge group in the rectangular unit-cell */
4406                 while (cm_new[d] >= state->box[d][d])
4407                 {
4408                     rvec_dec(cm_new,state->box[d]);
4409                     for(k=k0; (k<k1); k++)
4410                     {
4411                         rvec_dec(state->x[k],state->box[d]);
4412                     }
4413                 }
4414                 while (cm_new[d] < 0)
4415                 {
4416                     rvec_inc(cm_new,state->box[d]);
4417                     for(k=k0; (k<k1); k++)
4418                     {
4419                         rvec_inc(state->x[k],state->box[d]);
4420                     }
4421                 }
4422             }
4423         }
4424     
4425         copy_rvec(cm_new,cg_cm[cg]);
4426         
4427         /* Determine where this cg should go */
4428         flag = 0;
4429         mc = -1;
4430         for(d=0; d<dd->ndim; d++)
4431         {
4432             dim = dd->dim[d];
4433             if (dev[dim] == 1)
4434             {
4435                 flag |= DD_FLAG_FW(d);
4436                 if (mc == -1)
4437                 {
4438                     mc = d*2;
4439                 }
4440             }
4441             else if (dev[dim] == -1)
4442             {
4443                 flag |= DD_FLAG_BW(d);
4444                 if (mc == -1) {
4445                     if (dd->nc[dim] > 2)
4446                     {
4447                         mc = d*2 + 1;
4448                     }
4449                     else
4450                     {
4451                         mc = d*2;
4452                     }
4453                 }
4454             }
4455         }
4456         /* Temporarily store the flag in move */
4457         move[cg] = mc + flag;
4458     }
4459 }
4460
4461 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4462                                gmx_domdec_t *dd,ivec tric_dir,
4463                                t_state *state,rvec **f,
4464                                t_forcerec *fr,t_mdatoms *md,
4465                                gmx_bool bCompact,
4466                                t_nrnb *nrnb,
4467                                int *ncg_stay_home,
4468                                int *ncg_moved)
4469 {
4470     int  *move;
4471     int  npbcdim;
4472     int  ncg[DIM*2],nat[DIM*2];
4473     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4474     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4475     int  sbuf[2],rbuf[2];
4476     int  home_pos_cg,home_pos_at,buf_pos;
4477     int  flag;
4478     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4479     gmx_bool bScrew;
4480     ivec dev;
4481     real inv_ncg,pos_d;
4482     matrix tcm;
4483     rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4484     atom_id *cgindex;
4485     cginfo_mb_t *cginfo_mb;
4486     gmx_domdec_comm_t *comm;
4487     int  *moved;
4488     int  nthread,thread;
4489     
4490     if (dd->bScrewPBC)
4491     {
4492         check_screw_box(state->box);
4493     }
4494     
4495     comm  = dd->comm;
4496     if (fr->cutoff_scheme == ecutsGROUP)
4497     {
4498         cg_cm = fr->cg_cm;
4499     }
4500     
4501     for(i=0; i<estNR; i++)
4502     {
4503         if (EST_DISTR(i))
4504         {
4505             switch (i)
4506             {
4507             case estX:   /* Always present */            break;
4508             case estV:   bV   = (state->flags & (1<<i)); break;
4509             case estSDX: bSDX = (state->flags & (1<<i)); break;
4510             case estCGP: bCGP = (state->flags & (1<<i)); break;
4511             case estLD_RNG:
4512             case estLD_RNGI:
4513             case estDISRE_INITF:
4514             case estDISRE_RM3TAV:
4515             case estORIRE_INITF:
4516             case estORIRE_DTAV:
4517                 /* No processing required */
4518                 break;
4519             default:
4520             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4521             }
4522         }
4523     }
4524     
4525     if (dd->ncg_tot > comm->nalloc_int)
4526     {
4527         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4528         srenew(comm->buf_int,comm->nalloc_int);
4529     }
4530     move = comm->buf_int;
4531     
4532     /* Clear the count */
4533     for(c=0; c<dd->ndim*2; c++)
4534     {
4535         ncg[c] = 0;
4536         nat[c] = 0;
4537     }
4538
4539     npbcdim = dd->npbcdim;
4540
4541     for(d=0; (d<DIM); d++)
4542     {
4543         limitd[d] = dd->comm->cellsize_min[d];
4544         if (d >= npbcdim && dd->ci[d] == 0)
4545         {
4546             cell_x0[d] = -GMX_FLOAT_MAX;
4547         }
4548         else
4549         {
4550             cell_x0[d] = comm->cell_x0[d];
4551         }
4552         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4553         {
4554             cell_x1[d] = GMX_FLOAT_MAX;
4555         }
4556         else
4557         {
4558             cell_x1[d] = comm->cell_x1[d];
4559         }
4560         if (d < npbcdim)
4561         {
4562             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4563             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4564         }
4565         else
4566         {
4567             /* We check after communication if a charge group moved
4568              * more than one cell. Set the pre-comm check limit to float_max.
4569              */
4570             limit0[d] = -GMX_FLOAT_MAX;
4571             limit1[d] =  GMX_FLOAT_MAX;
4572         }
4573     }
4574     
4575     make_tric_corr_matrix(npbcdim,state->box,tcm);
4576     
4577     cgindex = dd->cgindex;
4578
4579     nthread = gmx_omp_nthreads_get(emntDomdec);
4580
4581     /* Compute the center of geometry for all home charge groups
4582      * and put them in the box and determine where they should go.
4583      */
4584 #pragma omp parallel for num_threads(nthread) schedule(static)
4585     for(thread=0; thread<nthread; thread++)
4586     {
4587         calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4588                      cell_x0,cell_x1,limitd,limit0,limit1,
4589                      cgindex,
4590                      ( thread   *dd->ncg_home)/nthread,
4591                      ((thread+1)*dd->ncg_home)/nthread,
4592                      fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4593                      move);
4594     }
4595
4596     for(cg=0; cg<dd->ncg_home; cg++)
4597     {
4598         if (move[cg] >= 0)
4599         {
4600             mc = move[cg];
4601             flag     = mc & ~DD_FLAG_NRCG;
4602             mc       = mc & DD_FLAG_NRCG;
4603             move[cg] = mc;
4604
4605             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4606             {
4607                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4608                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4609             }
4610             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4611             /* We store the cg size in the lower 16 bits
4612              * and the place where the charge group should go
4613              * in the next 6 bits. This saves some communication volume.
4614              */
4615             nrcg = cgindex[cg+1] - cgindex[cg];
4616             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4617             ncg[mc] += 1;
4618             nat[mc] += nrcg;
4619         }
4620     }
4621     
4622     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4623     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4624
4625     *ncg_moved = 0;
4626     for(i=0; i<dd->ndim*2; i++)
4627     {
4628         *ncg_moved += ncg[i];
4629     }
4630     
4631     nvec = 1;
4632     if (bV)
4633     {
4634         nvec++;
4635     }
4636     if (bSDX)
4637     {
4638         nvec++;
4639     }
4640     if (bCGP)
4641     {
4642         nvec++;
4643     }
4644     
4645     /* Make sure the communication buffers are large enough */
4646     for(mc=0; mc<dd->ndim*2; mc++)
4647     {
4648         nvr = ncg[mc] + nat[mc]*nvec;
4649         if (nvr > comm->cgcm_state_nalloc[mc])
4650         {
4651             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4652             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4653         }
4654     }
4655     
4656     switch (fr->cutoff_scheme)
4657     {
4658     case ecutsGROUP:
4659         /* Recalculating cg_cm might be cheaper than communicating,
4660          * but that could give rise to rounding issues.
4661          */
4662         home_pos_cg =
4663             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4664                                     nvec,cg_cm,comm,bCompact);
4665     break;
4666     case ecutsVERLET:
4667         /* Without charge groups we send the moved atom coordinates
4668          * over twice. This is so the code below can be used without
4669          * many conditionals for both for with and without charge groups.
4670          */
4671         home_pos_cg =
4672             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4673                                     nvec,state->x,comm,FALSE);
4674         if (bCompact)
4675         {
4676             home_pos_cg -= *ncg_moved;
4677         }
4678         break;
4679     default:
4680         gmx_incons("unimplemented");
4681         home_pos_cg = 0;
4682     }
4683     
4684     vec = 0;
4685     home_pos_at =
4686         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4687                                 nvec,vec++,state->x,comm,bCompact);
4688     if (bV)
4689     {
4690         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4691                                 nvec,vec++,state->v,comm,bCompact);
4692     }
4693     if (bSDX)
4694     {
4695         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4696                                 nvec,vec++,state->sd_X,comm,bCompact);
4697     }
4698     if (bCGP)
4699     {
4700         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4701                                 nvec,vec++,state->cg_p,comm,bCompact);
4702     }
4703     
4704     if (bCompact)
4705     {
4706         compact_ind(dd->ncg_home,move,
4707                     dd->index_gl,dd->cgindex,dd->gatindex,
4708                     dd->ga2la,comm->bLocalCG,
4709                     fr->cginfo);
4710     }
4711     else
4712     {
4713         if (fr->cutoff_scheme == ecutsVERLET)
4714         {
4715             moved = get_moved(comm,dd->ncg_home);
4716
4717             for(k=0; k<dd->ncg_home; k++)
4718             {
4719                 moved[k] = 0;
4720             }
4721         }
4722         else
4723         {
4724             moved = fr->ns.grid->cell_index;
4725         }
4726
4727         clear_and_mark_ind(dd->ncg_home,move,
4728                            dd->index_gl,dd->cgindex,dd->gatindex,
4729                            dd->ga2la,comm->bLocalCG,
4730                            moved);
4731     }
4732     
4733     cginfo_mb = fr->cginfo_mb;
4734
4735     *ncg_stay_home = home_pos_cg;
4736     for(d=0; d<dd->ndim; d++)
4737     {
4738         dim = dd->dim[d];
4739         ncg_recv = 0;
4740         nat_recv = 0;
4741         nvr      = 0;
4742         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4743         {
4744             cdd = d*2 + dir;
4745             /* Communicate the cg and atom counts */
4746             sbuf[0] = ncg[cdd];
4747             sbuf[1] = nat[cdd];
4748             if (debug)
4749             {
4750                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4751                         d,dir,sbuf[0],sbuf[1]);
4752             }
4753             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4754             
4755             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4756             {
4757                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4758                 srenew(comm->buf_int,comm->nalloc_int);
4759             }
4760             
4761             /* Communicate the charge group indices, sizes and flags */
4762             dd_sendrecv_int(dd, d, dir,
4763                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4764                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4765             
4766             nvs = ncg[cdd] + nat[cdd]*nvec;
4767             i   = rbuf[0]  + rbuf[1] *nvec;
4768             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4769             
4770             /* Communicate cgcm and state */
4771             dd_sendrecv_rvec(dd, d, dir,
4772                              comm->cgcm_state[cdd], nvs,
4773                              comm->vbuf.v+nvr, i);
4774             ncg_recv += rbuf[0];
4775             nat_recv += rbuf[1];
4776             nvr      += i;
4777         }
4778         
4779         /* Process the received charge groups */
4780         buf_pos = 0;
4781         for(cg=0; cg<ncg_recv; cg++)
4782         {
4783             flag = comm->buf_int[cg*DD_CGIBS+1];
4784
4785             if (dim >= npbcdim && dd->nc[dim] > 2)
4786             {
4787                 /* No pbc in this dim and more than one domain boundary.
4788                  * We do a separate check if a charge group didn't move too far.
4789                  */
4790                 if (((flag & DD_FLAG_FW(d)) &&
4791                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4792                     ((flag & DD_FLAG_BW(d)) &&
4793                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4794                 {
4795                     cg_move_error(fplog,dd,step,cg,dim,
4796                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4797                                    FALSE,0,
4798                                    comm->vbuf.v[buf_pos],
4799                                    comm->vbuf.v[buf_pos],
4800                                    comm->vbuf.v[buf_pos][dim]);
4801                 }
4802             }
4803
4804             mc = -1;
4805             if (d < dd->ndim-1)
4806             {
4807                 /* Check which direction this cg should go */
4808                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4809                 {
4810                     if (dd->bGridJump)
4811                     {
4812                         /* The cell boundaries for dimension d2 are not equal
4813                          * for each cell row of the lower dimension(s),
4814                          * therefore we might need to redetermine where
4815                          * this cg should go.
4816                          */
4817                         dim2 = dd->dim[d2];
4818                         /* If this cg crosses the box boundary in dimension d2
4819                          * we can use the communicated flag, so we do not
4820                          * have to worry about pbc.
4821                          */
4822                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4823                                (flag & DD_FLAG_FW(d2))) ||
4824                               (dd->ci[dim2] == 0 &&
4825                                (flag & DD_FLAG_BW(d2)))))
4826                         {
4827                             /* Clear the two flags for this dimension */
4828                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4829                             /* Determine the location of this cg
4830                              * in lattice coordinates
4831                              */
4832                             pos_d = comm->vbuf.v[buf_pos][dim2];
4833                             if (tric_dir[dim2])
4834                             {
4835                                 for(d3=dim2+1; d3<DIM; d3++)
4836                                 {
4837                                     pos_d +=
4838                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4839                                 }
4840                             }
4841                             /* Check of we are not at the box edge.
4842                              * pbc is only handled in the first step above,
4843                              * but this check could move over pbc while
4844                              * the first step did not due to different rounding.
4845                              */
4846                             if (pos_d >= cell_x1[dim2] &&
4847                                 dd->ci[dim2] != dd->nc[dim2]-1)
4848                             {
4849                                 flag |= DD_FLAG_FW(d2);
4850                             }
4851                             else if (pos_d < cell_x0[dim2] &&
4852                                      dd->ci[dim2] != 0)
4853                             {
4854                                 flag |= DD_FLAG_BW(d2);
4855                             }
4856                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4857                         }
4858                     }
4859                     /* Set to which neighboring cell this cg should go */
4860                     if (flag & DD_FLAG_FW(d2))
4861                     {
4862                         mc = d2*2;
4863                     }
4864                     else if (flag & DD_FLAG_BW(d2))
4865                     {
4866                         if (dd->nc[dd->dim[d2]] > 2)
4867                         {
4868                             mc = d2*2+1;
4869                         }
4870                         else
4871                         {
4872                             mc = d2*2;
4873                         }
4874                     }
4875                 }
4876             }
4877             
4878             nrcg = flag & DD_FLAG_NRCG;
4879             if (mc == -1)
4880             {
4881                 if (home_pos_cg+1 > dd->cg_nalloc)
4882                 {
4883                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4884                     srenew(dd->index_gl,dd->cg_nalloc);
4885                     srenew(dd->cgindex,dd->cg_nalloc+1);
4886                 }
4887                 /* Set the global charge group index and size */
4888                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4889                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4890                 /* Copy the state from the buffer */
4891                 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4892                 if (fr->cutoff_scheme == ecutsGROUP)
4893                 {
4894                     cg_cm = fr->cg_cm;
4895                     copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4896                 }
4897                 buf_pos++;
4898
4899                 /* Set the cginfo */
4900                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4901                                                    dd->index_gl[home_pos_cg]);
4902                 if (comm->bLocalCG)
4903                 {
4904                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4905                 }
4906
4907                 if (home_pos_at+nrcg > state->nalloc)
4908                 {
4909                     dd_realloc_state(state,f,home_pos_at+nrcg);
4910                 }
4911                 for(i=0; i<nrcg; i++)
4912                 {
4913                     copy_rvec(comm->vbuf.v[buf_pos++],
4914                               state->x[home_pos_at+i]);
4915                 }
4916                 if (bV)
4917                 {
4918                     for(i=0; i<nrcg; i++)
4919                     {
4920                         copy_rvec(comm->vbuf.v[buf_pos++],
4921                                   state->v[home_pos_at+i]);
4922                     }
4923                 }
4924                 if (bSDX)
4925                 {
4926                     for(i=0; i<nrcg; i++)
4927                     {
4928                         copy_rvec(comm->vbuf.v[buf_pos++],
4929                                   state->sd_X[home_pos_at+i]);
4930                     }
4931                 }
4932                 if (bCGP)
4933                 {
4934                     for(i=0; i<nrcg; i++)
4935                     {
4936                         copy_rvec(comm->vbuf.v[buf_pos++],
4937                                   state->cg_p[home_pos_at+i]);
4938                     }
4939                 }
4940                 home_pos_cg += 1;
4941                 home_pos_at += nrcg;
4942             }
4943             else
4944             {
4945                 /* Reallocate the buffers if necessary  */
4946                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4947                 {
4948                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4949                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4950                 }
4951                 nvr = ncg[mc] + nat[mc]*nvec;
4952                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4953                 {
4954                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4955                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4956                 }
4957                 /* Copy from the receive to the send buffers */
4958                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4959                        comm->buf_int + cg*DD_CGIBS,
4960                        DD_CGIBS*sizeof(int));
4961                 memcpy(comm->cgcm_state[mc][nvr],
4962                        comm->vbuf.v[buf_pos],
4963                        (1+nrcg*nvec)*sizeof(rvec));
4964                 buf_pos += 1 + nrcg*nvec;
4965                 ncg[mc] += 1;
4966                 nat[mc] += nrcg;
4967             }
4968         }
4969     }
4970     
4971     /* With sorting (!bCompact) the indices are now only partially up to date
4972      * and ncg_home and nat_home are not the real count, since there are
4973      * "holes" in the arrays for the charge groups that moved to neighbors.
4974      */
4975     if (fr->cutoff_scheme == ecutsVERLET)
4976     {
4977         moved = get_moved(comm,home_pos_cg);
4978
4979         for(i=dd->ncg_home; i<home_pos_cg; i++)
4980         {
4981             moved[i] = 0;
4982         }
4983     }
4984     dd->ncg_home = home_pos_cg;
4985     dd->nat_home = home_pos_at;
4986
4987     if (debug)
4988     {
4989         fprintf(debug,
4990                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4991                 *ncg_moved,dd->ncg_home-*ncg_moved);
4992                 
4993     }
4994 }
4995
4996 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4997 {
4998     dd->comm->cycl[ddCycl] += cycles;
4999     dd->comm->cycl_n[ddCycl]++;
5000     if (cycles > dd->comm->cycl_max[ddCycl])
5001     {
5002         dd->comm->cycl_max[ddCycl] = cycles;
5003     }
5004 }
5005
5006 static double force_flop_count(t_nrnb *nrnb)
5007 {
5008     int i;
5009     double sum;
5010     const char *name;
5011
5012     sum = 0;
5013     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5014     {
5015         /* To get closer to the real timings, we half the count
5016          * for the normal loops and again half it for water loops.
5017          */
5018         name = nrnb_str(i);
5019         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5020         {
5021             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5022         }
5023         else
5024         {
5025             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5026         }
5027     }
5028     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5029     {
5030         name = nrnb_str(i);
5031         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5032         sum += nrnb->n[i]*cost_nrnb(i);
5033     }
5034     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5035     {
5036         sum += nrnb->n[i]*cost_nrnb(i);
5037     }
5038
5039     return sum;
5040 }
5041
5042 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5043 {
5044     if (dd->comm->eFlop)
5045     {
5046         dd->comm->flop -= force_flop_count(nrnb);
5047     }
5048 }
5049 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5050 {
5051     if (dd->comm->eFlop)
5052     {
5053         dd->comm->flop += force_flop_count(nrnb);
5054         dd->comm->flop_n++;
5055     }
5056 }  
5057
5058 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5059 {
5060     int i;
5061     
5062     for(i=0; i<ddCyclNr; i++)
5063     {
5064         dd->comm->cycl[i] = 0;
5065         dd->comm->cycl_n[i] = 0;
5066         dd->comm->cycl_max[i] = 0;
5067     }
5068     dd->comm->flop = 0;
5069     dd->comm->flop_n = 0;
5070 }
5071
5072 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5073 {
5074     gmx_domdec_comm_t *comm;
5075     gmx_domdec_load_t *load;
5076     gmx_domdec_root_t *root=NULL;
5077     int  d,dim,cid,i,pos;
5078     float cell_frac=0,sbuf[DD_NLOAD_MAX];
5079     gmx_bool bSepPME;
5080     
5081     if (debug)
5082     {
5083         fprintf(debug,"get_load_distribution start\n");
5084     }
5085
5086     wallcycle_start(wcycle,ewcDDCOMMLOAD);
5087     
5088     comm = dd->comm;
5089     
5090     bSepPME = (dd->pme_nodeid >= 0);
5091     
5092     for(d=dd->ndim-1; d>=0; d--)
5093     {
5094         dim = dd->dim[d];
5095         /* Check if we participate in the communication in this dimension */
5096         if (d == dd->ndim-1 || 
5097             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5098         {
5099             load = &comm->load[d];
5100             if (dd->bGridJump)
5101             {
5102                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5103             }
5104             pos = 0;
5105             if (d == dd->ndim-1)
5106             {
5107                 sbuf[pos++] = dd_force_load(comm);
5108                 sbuf[pos++] = sbuf[0];
5109                 if (dd->bGridJump)
5110                 {
5111                     sbuf[pos++] = sbuf[0];
5112                     sbuf[pos++] = cell_frac;
5113                     if (d > 0)
5114                     {
5115                         sbuf[pos++] = comm->cell_f_max0[d];
5116                         sbuf[pos++] = comm->cell_f_min1[d];
5117                     }
5118                 }
5119                 if (bSepPME)
5120                 {
5121                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5122                     sbuf[pos++] = comm->cycl[ddCyclPME];
5123                 }
5124             }
5125             else
5126             {
5127                 sbuf[pos++] = comm->load[d+1].sum;
5128                 sbuf[pos++] = comm->load[d+1].max;
5129                 if (dd->bGridJump)
5130                 {
5131                     sbuf[pos++] = comm->load[d+1].sum_m;
5132                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5133                     sbuf[pos++] = comm->load[d+1].flags;
5134                     if (d > 0)
5135                     {
5136                         sbuf[pos++] = comm->cell_f_max0[d];
5137                         sbuf[pos++] = comm->cell_f_min1[d];
5138                     }
5139                 }
5140                 if (bSepPME)
5141                 {
5142                     sbuf[pos++] = comm->load[d+1].mdf;
5143                     sbuf[pos++] = comm->load[d+1].pme;
5144                 }
5145             }
5146             load->nload = pos;
5147             /* Communicate a row in DD direction d.
5148              * The communicators are setup such that the root always has rank 0.
5149              */
5150 #ifdef GMX_MPI
5151             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
5152                        load->load,load->nload*sizeof(float),MPI_BYTE,
5153                        0,comm->mpi_comm_load[d]);
5154 #endif
5155             if (dd->ci[dim] == dd->master_ci[dim])
5156             {
5157                 /* We are the root, process this row */
5158                 if (comm->bDynLoadBal)
5159                 {
5160                     root = comm->root[d];
5161                 }
5162                 load->sum = 0;
5163                 load->max = 0;
5164                 load->sum_m = 0;
5165                 load->cvol_min = 1;
5166                 load->flags = 0;
5167                 load->mdf = 0;
5168                 load->pme = 0;
5169                 pos = 0;
5170                 for(i=0; i<dd->nc[dim]; i++)
5171                 {
5172                     load->sum += load->load[pos++];
5173                     load->max = max(load->max,load->load[pos]);
5174                     pos++;
5175                     if (dd->bGridJump)
5176                     {
5177                         if (root->bLimited)
5178                         {
5179                             /* This direction could not be load balanced properly,
5180                              * therefore we need to use the maximum iso the average load.
5181                              */
5182                             load->sum_m = max(load->sum_m,load->load[pos]);
5183                         }
5184                         else
5185                         {
5186                             load->sum_m += load->load[pos];
5187                         }
5188                         pos++;
5189                         load->cvol_min = min(load->cvol_min,load->load[pos]);
5190                         pos++;
5191                         if (d < dd->ndim-1)
5192                         {
5193                             load->flags = (int)(load->load[pos++] + 0.5);
5194                         }
5195                         if (d > 0)
5196                         {
5197                             root->cell_f_max0[i] = load->load[pos++];
5198                             root->cell_f_min1[i] = load->load[pos++];
5199                         }
5200                     }
5201                     if (bSepPME)
5202                     {
5203                         load->mdf = max(load->mdf,load->load[pos]);
5204                         pos++;
5205                         load->pme = max(load->pme,load->load[pos]);
5206                         pos++;
5207                     }
5208                 }
5209                 if (comm->bDynLoadBal && root->bLimited)
5210                 {
5211                     load->sum_m *= dd->nc[dim];
5212                     load->flags |= (1<<d);
5213                 }
5214             }
5215         }
5216     }
5217
5218     if (DDMASTER(dd))
5219     {
5220         comm->nload      += dd_load_count(comm);
5221         comm->load_step  += comm->cycl[ddCyclStep];
5222         comm->load_sum   += comm->load[0].sum;
5223         comm->load_max   += comm->load[0].max;
5224         if (comm->bDynLoadBal)
5225         {
5226             for(d=0; d<dd->ndim; d++)
5227             {
5228                 if (comm->load[0].flags & (1<<d))
5229                 {
5230                     comm->load_lim[d]++;
5231                 }
5232             }
5233         }
5234         if (bSepPME)
5235         {
5236             comm->load_mdf += comm->load[0].mdf;
5237             comm->load_pme += comm->load[0].pme;
5238         }
5239     }
5240
5241     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5242     
5243     if (debug)
5244     {
5245         fprintf(debug,"get_load_distribution finished\n");
5246     }
5247 }
5248
5249 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5250 {
5251     /* Return the relative performance loss on the total run time
5252      * due to the force calculation load imbalance.
5253      */
5254     if (dd->comm->nload > 0)
5255     {
5256         return
5257             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5258             (dd->comm->load_step*dd->nnodes);
5259     }
5260     else
5261     {
5262         return 0;
5263     }
5264 }
5265
5266 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5267 {
5268     char  buf[STRLEN];
5269     int   npp,npme,nnodes,d,limp;
5270     float imbal,pme_f_ratio,lossf,lossp=0;
5271     gmx_bool  bLim;
5272     gmx_domdec_comm_t *comm;
5273
5274     comm = dd->comm;
5275     if (DDMASTER(dd) && comm->nload > 0)
5276     {
5277         npp    = dd->nnodes;
5278         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5279         nnodes = npp + npme;
5280         imbal = comm->load_max*npp/comm->load_sum - 1;
5281         lossf = dd_force_imb_perf_loss(dd);
5282         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5283         fprintf(fplog,"%s",buf);
5284         fprintf(stderr,"\n");
5285         fprintf(stderr,"%s",buf);
5286         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5287         fprintf(fplog,"%s",buf);
5288         fprintf(stderr,"%s",buf);
5289         bLim = FALSE;
5290         if (comm->bDynLoadBal)
5291         {
5292             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5293             for(d=0; d<dd->ndim; d++)
5294             {
5295                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5296                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5297                 if (limp >= 50)
5298                 {
5299                     bLim = TRUE;
5300                 }
5301             }
5302             sprintf(buf+strlen(buf),"\n");
5303             fprintf(fplog,"%s",buf);
5304             fprintf(stderr,"%s",buf);
5305         }
5306         if (npme > 0)
5307         {
5308             pme_f_ratio = comm->load_pme/comm->load_mdf;
5309             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5310             if (lossp <= 0)
5311             {
5312                 lossp *= (float)npme/(float)nnodes;
5313             }
5314             else
5315             {
5316                 lossp *= (float)npp/(float)nnodes;
5317             }
5318             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5319             fprintf(fplog,"%s",buf);
5320             fprintf(stderr,"%s",buf);
5321             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5322             fprintf(fplog,"%s",buf);
5323             fprintf(stderr,"%s",buf);
5324         }
5325         fprintf(fplog,"\n");
5326         fprintf(stderr,"\n");
5327         
5328         if (lossf >= DD_PERF_LOSS)
5329         {
5330             sprintf(buf,
5331                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5332                     "      in the domain decomposition.\n",lossf*100);
5333             if (!comm->bDynLoadBal)
5334             {
5335                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5336             }
5337             else if (bLim)
5338             {
5339                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5340             }
5341             fprintf(fplog,"%s\n",buf);
5342             fprintf(stderr,"%s\n",buf);
5343         }
5344         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5345         {
5346             sprintf(buf,
5347                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5348                     "      had %s work to do than the PP nodes.\n"
5349                     "      You might want to %s the number of PME nodes\n"
5350                     "      or %s the cut-off and the grid spacing.\n",
5351                     fabs(lossp*100),
5352                     (lossp < 0) ? "less"     : "more",
5353                     (lossp < 0) ? "decrease" : "increase",
5354                     (lossp < 0) ? "decrease" : "increase");
5355             fprintf(fplog,"%s\n",buf);
5356             fprintf(stderr,"%s\n",buf);
5357         }
5358     }
5359 }
5360
5361 static float dd_vol_min(gmx_domdec_t *dd)
5362 {
5363     return dd->comm->load[0].cvol_min*dd->nnodes;
5364 }
5365
5366 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5367 {
5368     return dd->comm->load[0].flags;
5369 }
5370
5371 static float dd_f_imbal(gmx_domdec_t *dd)
5372 {
5373     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5374 }
5375
5376 float dd_pme_f_ratio(gmx_domdec_t *dd)
5377 {
5378     if (dd->comm->cycl_n[ddCyclPME] > 0)
5379     {
5380         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5381     }
5382     else
5383     {
5384         return -1.0;
5385     }
5386 }
5387
5388 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5389 {
5390     int flags,d;
5391     char buf[22];
5392     
5393     flags = dd_load_flags(dd);
5394     if (flags)
5395     {
5396         fprintf(fplog,
5397                 "DD  load balancing is limited by minimum cell size in dimension");
5398         for(d=0; d<dd->ndim; d++)
5399         {
5400             if (flags & (1<<d))
5401             {
5402                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5403             }
5404         }
5405         fprintf(fplog,"\n");
5406     }
5407     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5408     if (dd->comm->bDynLoadBal)
5409     {
5410         fprintf(fplog,"  vol min/aver %5.3f%c",
5411                 dd_vol_min(dd),flags ? '!' : ' ');
5412     }
5413     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5414     if (dd->comm->cycl_n[ddCyclPME])
5415     {
5416         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5417     }
5418     fprintf(fplog,"\n\n");
5419 }
5420
5421 static void dd_print_load_verbose(gmx_domdec_t *dd)
5422 {
5423     if (dd->comm->bDynLoadBal)
5424     {
5425         fprintf(stderr,"vol %4.2f%c ",
5426                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5427     }
5428     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5429     if (dd->comm->cycl_n[ddCyclPME])
5430     {
5431         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5432     }
5433 }
5434
5435 #ifdef GMX_MPI
5436 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5437 {
5438     MPI_Comm  c_row;
5439     int  dim, i, rank;
5440     ivec loc_c;
5441     gmx_domdec_root_t *root;
5442     gmx_bool bPartOfGroup = FALSE;
5443     
5444     dim = dd->dim[dim_ind];
5445     copy_ivec(loc,loc_c);
5446     for(i=0; i<dd->nc[dim]; i++)
5447     {
5448         loc_c[dim] = i;
5449         rank = dd_index(dd->nc,loc_c);
5450         if (rank == dd->rank)
5451         {
5452             /* This process is part of the group */
5453             bPartOfGroup = TRUE;
5454         }
5455     }
5456     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5457                    &c_row);
5458     if (bPartOfGroup)
5459     {
5460         dd->comm->mpi_comm_load[dim_ind] = c_row;
5461         if (dd->comm->eDLB != edlbNO)
5462         {
5463             if (dd->ci[dim] == dd->master_ci[dim])
5464             {
5465                 /* This is the root process of this row */
5466                 snew(dd->comm->root[dim_ind],1);
5467                 root = dd->comm->root[dim_ind];
5468                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5469                 snew(root->old_cell_f,dd->nc[dim]+1);
5470                 snew(root->bCellMin,dd->nc[dim]);
5471                 if (dim_ind > 0)
5472                 {
5473                     snew(root->cell_f_max0,dd->nc[dim]);
5474                     snew(root->cell_f_min1,dd->nc[dim]);
5475                     snew(root->bound_min,dd->nc[dim]);
5476                     snew(root->bound_max,dd->nc[dim]);
5477                 }
5478                 snew(root->buf_ncd,dd->nc[dim]);
5479             }
5480             else
5481             {
5482                 /* This is not a root process, we only need to receive cell_f */
5483                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5484             }
5485         }
5486         if (dd->ci[dim] == dd->master_ci[dim])
5487         {
5488             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5489         }
5490     }
5491 }
5492 #endif
5493
5494 static void make_load_communicators(gmx_domdec_t *dd)
5495 {
5496 #ifdef GMX_MPI
5497   int  dim0,dim1,i,j;
5498   ivec loc;
5499
5500   if (debug)
5501     fprintf(debug,"Making load communicators\n");
5502
5503   snew(dd->comm->load,dd->ndim);
5504   snew(dd->comm->mpi_comm_load,dd->ndim);
5505   
5506   clear_ivec(loc);
5507   make_load_communicator(dd,0,loc);
5508   if (dd->ndim > 1) {
5509     dim0 = dd->dim[0];
5510     for(i=0; i<dd->nc[dim0]; i++) {
5511       loc[dim0] = i;
5512       make_load_communicator(dd,1,loc);
5513     }
5514   }
5515   if (dd->ndim > 2) {
5516     dim0 = dd->dim[0];
5517     for(i=0; i<dd->nc[dim0]; i++) {
5518       loc[dim0] = i;
5519       dim1 = dd->dim[1];
5520       for(j=0; j<dd->nc[dim1]; j++) {
5521           loc[dim1] = j;
5522           make_load_communicator(dd,2,loc);
5523       }
5524     }
5525   }
5526
5527   if (debug)
5528     fprintf(debug,"Finished making load communicators\n");
5529 #endif
5530 }
5531
5532 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5533 {
5534     gmx_bool bZYX;
5535     int  d,dim,i,j,m;
5536     ivec tmp,s;
5537     int  nzone,nzonep;
5538     ivec dd_zp[DD_MAXIZONE];
5539     gmx_domdec_zones_t *zones;
5540     gmx_domdec_ns_ranges_t *izone;
5541     
5542     for(d=0; d<dd->ndim; d++)
5543     {
5544         dim = dd->dim[d];
5545         copy_ivec(dd->ci,tmp);
5546         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5547         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5548         copy_ivec(dd->ci,tmp);
5549         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5550         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5551         if (debug)
5552         {
5553             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5554                     dd->rank,dim,
5555                     dd->neighbor[d][0],
5556                     dd->neighbor[d][1]);
5557         }
5558     }
5559     
5560     if (DDMASTER(dd))
5561     {
5562         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5563             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5564     }
5565     if (fplog)
5566     {
5567         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5568                 dd->ndim,
5569                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5570                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5571     }
5572     switch (dd->ndim)
5573     {
5574     case 3:
5575         nzone  = dd_z3n;
5576         nzonep = dd_zp3n;
5577         for(i=0; i<nzonep; i++)
5578         {
5579             copy_ivec(dd_zp3[i],dd_zp[i]);
5580         }
5581         break;
5582     case 2:
5583         nzone  = dd_z2n;
5584         nzonep = dd_zp2n;
5585         for(i=0; i<nzonep; i++)
5586         {
5587             copy_ivec(dd_zp2[i],dd_zp[i]);
5588         }
5589         break;
5590     case 1:
5591         nzone  = dd_z1n;
5592         nzonep = dd_zp1n;
5593         for(i=0; i<nzonep; i++)
5594         {
5595             copy_ivec(dd_zp1[i],dd_zp[i]);
5596         }
5597         break;
5598     default:
5599         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5600         nzone = 0;
5601         nzonep = 0;
5602     }
5603
5604     zones = &dd->comm->zones;
5605
5606     for(i=0; i<nzone; i++)
5607     {
5608         m = 0;
5609         clear_ivec(zones->shift[i]);
5610         for(d=0; d<dd->ndim; d++)
5611         {
5612             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5613         }
5614     }
5615     
5616     zones->n = nzone;
5617     for(i=0; i<nzone; i++)
5618     {
5619         for(d=0; d<DIM; d++)
5620         {
5621             s[d] = dd->ci[d] - zones->shift[i][d];
5622             if (s[d] < 0)
5623             {
5624                 s[d] += dd->nc[d];
5625             }
5626             else if (s[d] >= dd->nc[d])
5627             {
5628                 s[d] -= dd->nc[d];
5629             }
5630         }
5631     }
5632     zones->nizone = nzonep;
5633     for(i=0; i<zones->nizone; i++)
5634     {
5635         if (dd_zp[i][0] != i)
5636         {
5637             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5638         }
5639         izone = &zones->izone[i];
5640         izone->j0 = dd_zp[i][1];
5641         izone->j1 = dd_zp[i][2];
5642         for(dim=0; dim<DIM; dim++)
5643         {
5644             if (dd->nc[dim] == 1)
5645             {
5646                 /* All shifts should be allowed */
5647                 izone->shift0[dim] = -1;
5648                 izone->shift1[dim] = 1;
5649             }
5650             else
5651             {
5652                 /*
5653                   izone->shift0[d] = 0;
5654                   izone->shift1[d] = 0;
5655                   for(j=izone->j0; j<izone->j1; j++) {
5656                   if (dd->shift[j][d] > dd->shift[i][d])
5657                   izone->shift0[d] = -1;
5658                   if (dd->shift[j][d] < dd->shift[i][d])
5659                   izone->shift1[d] = 1;
5660                   }
5661                 */
5662                 
5663                 int shift_diff;
5664                 
5665                 /* Assume the shift are not more than 1 cell */
5666                 izone->shift0[dim] = 1;
5667                 izone->shift1[dim] = -1;
5668                 for(j=izone->j0; j<izone->j1; j++)
5669                 {
5670                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5671                     if (shift_diff < izone->shift0[dim])
5672                     {
5673                         izone->shift0[dim] = shift_diff;
5674                     }
5675                     if (shift_diff > izone->shift1[dim])
5676                     {
5677                         izone->shift1[dim] = shift_diff;
5678                     }
5679                 }
5680             }
5681         }
5682     }
5683     
5684     if (dd->comm->eDLB != edlbNO)
5685     {
5686         snew(dd->comm->root,dd->ndim);
5687     }
5688     
5689     if (dd->comm->bRecordLoad)
5690     {
5691         make_load_communicators(dd);
5692     }
5693 }
5694
5695 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5696 {
5697     gmx_domdec_t *dd;
5698     gmx_domdec_comm_t *comm;
5699     int  i,rank,*buf;
5700     ivec periods;
5701 #ifdef GMX_MPI
5702     MPI_Comm comm_cart;
5703 #endif
5704     
5705     dd = cr->dd;
5706     comm = dd->comm;
5707     
5708 #ifdef GMX_MPI
5709     if (comm->bCartesianPP)
5710     {
5711         /* Set up cartesian communication for the particle-particle part */
5712         if (fplog)
5713         {
5714             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5715                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5716         }
5717         
5718         for(i=0; i<DIM; i++)
5719         {
5720             periods[i] = TRUE;
5721         }
5722         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5723                         &comm_cart);
5724         /* We overwrite the old communicator with the new cartesian one */
5725         cr->mpi_comm_mygroup = comm_cart;
5726     }
5727     
5728     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5729     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5730     
5731     if (comm->bCartesianPP_PME)
5732     {
5733         /* Since we want to use the original cartesian setup for sim,
5734          * and not the one after split, we need to make an index.
5735          */
5736         snew(comm->ddindex2ddnodeid,dd->nnodes);
5737         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5738         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5739         /* Get the rank of the DD master,
5740          * above we made sure that the master node is a PP node.
5741          */
5742         if (MASTER(cr))
5743         {
5744             rank = dd->rank;
5745         }
5746         else
5747         {
5748             rank = 0;
5749         }
5750         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5751     }
5752     else if (comm->bCartesianPP)
5753     {
5754         if (cr->npmenodes == 0)
5755         {
5756             /* The PP communicator is also
5757              * the communicator for this simulation
5758              */
5759             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5760         }
5761         cr->nodeid = dd->rank;
5762         
5763         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5764         
5765         /* We need to make an index to go from the coordinates
5766          * to the nodeid of this simulation.
5767          */
5768         snew(comm->ddindex2simnodeid,dd->nnodes);
5769         snew(buf,dd->nnodes);
5770         if (cr->duty & DUTY_PP)
5771         {
5772             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5773         }
5774         /* Communicate the ddindex to simulation nodeid index */
5775         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5776                       cr->mpi_comm_mysim);
5777         sfree(buf);
5778         
5779         /* Determine the master coordinates and rank.
5780          * The DD master should be the same node as the master of this sim.
5781          */
5782         for(i=0; i<dd->nnodes; i++)
5783         {
5784             if (comm->ddindex2simnodeid[i] == 0)
5785             {
5786                 ddindex2xyz(dd->nc,i,dd->master_ci);
5787                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5788             }
5789         }
5790         if (debug)
5791         {
5792             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5793         }
5794     }
5795     else
5796     {
5797         /* No Cartesian communicators */
5798         /* We use the rank in dd->comm->all as DD index */
5799         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5800         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5801         dd->masterrank = 0;
5802         clear_ivec(dd->master_ci);
5803     }
5804 #endif
5805   
5806     if (fplog)
5807     {
5808         fprintf(fplog,
5809                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5810                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5811     }
5812     if (debug)
5813     {
5814         fprintf(debug,
5815                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5816                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5817     }
5818 }
5819
5820 static void receive_ddindex2simnodeid(t_commrec *cr)
5821 {
5822     gmx_domdec_t *dd;
5823     
5824     gmx_domdec_comm_t *comm;
5825     int  *buf;
5826     
5827     dd = cr->dd;
5828     comm = dd->comm;
5829     
5830 #ifdef GMX_MPI
5831     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5832     {
5833         snew(comm->ddindex2simnodeid,dd->nnodes);
5834         snew(buf,dd->nnodes);
5835         if (cr->duty & DUTY_PP)
5836         {
5837             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5838         }
5839 #ifdef GMX_MPI
5840         /* Communicate the ddindex to simulation nodeid index */
5841         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5842                       cr->mpi_comm_mysim);
5843 #endif
5844         sfree(buf);
5845     }
5846 #endif
5847 }
5848
5849 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5850                                                      int ncg,int natoms)
5851 {
5852     gmx_domdec_master_t *ma;
5853     int i;
5854
5855     snew(ma,1);
5856     
5857     snew(ma->ncg,dd->nnodes);
5858     snew(ma->index,dd->nnodes+1);
5859     snew(ma->cg,ncg);
5860     snew(ma->nat,dd->nnodes);
5861     snew(ma->ibuf,dd->nnodes*2);
5862     snew(ma->cell_x,DIM);
5863     for(i=0; i<DIM; i++)
5864     {
5865         snew(ma->cell_x[i],dd->nc[i]+1);
5866     }
5867
5868     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5869     {
5870         ma->vbuf = NULL;
5871     }
5872     else
5873     {
5874         snew(ma->vbuf,natoms);
5875     }
5876
5877     return ma;
5878 }
5879
5880 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5881                                int reorder)
5882 {
5883     gmx_domdec_t *dd;
5884     gmx_domdec_comm_t *comm;
5885     int  i,rank;
5886     gmx_bool bDiv[DIM];
5887     ivec periods;
5888 #ifdef GMX_MPI
5889     MPI_Comm comm_cart;
5890 #endif
5891     
5892     dd = cr->dd;
5893     comm = dd->comm;
5894     
5895     if (comm->bCartesianPP)
5896     {
5897         for(i=1; i<DIM; i++)
5898         {
5899             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5900         }
5901         if (bDiv[YY] || bDiv[ZZ])
5902         {
5903             comm->bCartesianPP_PME = TRUE;
5904             /* If we have 2D PME decomposition, which is always in x+y,
5905              * we stack the PME only nodes in z.
5906              * Otherwise we choose the direction that provides the thinnest slab
5907              * of PME only nodes as this will have the least effect
5908              * on the PP communication.
5909              * But for the PME communication the opposite might be better.
5910              */
5911             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5912                              !bDiv[YY] ||
5913                              dd->nc[YY] > dd->nc[ZZ]))
5914             {
5915                 comm->cartpmedim = ZZ;
5916             }
5917             else
5918             {
5919                 comm->cartpmedim = YY;
5920             }
5921             comm->ntot[comm->cartpmedim]
5922                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5923         }
5924         else if (fplog)
5925         {
5926             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5927             fprintf(fplog,
5928                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5929         }
5930     }
5931     
5932 #ifdef GMX_MPI
5933     if (comm->bCartesianPP_PME)
5934     {
5935         if (fplog)
5936         {
5937             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5938         }
5939         
5940         for(i=0; i<DIM; i++)
5941         {
5942             periods[i] = TRUE;
5943         }
5944         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5945                         &comm_cart);
5946         
5947         MPI_Comm_rank(comm_cart,&rank);
5948         if (MASTERNODE(cr) && rank != 0)
5949         {
5950             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5951         }
5952         
5953         /* With this assigment we loose the link to the original communicator
5954          * which will usually be MPI_COMM_WORLD, unless have multisim.
5955          */
5956         cr->mpi_comm_mysim = comm_cart;
5957         cr->sim_nodeid = rank;
5958         
5959         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5960         
5961         if (fplog)
5962         {
5963             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5964                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5965         }
5966         
5967         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5968         {
5969             cr->duty = DUTY_PP;
5970         }
5971         if (cr->npmenodes == 0 ||
5972             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5973         {
5974             cr->duty = DUTY_PME;
5975         }
5976         
5977         /* Split the sim communicator into PP and PME only nodes */
5978         MPI_Comm_split(cr->mpi_comm_mysim,
5979                        cr->duty,
5980                        dd_index(comm->ntot,dd->ci),
5981                        &cr->mpi_comm_mygroup);
5982     }
5983     else
5984     {
5985         switch (dd_node_order)
5986         {
5987         case ddnoPP_PME:
5988             if (fplog)
5989             {
5990                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5991             }
5992             break;
5993         case ddnoINTERLEAVE:
5994             /* Interleave the PP-only and PME-only nodes,
5995              * as on clusters with dual-core machines this will double
5996              * the communication bandwidth of the PME processes
5997              * and thus speed up the PP <-> PME and inter PME communication.
5998              */
5999             if (fplog)
6000             {
6001                 fprintf(fplog,"Interleaving PP and PME nodes\n");
6002             }
6003             comm->pmenodes = dd_pmenodes(cr);
6004             break;
6005         case ddnoCARTESIAN:
6006             break;
6007         default:
6008             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6009         }
6010     
6011         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6012         {
6013             cr->duty = DUTY_PME;
6014         }
6015         else
6016         {
6017             cr->duty = DUTY_PP;
6018         }
6019         
6020         /* Split the sim communicator into PP and PME only nodes */
6021         MPI_Comm_split(cr->mpi_comm_mysim,
6022                        cr->duty,
6023                        cr->nodeid,
6024                        &cr->mpi_comm_mygroup);
6025         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6026     }
6027 #endif
6028
6029     if (fplog)
6030     {
6031         fprintf(fplog,"This is a %s only node\n\n",
6032                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6033     }
6034 }
6035
6036 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6037 {
6038     gmx_domdec_t *dd;
6039     gmx_domdec_comm_t *comm;
6040     int CartReorder;
6041     
6042     dd = cr->dd;
6043     comm = dd->comm;
6044     
6045     copy_ivec(dd->nc,comm->ntot);
6046     
6047     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6048     comm->bCartesianPP_PME = FALSE;
6049     
6050     /* Reorder the nodes by default. This might change the MPI ranks.
6051      * Real reordering is only supported on very few architectures,
6052      * Blue Gene is one of them.
6053      */
6054     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6055     
6056     if (cr->npmenodes > 0)
6057     {
6058         /* Split the communicator into a PP and PME part */
6059         split_communicator(fplog,cr,dd_node_order,CartReorder);
6060         if (comm->bCartesianPP_PME)
6061         {
6062             /* We (possibly) reordered the nodes in split_communicator,
6063              * so it is no longer required in make_pp_communicator.
6064              */
6065             CartReorder = FALSE;
6066         }
6067     }
6068     else
6069     {
6070         /* All nodes do PP and PME */
6071 #ifdef GMX_MPI    
6072         /* We do not require separate communicators */
6073         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6074 #endif
6075     }
6076     
6077     if (cr->duty & DUTY_PP)
6078     {
6079         /* Copy or make a new PP communicator */
6080         make_pp_communicator(fplog,cr,CartReorder);
6081     }
6082     else
6083     {
6084         receive_ddindex2simnodeid(cr);
6085     }
6086     
6087     if (!(cr->duty & DUTY_PME))
6088     {
6089         /* Set up the commnuication to our PME node */
6090         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6091         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6092         if (debug)
6093         {
6094             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6095                     dd->pme_nodeid,dd->pme_receive_vir_ener);
6096         }
6097     }
6098     else
6099     {
6100         dd->pme_nodeid = -1;
6101     }
6102
6103     if (DDMASTER(dd))
6104     {
6105         dd->ma = init_gmx_domdec_master_t(dd,
6106                                           comm->cgs_gl.nr,
6107                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6108     }
6109 }
6110
6111 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6112 {
6113     real *slb_frac,tot;
6114     int  i,n;
6115     double dbl;
6116     
6117     slb_frac = NULL;
6118     if (nc > 1 && size_string != NULL)
6119     {
6120         if (fplog)
6121         {
6122             fprintf(fplog,"Using static load balancing for the %s direction\n",
6123                     dir);
6124         }
6125         snew(slb_frac,nc);
6126         tot = 0;
6127         for (i=0; i<nc; i++)
6128         {
6129             dbl = 0;
6130             sscanf(size_string,"%lf%n",&dbl,&n);
6131             if (dbl == 0)
6132             {
6133                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6134             }
6135             slb_frac[i] = dbl;
6136             size_string += n;
6137             tot += slb_frac[i];
6138         }
6139         /* Normalize */
6140         if (fplog)
6141         {
6142             fprintf(fplog,"Relative cell sizes:");
6143         }
6144         for (i=0; i<nc; i++)
6145         {
6146             slb_frac[i] /= tot;
6147             if (fplog)
6148             {
6149                 fprintf(fplog," %5.3f",slb_frac[i]);
6150             }
6151         }
6152         if (fplog)
6153         {
6154             fprintf(fplog,"\n");
6155         }
6156     }
6157     
6158     return slb_frac;
6159 }
6160
6161 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6162 {
6163     int n,nmol,ftype;
6164     gmx_mtop_ilistloop_t iloop;
6165     t_ilist *il;
6166     
6167     n = 0;
6168     iloop = gmx_mtop_ilistloop_init(mtop);
6169     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6170     {
6171         for(ftype=0; ftype<F_NRE; ftype++)
6172         {
6173             if ((interaction_function[ftype].flags & IF_BOND) &&
6174                 NRAL(ftype) >  2)
6175             {
6176                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6177             }
6178         }
6179   }
6180
6181   return n;
6182 }
6183
6184 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6185 {
6186     char *val;
6187     int  nst;
6188     
6189     nst = def;
6190     val = getenv(env_var);
6191     if (val)
6192     {
6193         if (sscanf(val,"%d",&nst) <= 0)
6194         {
6195             nst = 1;
6196         }
6197         if (fplog)
6198         {
6199             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6200                     env_var,val,nst);
6201         }
6202     }
6203     
6204     return nst;
6205 }
6206
6207 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6208 {
6209     if (MASTER(cr))
6210     {
6211         fprintf(stderr,"\n%s\n",warn_string);
6212     }
6213     if (fplog)
6214     {
6215         fprintf(fplog,"\n%s\n",warn_string);
6216     }
6217 }
6218
6219 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6220                                   t_inputrec *ir,FILE *fplog)
6221 {
6222     if (ir->ePBC == epbcSCREW &&
6223         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6224     {
6225         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6226     }
6227
6228     if (ir->ns_type == ensSIMPLE)
6229     {
6230         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6231     }
6232
6233     if (ir->nstlist == 0)
6234     {
6235         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6236     }
6237
6238     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6239     {
6240         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6241     }
6242 }
6243
6244 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6245 {
6246     int  di,d;
6247     real r;
6248
6249     r = ddbox->box_size[XX];
6250     for(di=0; di<dd->ndim; di++)
6251     {
6252         d = dd->dim[di];
6253         /* Check using the initial average cell size */
6254         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6255     }
6256
6257     return r;
6258 }
6259
6260 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6261                              const char *dlb_opt,gmx_bool bRecordLoad,
6262                              unsigned long Flags,t_inputrec *ir)
6263 {
6264     gmx_domdec_t *dd;
6265     int  eDLB=-1;
6266     char buf[STRLEN];
6267
6268     switch (dlb_opt[0])
6269     {
6270     case 'a': eDLB = edlbAUTO; break;
6271     case 'n': eDLB = edlbNO;   break;
6272     case 'y': eDLB = edlbYES;  break;
6273     default: gmx_incons("Unknown dlb_opt");
6274     }
6275
6276     if (Flags & MD_RERUN)
6277     {
6278         return edlbNO;
6279     }
6280
6281     if (!EI_DYNAMICS(ir->eI))
6282     {
6283         if (eDLB == edlbYES)
6284         {
6285             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6286             dd_warning(cr,fplog,buf);
6287         }
6288             
6289         return edlbNO;
6290     }
6291
6292     if (!bRecordLoad)
6293     {
6294         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6295
6296         return edlbNO;
6297     }
6298
6299     if (Flags & MD_REPRODUCIBLE)
6300     {
6301         switch (eDLB)
6302         {
6303                         case edlbNO: 
6304                                 break;
6305                         case edlbAUTO:
6306                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6307                                 eDLB = edlbNO;
6308                                 break;
6309                         case edlbYES:
6310                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6311                                 break;
6312                         default:
6313                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6314                                 break;
6315         }
6316     }
6317
6318     return eDLB;
6319 }
6320
6321 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6322 {
6323     int dim;
6324
6325     dd->ndim = 0;
6326     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6327     {
6328         /* Decomposition order z,y,x */
6329         if (fplog)
6330         {
6331             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6332         }
6333         for(dim=DIM-1; dim>=0; dim--)
6334         {
6335             if (dd->nc[dim] > 1)
6336             {
6337                 dd->dim[dd->ndim++] = dim;
6338             }
6339         }
6340     }
6341     else
6342     {
6343         /* Decomposition order x,y,z */
6344         for(dim=0; dim<DIM; dim++)
6345         {
6346             if (dd->nc[dim] > 1)
6347             {
6348                 dd->dim[dd->ndim++] = dim;
6349             }
6350         }
6351     }
6352 }
6353
6354 static gmx_domdec_comm_t *init_dd_comm()
6355 {
6356     gmx_domdec_comm_t *comm;
6357     int  i;
6358
6359     snew(comm,1);
6360     snew(comm->cggl_flag,DIM*2);
6361     snew(comm->cgcm_state,DIM*2);
6362     for(i=0; i<DIM*2; i++)
6363     {
6364         comm->cggl_flag_nalloc[i]  = 0;
6365         comm->cgcm_state_nalloc[i] = 0;
6366     }
6367     
6368     comm->nalloc_int = 0;
6369     comm->buf_int    = NULL;
6370
6371     vec_rvec_init(&comm->vbuf);
6372
6373     comm->n_load_have    = 0;
6374     comm->n_load_collect = 0;
6375
6376     for(i=0; i<ddnatNR-ddnatZONE; i++)
6377     {
6378         comm->sum_nat[i] = 0;
6379     }
6380     comm->ndecomp = 0;
6381     comm->nload   = 0;
6382     comm->load_step = 0;
6383     comm->load_sum  = 0;
6384     comm->load_max  = 0;
6385     clear_ivec(comm->load_lim);
6386     comm->load_mdf  = 0;
6387     comm->load_pme  = 0;
6388
6389     return comm;
6390 }
6391
6392 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6393                                         unsigned long Flags,
6394                                         ivec nc,
6395                                         real comm_distance_min,real rconstr,
6396                                         const char *dlb_opt,real dlb_scale,
6397                                         const char *sizex,const char *sizey,const char *sizez,
6398                                         gmx_mtop_t *mtop,t_inputrec *ir,
6399                                         matrix box,rvec *x,
6400                                         gmx_ddbox_t *ddbox,
6401                                         int *npme_x,int *npme_y)
6402 {
6403     gmx_domdec_t *dd;
6404     gmx_domdec_comm_t *comm;
6405     int  recload;
6406     int  d,i,j;
6407     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6408     gmx_bool bC;
6409     char buf[STRLEN];
6410     
6411     if (fplog)
6412     {
6413         fprintf(fplog,
6414                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6415     }
6416     
6417     snew(dd,1);
6418
6419     dd->comm = init_dd_comm();
6420     comm = dd->comm;
6421     snew(comm->cggl_flag,DIM*2);
6422     snew(comm->cgcm_state,DIM*2);
6423
6424     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6425     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6426     
6427     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6428     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6429     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6430     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6431     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6432     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6433     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6434     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6435
6436     dd->pme_recv_f_alloc = 0;
6437     dd->pme_recv_f_buf = NULL;
6438
6439     if (dd->bSendRecv2 && fplog)
6440     {
6441         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6442     }
6443     if (comm->eFlop)
6444     {
6445         if (fplog)
6446         {
6447             fprintf(fplog,"Will load balance based on FLOP count\n");
6448         }
6449         if (comm->eFlop > 1)
6450         {
6451             srand(1+cr->nodeid);
6452         }
6453         comm->bRecordLoad = TRUE;
6454     }
6455     else
6456     {
6457         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6458                              
6459     }
6460     
6461     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6462     
6463     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6464     if (fplog)
6465     {
6466         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6467     }
6468     dd->bGridJump = comm->bDynLoadBal;
6469     
6470     if (comm->nstSortCG)
6471     {
6472         if (fplog)
6473         {
6474             if (comm->nstSortCG == 1)
6475             {
6476                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6477             }
6478             else
6479             {
6480                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6481                         comm->nstSortCG);
6482             }
6483         }
6484         snew(comm->sort,1);
6485     }
6486     else
6487     {
6488         if (fplog)
6489         {
6490             fprintf(fplog,"Will not sort the charge groups\n");
6491         }
6492     }
6493
6494     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6495     
6496     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6497     if (comm->bInterCGBondeds)
6498     {
6499         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6500     }
6501     else
6502     {
6503         comm->bInterCGMultiBody = FALSE;
6504     }
6505     
6506     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6507     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6508
6509     if (ir->rlistlong == 0)
6510     {
6511         /* Set the cut-off to some very large value,
6512          * so we don't need if statements everywhere in the code.
6513          * We use sqrt, since the cut-off is squared in some places.
6514          */
6515         comm->cutoff   = GMX_CUTOFF_INF;
6516     }
6517     else
6518     {
6519         comm->cutoff   = ir->rlistlong;
6520     }
6521     comm->cutoff_mbody = 0;
6522     
6523     comm->cellsize_limit = 0;
6524     comm->bBondComm = FALSE;
6525
6526     if (comm->bInterCGBondeds)
6527     {
6528         if (comm_distance_min > 0)
6529         {
6530             comm->cutoff_mbody = comm_distance_min;
6531             if (Flags & MD_DDBONDCOMM)
6532             {
6533                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6534             }
6535             else
6536             {
6537                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6538             }
6539             r_bonded_limit = comm->cutoff_mbody;
6540         }
6541         else if (ir->bPeriodicMols)
6542         {
6543             /* Can not easily determine the required cut-off */
6544             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6545             comm->cutoff_mbody = comm->cutoff/2;
6546             r_bonded_limit = comm->cutoff_mbody;
6547         }
6548         else
6549         {
6550             if (MASTER(cr))
6551             {
6552                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6553                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6554             }
6555             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6556             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6557
6558             /* We use an initial margin of 10% for the minimum cell size,
6559              * except when we are just below the non-bonded cut-off.
6560              */
6561             if (Flags & MD_DDBONDCOMM)
6562             {
6563                 if (max(r_2b,r_mb) > comm->cutoff)
6564                 {
6565                     r_bonded       = max(r_2b,r_mb);
6566                     r_bonded_limit = 1.1*r_bonded;
6567                     comm->bBondComm = TRUE;
6568                 }
6569                 else
6570                 {
6571                     r_bonded       = r_mb;
6572                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6573                 }
6574                 /* We determine cutoff_mbody later */
6575             }
6576             else
6577             {
6578                 /* No special bonded communication,
6579                  * simply increase the DD cut-off.
6580                  */
6581                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6582                 comm->cutoff_mbody = r_bonded_limit;
6583                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6584             }
6585         }
6586         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6587         if (fplog)
6588         {
6589             fprintf(fplog,
6590                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6591                     comm->cellsize_limit);
6592         }
6593     }
6594
6595     if (dd->bInterCGcons && rconstr <= 0)
6596     {
6597         /* There is a cell size limit due to the constraints (P-LINCS) */
6598         rconstr = constr_r_max(fplog,mtop,ir);
6599         if (fplog)
6600         {
6601             fprintf(fplog,
6602                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6603                     rconstr);
6604             if (rconstr > comm->cellsize_limit)
6605             {
6606                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6607             }
6608         }
6609     }
6610     else if (rconstr > 0 && fplog)
6611     {
6612         /* Here we do not check for dd->bInterCGcons,
6613          * because one can also set a cell size limit for virtual sites only
6614          * and at this point we don't know yet if there are intercg v-sites.
6615          */
6616         fprintf(fplog,
6617                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6618                 rconstr);
6619     }
6620     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6621
6622     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6623
6624     if (nc[XX] > 0)
6625     {
6626         copy_ivec(nc,dd->nc);
6627         set_dd_dim(fplog,dd);
6628         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6629
6630         if (cr->npmenodes == -1)
6631         {
6632             cr->npmenodes = 0;
6633         }
6634         acs = average_cellsize_min(dd,ddbox);
6635         if (acs < comm->cellsize_limit)
6636         {
6637             if (fplog)
6638             {
6639                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6640             }
6641             gmx_fatal_collective(FARGS,cr,NULL,
6642                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6643                                  acs,comm->cellsize_limit);
6644         }
6645     }
6646     else
6647     {
6648         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6649
6650         /* We need to choose the optimal DD grid and possibly PME nodes */
6651         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6652                                comm->eDLB!=edlbNO,dlb_scale,
6653                                comm->cellsize_limit,comm->cutoff,
6654                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6655         
6656         if (dd->nc[XX] == 0)
6657         {
6658             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6659             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6660                     !bC ? "-rdd" : "-rcon",
6661                     comm->eDLB!=edlbNO ? " or -dds" : "",
6662                     bC ? " or your LINCS settings" : "");
6663
6664             gmx_fatal_collective(FARGS,cr,NULL,
6665                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6666                                  "%s\n"
6667                                  "Look in the log file for details on the domain decomposition",
6668                                  cr->nnodes-cr->npmenodes,limit,buf);
6669         }
6670         set_dd_dim(fplog,dd);
6671     }
6672
6673     if (fplog)
6674     {
6675         fprintf(fplog,
6676                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6677                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6678     }
6679     
6680     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6681     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6682     {
6683         gmx_fatal_collective(FARGS,cr,NULL,
6684                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6685                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6686     }
6687     if (cr->npmenodes > dd->nnodes)
6688     {
6689         gmx_fatal_collective(FARGS,cr,NULL,
6690                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6691     }
6692     if (cr->npmenodes > 0)
6693     {
6694         comm->npmenodes = cr->npmenodes;
6695     }
6696     else
6697     {
6698         comm->npmenodes = dd->nnodes;
6699     }
6700
6701     if (EEL_PME(ir->coulombtype))
6702     {
6703         /* The following choices should match those
6704          * in comm_cost_est in domdec_setup.c.
6705          * Note that here the checks have to take into account
6706          * that the decomposition might occur in a different order than xyz
6707          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6708          * in which case they will not match those in comm_cost_est,
6709          * but since that is mainly for testing purposes that's fine.
6710          */
6711         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6712             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6713             getenv("GMX_PMEONEDD") == NULL)
6714         {
6715             comm->npmedecompdim = 2;
6716             comm->npmenodes_x   = dd->nc[XX];
6717             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6718         }
6719         else
6720         {
6721             /* In case nc is 1 in both x and y we could still choose to
6722              * decompose pme in y instead of x, but we use x for simplicity.
6723              */
6724             comm->npmedecompdim = 1;
6725             if (dd->dim[0] == YY)
6726             {
6727                 comm->npmenodes_x = 1;
6728                 comm->npmenodes_y = comm->npmenodes;
6729             }
6730             else
6731             {
6732                 comm->npmenodes_x = comm->npmenodes;
6733                 comm->npmenodes_y = 1;
6734             }
6735         }    
6736         if (fplog)
6737         {
6738             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6739                     comm->npmenodes_x,comm->npmenodes_y,1);
6740         }
6741     }
6742     else
6743     {
6744         comm->npmedecompdim = 0;
6745         comm->npmenodes_x   = 0;
6746         comm->npmenodes_y   = 0;
6747     }
6748     
6749     /* Technically we don't need both of these,
6750      * but it simplifies code not having to recalculate it.
6751      */
6752     *npme_x = comm->npmenodes_x;
6753     *npme_y = comm->npmenodes_y;
6754         
6755     snew(comm->slb_frac,DIM);
6756     if (comm->eDLB == edlbNO)
6757     {
6758         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6759         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6760         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6761     }
6762
6763     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6764     {
6765         if (comm->bBondComm || comm->eDLB != edlbNO)
6766         {
6767             /* Set the bonded communication distance to halfway
6768              * the minimum and the maximum,
6769              * since the extra communication cost is nearly zero.
6770              */
6771             acs = average_cellsize_min(dd,ddbox);
6772             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6773             if (comm->eDLB != edlbNO)
6774             {
6775                 /* Check if this does not limit the scaling */
6776                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6777             }
6778             if (!comm->bBondComm)
6779             {
6780                 /* Without bBondComm do not go beyond the n.b. cut-off */
6781                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6782                 if (comm->cellsize_limit >= comm->cutoff)
6783                 {
6784                     /* We don't loose a lot of efficieny
6785                      * when increasing it to the n.b. cut-off.
6786                      * It can even be slightly faster, because we need
6787                      * less checks for the communication setup.
6788                      */
6789                     comm->cutoff_mbody = comm->cutoff;
6790                 }
6791             }
6792             /* Check if we did not end up below our original limit */
6793             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6794
6795             if (comm->cutoff_mbody > comm->cellsize_limit)
6796             {
6797                 comm->cellsize_limit = comm->cutoff_mbody;
6798             }
6799         }
6800         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6801     }
6802
6803     if (debug)
6804     {
6805         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6806                 "cellsize limit %f\n",
6807                 comm->bBondComm,comm->cellsize_limit);
6808     }
6809     
6810     if (MASTER(cr))
6811     {
6812         check_dd_restrictions(cr,dd,ir,fplog);
6813     }
6814
6815     comm->partition_step = INT_MIN;
6816     dd->ddp_count = 0;
6817
6818     clear_dd_cycle_counts(dd);
6819
6820     return dd;
6821 }
6822
6823 static void set_dlb_limits(gmx_domdec_t *dd)
6824
6825 {
6826     int d;
6827
6828     for(d=0; d<dd->ndim; d++)
6829     {
6830         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6831         dd->comm->cellsize_min[dd->dim[d]] =
6832             dd->comm->cellsize_min_dlb[dd->dim[d]];
6833     }
6834 }
6835
6836
6837 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6838 {
6839     gmx_domdec_t *dd;
6840     gmx_domdec_comm_t *comm;
6841     real cellsize_min;
6842     int  d,nc,i;
6843     char buf[STRLEN];
6844     
6845     dd = cr->dd;
6846     comm = dd->comm;
6847     
6848     if (fplog)
6849     {
6850         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6851     }
6852
6853     cellsize_min = comm->cellsize_min[dd->dim[0]];
6854     for(d=1; d<dd->ndim; d++)
6855     {
6856         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6857     }
6858
6859     if (cellsize_min < comm->cellsize_limit*1.05)
6860     {
6861         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6862
6863         /* Change DLB from "auto" to "no". */
6864         comm->eDLB = edlbNO;
6865
6866         return;
6867     }
6868
6869     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6870     comm->bDynLoadBal = TRUE;
6871     dd->bGridJump = TRUE;
6872     
6873     set_dlb_limits(dd);
6874
6875     /* We can set the required cell size info here,
6876      * so we do not need to communicate this.
6877      * The grid is completely uniform.
6878      */
6879     for(d=0; d<dd->ndim; d++)
6880     {
6881         if (comm->root[d])
6882         {
6883             comm->load[d].sum_m = comm->load[d].sum;
6884
6885             nc = dd->nc[dd->dim[d]];
6886             for(i=0; i<nc; i++)
6887             {
6888                 comm->root[d]->cell_f[i]    = i/(real)nc;
6889                 if (d > 0)
6890                 {
6891                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6892                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6893                 }
6894             }
6895             comm->root[d]->cell_f[nc] = 1.0;
6896         }
6897     }
6898 }
6899
6900 static char *init_bLocalCG(gmx_mtop_t *mtop)
6901 {
6902     int  ncg,cg;
6903     char *bLocalCG;
6904     
6905     ncg = ncg_mtop(mtop);
6906     snew(bLocalCG,ncg);
6907     for(cg=0; cg<ncg; cg++)
6908     {
6909         bLocalCG[cg] = FALSE;
6910     }
6911
6912     return bLocalCG;
6913 }
6914
6915 void dd_init_bondeds(FILE *fplog,
6916                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6917                      gmx_vsite_t *vsite,gmx_constr_t constr,
6918                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6919 {
6920     gmx_domdec_comm_t *comm;
6921     gmx_bool bBondComm;
6922     int  d;
6923
6924     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6925
6926     comm = dd->comm;
6927
6928     if (comm->bBondComm)
6929     {
6930         /* Communicate atoms beyond the cut-off for bonded interactions */
6931         comm = dd->comm;
6932
6933         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6934
6935         comm->bLocalCG = init_bLocalCG(mtop);
6936     }
6937     else
6938     {
6939         /* Only communicate atoms based on cut-off */
6940         comm->cglink   = NULL;
6941         comm->bLocalCG = NULL;
6942     }
6943 }
6944
6945 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6946                               t_inputrec *ir,
6947                               gmx_bool bDynLoadBal,real dlb_scale,
6948                               gmx_ddbox_t *ddbox)
6949 {
6950     gmx_domdec_comm_t *comm;
6951     int  d;
6952     ivec np;
6953     real limit,shrink;
6954     char buf[64];
6955
6956     if (fplog == NULL)
6957     {
6958         return;
6959     }
6960
6961     comm = dd->comm;
6962
6963     if (bDynLoadBal)
6964     {
6965         fprintf(fplog,"The maximum number of communication pulses is:");
6966         for(d=0; d<dd->ndim; d++)
6967         {
6968             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6969         }
6970         fprintf(fplog,"\n");
6971         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6972         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6973         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6974         for(d=0; d<DIM; d++)
6975         {
6976             if (dd->nc[d] > 1)
6977             {
6978                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6979                 {
6980                     shrink = 0;
6981                 }
6982                 else
6983                 {
6984                     shrink =
6985                         comm->cellsize_min_dlb[d]/
6986                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6987                 }
6988                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6989             }
6990         }
6991         fprintf(fplog,"\n");
6992     }
6993     else
6994     {
6995         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6996         fprintf(fplog,"The initial number of communication pulses is:");
6997         for(d=0; d<dd->ndim; d++)
6998         {
6999             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7000         }
7001         fprintf(fplog,"\n");
7002         fprintf(fplog,"The initial domain decomposition cell size is:");
7003         for(d=0; d<DIM; d++) {
7004             if (dd->nc[d] > 1)
7005             {
7006                 fprintf(fplog," %c %.2f nm",
7007                         dim2char(d),dd->comm->cellsize_min[d]);
7008             }
7009         }
7010         fprintf(fplog,"\n\n");
7011     }
7012     
7013     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7014     {
7015         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7016         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7017                 "non-bonded interactions","",comm->cutoff);
7018
7019         if (bDynLoadBal)
7020         {
7021             limit = dd->comm->cellsize_limit;
7022         }
7023         else
7024         {
7025             if (dynamic_dd_box(ddbox,ir))
7026             {
7027                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7028             }
7029             limit = dd->comm->cellsize_min[XX];
7030             for(d=1; d<DIM; d++)
7031             {
7032                 limit = min(limit,dd->comm->cellsize_min[d]);
7033             }
7034         }
7035
7036         if (comm->bInterCGBondeds)
7037         {
7038             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7039                     "two-body bonded interactions","(-rdd)",
7040                     max(comm->cutoff,comm->cutoff_mbody));
7041             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7042                     "multi-body bonded interactions","(-rdd)",
7043                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7044         }
7045         if (dd->vsite_comm)
7046         {
7047             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7048                     "virtual site constructions","(-rcon)",limit);
7049         }
7050         if (dd->constraint_comm)
7051         {
7052             sprintf(buf,"atoms separated by up to %d constraints",
7053                     1+ir->nProjOrder);
7054             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7055                     buf,"(-rcon)",limit);
7056         }
7057         fprintf(fplog,"\n");
7058     }
7059     
7060     fflush(fplog);
7061 }
7062
7063 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7064                                 real dlb_scale,
7065                                 const t_inputrec *ir,
7066                                 const gmx_ddbox_t *ddbox)
7067 {
7068     gmx_domdec_comm_t *comm;
7069     int  d,dim,npulse,npulse_d_max,npulse_d;
7070     gmx_bool bNoCutOff;
7071
7072     comm = dd->comm;
7073
7074     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7075
7076     /* Determine the maximum number of comm. pulses in one dimension */
7077         
7078     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7079         
7080     /* Determine the maximum required number of grid pulses */
7081     if (comm->cellsize_limit >= comm->cutoff)
7082     {
7083         /* Only a single pulse is required */
7084         npulse = 1;
7085     }
7086     else if (!bNoCutOff && comm->cellsize_limit > 0)
7087     {
7088         /* We round down slightly here to avoid overhead due to the latency
7089          * of extra communication calls when the cut-off
7090          * would be only slightly longer than the cell size.
7091          * Later cellsize_limit is redetermined,
7092          * so we can not miss interactions due to this rounding.
7093          */
7094         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7095     }
7096     else
7097     {
7098         /* There is no cell size limit */
7099         npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7100     }
7101
7102     if (!bNoCutOff && npulse > 1)
7103     {
7104         /* See if we can do with less pulses, based on dlb_scale */
7105         npulse_d_max = 0;
7106         for(d=0; d<dd->ndim; d++)
7107         {
7108             dim = dd->dim[d];
7109             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7110                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7111             npulse_d_max = max(npulse_d_max,npulse_d);
7112         }
7113         npulse = min(npulse,npulse_d_max);
7114     }
7115
7116     /* This env var can override npulse */
7117     d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7118     if (d > 0)
7119     {
7120         npulse = d;
7121     }
7122
7123     comm->maxpulse = 1;
7124     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7125     for(d=0; d<dd->ndim; d++)
7126     {
7127         comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7128         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7129         snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7130         comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7131         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7132         {
7133             comm->bVacDLBNoLimit = FALSE;
7134         }
7135     }
7136
7137     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7138     if (!comm->bVacDLBNoLimit)
7139     {
7140         comm->cellsize_limit = max(comm->cellsize_limit,
7141                                    comm->cutoff/comm->maxpulse);
7142     }
7143     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7144     /* Set the minimum cell size for each DD dimension */
7145     for(d=0; d<dd->ndim; d++)
7146     {
7147         if (comm->bVacDLBNoLimit ||
7148             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7149         {
7150             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7151         }
7152         else
7153         {
7154             comm->cellsize_min_dlb[dd->dim[d]] =
7155                 comm->cutoff/comm->cd[d].np_dlb;
7156         }
7157     }
7158     if (comm->cutoff_mbody <= 0)
7159     {
7160         comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7161     }
7162     if (comm->bDynLoadBal)
7163     {
7164         set_dlb_limits(dd);
7165     }
7166 }
7167
7168 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7169 {
7170     /* If each molecule is a single charge group
7171      * or we use domain decomposition for each periodic dimension,
7172      * we do not need to take pbc into account for the bonded interactions.
7173      */
7174     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7175             !(dd->nc[XX]>1 &&
7176               dd->nc[YY]>1 &&
7177               (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7178 }
7179
7180 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7181                        t_inputrec *ir,t_forcerec *fr,
7182                        gmx_ddbox_t *ddbox)
7183 {
7184     gmx_domdec_comm_t *comm;
7185     int  natoms_tot;
7186     real vol_frac;
7187
7188     comm = dd->comm;
7189
7190     /* Initialize the thread data.
7191      * This can not be done in init_domain_decomposition,
7192      * as the numbers of threads is determined later.
7193      */
7194     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7195     if (comm->nth > 1)
7196     {
7197         snew(comm->dth,comm->nth);
7198     }
7199
7200     if (EEL_PME(ir->coulombtype))
7201     {
7202         init_ddpme(dd,&comm->ddpme[0],0);
7203         if (comm->npmedecompdim >= 2)
7204         {
7205             init_ddpme(dd,&comm->ddpme[1],1);
7206         }
7207     }
7208     else
7209     {
7210         comm->npmenodes = 0;
7211         if (dd->pme_nodeid >= 0)
7212         {
7213             gmx_fatal_collective(FARGS,NULL,dd,
7214                                  "Can not have separate PME nodes without PME electrostatics");
7215         }
7216     }
7217         
7218     if (debug)
7219     {
7220         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7221     }
7222     if (comm->eDLB != edlbNO)
7223     {
7224         set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7225     }
7226     
7227     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7228     if (comm->eDLB == edlbAUTO)
7229     {
7230         if (fplog)
7231         {
7232             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7233         }
7234         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7235     }
7236
7237     if (ir->ePBC == epbcNONE)
7238     {
7239         vol_frac = 1 - 1/(double)dd->nnodes;
7240     }
7241     else
7242     {
7243         vol_frac =
7244             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7245     }
7246     if (debug)
7247     {
7248         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7249     }
7250     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7251    
7252     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7253 }
7254
7255 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7256                           real cutoff_req)
7257 {
7258     gmx_domdec_t *dd;
7259     gmx_ddbox_t ddbox;
7260     int d,dim,np;
7261     real inv_cell_size;
7262     int LocallyLimited;
7263
7264     dd = cr->dd;
7265
7266     set_ddbox(dd,FALSE,cr,ir,state->box,
7267               TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7268
7269     LocallyLimited = 0;
7270
7271     for(d=0; d<dd->ndim; d++)
7272     {
7273         dim = dd->dim[d];
7274
7275         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7276         if (dynamic_dd_box(&ddbox,ir))
7277         {
7278             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7279         }
7280
7281         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7282
7283         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7284             dd->comm->cd[d].np_dlb > 0)
7285         {
7286             if (np > dd->comm->cd[d].np_dlb)
7287             {
7288                 return FALSE;
7289             }
7290
7291             /* If a current local cell size is smaller than the requested
7292              * cut-off, we could still fix it, but this gets very complicated.
7293              * Without fixing here, we might actually need more checks.
7294              */
7295             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7296             {
7297                 LocallyLimited = 1;
7298             }
7299         }
7300     }
7301
7302     if (dd->comm->eDLB != edlbNO)
7303     {
7304         if (check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7305         {
7306             LocallyLimited = 1; 
7307         }
7308
7309         gmx_sumi(1,&LocallyLimited,cr);
7310
7311         if (LocallyLimited > 0)
7312         {
7313             return FALSE;
7314         }
7315     }
7316
7317     dd->comm->cutoff = cutoff_req;
7318
7319     return TRUE;
7320 }
7321
7322 static void merge_cg_buffers(int ncell,
7323                              gmx_domdec_comm_dim_t *cd, int pulse,
7324                              int  *ncg_cell,
7325                              int  *index_gl, int  *recv_i,
7326                              rvec *cg_cm,    rvec *recv_vr,
7327                              int *cgindex,
7328                              cginfo_mb_t *cginfo_mb,int *cginfo)
7329 {
7330     gmx_domdec_ind_t *ind,*ind_p;
7331     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7332     int shift,shift_at;
7333     
7334     ind = &cd->ind[pulse];
7335     
7336     /* First correct the already stored data */
7337     shift = ind->nrecv[ncell];
7338     for(cell=ncell-1; cell>=0; cell--)
7339     {
7340         shift -= ind->nrecv[cell];
7341         if (shift > 0)
7342         {
7343             /* Move the cg's present from previous grid pulses */
7344             cg0 = ncg_cell[ncell+cell];
7345             cg1 = ncg_cell[ncell+cell+1];
7346             cgindex[cg1+shift] = cgindex[cg1];
7347             for(cg=cg1-1; cg>=cg0; cg--)
7348             {
7349                 index_gl[cg+shift] = index_gl[cg];
7350                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7351                 cgindex[cg+shift] = cgindex[cg];
7352                 cginfo[cg+shift] = cginfo[cg];
7353             }
7354             /* Correct the already stored send indices for the shift */
7355             for(p=1; p<=pulse; p++)
7356             {
7357                 ind_p = &cd->ind[p];
7358                 cg0 = 0;
7359                 for(c=0; c<cell; c++)
7360                 {
7361                     cg0 += ind_p->nsend[c];
7362                 }
7363                 cg1 = cg0 + ind_p->nsend[cell];
7364                 for(cg=cg0; cg<cg1; cg++)
7365                 {
7366                     ind_p->index[cg] += shift;
7367                 }
7368             }
7369         }
7370     }
7371
7372     /* Merge in the communicated buffers */
7373     shift = 0;
7374     shift_at = 0;
7375     cg0 = 0;
7376     for(cell=0; cell<ncell; cell++)
7377     {
7378         cg1 = ncg_cell[ncell+cell+1] + shift;
7379         if (shift_at > 0)
7380         {
7381             /* Correct the old cg indices */
7382             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7383             {
7384                 cgindex[cg+1] += shift_at;
7385             }
7386         }
7387         for(cg=0; cg<ind->nrecv[cell]; cg++)
7388         {
7389             /* Copy this charge group from the buffer */
7390             index_gl[cg1] = recv_i[cg0];
7391             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7392             /* Add it to the cgindex */
7393             cg_gl = index_gl[cg1];
7394             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7395             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7396             cgindex[cg1+1] = cgindex[cg1] + nat;
7397             cg0++;
7398             cg1++;
7399             shift_at += nat;
7400         }
7401         shift += ind->nrecv[cell];
7402         ncg_cell[ncell+cell+1] = cg1;
7403     }
7404 }
7405
7406 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7407                                int nzone,int cg0,const int *cgindex)
7408 {
7409     int cg,zone,p;
7410     
7411     /* Store the atom block boundaries for easy copying of communication buffers
7412      */
7413     cg = cg0;
7414     for(zone=0; zone<nzone; zone++)
7415     {
7416         for(p=0; p<cd->np; p++) {
7417             cd->ind[p].cell2at0[zone] = cgindex[cg];
7418             cg += cd->ind[p].nrecv[zone];
7419             cd->ind[p].cell2at1[zone] = cgindex[cg];
7420         }
7421     }
7422 }
7423
7424 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7425 {
7426     int  i;
7427     gmx_bool bMiss;
7428
7429     bMiss = FALSE;
7430     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7431     {
7432         if (!bLocalCG[link->a[i]])
7433         {
7434             bMiss = TRUE;
7435         }
7436     }
7437
7438     return bMiss;
7439 }
7440
7441 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7442 typedef struct {
7443     real c[DIM][4]; /* the corners for the non-bonded communication */
7444     real cr0;       /* corner for rounding */
7445     real cr1[4];    /* corners for rounding */
7446     real bc[DIM];   /* corners for bounded communication */
7447     real bcr1;      /* corner for rounding for bonded communication */
7448 } dd_corners_t;
7449
7450 /* Determine the corners of the domain(s) we are communicating with */
7451 static void
7452 set_dd_corners(const gmx_domdec_t *dd,
7453                int dim0, int dim1, int dim2,
7454                gmx_bool bDistMB,
7455                dd_corners_t *c)
7456 {
7457     const gmx_domdec_comm_t *comm;
7458     const gmx_domdec_zones_t *zones;
7459     int i,j;
7460
7461     comm = dd->comm;
7462
7463     zones = &comm->zones;
7464
7465     /* Keep the compiler happy */
7466     c->cr0  = 0;
7467     c->bcr1 = 0;
7468
7469     /* The first dimension is equal for all cells */
7470     c->c[0][0] = comm->cell_x0[dim0];
7471     if (bDistMB)
7472     {
7473         c->bc[0] = c->c[0][0];
7474     }
7475     if (dd->ndim >= 2)
7476     {
7477         dim1 = dd->dim[1];
7478         /* This cell row is only seen from the first row */
7479         c->c[1][0] = comm->cell_x0[dim1];
7480         /* All rows can see this row */
7481         c->c[1][1] = comm->cell_x0[dim1];
7482         if (dd->bGridJump)
7483         {
7484             c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7485             if (bDistMB)
7486             {
7487                 /* For the multi-body distance we need the maximum */
7488                 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7489             }
7490         }
7491         /* Set the upper-right corner for rounding */
7492         c->cr0 = comm->cell_x1[dim0];
7493         
7494         if (dd->ndim >= 3)
7495         {
7496             dim2 = dd->dim[2];
7497             for(j=0; j<4; j++)
7498             {
7499                 c->c[2][j] = comm->cell_x0[dim2];
7500             }
7501             if (dd->bGridJump)
7502             {
7503                 /* Use the maximum of the i-cells that see a j-cell */
7504                 for(i=0; i<zones->nizone; i++)
7505                 {
7506                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7507                     {
7508                         if (j >= 4)
7509                         {
7510                             c->c[2][j-4] =
7511                                 max(c->c[2][j-4],
7512                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7513                         }
7514                     }
7515                 }
7516                 if (bDistMB)
7517                 {
7518                     /* For the multi-body distance we need the maximum */
7519                     c->bc[2] = comm->cell_x0[dim2];
7520                     for(i=0; i<2; i++)
7521                     {
7522                         for(j=0; j<2; j++)
7523                         {
7524                             c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7525                         }
7526                     }
7527                 }
7528             }
7529             
7530             /* Set the upper-right corner for rounding */
7531             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7532              * Only cell (0,0,0) can see cell 7 (1,1,1)
7533              */
7534             c->cr1[0] = comm->cell_x1[dim1];
7535             c->cr1[3] = comm->cell_x1[dim1];
7536             if (dd->bGridJump)
7537             {
7538                 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7539                 if (bDistMB)
7540                 {
7541                     /* For the multi-body distance we need the maximum */
7542                     c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7543                 }
7544             }
7545         }
7546     }
7547 }
7548
7549 /* Determine which cg's we need to send in this pulse from this zone */
7550 static void
7551 get_zone_pulse_cgs(gmx_domdec_t *dd,
7552                    int zonei, int zone,
7553                    int cg0, int cg1,
7554                    const int *index_gl,
7555                    const int *cgindex,
7556                    int dim, int dim_ind,
7557                    int dim0, int dim1, int dim2,
7558                    real r_comm2, real r_bcomm2,
7559                    matrix box,
7560                    ivec tric_dist,
7561                    rvec *normal,
7562                    real skew_fac2_d, real skew_fac_01,
7563                    rvec *v_d, rvec *v_0, rvec *v_1,
7564                    const dd_corners_t *c,
7565                    rvec sf2_round,
7566                    gmx_bool bDistBonded,
7567                    gmx_bool bBondComm,
7568                    gmx_bool bDist2B,
7569                    gmx_bool bDistMB,
7570                    rvec *cg_cm,
7571                    int *cginfo,
7572                    gmx_domdec_ind_t *ind,
7573                    int **ibuf, int *ibuf_nalloc,
7574                    vec_rvec_t *vbuf,
7575                    int *nsend_ptr,
7576                    int *nat_ptr,
7577                    int *nsend_z_ptr)
7578 {
7579     gmx_domdec_comm_t *comm;
7580     gmx_bool bScrew;
7581     gmx_bool bDistMB_pulse;
7582     int  cg,i;
7583     real r2,rb2,r,tric_sh;
7584     rvec rn,rb;
7585     int  dimd;
7586     int  nsend_z,nsend,nat;
7587
7588     comm = dd->comm;
7589
7590     bScrew = (dd->bScrewPBC && dim == XX);
7591
7592     bDistMB_pulse = (bDistMB && bDistBonded);
7593
7594     nsend_z = 0;
7595     nsend   = *nsend_ptr;
7596     nat     = *nat_ptr;
7597
7598     for(cg=cg0; cg<cg1; cg++)
7599     {
7600         r2  = 0;
7601         rb2 = 0;
7602         if (tric_dist[dim_ind] == 0)
7603         {
7604             /* Rectangular direction, easy */
7605             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7606             if (r > 0)
7607             {
7608                 r2 += r*r;
7609             }
7610             if (bDistMB_pulse)
7611             {
7612                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7613                 if (r > 0)
7614                 {
7615                     rb2 += r*r;
7616                 }
7617             }
7618             /* Rounding gives at most a 16% reduction
7619              * in communicated atoms
7620              */
7621             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7622             {
7623                 r = cg_cm[cg][dim0] - c->cr0;
7624                 /* This is the first dimension, so always r >= 0 */
7625                 r2 += r*r;
7626                 if (bDistMB_pulse)
7627                 {
7628                     rb2 += r*r;
7629                 }
7630             }
7631             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7632             {
7633                 r = cg_cm[cg][dim1] - c->cr1[zone];
7634                 if (r > 0)
7635                 {
7636                     r2 += r*r;
7637                 }
7638                 if (bDistMB_pulse)
7639                 {
7640                     r = cg_cm[cg][dim1] - c->bcr1;
7641                     if (r > 0)
7642                     {
7643                         rb2 += r*r;
7644                     }
7645                 }
7646             }
7647         }
7648         else
7649         {
7650             /* Triclinic direction, more complicated */
7651             clear_rvec(rn);
7652             clear_rvec(rb);
7653             /* Rounding, conservative as the skew_fac multiplication
7654              * will slightly underestimate the distance.
7655              */
7656             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7657             {
7658                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7659                 for(i=dim0+1; i<DIM; i++)
7660                 {
7661                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7662                 }
7663                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7664                 if (bDistMB_pulse)
7665                 {
7666                     rb[dim0] = rn[dim0];
7667                     rb2 = r2;
7668                 }
7669                 /* Take care that the cell planes along dim0 might not
7670                  * be orthogonal to those along dim1 and dim2.
7671                  */
7672                 for(i=1; i<=dim_ind; i++)
7673                 {
7674                     dimd = dd->dim[i];
7675                     if (normal[dim0][dimd] > 0)
7676                     {
7677                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7678                         if (bDistMB_pulse)
7679                         {
7680                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7681                         }
7682                     }
7683                 }
7684             }
7685             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7686             {
7687                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7688                 tric_sh = 0;
7689                 for(i=dim1+1; i<DIM; i++)
7690                 {
7691                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7692                 }
7693                 rn[dim1] += tric_sh;
7694                 if (rn[dim1] > 0)
7695                 {
7696                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7697                     /* Take care of coupling of the distances
7698                      * to the planes along dim0 and dim1 through dim2.
7699                      */
7700                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7701                     /* Take care that the cell planes along dim1
7702                      * might not be orthogonal to that along dim2.
7703                      */
7704                     if (normal[dim1][dim2] > 0)
7705                     {
7706                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7707                     }
7708                 }
7709                 if (bDistMB_pulse)
7710                 {
7711                     rb[dim1] +=
7712                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7713                     if (rb[dim1] > 0)
7714                     {
7715                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7716                         /* Take care of coupling of the distances
7717                          * to the planes along dim0 and dim1 through dim2.
7718                          */
7719                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7720                         /* Take care that the cell planes along dim1
7721                          * might not be orthogonal to that along dim2.
7722                          */
7723                         if (normal[dim1][dim2] > 0)
7724                         {
7725                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7726                         }
7727                     }
7728                 }
7729             }
7730             /* The distance along the communication direction */
7731             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7732             tric_sh = 0;
7733             for(i=dim+1; i<DIM; i++)
7734             {
7735                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7736             }
7737             rn[dim] += tric_sh;
7738             if (rn[dim] > 0)
7739             {
7740                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7741                 /* Take care of coupling of the distances
7742                  * to the planes along dim0 and dim1 through dim2.
7743                  */
7744                 if (dim_ind == 1 && zonei == 1)
7745                 {
7746                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7747                 }
7748             }
7749             if (bDistMB_pulse)
7750             {
7751                 clear_rvec(rb);
7752                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7753                 if (rb[dim] > 0)
7754                 {
7755                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7756                     /* Take care of coupling of the distances
7757                      * to the planes along dim0 and dim1 through dim2.
7758                      */
7759                     if (dim_ind == 1 && zonei == 1)
7760                     {
7761                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7762                     }
7763                 }
7764             }
7765         }
7766         
7767         if (r2 < r_comm2 ||
7768             (bDistBonded &&
7769              ((bDistMB && rb2 < r_bcomm2) ||
7770               (bDist2B && r2  < r_bcomm2)) &&
7771              (!bBondComm ||
7772               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7773                missing_link(comm->cglink,index_gl[cg],
7774                             comm->bLocalCG)))))
7775         {
7776             /* Make an index to the local charge groups */
7777             if (nsend+1 > ind->nalloc)
7778             {
7779                 ind->nalloc = over_alloc_large(nsend+1);
7780                 srenew(ind->index,ind->nalloc);
7781             }
7782             if (nsend+1 > *ibuf_nalloc)
7783             {
7784                 *ibuf_nalloc = over_alloc_large(nsend+1);
7785                 srenew(*ibuf,*ibuf_nalloc);
7786             }
7787             ind->index[nsend] = cg;
7788             (*ibuf)[nsend] = index_gl[cg];
7789             nsend_z++;
7790             vec_rvec_check_alloc(vbuf,nsend+1);
7791             
7792             if (dd->ci[dim] == 0)
7793             {
7794                 /* Correct cg_cm for pbc */
7795                 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7796                 if (bScrew)
7797                 {
7798                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7799                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7800                 }
7801             }
7802             else
7803             {
7804                 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7805             }
7806             nsend++;
7807             nat += cgindex[cg+1] - cgindex[cg];
7808         }
7809     }
7810
7811     *nsend_ptr   = nsend;
7812     *nat_ptr     = nat;
7813     *nsend_z_ptr = nsend_z;
7814 }
7815
7816 static void setup_dd_communication(gmx_domdec_t *dd,
7817                                    matrix box,gmx_ddbox_t *ddbox,
7818                                    t_forcerec *fr,t_state *state,rvec **f)
7819 {
7820     int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7821     int nzone,nzone_send,zone,zonei,cg0,cg1;
7822     int c,i,j,cg,cg_gl,nrcg;
7823     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7824     gmx_domdec_comm_t *comm;
7825     gmx_domdec_zones_t *zones;
7826     gmx_domdec_comm_dim_t *cd;
7827     gmx_domdec_ind_t *ind;
7828     cginfo_mb_t *cginfo_mb;
7829     gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7830     real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7831     dd_corners_t corners;
7832     ivec tric_dist;
7833     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7834     real skew_fac2_d,skew_fac_01;
7835     rvec sf2_round;
7836     int  nsend,nat;
7837     int  th;
7838     
7839     if (debug)
7840     {
7841         fprintf(debug,"Setting up DD communication\n");
7842     }
7843     
7844     comm  = dd->comm;
7845
7846     switch (fr->cutoff_scheme)
7847     {
7848     case ecutsGROUP:
7849         cg_cm = fr->cg_cm;
7850         break;
7851     case ecutsVERLET:
7852         cg_cm = state->x;
7853         break;
7854     default:
7855         gmx_incons("unimplemented");
7856         cg_cm = NULL;
7857     }
7858
7859     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7860     {
7861         dim = dd->dim[dim_ind];
7862
7863         /* Check if we need to use triclinic distances */
7864         tric_dist[dim_ind] = 0;
7865         for(i=0; i<=dim_ind; i++)
7866         {
7867             if (ddbox->tric_dir[dd->dim[i]])
7868             {
7869                 tric_dist[dim_ind] = 1;
7870             }
7871         }
7872     }
7873
7874     bBondComm = comm->bBondComm;
7875
7876     /* Do we need to determine extra distances for multi-body bondeds? */
7877     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7878     
7879     /* Do we need to determine extra distances for only two-body bondeds? */
7880     bDist2B = (bBondComm && !bDistMB);
7881
7882     r_comm2  = sqr(comm->cutoff);
7883     r_bcomm2 = sqr(comm->cutoff_mbody);
7884
7885     if (debug)
7886     {
7887         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7888     }
7889
7890     zones = &comm->zones;
7891     
7892     dim0 = dd->dim[0];
7893     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7894     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7895
7896     set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7897     
7898     /* Triclinic stuff */
7899     normal = ddbox->normal;
7900     skew_fac_01 = 0;
7901     if (dd->ndim >= 2)
7902     {
7903         v_0 = ddbox->v[dim0];
7904         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7905         {
7906             /* Determine the coupling coefficient for the distances
7907              * to the cell planes along dim0 and dim1 through dim2.
7908              * This is required for correct rounding.
7909              */
7910             skew_fac_01 =
7911                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7912             if (debug)
7913             {
7914                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7915             }
7916         }
7917     }
7918     if (dd->ndim >= 3)
7919     {
7920         v_1 = ddbox->v[dim1];
7921     }
7922     
7923     zone_cg_range = zones->cg_range;
7924     index_gl = dd->index_gl;
7925     cgindex  = dd->cgindex;
7926     cginfo_mb = fr->cginfo_mb;
7927     
7928     zone_cg_range[0]   = 0;
7929     zone_cg_range[1]   = dd->ncg_home;
7930     comm->zone_ncg1[0] = dd->ncg_home;
7931     pos_cg             = dd->ncg_home;
7932     
7933     nat_tot = dd->nat_home;
7934     nzone = 1;
7935     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7936     {
7937         dim = dd->dim[dim_ind];
7938         cd = &comm->cd[dim_ind];
7939         
7940         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7941         {
7942             /* No pbc in this dimension, the first node should not comm. */
7943             nzone_send = 0;
7944         }
7945         else
7946         {
7947             nzone_send = nzone;
7948         }
7949
7950         v_d = ddbox->v[dim];
7951         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7952
7953         cd->bInPlace = TRUE;
7954         for(p=0; p<cd->np; p++)
7955         {
7956             /* Only atoms communicated in the first pulse are used
7957              * for multi-body bonded interactions or for bBondComm.
7958              */
7959             bDistBonded = ((bDistMB || bDist2B) && p == 0);
7960
7961             ind = &cd->ind[p];
7962             nsend = 0;
7963             nat = 0;
7964             for(zone=0; zone<nzone_send; zone++)
7965             {
7966                 if (tric_dist[dim_ind] && dim_ind > 0)
7967                 {
7968                     /* Determine slightly more optimized skew_fac's
7969                      * for rounding.
7970                      * This reduces the number of communicated atoms
7971                      * by about 10% for 3D DD of rhombic dodecahedra.
7972                      */
7973                     for(dimd=0; dimd<dim; dimd++)
7974                     {
7975                         sf2_round[dimd] = 1;
7976                         if (ddbox->tric_dir[dimd])
7977                         {
7978                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7979                             {
7980                                 /* If we are shifted in dimension i
7981                                  * and the cell plane is tilted forward
7982                                  * in dimension i, skip this coupling.
7983                                  */
7984                                 if (!(zones->shift[nzone+zone][i] &&
7985                                       ddbox->v[dimd][i][dimd] >= 0))
7986                                 {
7987                                     sf2_round[dimd] +=
7988                                         sqr(ddbox->v[dimd][i][dimd]);
7989                                 }
7990                             }
7991                             sf2_round[dimd] = 1/sf2_round[dimd];
7992                         }
7993                     }
7994                 }
7995
7996                 zonei = zone_perm[dim_ind][zone];
7997                 if (p == 0)
7998                 {
7999                     /* Here we permutate the zones to obtain a convenient order
8000                      * for neighbor searching
8001                      */
8002                     cg0 = zone_cg_range[zonei];
8003                     cg1 = zone_cg_range[zonei+1];
8004                 }
8005                 else
8006                 {
8007                     /* Look only at the cg's received in the previous grid pulse
8008                      */
8009                     cg1 = zone_cg_range[nzone+zone+1];
8010                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8011                 }
8012
8013 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8014                 for(th=0; th<comm->nth; th++)
8015                 {
8016                     gmx_domdec_ind_t *ind_p;
8017                     int **ibuf_p,*ibuf_nalloc_p;
8018                     vec_rvec_t *vbuf_p;
8019                     int *nsend_p,*nat_p;
8020                     int *nsend_zone_p;
8021                     int cg0_th,cg1_th;
8022
8023                     if (th == 0)
8024                     {
8025                         /* Thread 0 writes in the comm buffers */
8026                         ind_p         = ind;
8027                         ibuf_p        = &comm->buf_int;
8028                         ibuf_nalloc_p = &comm->nalloc_int;
8029                         vbuf_p        = &comm->vbuf;
8030                         nsend_p       = &nsend;
8031                         nat_p         = &nat;
8032                         nsend_zone_p  = &ind->nsend[zone];
8033                     }
8034                     else
8035                     {
8036                         /* Other threads write into temp buffers */
8037                         ind_p         = &comm->dth[th].ind;
8038                         ibuf_p        = &comm->dth[th].ibuf;
8039                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8040                         vbuf_p        = &comm->dth[th].vbuf;
8041                         nsend_p       = &comm->dth[th].nsend;
8042                         nat_p         = &comm->dth[th].nat;
8043                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8044
8045                         comm->dth[th].nsend      = 0;
8046                         comm->dth[th].nat        = 0;
8047                         comm->dth[th].nsend_zone = 0;
8048                     }
8049
8050                     if (comm->nth == 1)
8051                     {
8052                         cg0_th = cg0;
8053                         cg1_th = cg1;
8054                     }
8055                     else
8056                     {
8057                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8058                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8059                     }
8060                     
8061                     /* Get the cg's for this pulse in this zone */
8062                     get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8063                                        index_gl,cgindex,
8064                                        dim,dim_ind,dim0,dim1,dim2,
8065                                        r_comm2,r_bcomm2,
8066                                        box,tric_dist,
8067                                        normal,skew_fac2_d,skew_fac_01,
8068                                        v_d,v_0,v_1,&corners,sf2_round,
8069                                        bDistBonded,bBondComm,
8070                                        bDist2B,bDistMB,
8071                                        cg_cm,fr->cginfo,
8072                                        ind_p,
8073                                        ibuf_p,ibuf_nalloc_p,
8074                                        vbuf_p,
8075                                        nsend_p,nat_p,
8076                                        nsend_zone_p);
8077                 }
8078
8079                 /* Append data of threads>=1 to the communication buffers */
8080                 for(th=1; th<comm->nth; th++)
8081                 {
8082                     dd_comm_setup_work_t *dth;
8083                     int i,ns1;
8084
8085                     dth = &comm->dth[th];
8086
8087                     ns1 = nsend + dth->nsend_zone;
8088                     if (ns1 > ind->nalloc)
8089                     {
8090                         ind->nalloc = over_alloc_dd(ns1);
8091                         srenew(ind->index,ind->nalloc);
8092                     }
8093                     if (ns1 > comm->nalloc_int)
8094                     {
8095                         comm->nalloc_int = over_alloc_dd(ns1);
8096                         srenew(comm->buf_int,comm->nalloc_int);
8097                     }
8098                     if (ns1 > comm->vbuf.nalloc)
8099                     {
8100                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8101                         srenew(comm->vbuf.v,comm->vbuf.nalloc);
8102                     }
8103
8104                     for(i=0; i<dth->nsend_zone; i++)
8105                     {
8106                         ind->index[nsend] = dth->ind.index[i];
8107                         comm->buf_int[nsend] = dth->ibuf[i];
8108                         copy_rvec(dth->vbuf.v[i],
8109                                   comm->vbuf.v[nsend]);
8110                         nsend++;
8111                     }
8112                     nat              += dth->nat;
8113                     ind->nsend[zone] += dth->nsend_zone;
8114                 }
8115             }
8116             /* Clear the counts in case we do not have pbc */
8117             for(zone=nzone_send; zone<nzone; zone++)
8118             {
8119                 ind->nsend[zone] = 0;
8120             }
8121             ind->nsend[nzone]   = nsend;
8122             ind->nsend[nzone+1] = nat;
8123             /* Communicate the number of cg's and atoms to receive */
8124             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8125                             ind->nsend, nzone+2,
8126                             ind->nrecv, nzone+2);
8127             
8128             /* The rvec buffer is also required for atom buffers of size nsend
8129              * in dd_move_x and dd_move_f.
8130              */
8131             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8132
8133             if (p > 0)
8134             {
8135                 /* We can receive in place if only the last zone is not empty */
8136                 for(zone=0; zone<nzone-1; zone++)
8137                 {
8138                     if (ind->nrecv[zone] > 0)
8139                     {
8140                         cd->bInPlace = FALSE;
8141                     }
8142                 }
8143                 if (!cd->bInPlace)
8144                 {
8145                     /* The int buffer is only required here for the cg indices */
8146                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8147                     {
8148                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8149                         srenew(comm->buf_int2,comm->nalloc_int2);
8150                     }
8151                     /* The rvec buffer is also required for atom buffers
8152                      * of size nrecv in dd_move_x and dd_move_f.
8153                      */
8154                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8155                     vec_rvec_check_alloc(&comm->vbuf2,i);
8156                 }
8157             }
8158             
8159             /* Make space for the global cg indices */
8160             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8161                 || dd->cg_nalloc == 0)
8162             {
8163                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8164                 srenew(index_gl,dd->cg_nalloc);
8165                 srenew(cgindex,dd->cg_nalloc+1);
8166             }
8167             /* Communicate the global cg indices */
8168             if (cd->bInPlace)
8169             {
8170                 recv_i = index_gl + pos_cg;
8171             }
8172             else
8173             {
8174                 recv_i = comm->buf_int2;
8175             }
8176             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8177                             comm->buf_int, nsend,
8178                             recv_i,        ind->nrecv[nzone]);
8179
8180             /* Make space for cg_cm */
8181             dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8182             if (fr->cutoff_scheme == ecutsGROUP)
8183             {
8184                 cg_cm = fr->cg_cm;
8185             }
8186             else
8187             {
8188                 cg_cm = state->x;
8189             }
8190             /* Communicate cg_cm */
8191             if (cd->bInPlace)
8192             {
8193                 recv_vr = cg_cm + pos_cg;
8194             }
8195             else
8196             {
8197                 recv_vr = comm->vbuf2.v;
8198             }
8199             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8200                              comm->vbuf.v, nsend,
8201                              recv_vr,      ind->nrecv[nzone]);
8202             
8203             /* Make the charge group index */
8204             if (cd->bInPlace)
8205             {
8206                 zone = (p == 0 ? 0 : nzone - 1);
8207                 while (zone < nzone)
8208                 {
8209                     for(cg=0; cg<ind->nrecv[zone]; cg++)
8210                     {
8211                         cg_gl = index_gl[pos_cg];
8212                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8213                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8214                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8215                         if (bBondComm)
8216                         {
8217                             /* Update the charge group presence,
8218                              * so we can use it in the next pass of the loop.
8219                              */
8220                             comm->bLocalCG[cg_gl] = TRUE;
8221                         }
8222                         pos_cg++;
8223                     }
8224                     if (p == 0)
8225                     {
8226                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8227                     }
8228                     zone++;
8229                     zone_cg_range[nzone+zone] = pos_cg;
8230                 }
8231             }
8232             else
8233             {
8234                 /* This part of the code is never executed with bBondComm. */
8235                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8236                                  index_gl,recv_i,cg_cm,recv_vr,
8237                                  cgindex,fr->cginfo_mb,fr->cginfo);
8238                 pos_cg += ind->nrecv[nzone];
8239             }
8240             nat_tot += ind->nrecv[nzone+1];
8241         }
8242         if (!cd->bInPlace)
8243         {
8244             /* Store the atom block for easy copying of communication buffers */
8245             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8246         }
8247         nzone += nzone;
8248     }
8249     dd->index_gl = index_gl;
8250     dd->cgindex  = cgindex;
8251     
8252     dd->ncg_tot = zone_cg_range[zones->n];
8253     dd->nat_tot = nat_tot;
8254     comm->nat[ddnatHOME] = dd->nat_home;
8255     for(i=ddnatZONE; i<ddnatNR; i++)
8256     {
8257         comm->nat[i] = dd->nat_tot;
8258     }
8259
8260     if (!bBondComm)
8261     {
8262         /* We don't need to update cginfo, since that was alrady done above.
8263          * So we pass NULL for the forcerec.
8264          */
8265         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8266                       NULL,comm->bLocalCG);
8267     }
8268
8269     if (debug)
8270     {
8271         fprintf(debug,"Finished setting up DD communication, zones:");
8272         for(c=0; c<zones->n; c++)
8273         {
8274             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8275         }
8276         fprintf(debug,"\n");
8277     }
8278 }
8279
8280 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8281 {
8282     int c;
8283     
8284     for(c=0; c<zones->nizone; c++)
8285     {
8286         zones->izone[c].cg1  = zones->cg_range[c+1];
8287         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8288         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8289     }
8290 }
8291
8292 static void set_zones_size(gmx_domdec_t *dd,
8293                            matrix box,const gmx_ddbox_t *ddbox,
8294                            int zone_start,int zone_end)
8295 {
8296     gmx_domdec_comm_t *comm;
8297     gmx_domdec_zones_t *zones;
8298     gmx_bool bDistMB;
8299     int  z,zi,zj0,zj1,d,dim;
8300     real rcs,rcmbs;
8301     int  i,j;
8302     real size_j,add_tric;
8303     real vol;
8304
8305     comm = dd->comm;
8306
8307     zones = &comm->zones;
8308
8309     /* Do we need to determine extra distances for multi-body bondeds? */
8310     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8311
8312     for(z=zone_start; z<zone_end; z++)
8313     {
8314         /* Copy cell limits to zone limits.
8315          * Valid for non-DD dims and non-shifted dims.
8316          */
8317         copy_rvec(comm->cell_x0,zones->size[z].x0);
8318         copy_rvec(comm->cell_x1,zones->size[z].x1);
8319     }
8320
8321     for(d=0; d<dd->ndim; d++)
8322     {
8323         dim = dd->dim[d];
8324
8325         for(z=0; z<zones->n; z++)
8326         {
8327             /* With a staggered grid we have different sizes
8328              * for non-shifted dimensions.
8329              */
8330             if (dd->bGridJump && zones->shift[z][dim] == 0)
8331             {
8332                 if (d == 1)
8333                 {
8334                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8335                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8336                 }
8337                 else if (d == 2)
8338                 {
8339                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8340                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8341                 }
8342             }
8343         }
8344
8345         rcs   = comm->cutoff;
8346         rcmbs = comm->cutoff_mbody;
8347         if (ddbox->tric_dir[dim])
8348         {
8349             rcs   /= ddbox->skew_fac[dim];
8350             rcmbs /= ddbox->skew_fac[dim];
8351         }
8352
8353         /* Set the lower limit for the shifted zone dimensions */
8354         for(z=zone_start; z<zone_end; z++)
8355         {
8356             if (zones->shift[z][dim] > 0)
8357             {
8358                 dim = dd->dim[d];
8359                 if (!dd->bGridJump || d == 0)
8360                 {
8361                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8362                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8363                 }
8364                 else
8365                 {
8366                     /* Here we take the lower limit of the zone from
8367                      * the lowest domain of the zone below.
8368                      */
8369                     if (z < 4)
8370                     {
8371                         zones->size[z].x0[dim] =
8372                              comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8373                     }
8374                     else
8375                     {
8376                         if (d == 1)
8377                         {
8378                             zones->size[z].x0[dim] =
8379                                 zones->size[zone_perm[2][z-4]].x0[dim];
8380                         }
8381                         else
8382                         {
8383                             zones->size[z].x0[dim] =
8384                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8385                         }
8386                     }
8387                     /* A temporary limit, is updated below */
8388                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8389
8390                     if (bDistMB)
8391                     {
8392                         for(zi=0; zi<zones->nizone; zi++)
8393                         {
8394                             if (zones->shift[zi][dim] == 0)
8395                             {
8396                                 /* This takes the whole zone into account.
8397                                  * With multiple pulses this will lead
8398                                  * to a larger zone then strictly necessary.
8399                                  */
8400                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8401                                                              zones->size[zi].x1[dim]+rcmbs);
8402                             }
8403                         }
8404                     }
8405                 }
8406             }
8407         }
8408
8409         /* Loop over the i-zones to set the upper limit of each
8410          * j-zone they see.
8411          */
8412         for(zi=0; zi<zones->nizone; zi++)
8413         {
8414             if (zones->shift[zi][dim] == 0)
8415             {
8416                 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8417                 {
8418                     if (zones->shift[z][dim] > 0)
8419                     {
8420                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8421                                                      zones->size[zi].x1[dim]+rcs);
8422                     }
8423                 }
8424             }
8425         }
8426     }
8427
8428     for(z=zone_start; z<zone_end; z++)
8429     {
8430         for(i=0; i<DIM; i++)
8431         {
8432             zones->size[z].bb_x0[i] = zones->size[z].x0[i];
8433             zones->size[z].bb_x1[i] = zones->size[z].x1[i];
8434
8435             for(j=i+1; j<ddbox->npbcdim; j++)
8436             {
8437                 /* With 1D domain decomposition the cg's are not in
8438                  * the triclinic box, but trilinic x-y and rectangular y-z.
8439                  */
8440                 if (box[j][i] != 0 &&
8441                     !(dd->ndim == 1 && i == YY && j == ZZ))
8442                 {
8443                     /* Correct for triclinic offset of the lower corner */
8444                     add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
8445                     zones->size[z].bb_x0[i] += add_tric;
8446                     zones->size[z].bb_x1[i] += add_tric;
8447
8448                     /* Correct for triclinic offset of the upper corner */
8449                     size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
8450                     add_tric = size_j*box[j][i]/box[j][j];
8451
8452                     if (box[j][i] < 0)
8453                     {
8454                         zones->size[z].bb_x0[i] += add_tric;
8455                     }
8456                     else
8457                     {
8458                         zones->size[z].bb_x1[i] += add_tric;
8459                     }
8460                 }
8461             }
8462         }
8463     }
8464
8465     if (zone_start == 0)
8466     {
8467         vol = 1;
8468         for(dim=0; dim<DIM; dim++)
8469         {
8470             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8471         }
8472         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8473     }
8474
8475     if (debug)
8476     {
8477         for(z=zone_start; z<zone_end; z++)
8478         {
8479             fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8480                     z,
8481                     zones->size[z].x0[XX],zones->size[z].x1[XX],
8482                     zones->size[z].x0[YY],zones->size[z].x1[YY],
8483                     zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8484             fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8485                     z,
8486                     zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8487                     zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8488                     zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8489         }
8490     }
8491 }
8492
8493 static int comp_cgsort(const void *a,const void *b)
8494 {
8495     int comp;
8496     
8497     gmx_cgsort_t *cga,*cgb;
8498     cga = (gmx_cgsort_t *)a;
8499     cgb = (gmx_cgsort_t *)b;
8500     
8501     comp = cga->nsc - cgb->nsc;
8502     if (comp == 0)
8503     {
8504         comp = cga->ind_gl - cgb->ind_gl;
8505     }
8506     
8507     return comp;
8508 }
8509
8510 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8511                          int *a,int *buf)
8512 {
8513     int i;
8514     
8515     /* Order the data */
8516     for(i=0; i<n; i++)
8517     {
8518         buf[i] = a[sort[i].ind];
8519     }
8520     
8521     /* Copy back to the original array */
8522     for(i=0; i<n; i++)
8523     {
8524         a[i] = buf[i];
8525     }
8526 }
8527
8528 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8529                          rvec *v,rvec *buf)
8530 {
8531     int i;
8532     
8533     /* Order the data */
8534     for(i=0; i<n; i++)
8535     {
8536         copy_rvec(v[sort[i].ind],buf[i]);
8537     }
8538     
8539     /* Copy back to the original array */
8540     for(i=0; i<n; i++)
8541     {
8542         copy_rvec(buf[i],v[i]);
8543     }
8544 }
8545
8546 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8547                            rvec *v,rvec *buf)
8548 {
8549     int a,atot,cg,cg0,cg1,i;
8550     
8551     if (cgindex == NULL)
8552     {
8553         /* Avoid the useless loop of the atoms within a cg */
8554         order_vec_cg(ncg,sort,v,buf);
8555
8556         return;
8557     }
8558
8559     /* Order the data */
8560     a = 0;
8561     for(cg=0; cg<ncg; cg++)
8562     {
8563         cg0 = cgindex[sort[cg].ind];
8564         cg1 = cgindex[sort[cg].ind+1];
8565         for(i=cg0; i<cg1; i++)
8566         {
8567             copy_rvec(v[i],buf[a]);
8568             a++;
8569         }
8570     }
8571     atot = a;
8572     
8573     /* Copy back to the original array */
8574     for(a=0; a<atot; a++)
8575     {
8576         copy_rvec(buf[a],v[a]);
8577     }
8578 }
8579
8580 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8581                          int nsort_new,gmx_cgsort_t *sort_new,
8582                          gmx_cgsort_t *sort1)
8583 {
8584     int i1,i2,i_new;
8585     
8586     /* The new indices are not very ordered, so we qsort them */
8587     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8588     
8589     /* sort2 is already ordered, so now we can merge the two arrays */
8590     i1 = 0;
8591     i2 = 0;
8592     i_new = 0;
8593     while(i2 < nsort2 || i_new < nsort_new)
8594     {
8595         if (i2 == nsort2)
8596         {
8597             sort1[i1++] = sort_new[i_new++];
8598         }
8599         else if (i_new == nsort_new)
8600         {
8601             sort1[i1++] = sort2[i2++];
8602         }
8603         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8604                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8605                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8606         {
8607             sort1[i1++] = sort2[i2++];
8608         }
8609         else
8610         {
8611             sort1[i1++] = sort_new[i_new++];
8612         }
8613     }
8614 }
8615
8616 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8617 {
8618     gmx_domdec_sort_t *sort;
8619     gmx_cgsort_t *cgsort,*sort_i;
8620     int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8621     int  sort_last,sort_skip;
8622
8623     sort = dd->comm->sort;
8624
8625     a = fr->ns.grid->cell_index;
8626
8627     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8628
8629     if (ncg_home_old >= 0)
8630     {
8631         /* The charge groups that remained in the same ns grid cell
8632          * are completely ordered. So we can sort efficiently by sorting
8633          * the charge groups that did move into the stationary list.
8634          */
8635         ncg_new = 0;
8636         nsort2 = 0;
8637         nsort_new = 0;
8638         for(i=0; i<dd->ncg_home; i++)
8639         {
8640             /* Check if this cg did not move to another node */
8641             if (a[i] < moved)
8642             {
8643                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8644                 {
8645                     /* This cg is new on this node or moved ns grid cell */
8646                     if (nsort_new >= sort->sort_new_nalloc)
8647                     {
8648                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8649                         srenew(sort->sort_new,sort->sort_new_nalloc);
8650                     }
8651                     sort_i = &(sort->sort_new[nsort_new++]);
8652                 }
8653                 else
8654                 {
8655                     /* This cg did not move */
8656                     sort_i = &(sort->sort2[nsort2++]);
8657                 }
8658                 /* Sort on the ns grid cell indices
8659                  * and the global topology index.
8660                  * index_gl is irrelevant with cell ns,
8661                  * but we set it here anyhow to avoid a conditional.
8662                  */
8663                 sort_i->nsc    = a[i];
8664                 sort_i->ind_gl = dd->index_gl[i];
8665                 sort_i->ind    = i;
8666                 ncg_new++;
8667             }
8668         }
8669         if (debug)
8670         {
8671             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8672                     nsort2,nsort_new);
8673         }
8674         /* Sort efficiently */
8675         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8676                      sort->sort);
8677     }
8678     else
8679     {
8680         cgsort = sort->sort;
8681         ncg_new = 0;
8682         for(i=0; i<dd->ncg_home; i++)
8683         {
8684             /* Sort on the ns grid cell indices
8685              * and the global topology index
8686              */
8687             cgsort[i].nsc    = a[i];
8688             cgsort[i].ind_gl = dd->index_gl[i];
8689             cgsort[i].ind    = i;
8690             if (cgsort[i].nsc < moved)
8691             {
8692                 ncg_new++;
8693             }
8694         }
8695         if (debug)
8696         {
8697             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8698         }
8699         /* Determine the order of the charge groups using qsort */
8700         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8701     }
8702
8703     return ncg_new;
8704 }
8705
8706 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8707 {
8708     gmx_cgsort_t *sort;
8709     int  ncg_new,i,*a,na;
8710
8711     sort = dd->comm->sort->sort;
8712
8713     nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8714
8715     ncg_new = 0;
8716     for(i=0; i<na; i++)
8717     {
8718         if (a[i] >= 0)
8719         {
8720             sort[ncg_new].ind = a[i];
8721             ncg_new++;
8722         }
8723     }
8724
8725     return ncg_new;
8726 }
8727
8728 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8729                           rvec *cgcm,t_forcerec *fr,t_state *state,
8730                           int ncg_home_old)
8731 {
8732     gmx_domdec_sort_t *sort;
8733     gmx_cgsort_t *cgsort,*sort_i;
8734     int  *cgindex;
8735     int  ncg_new,i,*ibuf,cgsize;
8736     rvec *vbuf;
8737     
8738     sort = dd->comm->sort;
8739     
8740     if (dd->ncg_home > sort->sort_nalloc)
8741     {
8742         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8743         srenew(sort->sort,sort->sort_nalloc);
8744         srenew(sort->sort2,sort->sort_nalloc);
8745     }
8746     cgsort = sort->sort;
8747
8748     switch (fr->cutoff_scheme)
8749     {
8750     case ecutsGROUP:
8751         ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8752         break;
8753     case ecutsVERLET:
8754         ncg_new = dd_sort_order_nbnxn(dd,fr);
8755         break;
8756     default:
8757         gmx_incons("unimplemented");
8758         ncg_new = 0;
8759     }
8760
8761     /* We alloc with the old size, since cgindex is still old */
8762     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8763     vbuf = dd->comm->vbuf.v;
8764     
8765     if (dd->comm->bCGs)
8766     {
8767         cgindex = dd->cgindex;
8768     }
8769     else
8770     {
8771         cgindex = NULL;
8772     }
8773
8774     /* Remove the charge groups which are no longer at home here */
8775     dd->ncg_home = ncg_new;
8776     if (debug)
8777     {
8778         fprintf(debug,"Set the new home charge group count to %d\n",
8779                 dd->ncg_home);
8780     }
8781     
8782     /* Reorder the state */
8783     for(i=0; i<estNR; i++)
8784     {
8785         if (EST_DISTR(i) && (state->flags & (1<<i)))
8786         {
8787             switch (i)
8788             {
8789             case estX:
8790                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8791                 break;
8792             case estV:
8793                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8794                 break;
8795             case estSDX:
8796                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8797                 break;
8798             case estCGP:
8799                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8800                 break;
8801             case estLD_RNG:
8802             case estLD_RNGI:
8803             case estDISRE_INITF:
8804             case estDISRE_RM3TAV:
8805             case estORIRE_INITF:
8806             case estORIRE_DTAV:
8807                 /* No ordering required */
8808                 break;
8809             default:
8810                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8811                 break;
8812             }
8813         }
8814     }
8815     if (fr->cutoff_scheme == ecutsGROUP)
8816     {
8817         /* Reorder cgcm */
8818         order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8819     }
8820     
8821     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8822     {
8823         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8824         srenew(sort->ibuf,sort->ibuf_nalloc);
8825     }
8826     ibuf = sort->ibuf;
8827     /* Reorder the global cg index */
8828     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8829     /* Reorder the cginfo */
8830     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8831     /* Rebuild the local cg index */
8832     if (dd->comm->bCGs)
8833     {
8834         ibuf[0] = 0;
8835         for(i=0; i<dd->ncg_home; i++)
8836         {
8837             cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8838             ibuf[i+1] = ibuf[i] + cgsize;
8839         }
8840         for(i=0; i<dd->ncg_home+1; i++)
8841         {
8842             dd->cgindex[i] = ibuf[i];
8843         }
8844     }
8845     else
8846     {
8847         for(i=0; i<dd->ncg_home+1; i++)
8848         {
8849             dd->cgindex[i] = i;
8850         }
8851     }
8852     /* Set the home atom number */
8853     dd->nat_home = dd->cgindex[dd->ncg_home];
8854
8855     if (fr->cutoff_scheme == ecutsVERLET)
8856     {
8857         /* The atoms are now exactly in grid order, update the grid order */
8858         nbnxn_set_atomorder(fr->nbv->nbs);
8859     }
8860     else
8861     {
8862         /* Copy the sorted ns cell indices back to the ns grid struct */
8863         for(i=0; i<dd->ncg_home; i++)
8864         {
8865             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8866         }
8867         fr->ns.grid->nr = dd->ncg_home;
8868     }
8869 }
8870
8871 static void add_dd_statistics(gmx_domdec_t *dd)
8872 {
8873     gmx_domdec_comm_t *comm;
8874     int ddnat;
8875     
8876     comm = dd->comm;
8877     
8878     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8879     {
8880         comm->sum_nat[ddnat-ddnatZONE] +=
8881             comm->nat[ddnat] - comm->nat[ddnat-1];
8882     }
8883     comm->ndecomp++;
8884 }
8885
8886 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8887 {
8888     gmx_domdec_comm_t *comm;
8889     int ddnat;
8890     
8891     comm = dd->comm;
8892
8893     /* Reset all the statistics and counters for total run counting */
8894     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8895     {
8896         comm->sum_nat[ddnat-ddnatZONE] = 0;
8897     }
8898     comm->ndecomp = 0;
8899     comm->nload = 0;
8900     comm->load_step = 0;
8901     comm->load_sum = 0;
8902     comm->load_max = 0;
8903     clear_ivec(comm->load_lim);
8904     comm->load_mdf = 0;
8905     comm->load_pme = 0;
8906 }
8907
8908 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8909 {
8910     gmx_domdec_comm_t *comm;
8911     int ddnat;
8912     double av;
8913    
8914     comm = cr->dd->comm;
8915     
8916     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8917     
8918     if (fplog == NULL)
8919     {
8920         return;
8921     }
8922     
8923     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8924             
8925     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8926     {
8927         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8928         switch(ddnat)
8929         {
8930         case ddnatZONE:
8931             fprintf(fplog,
8932                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8933                     2,av);
8934             break;
8935         case ddnatVSITE:
8936             if (cr->dd->vsite_comm)
8937             {
8938                 fprintf(fplog,
8939                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8940                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8941                         av);
8942             }
8943             break;
8944         case ddnatCON:
8945             if (cr->dd->constraint_comm)
8946             {
8947                 fprintf(fplog,
8948                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8949                         1 + ir->nLincsIter,av);
8950             }
8951             break;
8952         default:
8953             gmx_incons(" Unknown type for DD statistics");
8954         }
8955     }
8956     fprintf(fplog,"\n");
8957     
8958     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8959     {
8960         print_dd_load_av(fplog,cr->dd);
8961     }
8962 }
8963
8964 void dd_partition_system(FILE            *fplog,
8965                          gmx_large_int_t      step,
8966                          t_commrec       *cr,
8967                          gmx_bool            bMasterState,
8968                          int             nstglobalcomm,
8969                          t_state         *state_global,
8970                          gmx_mtop_t      *top_global,
8971                          t_inputrec      *ir,
8972                          t_state         *state_local,
8973                          rvec            **f,
8974                          t_mdatoms       *mdatoms,
8975                          gmx_localtop_t  *top_local,
8976                          t_forcerec      *fr,
8977                          gmx_vsite_t     *vsite,
8978                          gmx_shellfc_t   shellfc,
8979                          gmx_constr_t    constr,
8980                          t_nrnb          *nrnb,
8981                          gmx_wallcycle_t wcycle,
8982                          gmx_bool            bVerbose)
8983 {
8984     gmx_domdec_t *dd;
8985     gmx_domdec_comm_t *comm;
8986     gmx_ddbox_t ddbox={0};
8987     t_block *cgs_gl;
8988     gmx_large_int_t step_pcoupl;
8989     rvec cell_ns_x0,cell_ns_x1;
8990     int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
8991     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8992     gmx_bool bRedist,bSortCG,bResortAll;
8993     ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
8994     real grid_density;
8995     char sbuf[22];
8996         
8997     dd = cr->dd;
8998     comm = dd->comm;
8999
9000     bBoxChanged = (bMasterState || DEFORM(*ir));
9001     if (ir->epc != epcNO)
9002     {
9003         /* With nstpcouple > 1 pressure coupling happens.
9004          * one step after calculating the pressure.
9005          * Box scaling happens at the end of the MD step,
9006          * after the DD partitioning.
9007          * We therefore have to do DLB in the first partitioning
9008          * after an MD step where P-coupling occured.
9009          * We need to determine the last step in which p-coupling occurred.
9010          * MRS -- need to validate this for vv?
9011          */
9012         n = ir->nstpcouple;
9013         if (n == 1)
9014         {
9015             step_pcoupl = step - 1;
9016         }
9017         else
9018         {
9019             step_pcoupl = ((step - 1)/n)*n + 1;
9020         }
9021         if (step_pcoupl >= comm->partition_step)
9022         {
9023             bBoxChanged = TRUE;
9024         }
9025     }
9026
9027     bNStGlobalComm = (step % nstglobalcomm == 0);
9028
9029     if (!comm->bDynLoadBal)
9030     {
9031         bDoDLB = FALSE;
9032     }
9033     else
9034     {
9035         /* Should we do dynamic load balacing this step?
9036          * Since it requires (possibly expensive) global communication,
9037          * we might want to do DLB less frequently.
9038          */
9039         if (bBoxChanged || ir->epc != epcNO)
9040         {
9041             bDoDLB = bBoxChanged;
9042         }
9043         else
9044         {
9045             bDoDLB = bNStGlobalComm;
9046         }
9047     }
9048
9049     /* Check if we have recorded loads on the nodes */
9050     if (comm->bRecordLoad && dd_load_count(comm))
9051     {
9052         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9053         {
9054             /* Check if we should use DLB at the second partitioning
9055              * and every 100 partitionings,
9056              * so the extra communication cost is negligible.
9057              */
9058             n = max(100,nstglobalcomm);
9059             bCheckDLB = (comm->n_load_collect == 0 ||
9060                          comm->n_load_have % n == n-1);
9061         }
9062         else
9063         {
9064             bCheckDLB = FALSE;
9065         }
9066         
9067         /* Print load every nstlog, first and last step to the log file */
9068         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9069                     comm->n_load_collect == 0 ||
9070                     (ir->nsteps >= 0 &&
9071                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9072
9073         /* Avoid extra communication due to verbose screen output
9074          * when nstglobalcomm is set.
9075          */
9076         if (bDoDLB || bLogLoad || bCheckDLB ||
9077             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9078         {
9079             get_load_distribution(dd,wcycle);
9080             if (DDMASTER(dd))
9081             {
9082                 if (bLogLoad)
9083                 {
9084                     dd_print_load(fplog,dd,step-1);
9085                 }
9086                 if (bVerbose)
9087                 {
9088                     dd_print_load_verbose(dd);
9089                 }
9090             }
9091             comm->n_load_collect++;
9092
9093             if (bCheckDLB) {
9094                 /* Since the timings are node dependent, the master decides */
9095                 if (DDMASTER(dd))
9096                 {
9097                     bTurnOnDLB =
9098                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9099                     if (debug)
9100                     {
9101                         fprintf(debug,"step %s, imb loss %f\n",
9102                                 gmx_step_str(step,sbuf),
9103                                 dd_force_imb_perf_loss(dd));
9104                     }
9105                 }
9106                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9107                 if (bTurnOnDLB)
9108                 {
9109                     turn_on_dlb(fplog,cr,step);
9110                     bDoDLB = TRUE;
9111                 }
9112             }
9113         }
9114         comm->n_load_have++;
9115     }
9116
9117     cgs_gl = &comm->cgs_gl;
9118
9119     bRedist = FALSE;
9120     if (bMasterState)
9121     {
9122         /* Clear the old state */
9123         clear_dd_indices(dd,0,0);
9124
9125         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9126                   TRUE,cgs_gl,state_global->x,&ddbox);
9127     
9128         get_cg_distribution(fplog,step,dd,cgs_gl,
9129                             state_global->box,&ddbox,state_global->x);
9130         
9131         dd_distribute_state(dd,cgs_gl,
9132                             state_global,state_local,f);
9133         
9134         dd_make_local_cgs(dd,&top_local->cgs);
9135         
9136         /* Ensure that we have space for the new distribution */
9137         dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9138
9139         if (fr->cutoff_scheme == ecutsGROUP)
9140         {
9141             calc_cgcm(fplog,0,dd->ncg_home,
9142                       &top_local->cgs,state_local->x,fr->cg_cm);
9143         }
9144         
9145         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9146         
9147         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9148
9149         cg0 = 0;
9150     }
9151     else if (state_local->ddp_count != dd->ddp_count)
9152     {
9153         if (state_local->ddp_count > dd->ddp_count)
9154         {
9155             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9156         }
9157         
9158         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9159         {
9160             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9161         }
9162         
9163         /* Clear the old state */
9164         clear_dd_indices(dd,0,0);
9165         
9166         /* Build the new indices */
9167         rebuild_cgindex(dd,cgs_gl->index,state_local);
9168         make_dd_indices(dd,cgs_gl->index,0);
9169
9170         if (fr->cutoff_scheme == ecutsGROUP)
9171         {
9172             /* Redetermine the cg COMs */
9173             calc_cgcm(fplog,0,dd->ncg_home,
9174                       &top_local->cgs,state_local->x,fr->cg_cm);
9175         }
9176         
9177         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9178
9179         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9180
9181         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9182                   TRUE,&top_local->cgs,state_local->x,&ddbox);
9183
9184         bRedist = comm->bDynLoadBal;
9185     }
9186     else
9187     {
9188         /* We have the full state, only redistribute the cgs */
9189
9190         /* Clear the non-home indices */
9191         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9192
9193         /* Avoid global communication for dim's without pbc and -gcom */
9194         if (!bNStGlobalComm)
9195         {
9196             copy_rvec(comm->box0    ,ddbox.box0    );
9197             copy_rvec(comm->box_size,ddbox.box_size);
9198         }
9199         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9200                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9201
9202         bBoxChanged = TRUE;
9203         bRedist = TRUE;
9204     }
9205     /* For dim's without pbc and -gcom */
9206     copy_rvec(ddbox.box0    ,comm->box0    );
9207     copy_rvec(ddbox.box_size,comm->box_size);
9208     
9209     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9210                       step,wcycle);
9211     
9212     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9213     {
9214         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9215     }
9216     
9217     /* Check if we should sort the charge groups */
9218     if (comm->nstSortCG > 0)
9219     {
9220         bSortCG = (bMasterState ||
9221                    (bRedist && (step % comm->nstSortCG == 0)));
9222     }
9223     else
9224     {
9225         bSortCG = FALSE;
9226     }
9227
9228     ncg_home_old = dd->ncg_home;
9229
9230     ncg_moved = 0;
9231     if (bRedist)
9232     {
9233         wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9234
9235         dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9236                            state_local,f,fr,mdatoms,
9237                            !bSortCG,nrnb,&cg0,&ncg_moved);
9238
9239         wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9240     }
9241     
9242     get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9243                           dd,&ddbox,
9244                           &comm->cell_x0,&comm->cell_x1,
9245                           dd->ncg_home,fr->cg_cm,
9246                           cell_ns_x0,cell_ns_x1,&grid_density);
9247
9248     if (bBoxChanged)
9249     {
9250         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9251     }
9252
9253     switch (fr->cutoff_scheme)
9254     {
9255     case ecutsGROUP:
9256         copy_ivec(fr->ns.grid->n,ncells_old);
9257         grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9258                    state_local->box,cell_ns_x0,cell_ns_x1,
9259                    fr->rlistlong,grid_density);
9260         break;
9261     case ecutsVERLET:
9262         nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9263         break;
9264     default:
9265         gmx_incons("unimplemented");
9266     }
9267     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9268     copy_ivec(ddbox.tric_dir,comm->tric_dir);
9269
9270     if (bSortCG)
9271     {
9272         wallcycle_sub_start(wcycle,ewcsDD_GRID);
9273
9274         /* Sort the state on charge group position.
9275          * This enables exact restarts from this step.
9276          * It also improves performance by about 15% with larger numbers
9277          * of atoms per node.
9278          */
9279         
9280         /* Fill the ns grid with the home cell,
9281          * so we can sort with the indices.
9282          */
9283         set_zones_ncg_home(dd);
9284
9285         switch (fr->cutoff_scheme)
9286         {
9287         case ecutsVERLET:
9288             set_zones_size(dd,state_local->box,&ddbox,0,1);
9289
9290             nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9291                               0,
9292                               comm->zones.size[0].bb_x0,
9293                               comm->zones.size[0].bb_x1,
9294                               0,dd->ncg_home,
9295                               comm->zones.dens_zone0,
9296                               fr->cginfo,
9297                               state_local->x,
9298                               ncg_moved,comm->moved,
9299                               fr->nbv->grp[eintLocal].kernel_type,
9300                               fr->nbv->grp[eintLocal].nbat);
9301
9302             nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9303             break;
9304         case ecutsGROUP:
9305             fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9306                       0,dd->ncg_home,fr->cg_cm);
9307             
9308             copy_ivec(fr->ns.grid->n,ncells_new);
9309             break;
9310         default:
9311             gmx_incons("unimplemented");
9312         }
9313
9314         bResortAll = bMasterState;
9315    
9316         /* Check if we can user the old order and ns grid cell indices
9317          * of the charge groups to sort the charge groups efficiently.
9318          */
9319         if (ncells_new[XX] != ncells_old[XX] ||
9320             ncells_new[YY] != ncells_old[YY] ||
9321             ncells_new[ZZ] != ncells_old[ZZ])
9322         {
9323             bResortAll = TRUE;
9324         }
9325
9326         if (debug)
9327         {
9328             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9329                     gmx_step_str(step,sbuf),dd->ncg_home);
9330         }
9331         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9332                       bResortAll ? -1 : ncg_home_old);
9333         /* Rebuild all the indices */
9334         cg0 = 0;
9335         ga2la_clear(dd->ga2la);
9336
9337         wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9338     }
9339
9340     wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9341     
9342     /* Setup up the communication and communicate the coordinates */
9343     setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9344     
9345     /* Set the indices */
9346     make_dd_indices(dd,cgs_gl->index,cg0);
9347
9348     /* Set the charge group boundaries for neighbor searching */
9349     set_cg_boundaries(&comm->zones);
9350
9351     if (fr->cutoff_scheme == ecutsVERLET)
9352     {
9353         set_zones_size(dd,state_local->box,&ddbox,
9354                        bSortCG ? 1 : 0,comm->zones.n);
9355     }
9356
9357     wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9358
9359     /*
9360     write_dd_pdb("dd_home",step,"dump",top_global,cr,
9361                  -1,state_local->x,state_local->box);
9362     */
9363
9364     wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9365     
9366     /* Extract a local topology from the global topology */
9367     for(i=0; i<dd->ndim; i++)
9368     {
9369         np[dd->dim[i]] = comm->cd[i].np;
9370     }
9371     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9372                       comm->cellsize_min,np,
9373                       fr,
9374                       fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9375                       vsite,top_global,top_local);
9376
9377     wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9378
9379     wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9380     
9381     /* Set up the special atom communication */
9382     n = comm->nat[ddnatZONE];
9383     for(i=ddnatZONE+1; i<ddnatNR; i++)
9384     {
9385         switch(i)
9386         {
9387         case ddnatVSITE:
9388             if (vsite && vsite->n_intercg_vsite)
9389             {
9390                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9391             }
9392             break;
9393         case ddnatCON:
9394             if (dd->bInterCGcons || dd->bInterCGsettles)
9395             {
9396                 /* Only for inter-cg constraints we need special code */
9397                 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9398                                               constr,ir->nProjOrder,
9399                                               top_local->idef.il);
9400             }
9401             break;
9402         default:
9403             gmx_incons("Unknown special atom type setup");
9404         }
9405         comm->nat[i] = n;
9406     }
9407
9408     wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9409
9410     wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9411
9412     /* Make space for the extra coordinates for virtual site
9413      * or constraint communication.
9414      */
9415     state_local->natoms = comm->nat[ddnatNR-1];
9416     if (state_local->natoms > state_local->nalloc)
9417     {
9418         dd_realloc_state(state_local,f,state_local->natoms);
9419     }
9420
9421     if (fr->bF_NoVirSum)
9422     {
9423         if (vsite && vsite->n_intercg_vsite)
9424         {
9425             nat_f_novirsum = comm->nat[ddnatVSITE];
9426         }
9427         else
9428         {
9429             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9430             {
9431                 nat_f_novirsum = dd->nat_tot;
9432             }
9433             else
9434             {
9435                 nat_f_novirsum = dd->nat_home;
9436             }
9437         }
9438     }
9439     else
9440     {
9441         nat_f_novirsum = 0;
9442     }
9443
9444     /* Set the number of atoms required for the force calculation.
9445      * Forces need to be constrained when using a twin-range setup
9446      * or with energy minimization. For simple simulations we could
9447      * avoid some allocation, zeroing and copying, but this is
9448      * probably not worth the complications ande checking.
9449      */
9450     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9451                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9452
9453     /* We make the all mdatoms up to nat_tot_con.
9454      * We could save some work by only setting invmass
9455      * between nat_tot and nat_tot_con.
9456      */
9457     /* This call also sets the new number of home particles to dd->nat_home */
9458     atoms2md(top_global,ir,
9459              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9460
9461     /* Now we have the charges we can sort the FE interactions */
9462     dd_sort_local_top(dd,mdatoms,top_local);
9463
9464     if (shellfc)
9465     {
9466         /* Make the local shell stuff, currently no communication is done */
9467         make_local_shells(cr,mdatoms,shellfc);
9468     }
9469     
9470         if (ir->implicit_solvent)
9471     {
9472         make_local_gb(cr,fr->born,ir->gb_algorithm);
9473     }
9474
9475     init_bonded_thread_force_reduction(fr,&top_local->idef);
9476
9477     if (!(cr->duty & DUTY_PME))
9478     {
9479         /* Send the charges to our PME only node */
9480         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9481                        mdatoms->chargeA,mdatoms->chargeB,
9482                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9483     }
9484     
9485     if (constr)
9486     {
9487         set_constraints(constr,top_local,ir,mdatoms,cr);
9488     }
9489     
9490     if (ir->ePull != epullNO)
9491     {
9492         /* Update the local pull groups */
9493         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9494     }
9495     
9496     if (ir->bRot)
9497     {
9498         /* Update the local rotation groups */
9499         dd_make_local_rotation_groups(dd,ir->rot);
9500     }
9501
9502
9503     add_dd_statistics(dd);
9504     
9505     /* Make sure we only count the cycles for this DD partitioning */
9506     clear_dd_cycle_counts(dd);
9507     
9508     /* Because the order of the atoms might have changed since
9509      * the last vsite construction, we need to communicate the constructing
9510      * atom coordinates again (for spreading the forces this MD step).
9511      */
9512     dd_move_x_vsites(dd,state_local->box,state_local->x);
9513
9514     wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9515     
9516     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9517     {
9518         dd_move_x(dd,state_local->box,state_local->x);
9519         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9520                      -1,state_local->x,state_local->box);
9521     }
9522
9523     /* Store the partitioning step */
9524     comm->partition_step = step;
9525     
9526     /* Increase the DD partitioning counter */
9527     dd->ddp_count++;
9528     /* The state currently matches this DD partitioning count, store it */
9529     state_local->ddp_count = dd->ddp_count;
9530     if (bMasterState)
9531     {
9532         /* The DD master node knows the complete cg distribution,
9533          * store the count so we can possibly skip the cg info communication.
9534          */
9535         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9536     }
9537
9538     if (comm->DD_debug > 0)
9539     {
9540         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9541         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9542                                 "after partitioning");
9543     }
9544 }