added Verlet scheme and NxN non-bonded functionality
[alexxy/gromacs.git] / src / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "gmx_fatal.h"
31 #include "gmx_fatal_collective.h"
32 #include "vec.h"
33 #include "domdec.h"
34 #include "domdec_network.h"
35 #include "nrnb.h"
36 #include "pbc.h"
37 #include "chargegroup.h"
38 #include "constr.h"
39 #include "mdatoms.h"
40 #include "names.h"
41 #include "pdbio.h"
42 #include "futil.h"
43 #include "force.h"
44 #include "pme.h"
45 #include "pull.h"
46 #include "pull_rotation.h"
47 #include "gmx_wallcycle.h"
48 #include "mdrun.h"
49 #include "nsgrid.h"
50 #include "shellfc.h"
51 #include "mtop_util.h"
52 #include "gmxfio.h"
53 #include "gmx_ga2la.h"
54 #include "gmx_sort.h"
55 #include "nbnxn_search.h"
56 #include "bondf.h"
57 #include "gmx_omp_nthreads.h"
58
59 #ifdef GMX_LIB_MPI
60 #include <mpi.h>
61 #endif
62 #ifdef GMX_THREAD_MPI
63 #include "tmpi.h"
64 #endif
65
66 #define DDRANK(dd,rank)    (rank)
67 #define DDMASTERRANK(dd)   (dd->masterrank)
68
69 typedef struct gmx_domdec_master
70 {
71     /* The cell boundaries */
72     real **cell_x;
73     /* The global charge group division */
74     int  *ncg;     /* Number of home charge groups for each node */
75     int  *index;   /* Index of nnodes+1 into cg */
76     int  *cg;      /* Global charge group index */
77     int  *nat;     /* Number of home atoms for each node. */
78     int  *ibuf;    /* Buffer for communication */
79     rvec *vbuf;    /* Buffer for state scattering and gathering */
80 } gmx_domdec_master_t;
81
82 typedef struct
83 {
84     /* The numbers of charge groups to send and receive for each cell
85      * that requires communication, the last entry contains the total
86      * number of atoms that needs to be communicated.
87      */
88     int nsend[DD_MAXIZONE+2];
89     int nrecv[DD_MAXIZONE+2];
90     /* The charge groups to send */
91     int *index;
92     int nalloc;
93     /* The atom range for non-in-place communication */
94     int cell2at0[DD_MAXIZONE];
95     int cell2at1[DD_MAXIZONE];
96 } gmx_domdec_ind_t;
97
98 typedef struct
99 {
100     int  np;                   /* Number of grid pulses in this dimension */
101     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
102     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
103     int  np_nalloc;
104     gmx_bool bInPlace;             /* Can we communicate in place?            */
105 } gmx_domdec_comm_dim_t;
106
107 typedef struct
108 {
109     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
110     real *cell_f;      /* State var.: cell boundaries, box relative      */
111     real *old_cell_f;  /* Temp. var.: old cell size                      */
112     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
113     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
114     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
115     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
116     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
117     real *buf_ncd;     /* Temp. var.                                     */
118 } gmx_domdec_root_t;
119
120 #define DD_NLOAD_MAX 9
121
122 /* Here floats are accurate enough, since these variables
123  * only influence the load balancing, not the actual MD results.
124  */
125 typedef struct
126 {
127     int  nload;
128     float *load;
129     float sum;
130     float max;
131     float sum_m;
132     float cvol_min;
133     float mdf;
134     float pme;
135     int   flags;
136 } gmx_domdec_load_t;
137
138 typedef struct
139 {
140     int  nsc;
141     int  ind_gl;
142     int  ind;
143 } gmx_cgsort_t;
144
145 typedef struct
146 {
147     gmx_cgsort_t *sort;
148     gmx_cgsort_t *sort2;
149     int  sort_nalloc;
150     gmx_cgsort_t *sort_new;
151     int  sort_new_nalloc;
152     int  *ibuf;
153     int  ibuf_nalloc;
154 } gmx_domdec_sort_t;
155
156 typedef struct
157 {
158     rvec *v;
159     int  nalloc;
160 } vec_rvec_t;
161
162 /* This enum determines the order of the coordinates.
163  * ddnatHOME and ddnatZONE should be first and second,
164  * the others can be ordered as wanted.
165  */
166 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
167
168 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
169 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
170
171 typedef struct
172 {
173     int  dim;      /* The dimension                                          */
174     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
175     int  nslab;    /* The number of PME slabs in this dimension              */
176     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
177     int  *pp_min;  /* The minimum pp node location, size nslab               */
178     int  *pp_max;  /* The maximum pp node location,size nslab                */
179     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
180 } gmx_ddpme_t;
181
182 typedef struct
183 {
184     real min0;    /* The minimum bottom of this zone                        */
185     real max1;    /* The maximum top of this zone                           */
186     real min1;    /* The minimum top of this zone                           */
187     real mch0;    /* The maximum bottom communicaton height for this zone   */
188     real mch1;    /* The maximum top communicaton height for this zone      */
189     real p1_0;    /* The bottom value of the first cell in this zone        */
190     real p1_1;    /* The top value of the first cell in this zone           */
191 } gmx_ddzone_t;
192
193 typedef struct
194 {
195     gmx_domdec_ind_t ind;
196     int *ibuf;
197     int ibuf_nalloc;
198     vec_rvec_t vbuf;
199     int nsend;
200     int nat;
201     int nsend_zone;
202 } dd_comm_setup_work_t;
203
204 typedef struct gmx_domdec_comm
205 {
206     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
207      * unless stated otherwise.
208      */
209
210     /* The number of decomposition dimensions for PME, 0: no PME */
211     int  npmedecompdim;
212     /* The number of nodes doing PME (PP/PME or only PME) */
213     int  npmenodes;
214     int  npmenodes_x;
215     int  npmenodes_y;
216     /* The communication setup including the PME only nodes */
217     gmx_bool bCartesianPP_PME;
218     ivec ntot;
219     int  cartpmedim;
220     int  *pmenodes;          /* size npmenodes                         */
221     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
222                               * but with bCartesianPP_PME              */
223     gmx_ddpme_t ddpme[2];
224     
225     /* The DD particle-particle nodes only */
226     gmx_bool bCartesianPP;
227     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
228     
229     /* The global charge groups */
230     t_block cgs_gl;
231
232     /* Should we sort the cgs */
233     int  nstSortCG;
234     gmx_domdec_sort_t *sort;
235     
236     /* Are there charge groups? */
237     gmx_bool bCGs;
238
239     /* Are there bonded and multi-body interactions between charge groups? */
240     gmx_bool bInterCGBondeds;
241     gmx_bool bInterCGMultiBody;
242
243     /* Data for the optional bonded interaction atom communication range */
244     gmx_bool bBondComm;
245     t_blocka *cglink;
246     char *bLocalCG;
247
248     /* The DLB option */
249     int  eDLB;
250     /* Are we actually using DLB? */
251     gmx_bool bDynLoadBal;
252
253     /* Cell sizes for static load balancing, first index cartesian */
254     real **slb_frac;
255     
256     /* The width of the communicated boundaries */
257     real cutoff_mbody;
258     real cutoff;
259     /* The minimum cell size (including triclinic correction) */
260     rvec cellsize_min;
261     /* For dlb, for use with edlbAUTO */
262     rvec cellsize_min_dlb;
263     /* The lower limit for the DD cell size with DLB */
264     real cellsize_limit;
265     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
266     gmx_bool bVacDLBNoLimit;
267
268     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
269     ivec tric_dir;
270     /* box0 and box_size are required with dim's without pbc and -gcom */
271     rvec box0;
272     rvec box_size;
273     
274     /* The cell boundaries */
275     rvec cell_x0;
276     rvec cell_x1;
277
278     /* The old location of the cell boundaries, to check cg displacements */
279     rvec old_cell_x0;
280     rvec old_cell_x1;
281
282     /* The communication setup and charge group boundaries for the zones */
283     gmx_domdec_zones_t zones;
284     
285     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
286      * cell boundaries of neighboring cells for dynamic load balancing.
287      */
288     gmx_ddzone_t zone_d1[2];
289     gmx_ddzone_t zone_d2[2][2];
290     
291     /* The coordinate/force communication setup and indices */
292     gmx_domdec_comm_dim_t cd[DIM];
293     /* The maximum number of cells to communicate with in one dimension */
294     int  maxpulse;
295     
296     /* Which cg distribution is stored on the master node */
297     int master_cg_ddp_count;
298     
299     /* The number of cg's received from the direct neighbors */
300     int  zone_ncg1[DD_MAXZONE];
301     
302     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
303     int  nat[ddnatNR];
304
305     /* Array for signalling if atoms have moved to another domain */
306     int  *moved;
307     int  moved_nalloc;
308     
309     /* Communication buffer for general use */
310     int  *buf_int;
311     int  nalloc_int;
312
313     /* Communication buffer for general use */
314     vec_rvec_t vbuf;
315
316     /* Temporary storage for thread parallel communication setup */
317     int nth;
318     dd_comm_setup_work_t *dth;
319
320     /* Communication buffers only used with multiple grid pulses */
321     int  *buf_int2;
322     int  nalloc_int2;
323     vec_rvec_t vbuf2;
324     
325     /* Communication buffers for local redistribution */
326     int  **cggl_flag;
327     int  cggl_flag_nalloc[DIM*2];
328     rvec **cgcm_state;
329     int  cgcm_state_nalloc[DIM*2];
330     
331     /* Cell sizes for dynamic load balancing */
332     gmx_domdec_root_t **root;
333     real *cell_f_row;
334     real cell_f0[DIM];
335     real cell_f1[DIM];
336     real cell_f_max0[DIM];
337     real cell_f_min1[DIM];
338     
339     /* Stuff for load communication */
340     gmx_bool bRecordLoad;
341     gmx_domdec_load_t *load;
342 #ifdef GMX_MPI
343     MPI_Comm *mpi_comm_load;
344 #endif
345
346     /* Maximum DLB scaling per load balancing step in percent */
347     int dlb_scale_lim;
348
349     /* Cycle counters */
350     float cycl[ddCyclNr];
351     int   cycl_n[ddCyclNr];
352     float cycl_max[ddCyclNr];
353     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
354     int eFlop;
355     double flop;
356     int    flop_n;
357     /* Have often have did we have load measurements */
358     int    n_load_have;
359     /* Have often have we collected the load measurements */
360     int    n_load_collect;
361     
362     /* Statistics */
363     double sum_nat[ddnatNR-ddnatZONE];
364     int    ndecomp;
365     int    nload;
366     double load_step;
367     double load_sum;
368     double load_max;
369     ivec   load_lim;
370     double load_mdf;
371     double load_pme;
372
373     /* The last partition step */
374     gmx_large_int_t partition_step;
375
376     /* Debugging */
377     int  nstDDDump;
378     int  nstDDDumpGrid;
379     int  DD_debug;
380 } gmx_domdec_comm_t;
381
382 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
383 #define DD_CGIBS 2
384
385 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
386 #define DD_FLAG_NRCG  65535
387 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
388 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
389
390 /* Zone permutation required to obtain consecutive charge groups
391  * for neighbor searching.
392  */
393 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
394
395 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
396  * components see only j zones with that component 0.
397  */
398
399 /* The DD zone order */
400 static const ivec dd_zo[DD_MAXZONE] =
401   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
402
403 /* The 3D setup */
404 #define dd_z3n  8
405 #define dd_zp3n 4
406 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
407
408 /* The 2D setup */
409 #define dd_z2n  4
410 #define dd_zp2n 2
411 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
412
413 /* The 1D setup */
414 #define dd_z1n  2
415 #define dd_zp1n 1
416 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
417
418 /* Factors used to avoid problems due to rounding issues */
419 #define DD_CELL_MARGIN       1.0001
420 #define DD_CELL_MARGIN2      1.00005
421 /* Factor to account for pressure scaling during nstlist steps */
422 #define DD_PRES_SCALE_MARGIN 1.02
423
424 /* Allowed performance loss before we DLB or warn */
425 #define DD_PERF_LOSS 0.05
426
427 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
428
429 /* Use separate MPI send and receive commands
430  * when nnodes <= GMX_DD_NNODES_SENDRECV.
431  * This saves memory (and some copying for small nnodes).
432  * For high parallelization scatter and gather calls are used.
433  */
434 #define GMX_DD_NNODES_SENDRECV 4
435
436
437 /*
438 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
439
440 static void index2xyz(ivec nc,int ind,ivec xyz)
441 {
442   xyz[XX] = ind % nc[XX];
443   xyz[YY] = (ind / nc[XX]) % nc[YY];
444   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
445 }
446 */
447
448 /* This order is required to minimize the coordinate communication in PME
449  * which uses decomposition in the x direction.
450  */
451 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
452
453 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
454 {
455     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
456     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
457     xyz[ZZ] = ind % nc[ZZ];
458 }
459
460 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
461 {
462     int ddindex;
463     int ddnodeid=-1;
464     
465     ddindex = dd_index(dd->nc,c);
466     if (dd->comm->bCartesianPP_PME)
467     {
468         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
469     }
470     else if (dd->comm->bCartesianPP)
471     {
472 #ifdef GMX_MPI
473         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
474 #endif
475     }
476     else
477     {
478         ddnodeid = ddindex;
479     }
480     
481     return ddnodeid;
482 }
483
484 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
485 {
486     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
487 }
488
489 int ddglatnr(gmx_domdec_t *dd,int i)
490 {
491     int atnr;
492     
493     if (dd == NULL)
494     {
495         atnr = i + 1;
496     }
497     else
498     {
499         if (i >= dd->comm->nat[ddnatNR-1])
500         {
501             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
502         }
503         atnr = dd->gatindex[i] + 1;
504     }
505     
506     return atnr;
507 }
508
509 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
510 {
511     return &dd->comm->cgs_gl;
512 }
513
514 static void vec_rvec_init(vec_rvec_t *v)
515 {
516     v->nalloc = 0;
517     v->v      = NULL;
518 }
519
520 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
521 {
522     if (n > v->nalloc)
523     {
524         v->nalloc = over_alloc_dd(n);
525         srenew(v->v,v->nalloc);
526     }
527 }
528
529 void dd_store_state(gmx_domdec_t *dd,t_state *state)
530 {
531     int i;
532     
533     if (state->ddp_count != dd->ddp_count)
534     {
535         gmx_incons("The state does not the domain decomposition state");
536     }
537     
538     state->ncg_gl = dd->ncg_home;
539     if (state->ncg_gl > state->cg_gl_nalloc)
540     {
541         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
542         srenew(state->cg_gl,state->cg_gl_nalloc);
543     }
544     for(i=0; i<state->ncg_gl; i++)
545     {
546         state->cg_gl[i] = dd->index_gl[i];
547     }
548     
549     state->ddp_count_cg_gl = dd->ddp_count;
550 }
551
552 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
553 {
554     return &dd->comm->zones;
555 }
556
557 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
558                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
559 {
560     gmx_domdec_zones_t *zones;
561     int izone,d,dim;
562
563     zones = &dd->comm->zones;
564
565     izone = 0;
566     while (icg >= zones->izone[izone].cg1)
567     {
568         izone++;
569     }
570     
571     if (izone == 0)
572     {
573         *jcg0 = icg;
574     }
575     else if (izone < zones->nizone)
576     {
577         *jcg0 = zones->izone[izone].jcg0;
578     }
579     else
580     {
581         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
582                   icg,izone,zones->nizone);
583     }
584         
585     *jcg1 = zones->izone[izone].jcg1;
586     
587     for(d=0; d<dd->ndim; d++)
588     {
589         dim = dd->dim[d];
590         shift0[dim] = zones->izone[izone].shift0[dim];
591         shift1[dim] = zones->izone[izone].shift1[dim];
592         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
593         {
594             /* A conservative approach, this can be optimized */
595             shift0[dim] -= 1;
596             shift1[dim] += 1;
597         }
598     }
599 }
600
601 int dd_natoms_vsite(gmx_domdec_t *dd)
602 {
603     return dd->comm->nat[ddnatVSITE];
604 }
605
606 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
607 {
608     *at_start = dd->comm->nat[ddnatCON-1];
609     *at_end   = dd->comm->nat[ddnatCON];
610 }
611
612 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
613 {
614     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
615     int  *index,*cgindex;
616     gmx_domdec_comm_t *comm;
617     gmx_domdec_comm_dim_t *cd;
618     gmx_domdec_ind_t *ind;
619     rvec shift={0,0,0},*buf,*rbuf;
620     gmx_bool bPBC,bScrew;
621     
622     comm = dd->comm;
623     
624     cgindex = dd->cgindex;
625     
626     buf = comm->vbuf.v;
627
628     nzone = 1;
629     nat_tot = dd->nat_home;
630     for(d=0; d<dd->ndim; d++)
631     {
632         bPBC   = (dd->ci[dd->dim[d]] == 0);
633         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
634         if (bPBC)
635         {
636             copy_rvec(box[dd->dim[d]],shift);
637         }
638         cd = &comm->cd[d];
639         for(p=0; p<cd->np; p++)
640         {
641             ind = &cd->ind[p];
642             index = ind->index;
643             n = 0;
644             if (!bPBC)
645             {
646                 for(i=0; i<ind->nsend[nzone]; i++)
647                 {
648                     at0 = cgindex[index[i]];
649                     at1 = cgindex[index[i]+1];
650                     for(j=at0; j<at1; j++)
651                     {
652                         copy_rvec(x[j],buf[n]);
653                         n++;
654                     }
655                 }
656             }
657             else if (!bScrew)
658             {
659                 for(i=0; i<ind->nsend[nzone]; i++)
660                 {
661                     at0 = cgindex[index[i]];
662                     at1 = cgindex[index[i]+1];
663                     for(j=at0; j<at1; j++)
664                     {
665                         /* We need to shift the coordinates */
666                         rvec_add(x[j],shift,buf[n]);
667                         n++;
668                     }
669                 }
670             }
671             else
672             {
673                 for(i=0; i<ind->nsend[nzone]; i++)
674                 {
675                     at0 = cgindex[index[i]];
676                     at1 = cgindex[index[i]+1];
677                     for(j=at0; j<at1; j++)
678                     {
679                         /* Shift x */
680                         buf[n][XX] = x[j][XX] + shift[XX];
681                         /* Rotate y and z.
682                          * This operation requires a special shift force
683                          * treatment, which is performed in calc_vir.
684                          */
685                         buf[n][YY] = box[YY][YY] - x[j][YY];
686                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
687                         n++;
688                     }
689                 }
690             }
691             
692             if (cd->bInPlace)
693             {
694                 rbuf = x + nat_tot;
695             }
696             else
697             {
698                 rbuf = comm->vbuf2.v;
699             }
700             /* Send and receive the coordinates */
701             dd_sendrecv_rvec(dd, d, dddirBackward,
702                              buf,  ind->nsend[nzone+1],
703                              rbuf, ind->nrecv[nzone+1]);
704             if (!cd->bInPlace)
705             {
706                 j = 0;
707                 for(zone=0; zone<nzone; zone++)
708                 {
709                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
710                     {
711                         copy_rvec(rbuf[j],x[i]);
712                         j++;
713                     }
714                 }
715             }
716             nat_tot += ind->nrecv[nzone+1];
717         }
718         nzone += nzone;
719     }
720 }
721
722 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
723 {
724     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
725     int  *index,*cgindex;
726     gmx_domdec_comm_t *comm;
727     gmx_domdec_comm_dim_t *cd;
728     gmx_domdec_ind_t *ind;
729     rvec *buf,*sbuf;
730     ivec vis;
731     int  is;
732     gmx_bool bPBC,bScrew;
733     
734     comm = dd->comm;
735     
736     cgindex = dd->cgindex;
737
738     buf = comm->vbuf.v;
739
740     n = 0;
741     nzone = comm->zones.n/2;
742     nat_tot = dd->nat_tot;
743     for(d=dd->ndim-1; d>=0; d--)
744     {
745         bPBC   = (dd->ci[dd->dim[d]] == 0);
746         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
747         if (fshift == NULL && !bScrew)
748         {
749             bPBC = FALSE;
750         }
751         /* Determine which shift vector we need */
752         clear_ivec(vis);
753         vis[dd->dim[d]] = 1;
754         is = IVEC2IS(vis);
755         
756         cd = &comm->cd[d];
757         for(p=cd->np-1; p>=0; p--) {
758             ind = &cd->ind[p];
759             nat_tot -= ind->nrecv[nzone+1];
760             if (cd->bInPlace)
761             {
762                 sbuf = f + nat_tot;
763             }
764             else
765             {
766                 sbuf = comm->vbuf2.v;
767                 j = 0;
768                 for(zone=0; zone<nzone; zone++)
769                 {
770                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
771                     {
772                         copy_rvec(f[i],sbuf[j]);
773                         j++;
774                     }
775                 }
776             }
777             /* Communicate the forces */
778             dd_sendrecv_rvec(dd, d, dddirForward,
779                              sbuf, ind->nrecv[nzone+1],
780                              buf,  ind->nsend[nzone+1]);
781             index = ind->index;
782             /* Add the received forces */
783             n = 0;
784             if (!bPBC)
785             {
786                 for(i=0; i<ind->nsend[nzone]; i++)
787                 {
788                     at0 = cgindex[index[i]];
789                     at1 = cgindex[index[i]+1];
790                     for(j=at0; j<at1; j++)
791                     {
792                         rvec_inc(f[j],buf[n]);
793                         n++;
794                     }
795                 } 
796             }
797             else if (!bScrew)
798             {
799                 for(i=0; i<ind->nsend[nzone]; i++)
800                 {
801                     at0 = cgindex[index[i]];
802                     at1 = cgindex[index[i]+1];
803                     for(j=at0; j<at1; j++)
804                     {
805                         rvec_inc(f[j],buf[n]);
806                         /* Add this force to the shift force */
807                         rvec_inc(fshift[is],buf[n]);
808                         n++;
809                     }
810                 }
811             }
812             else
813             {
814                 for(i=0; i<ind->nsend[nzone]; i++)
815                 {
816                     at0 = cgindex[index[i]];
817                     at1 = cgindex[index[i]+1];
818                     for(j=at0; j<at1; j++)
819                     {
820                         /* Rotate the force */
821                         f[j][XX] += buf[n][XX];
822                         f[j][YY] -= buf[n][YY];
823                         f[j][ZZ] -= buf[n][ZZ];
824                         if (fshift)
825                         {
826                             /* Add this force to the shift force */
827                             rvec_inc(fshift[is],buf[n]);
828                         }
829                         n++;
830                     }
831                 }
832             }
833         }
834         nzone /= 2;
835     }
836 }
837
838 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
839 {
840     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
841     int  *index,*cgindex;
842     gmx_domdec_comm_t *comm;
843     gmx_domdec_comm_dim_t *cd;
844     gmx_domdec_ind_t *ind;
845     real *buf,*rbuf;
846     
847     comm = dd->comm;
848     
849     cgindex = dd->cgindex;
850     
851     buf = &comm->vbuf.v[0][0];
852
853     nzone = 1;
854     nat_tot = dd->nat_home;
855     for(d=0; d<dd->ndim; d++)
856     {
857         cd = &comm->cd[d];
858         for(p=0; p<cd->np; p++)
859         {
860             ind = &cd->ind[p];
861             index = ind->index;
862             n = 0;
863             for(i=0; i<ind->nsend[nzone]; i++)
864             {
865                 at0 = cgindex[index[i]];
866                 at1 = cgindex[index[i]+1];
867                 for(j=at0; j<at1; j++)
868                 {
869                     buf[n] = v[j];
870                     n++;
871                 }
872             }
873             
874             if (cd->bInPlace)
875             {
876                 rbuf = v + nat_tot;
877             }
878             else
879             {
880                 rbuf = &comm->vbuf2.v[0][0];
881             }
882             /* Send and receive the coordinates */
883             dd_sendrecv_real(dd, d, dddirBackward,
884                              buf,  ind->nsend[nzone+1],
885                              rbuf, ind->nrecv[nzone+1]);
886             if (!cd->bInPlace)
887             {
888                 j = 0;
889                 for(zone=0; zone<nzone; zone++)
890                 {
891                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
892                     {
893                         v[i] = rbuf[j];
894                         j++;
895                     }
896                 }
897             }
898             nat_tot += ind->nrecv[nzone+1];
899         }
900         nzone += nzone;
901     }
902 }
903
904 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
905 {
906     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
907     int  *index,*cgindex;
908     gmx_domdec_comm_t *comm;
909     gmx_domdec_comm_dim_t *cd;
910     gmx_domdec_ind_t *ind;
911     real *buf,*sbuf;
912     
913     comm = dd->comm;
914     
915     cgindex = dd->cgindex;
916
917     buf = &comm->vbuf.v[0][0];
918
919     n = 0;
920     nzone = comm->zones.n/2;
921     nat_tot = dd->nat_tot;
922     for(d=dd->ndim-1; d>=0; d--)
923     {
924         cd = &comm->cd[d];
925         for(p=cd->np-1; p>=0; p--) {
926             ind = &cd->ind[p];
927             nat_tot -= ind->nrecv[nzone+1];
928             if (cd->bInPlace)
929             {
930                 sbuf = v + nat_tot;
931             }
932             else
933             {
934                 sbuf = &comm->vbuf2.v[0][0];
935                 j = 0;
936                 for(zone=0; zone<nzone; zone++)
937                 {
938                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
939                     {
940                         sbuf[j] = v[i];
941                         j++;
942                     }
943                 }
944             }
945             /* Communicate the forces */
946             dd_sendrecv_real(dd, d, dddirForward,
947                              sbuf, ind->nrecv[nzone+1],
948                              buf,  ind->nsend[nzone+1]);
949             index = ind->index;
950             /* Add the received forces */
951             n = 0;
952             for(i=0; i<ind->nsend[nzone]; i++)
953             {
954                 at0 = cgindex[index[i]];
955                 at1 = cgindex[index[i]+1];
956                 for(j=at0; j<at1; j++)
957                 {
958                     v[j] += buf[n];
959                     n++;
960                 }
961             } 
962         }
963         nzone /= 2;
964     }
965 }
966
967 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
968 {
969     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
970             d,i,j,
971             zone->min0,zone->max1,
972             zone->mch0,zone->mch0,
973             zone->p1_0,zone->p1_1);
974 }
975
976
977 #define DDZONECOMM_MAXZONE  5
978 #define DDZONECOMM_BUFSIZE  3
979
980 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
981                                int ddimind,int direction,
982                                gmx_ddzone_t *buf_s,int n_s,
983                                gmx_ddzone_t *buf_r,int n_r)
984 {
985 #define ZBS  DDZONECOMM_BUFSIZE
986     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
987     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
988     int i;
989
990     for(i=0; i<n_s; i++)
991     {
992         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
993         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
994         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
995         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
996         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
997         vbuf_s[i*ZBS+1][2] = 0;
998         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
999         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1000         vbuf_s[i*ZBS+2][2] = 0;
1001     }
1002
1003     dd_sendrecv_rvec(dd, ddimind, direction,
1004                      vbuf_s, n_s*ZBS,
1005                      vbuf_r, n_r*ZBS);
1006
1007     for(i=0; i<n_r; i++)
1008     {
1009         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1010         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1011         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1012         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1013         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1014         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1015         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1016     }
1017
1018 #undef ZBS
1019 }
1020
1021 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1022                           rvec cell_ns_x0,rvec cell_ns_x1)
1023 {
1024     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1025     gmx_ddzone_t *zp;
1026     gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1027     gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1028     gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1029     rvec extr_s[2],extr_r[2];
1030     rvec dh;
1031     real dist_d,c=0,det;
1032     gmx_domdec_comm_t *comm;
1033     gmx_bool bPBC,bUse;
1034
1035     comm = dd->comm;
1036
1037     for(d=1; d<dd->ndim; d++)
1038     {
1039         dim = dd->dim[d];
1040         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1041         zp->min0 = cell_ns_x0[dim];
1042         zp->max1 = cell_ns_x1[dim];
1043         zp->min1 = cell_ns_x1[dim];
1044         zp->mch0 = cell_ns_x0[dim];
1045         zp->mch1 = cell_ns_x1[dim];
1046         zp->p1_0 = cell_ns_x0[dim];
1047         zp->p1_1 = cell_ns_x1[dim];
1048     }
1049     
1050     for(d=dd->ndim-2; d>=0; d--)
1051     {
1052         dim  = dd->dim[d];
1053         bPBC = (dim < ddbox->npbcdim);
1054
1055         /* Use an rvec to store two reals */
1056         extr_s[d][0] = comm->cell_f0[d+1];
1057         extr_s[d][1] = comm->cell_f1[d+1];
1058         extr_s[d][2] = comm->cell_f1[d+1];
1059
1060         pos = 0;
1061         /* Store the extremes in the backward sending buffer,
1062          * so the get updated separately from the forward communication.
1063          */
1064         for(d1=d; d1<dd->ndim-1; d1++)
1065         {
1066             /* We invert the order to be able to use the same loop for buf_e */
1067             buf_s[pos].min0 = extr_s[d1][1];
1068             buf_s[pos].max1 = extr_s[d1][0];
1069             buf_s[pos].min1 = extr_s[d1][2];
1070             buf_s[pos].mch0 = 0;
1071             buf_s[pos].mch1 = 0;
1072             /* Store the cell corner of the dimension we communicate along */
1073             buf_s[pos].p1_0 = comm->cell_x0[dim];
1074             buf_s[pos].p1_1 = 0;
1075             pos++;
1076         }
1077
1078         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1079         pos++;
1080
1081         if (dd->ndim == 3 && d == 0)
1082         {
1083             buf_s[pos] = comm->zone_d2[0][1];
1084             pos++;
1085             buf_s[pos] = comm->zone_d1[0];
1086             pos++;
1087         }
1088
1089         /* We only need to communicate the extremes
1090          * in the forward direction
1091          */
1092         npulse = comm->cd[d].np;
1093         if (bPBC)
1094         {
1095             /* Take the minimum to avoid double communication */
1096             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1097         }
1098         else
1099         {
1100             /* Without PBC we should really not communicate over
1101              * the boundaries, but implementing that complicates
1102              * the communication setup and therefore we simply
1103              * do all communication, but ignore some data.
1104              */
1105             npulse_min = npulse;
1106         }
1107         for(p=0; p<npulse_min; p++)
1108         {
1109             /* Communicate the extremes forward */
1110             bUse = (bPBC || dd->ci[dim] > 0);
1111
1112             dd_sendrecv_rvec(dd, d, dddirForward,
1113                              extr_s+d, dd->ndim-d-1,
1114                              extr_r+d, dd->ndim-d-1);
1115
1116             if (bUse)
1117             {
1118                 for(d1=d; d1<dd->ndim-1; d1++)
1119                 {
1120                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1121                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1122                     extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1123                 }
1124             }
1125         }
1126
1127         buf_size = pos;
1128         for(p=0; p<npulse; p++)
1129         {
1130             /* Communicate all the zone information backward */
1131             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1132
1133             dd_sendrecv_ddzone(dd, d, dddirBackward,
1134                                buf_s, buf_size,
1135                                buf_r, buf_size);
1136
1137             clear_rvec(dh);
1138             if (p > 0)
1139             {
1140                 for(d1=d+1; d1<dd->ndim; d1++)
1141                 {
1142                     /* Determine the decrease of maximum required
1143                      * communication height along d1 due to the distance along d,
1144                      * this avoids a lot of useless atom communication.
1145                      */
1146                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1147
1148                     if (ddbox->tric_dir[dim])
1149                     {
1150                         /* c is the off-diagonal coupling between the cell planes
1151                          * along directions d and d1.
1152                          */
1153                         c = ddbox->v[dim][dd->dim[d1]][dim];
1154                     }
1155                     else
1156                     {
1157                         c = 0;
1158                     }
1159                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1160                     if (det > 0)
1161                     {
1162                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1163                     }
1164                     else
1165                     {
1166                         /* A negative value signals out of range */
1167                         dh[d1] = -1;
1168                     }
1169                 }
1170             }
1171
1172             /* Accumulate the extremes over all pulses */
1173             for(i=0; i<buf_size; i++)
1174             {
1175                 if (p == 0)
1176                 {
1177                     buf_e[i] = buf_r[i];
1178                 }
1179                 else
1180                 {
1181                     if (bUse)
1182                     {
1183                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1184                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1185                         buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1186                     }
1187
1188                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1189                     {
1190                         d1 = 1;
1191                     }
1192                     else
1193                     {
1194                         d1 = d + 1;
1195                     }
1196                     if (bUse && dh[d1] >= 0)
1197                     {
1198                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1199                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1200                     }
1201                 }
1202                 /* Copy the received buffer to the send buffer,
1203                  * to pass the data through with the next pulse.
1204                  */
1205                 buf_s[i] = buf_r[i];
1206             }
1207             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1208                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1209             {
1210                 /* Store the extremes */ 
1211                 pos = 0;
1212
1213                 for(d1=d; d1<dd->ndim-1; d1++)
1214                 {
1215                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1216                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1217                     extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1218                     pos++;
1219                 }
1220
1221                 if (d == 1 || (d == 0 && dd->ndim == 3))
1222                 {
1223                     for(i=d; i<2; i++)
1224                     {
1225                         comm->zone_d2[1-d][i] = buf_e[pos];
1226                         pos++;
1227                     }
1228                 }
1229                 if (d == 0)
1230                 {
1231                     comm->zone_d1[1] = buf_e[pos];
1232                     pos++;
1233                 }
1234             }
1235         }
1236     }
1237     
1238     if (dd->ndim >= 2)
1239     {
1240         dim = dd->dim[1];
1241         for(i=0; i<2; i++)
1242         {
1243             if (debug)
1244             {
1245                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1246             }
1247             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1248             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1249         }
1250     }
1251     if (dd->ndim >= 3)
1252     {
1253         dim = dd->dim[2];
1254         for(i=0; i<2; i++)
1255         {
1256             for(j=0; j<2; j++)
1257             {
1258                 if (debug)
1259                 {
1260                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1261                 }
1262                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1263                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1264             }
1265         }
1266     }
1267     for(d=1; d<dd->ndim; d++)
1268     {
1269         comm->cell_f_max0[d] = extr_s[d-1][0];
1270         comm->cell_f_min1[d] = extr_s[d-1][1];
1271         if (debug)
1272         {
1273             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1274                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1275         }
1276     }
1277 }
1278
1279 static void dd_collect_cg(gmx_domdec_t *dd,
1280                           t_state *state_local)
1281 {
1282     gmx_domdec_master_t *ma=NULL;
1283     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1284     t_block *cgs_gl;
1285
1286     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1287     {
1288         /* The master has the correct distribution */
1289         return;
1290     }
1291     
1292     if (state_local->ddp_count == dd->ddp_count)
1293     {
1294         ncg_home = dd->ncg_home;
1295         cg       = dd->index_gl;
1296         nat_home = dd->nat_home;
1297     } 
1298     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1299     {
1300         cgs_gl = &dd->comm->cgs_gl;
1301
1302         ncg_home = state_local->ncg_gl;
1303         cg       = state_local->cg_gl;
1304         nat_home = 0;
1305         for(i=0; i<ncg_home; i++)
1306         {
1307             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1308         }
1309     }
1310     else
1311     {
1312         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1313     }
1314     
1315     buf2[0] = dd->ncg_home;
1316     buf2[1] = dd->nat_home;
1317     if (DDMASTER(dd))
1318     {
1319         ma = dd->ma;
1320         ibuf = ma->ibuf;
1321     }
1322     else
1323     {
1324         ibuf = NULL;
1325     }
1326     /* Collect the charge group and atom counts on the master */
1327     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1328     
1329     if (DDMASTER(dd))
1330     {
1331         ma->index[0] = 0;
1332         for(i=0; i<dd->nnodes; i++)
1333         {
1334             ma->ncg[i] = ma->ibuf[2*i];
1335             ma->nat[i] = ma->ibuf[2*i+1];
1336             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1337             
1338         }
1339         /* Make byte counts and indices */
1340         for(i=0; i<dd->nnodes; i++)
1341         {
1342             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1343             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1344         }
1345         if (debug)
1346         {
1347             fprintf(debug,"Initial charge group distribution: ");
1348             for(i=0; i<dd->nnodes; i++)
1349                 fprintf(debug," %d",ma->ncg[i]);
1350             fprintf(debug,"\n");
1351         }
1352     }
1353     
1354     /* Collect the charge group indices on the master */
1355     dd_gatherv(dd,
1356                dd->ncg_home*sizeof(int),dd->index_gl,
1357                DDMASTER(dd) ? ma->ibuf : NULL,
1358                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1359                DDMASTER(dd) ? ma->cg : NULL);
1360     
1361     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1362 }
1363
1364 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1365                                     rvec *lv,rvec *v)
1366 {
1367     gmx_domdec_master_t *ma;
1368     int  n,i,c,a,nalloc=0;
1369     rvec *buf=NULL;
1370     t_block *cgs_gl;
1371
1372     ma = dd->ma;
1373     
1374     if (!DDMASTER(dd))
1375     {
1376 #ifdef GMX_MPI
1377         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1378                  dd->rank,dd->mpi_comm_all);
1379 #endif
1380     } else {
1381         /* Copy the master coordinates to the global array */
1382         cgs_gl = &dd->comm->cgs_gl;
1383
1384         n = DDMASTERRANK(dd);
1385         a = 0;
1386         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1387         {
1388             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1389             {
1390                 copy_rvec(lv[a++],v[c]);
1391             }
1392         }
1393         
1394         for(n=0; n<dd->nnodes; n++)
1395         {
1396             if (n != dd->rank)
1397             {
1398                 if (ma->nat[n] > nalloc)
1399                 {
1400                     nalloc = over_alloc_dd(ma->nat[n]);
1401                     srenew(buf,nalloc);
1402                 }
1403 #ifdef GMX_MPI
1404                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1405                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1406 #endif
1407                 a = 0;
1408                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1409                 {
1410                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1411                     {
1412                         copy_rvec(buf[a++],v[c]);
1413                     }
1414                 }
1415             }
1416         }
1417         sfree(buf);
1418     }
1419 }
1420
1421 static void get_commbuffer_counts(gmx_domdec_t *dd,
1422                                   int **counts,int **disps)
1423 {
1424     gmx_domdec_master_t *ma;
1425     int n;
1426
1427     ma = dd->ma;
1428     
1429     /* Make the rvec count and displacment arrays */
1430     *counts  = ma->ibuf;
1431     *disps   = ma->ibuf + dd->nnodes;
1432     for(n=0; n<dd->nnodes; n++)
1433     {
1434         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1435         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1436     }
1437 }
1438
1439 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1440                                    rvec *lv,rvec *v)
1441 {
1442     gmx_domdec_master_t *ma;
1443     int  *rcounts=NULL,*disps=NULL;
1444     int  n,i,c,a;
1445     rvec *buf=NULL;
1446     t_block *cgs_gl;
1447     
1448     ma = dd->ma;
1449     
1450     if (DDMASTER(dd))
1451     {
1452         get_commbuffer_counts(dd,&rcounts,&disps);
1453
1454         buf = ma->vbuf;
1455     }
1456     
1457     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1458
1459     if (DDMASTER(dd))
1460     {
1461         cgs_gl = &dd->comm->cgs_gl;
1462
1463         a = 0;
1464         for(n=0; n<dd->nnodes; n++)
1465         {
1466             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1467             {
1468                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1469                 {
1470                     copy_rvec(buf[a++],v[c]);
1471                 }
1472             }
1473         }
1474     }
1475 }
1476
1477 void dd_collect_vec(gmx_domdec_t *dd,
1478                     t_state *state_local,rvec *lv,rvec *v)
1479 {
1480     gmx_domdec_master_t *ma;
1481     int  n,i,c,a,nalloc=0;
1482     rvec *buf=NULL;
1483     
1484     dd_collect_cg(dd,state_local);
1485
1486     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1487     {
1488         dd_collect_vec_sendrecv(dd,lv,v);
1489     }
1490     else
1491     {
1492         dd_collect_vec_gatherv(dd,lv,v);
1493     }
1494 }
1495
1496
1497 void dd_collect_state(gmx_domdec_t *dd,
1498                       t_state *state_local,t_state *state)
1499 {
1500     int est,i,j,nh;
1501
1502     nh = state->nhchainlength;
1503
1504     if (DDMASTER(dd))
1505     {
1506         for (i=0;i<efptNR;i++) {
1507             state->lambda[i] = state_local->lambda[i];
1508         }
1509         state->fep_state = state_local->fep_state;
1510         state->veta = state_local->veta;
1511         state->vol0 = state_local->vol0;
1512         copy_mat(state_local->box,state->box);
1513         copy_mat(state_local->boxv,state->boxv);
1514         copy_mat(state_local->svir_prev,state->svir_prev);
1515         copy_mat(state_local->fvir_prev,state->fvir_prev);
1516         copy_mat(state_local->pres_prev,state->pres_prev);
1517
1518
1519         for(i=0; i<state_local->ngtc; i++)
1520         {
1521             for(j=0; j<nh; j++) {
1522                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1523                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1524             }
1525             state->therm_integral[i] = state_local->therm_integral[i];            
1526         }
1527         for(i=0; i<state_local->nnhpres; i++) 
1528         {
1529             for(j=0; j<nh; j++) {
1530                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1531                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1532             }
1533         }
1534     }
1535     for(est=0; est<estNR; est++)
1536     {
1537         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1538         {
1539             switch (est) {
1540             case estX:
1541                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1542                 break;
1543             case estV:
1544                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1545                 break;
1546             case estSDX:
1547                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1548                 break;
1549             case estCGP:
1550                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1551                 break;
1552             case estLD_RNG:
1553                 if (state->nrngi == 1)
1554                 {
1555                     if (DDMASTER(dd))
1556                     {
1557                         for(i=0; i<state_local->nrng; i++)
1558                         {
1559                             state->ld_rng[i] = state_local->ld_rng[i];
1560                         }
1561                     }
1562                 }
1563                 else
1564                 {
1565                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1566                               state_local->ld_rng,state->ld_rng);
1567                 }
1568                 break;
1569             case estLD_RNGI:
1570                 if (state->nrngi == 1)
1571                 {
1572                    if (DDMASTER(dd))
1573                     {
1574                         state->ld_rngi[0] = state_local->ld_rngi[0];
1575                     } 
1576                 }
1577                 else
1578                 {
1579                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1580                               state_local->ld_rngi,state->ld_rngi);
1581                 }
1582                 break;
1583             case estDISRE_INITF:
1584             case estDISRE_RM3TAV:
1585             case estORIRE_INITF:
1586             case estORIRE_DTAV:
1587                 break;
1588             default:
1589                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1590             }
1591         }
1592     }
1593 }
1594
1595 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1596 {
1597     int est;
1598
1599     if (debug)
1600     {
1601         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1602     }
1603
1604     state->nalloc = over_alloc_dd(nalloc);
1605     
1606     for(est=0; est<estNR; est++)
1607     {
1608         if (EST_DISTR(est) && (state->flags & (1<<est)))
1609         {
1610             switch(est) {
1611             case estX:
1612                 srenew(state->x,state->nalloc);
1613                 break;
1614             case estV:
1615                 srenew(state->v,state->nalloc);
1616                 break;
1617             case estSDX:
1618                 srenew(state->sd_X,state->nalloc);
1619                 break;
1620             case estCGP:
1621                 srenew(state->cg_p,state->nalloc);
1622                 break;
1623             case estLD_RNG:
1624             case estLD_RNGI:
1625             case estDISRE_INITF:
1626             case estDISRE_RM3TAV:
1627             case estORIRE_INITF:
1628             case estORIRE_DTAV:
1629                 /* No reallocation required */
1630                 break;
1631             default:
1632                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1633             }
1634         }
1635     }
1636     
1637     if (f != NULL)
1638     {
1639         srenew(*f,state->nalloc);
1640     }
1641 }
1642
1643 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1644                                int nalloc)
1645 {
1646     if (nalloc > fr->cg_nalloc)
1647     {
1648         if (debug)
1649         {
1650             fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1651         }
1652         fr->cg_nalloc = over_alloc_dd(nalloc);
1653         srenew(fr->cginfo,fr->cg_nalloc);
1654         if (fr->cutoff_scheme == ecutsGROUP)
1655         {
1656             srenew(fr->cg_cm,fr->cg_nalloc);
1657         }
1658     }
1659     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1660     {
1661         /* We don't use charge groups, we use x in state to set up
1662          * the atom communication.
1663          */
1664         dd_realloc_state(state,f,nalloc);
1665     }
1666 }
1667
1668 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1669                                        rvec *v,rvec *lv)
1670 {
1671     gmx_domdec_master_t *ma;
1672     int  n,i,c,a,nalloc=0;
1673     rvec *buf=NULL;
1674     
1675     if (DDMASTER(dd))
1676     {
1677         ma  = dd->ma;
1678         
1679         for(n=0; n<dd->nnodes; n++)
1680         {
1681             if (n != dd->rank)
1682             {
1683                 if (ma->nat[n] > nalloc)
1684                 {
1685                     nalloc = over_alloc_dd(ma->nat[n]);
1686                     srenew(buf,nalloc);
1687                 }
1688                 /* Use lv as a temporary buffer */
1689                 a = 0;
1690                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1691                 {
1692                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1693                     {
1694                         copy_rvec(v[c],buf[a++]);
1695                     }
1696                 }
1697                 if (a != ma->nat[n])
1698                 {
1699                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1700                               a,ma->nat[n]);
1701                 }
1702                 
1703 #ifdef GMX_MPI
1704                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1705                          DDRANK(dd,n),n,dd->mpi_comm_all);
1706 #endif
1707             }
1708         }
1709         sfree(buf);
1710         n = DDMASTERRANK(dd);
1711         a = 0;
1712         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1713         {
1714             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1715             {
1716                 copy_rvec(v[c],lv[a++]);
1717             }
1718         }
1719     }
1720     else
1721     {
1722 #ifdef GMX_MPI
1723         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1724                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1725 #endif
1726     }
1727 }
1728
1729 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1730                                        rvec *v,rvec *lv)
1731 {
1732     gmx_domdec_master_t *ma;
1733     int  *scounts=NULL,*disps=NULL;
1734     int  n,i,c,a,nalloc=0;
1735     rvec *buf=NULL;
1736     
1737     if (DDMASTER(dd))
1738     {
1739         ma  = dd->ma;
1740      
1741         get_commbuffer_counts(dd,&scounts,&disps);
1742
1743         buf = ma->vbuf;
1744         a = 0;
1745         for(n=0; n<dd->nnodes; n++)
1746         {
1747             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1748             {
1749                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1750                 {
1751                     copy_rvec(v[c],buf[a++]);
1752                 }
1753             }
1754         }
1755     }
1756
1757     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1758 }
1759
1760 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1761 {
1762     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1763     {
1764         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1765     }
1766     else
1767     {
1768         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1769     }
1770 }
1771
1772 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1773                                 t_state *state,t_state *state_local,
1774                                 rvec **f)
1775 {
1776     int  i,j,nh;
1777
1778     nh = state->nhchainlength;
1779
1780     if (DDMASTER(dd))
1781     {
1782         for(i=0;i<efptNR;i++)
1783         {
1784             state_local->lambda[i] = state->lambda[i];
1785         }
1786         state_local->fep_state = state->fep_state;
1787         state_local->veta   = state->veta;
1788         state_local->vol0   = state->vol0;
1789         copy_mat(state->box,state_local->box);
1790         copy_mat(state->box_rel,state_local->box_rel);
1791         copy_mat(state->boxv,state_local->boxv);
1792         copy_mat(state->svir_prev,state_local->svir_prev);
1793         copy_mat(state->fvir_prev,state_local->fvir_prev);
1794         for(i=0; i<state_local->ngtc; i++)
1795         {
1796             for(j=0; j<nh; j++) {
1797                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1798                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1799             }
1800             state_local->therm_integral[i] = state->therm_integral[i];
1801         }
1802         for(i=0; i<state_local->nnhpres; i++)
1803         {
1804             for(j=0; j<nh; j++) {
1805                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1806                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1807             }
1808         }
1809     }
1810     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1811     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1812     dd_bcast(dd,sizeof(real),&state_local->veta);
1813     dd_bcast(dd,sizeof(real),&state_local->vol0);
1814     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1815     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1816     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1817     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1818     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1819     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1820     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1821     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1822     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1823     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1824
1825     if (dd->nat_home > state_local->nalloc)
1826     {
1827         dd_realloc_state(state_local,f,dd->nat_home);
1828     }
1829     for(i=0; i<estNR; i++)
1830     {
1831         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1832         {
1833             switch (i) {
1834             case estX:
1835                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1836                 break;
1837             case estV:
1838                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1839                 break;
1840             case estSDX:
1841                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1842                 break;
1843             case estCGP:
1844                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1845                 break;
1846             case estLD_RNG:
1847                 if (state->nrngi == 1)
1848                 {
1849                     dd_bcastc(dd,
1850                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1851                               state->ld_rng,state_local->ld_rng);
1852                 }
1853                 else
1854                 {
1855                     dd_scatter(dd,
1856                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1857                                state->ld_rng,state_local->ld_rng);
1858                 }
1859                 break;
1860             case estLD_RNGI:
1861                 if (state->nrngi == 1)
1862                 {
1863                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1864                               state->ld_rngi,state_local->ld_rngi);
1865                 }
1866                 else
1867                 {
1868                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1869                                state->ld_rngi,state_local->ld_rngi);
1870                 }   
1871                 break;
1872             case estDISRE_INITF:
1873             case estDISRE_RM3TAV:
1874             case estORIRE_INITF:
1875             case estORIRE_DTAV:
1876                 /* Not implemented yet */
1877                 break;
1878             default:
1879                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1880             }
1881         }
1882     }
1883 }
1884
1885 static char dim2char(int dim)
1886 {
1887     char c='?';
1888     
1889     switch (dim)
1890     {
1891     case XX: c = 'X'; break;
1892     case YY: c = 'Y'; break;
1893     case ZZ: c = 'Z'; break;
1894     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1895     }
1896     
1897     return c;
1898 }
1899
1900 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1901                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1902 {
1903     rvec grid_s[2],*grid_r=NULL,cx,r;
1904     char fname[STRLEN],format[STRLEN],buf[22];
1905     FILE *out;
1906     int  a,i,d,z,y,x;
1907     matrix tric;
1908     real vol;
1909
1910     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1911     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1912     
1913     if (DDMASTER(dd))
1914     {
1915         snew(grid_r,2*dd->nnodes);
1916     }
1917     
1918     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1919     
1920     if (DDMASTER(dd))
1921     {
1922         for(d=0; d<DIM; d++)
1923         {
1924             for(i=0; i<DIM; i++)
1925             {
1926                 if (d == i)
1927                 {
1928                     tric[d][i] = 1;
1929                 }
1930                 else
1931                 {
1932                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1933                     {
1934                         tric[d][i] = box[i][d]/box[i][i];
1935                     }
1936                     else
1937                     {
1938                         tric[d][i] = 0;
1939                     }
1940                 }
1941             }
1942         }
1943         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1944         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1945         out = gmx_fio_fopen(fname,"w");
1946         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1947         a = 1;
1948         for(i=0; i<dd->nnodes; i++)
1949         {
1950             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1951             for(d=0; d<DIM; d++)
1952             {
1953                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1954             }
1955             for(z=0; z<2; z++)
1956             {
1957                 for(y=0; y<2; y++)
1958                 {
1959                     for(x=0; x<2; x++)
1960                     {
1961                         cx[XX] = grid_r[i*2+x][XX];
1962                         cx[YY] = grid_r[i*2+y][YY];
1963                         cx[ZZ] = grid_r[i*2+z][ZZ];
1964                         mvmul(tric,cx,r);
1965                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1966                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1967                     }
1968                 }
1969             }
1970             for(d=0; d<DIM; d++)
1971             {
1972                 for(x=0; x<4; x++)
1973                 {
1974                     switch(d)
1975                     {
1976                     case 0: y = 1 + i*8 + 2*x; break;
1977                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1978                     case 2: y = 1 + i*8 + x; break;
1979                     }
1980                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1981                 }
1982             }
1983         }
1984         gmx_fio_fclose(out);
1985         sfree(grid_r);
1986     }
1987 }
1988
1989 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1990                   gmx_mtop_t *mtop,t_commrec *cr,
1991                   int natoms,rvec x[],matrix box)
1992 {
1993     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1994     FILE *out;
1995     int  i,ii,resnr,c;
1996     char *atomname,*resname;
1997     real b;
1998     gmx_domdec_t *dd;
1999     
2000     dd = cr->dd;
2001     if (natoms == -1)
2002     {
2003         natoms = dd->comm->nat[ddnatVSITE];
2004     }
2005     
2006     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2007     
2008     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
2009     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
2010     
2011     out = gmx_fio_fopen(fname,"w");
2012     
2013     fprintf(out,"TITLE     %s\n",title);
2014     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2015     for(i=0; i<natoms; i++)
2016     {
2017         ii = dd->gatindex[i];
2018         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2019         if (i < dd->comm->nat[ddnatZONE])
2020         {
2021             c = 0;
2022             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2023             {
2024                 c++;
2025             }
2026             b = c;
2027         }
2028         else if (i < dd->comm->nat[ddnatVSITE])
2029         {
2030             b = dd->comm->zones.n;
2031         }
2032         else
2033         {
2034             b = dd->comm->zones.n + 1;
2035         }
2036         fprintf(out,strlen(atomname)<4 ? format : format4,
2037                 "ATOM",(ii+1)%100000,
2038                 atomname,resname,' ',resnr%10000,' ',
2039                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2040     }
2041     fprintf(out,"TER\n");
2042     
2043     gmx_fio_fclose(out);
2044 }
2045
2046 real dd_cutoff_mbody(gmx_domdec_t *dd)
2047 {
2048     gmx_domdec_comm_t *comm;
2049     int  di;
2050     real r;
2051
2052     comm = dd->comm;
2053
2054     r = -1;
2055     if (comm->bInterCGBondeds)
2056     {
2057         if (comm->cutoff_mbody > 0)
2058         {
2059             r = comm->cutoff_mbody;
2060         }
2061         else
2062         {
2063             /* cutoff_mbody=0 means we do not have DLB */
2064             r = comm->cellsize_min[dd->dim[0]];
2065             for(di=1; di<dd->ndim; di++)
2066             {
2067                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2068             }
2069             if (comm->bBondComm)
2070             {
2071                 r = max(r,comm->cutoff_mbody);
2072             }
2073             else
2074             {
2075                 r = min(r,comm->cutoff);
2076             }
2077         }
2078     }
2079
2080     return r;
2081 }
2082
2083 real dd_cutoff_twobody(gmx_domdec_t *dd)
2084 {
2085     real r_mb;
2086
2087     r_mb = dd_cutoff_mbody(dd);
2088
2089     return max(dd->comm->cutoff,r_mb);
2090 }
2091
2092
2093 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2094 {
2095     int nc,ntot;
2096     
2097     nc   = dd->nc[dd->comm->cartpmedim];
2098     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2099     copy_ivec(coord,coord_pme);
2100     coord_pme[dd->comm->cartpmedim] =
2101         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2102 }
2103
2104 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2105 {
2106     /* Here we assign a PME node to communicate with this DD node
2107      * by assuming that the major index of both is x.
2108      * We add cr->npmenodes/2 to obtain an even distribution.
2109      */
2110     return (ddindex*npme + npme/2)/ndd;
2111 }
2112
2113 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2114 {
2115     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2116 }
2117
2118 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2119 {
2120     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2121 }
2122
2123 static int *dd_pmenodes(t_commrec *cr)
2124 {
2125     int *pmenodes;
2126     int n,i,p0,p1;
2127     
2128     snew(pmenodes,cr->npmenodes);
2129     n = 0;
2130     for(i=0; i<cr->dd->nnodes; i++) {
2131         p0 = cr_ddindex2pmeindex(cr,i);
2132         p1 = cr_ddindex2pmeindex(cr,i+1);
2133         if (i+1 == cr->dd->nnodes || p1 > p0) {
2134             if (debug)
2135                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2136             pmenodes[n] = i + 1 + n;
2137             n++;
2138         }
2139     }
2140
2141     return pmenodes;
2142 }
2143
2144 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2145 {
2146     gmx_domdec_t *dd;
2147     ivec coords,coords_pme,nc;
2148     int  slab;
2149     
2150     dd = cr->dd;
2151     /*
2152       if (dd->comm->bCartesian) {
2153       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2154       dd_coords2pmecoords(dd,coords,coords_pme);
2155       copy_ivec(dd->ntot,nc);
2156       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2157       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2158       
2159       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2160       } else {
2161       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2162       }
2163     */
2164     coords[XX] = x;
2165     coords[YY] = y;
2166     coords[ZZ] = z;
2167     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2168     
2169     return slab;
2170 }
2171
2172 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2173 {
2174     gmx_domdec_comm_t *comm;
2175     ivec coords;
2176     int  ddindex,nodeid=-1;
2177     
2178     comm = cr->dd->comm;
2179     
2180     coords[XX] = x;
2181     coords[YY] = y;
2182     coords[ZZ] = z;
2183     if (comm->bCartesianPP_PME)
2184     {
2185 #ifdef GMX_MPI
2186         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2187 #endif
2188     }
2189     else
2190     {
2191         ddindex = dd_index(cr->dd->nc,coords);
2192         if (comm->bCartesianPP)
2193         {
2194             nodeid = comm->ddindex2simnodeid[ddindex];
2195         }
2196         else
2197         {
2198             if (comm->pmenodes)
2199             {
2200                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2201             }
2202             else
2203             {
2204                 nodeid = ddindex;
2205             }
2206         }
2207     }
2208   
2209     return nodeid;
2210 }
2211
2212 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2213 {
2214     gmx_domdec_t *dd;
2215     gmx_domdec_comm_t *comm;
2216     ivec coord,coord_pme;
2217     int  i;
2218     int  pmenode=-1;
2219     
2220     dd = cr->dd;
2221     comm = dd->comm;
2222     
2223     /* This assumes a uniform x domain decomposition grid cell size */
2224     if (comm->bCartesianPP_PME)
2225     {
2226 #ifdef GMX_MPI
2227         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2228         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2229         {
2230             /* This is a PP node */
2231             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2232             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2233         }
2234 #endif
2235     }
2236     else if (comm->bCartesianPP)
2237     {
2238         if (sim_nodeid < dd->nnodes)
2239         {
2240             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2241         }
2242     }
2243     else
2244     {
2245         /* This assumes DD cells with identical x coordinates
2246          * are numbered sequentially.
2247          */
2248         if (dd->comm->pmenodes == NULL)
2249         {
2250             if (sim_nodeid < dd->nnodes)
2251             {
2252                 /* The DD index equals the nodeid */
2253                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2254             }
2255         }
2256         else
2257         {
2258             i = 0;
2259             while (sim_nodeid > dd->comm->pmenodes[i])
2260             {
2261                 i++;
2262             }
2263             if (sim_nodeid < dd->comm->pmenodes[i])
2264             {
2265                 pmenode = dd->comm->pmenodes[i];
2266             }
2267         }
2268     }
2269     
2270     return pmenode;
2271 }
2272
2273 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2274 {
2275     gmx_bool bPMEOnlyNode;
2276     
2277     if (DOMAINDECOMP(cr))
2278     {
2279         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2280     }
2281     else
2282     {
2283         bPMEOnlyNode = FALSE;
2284     }
2285     
2286     return bPMEOnlyNode;
2287 }
2288
2289 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2290                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2291 {
2292     gmx_domdec_t *dd;
2293     int x,y,z;
2294     ivec coord,coord_pme;
2295     
2296     dd = cr->dd;
2297     
2298     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2299     
2300     *nmy_ddnodes = 0;
2301     for(x=0; x<dd->nc[XX]; x++)
2302     {
2303         for(y=0; y<dd->nc[YY]; y++)
2304         {
2305             for(z=0; z<dd->nc[ZZ]; z++)
2306             {
2307                 if (dd->comm->bCartesianPP_PME)
2308                 {
2309                     coord[XX] = x;
2310                     coord[YY] = y;
2311                     coord[ZZ] = z;
2312                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2313                     if (dd->ci[XX] == coord_pme[XX] &&
2314                         dd->ci[YY] == coord_pme[YY] &&
2315                         dd->ci[ZZ] == coord_pme[ZZ])
2316                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2317                 }
2318                 else
2319                 {
2320                     /* The slab corresponds to the nodeid in the PME group */
2321                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2322                     {
2323                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2324                     }
2325                 }
2326             }
2327         }
2328     }
2329     
2330     /* The last PP-only node is the peer node */
2331     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2332     
2333     if (debug)
2334     {
2335         fprintf(debug,"Receive coordinates from PP nodes:");
2336         for(x=0; x<*nmy_ddnodes; x++)
2337         {
2338             fprintf(debug," %d",(*my_ddnodes)[x]);
2339         }
2340         fprintf(debug,"\n");
2341     }
2342 }
2343
2344 static gmx_bool receive_vir_ener(t_commrec *cr)
2345 {
2346     gmx_domdec_comm_t *comm;
2347     int  pmenode,coords[DIM],rank;
2348     gmx_bool bReceive;
2349     
2350     bReceive = TRUE;
2351     if (cr->npmenodes < cr->dd->nnodes)
2352     {
2353         comm = cr->dd->comm;
2354         if (comm->bCartesianPP_PME)
2355         {
2356             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2357 #ifdef GMX_MPI
2358             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2359             coords[comm->cartpmedim]++;
2360             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2361             {
2362                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2363                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2364                 {
2365                     /* This is not the last PP node for pmenode */
2366                     bReceive = FALSE;
2367                 }
2368             }
2369 #endif  
2370         }
2371         else
2372         {
2373             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2374             if (cr->sim_nodeid+1 < cr->nnodes &&
2375                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2376             {
2377                 /* This is not the last PP node for pmenode */
2378                 bReceive = FALSE;
2379             }
2380         }
2381     }
2382     
2383     return bReceive;
2384 }
2385
2386 static void set_zones_ncg_home(gmx_domdec_t *dd)
2387 {
2388     gmx_domdec_zones_t *zones;
2389     int i;
2390
2391     zones = &dd->comm->zones;
2392
2393     zones->cg_range[0] = 0;
2394     for(i=1; i<zones->n+1; i++)
2395     {
2396         zones->cg_range[i] = dd->ncg_home;
2397     }
2398 }
2399
2400 static void rebuild_cgindex(gmx_domdec_t *dd,
2401                             const int *gcgs_index,t_state *state)
2402 {
2403     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2404     
2405     ind = state->cg_gl;
2406     dd_cg_gl = dd->index_gl;
2407     cgindex  = dd->cgindex;
2408     nat = 0;
2409     cgindex[0] = nat;
2410     for(i=0; i<state->ncg_gl; i++)
2411     {
2412         cgindex[i] = nat;
2413         cg_gl = ind[i];
2414         dd_cg_gl[i] = cg_gl;
2415         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2416     }
2417     cgindex[i] = nat;
2418     
2419     dd->ncg_home = state->ncg_gl;
2420     dd->nat_home = nat;
2421
2422     set_zones_ncg_home(dd);
2423 }
2424
2425 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2426 {
2427     while (cg >= cginfo_mb->cg_end)
2428     {
2429         cginfo_mb++;
2430     }
2431
2432     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2433 }
2434
2435 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2436                           t_forcerec *fr,char *bLocalCG)
2437 {
2438     cginfo_mb_t *cginfo_mb;
2439     int *cginfo;
2440     int cg;
2441
2442     if (fr != NULL)
2443     {
2444         cginfo_mb = fr->cginfo_mb;
2445         cginfo    = fr->cginfo;
2446
2447         for(cg=cg0; cg<cg1; cg++)
2448         {
2449             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2450         }
2451     }
2452
2453     if (bLocalCG != NULL)
2454     {
2455         for(cg=cg0; cg<cg1; cg++)
2456         {
2457             bLocalCG[index_gl[cg]] = TRUE;
2458         }
2459     }
2460 }
2461
2462 static void make_dd_indices(gmx_domdec_t *dd,
2463                             const int *gcgs_index,int cg_start)
2464 {
2465     int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2466     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2467     gmx_ga2la_t *ga2la;
2468     char *bLocalCG;
2469     gmx_bool bCGs;
2470
2471     bLocalCG = dd->comm->bLocalCG;
2472
2473     if (dd->nat_tot > dd->gatindex_nalloc)
2474     {
2475         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2476         srenew(dd->gatindex,dd->gatindex_nalloc);
2477     }
2478
2479     nzone      = dd->comm->zones.n;
2480     zone2cg    = dd->comm->zones.cg_range;
2481     zone_ncg1  = dd->comm->zone_ncg1;
2482     index_gl   = dd->index_gl;
2483     gatindex   = dd->gatindex;
2484     bCGs       = dd->comm->bCGs;
2485
2486     if (zone2cg[1] != dd->ncg_home)
2487     {
2488         gmx_incons("dd->ncg_zone is not up to date");
2489     }
2490     
2491     /* Make the local to global and global to local atom index */
2492     a = dd->cgindex[cg_start];
2493     for(zone=0; zone<nzone; zone++)
2494     {
2495         if (zone == 0)
2496         {
2497             cg0 = cg_start;
2498         }
2499         else
2500         {
2501             cg0 = zone2cg[zone];
2502         }
2503         cg1    = zone2cg[zone+1];
2504         cg1_p1 = cg0 + zone_ncg1[zone];
2505
2506         for(cg=cg0; cg<cg1; cg++)
2507         {
2508             zone1 = zone;
2509             if (cg >= cg1_p1)
2510             {
2511                 /* Signal that this cg is from more than one pulse away */
2512                 zone1 += nzone;
2513             }
2514             cg_gl = index_gl[cg];
2515             if (bCGs)
2516             {
2517                 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2518                 {
2519                     gatindex[a] = a_gl;
2520                     ga2la_set(dd->ga2la,a_gl,a,zone1);
2521                     a++;
2522                 }
2523             }
2524             else
2525             {
2526                 gatindex[a] = cg_gl;
2527                 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2528                 a++;
2529             }
2530         }
2531     }
2532 }
2533
2534 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2535                           const char *where)
2536 {
2537     int ncg,i,ngl,nerr;
2538
2539     nerr = 0;
2540     if (bLocalCG == NULL)
2541     {
2542         return nerr;
2543     }
2544     for(i=0; i<dd->ncg_tot; i++)
2545     {
2546         if (!bLocalCG[dd->index_gl[i]])
2547         {
2548             fprintf(stderr,
2549                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2550             nerr++;
2551         }
2552     }
2553     ngl = 0;
2554     for(i=0; i<ncg_sys; i++)
2555     {
2556         if (bLocalCG[i])
2557         {
2558             ngl++;
2559         }
2560     }
2561     if (ngl != dd->ncg_tot)
2562     {
2563         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2564         nerr++;
2565     }
2566
2567     return nerr;
2568 }
2569
2570 static void check_index_consistency(gmx_domdec_t *dd,
2571                                     int natoms_sys,int ncg_sys,
2572                                     const char *where)
2573 {
2574     int  nerr,ngl,i,a,cell;
2575     int  *have;
2576
2577     nerr = 0;
2578
2579     if (dd->comm->DD_debug > 1)
2580     {
2581         snew(have,natoms_sys);
2582         for(a=0; a<dd->nat_tot; a++)
2583         {
2584             if (have[dd->gatindex[a]] > 0)
2585             {
2586                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2587             }
2588             else
2589             {
2590                 have[dd->gatindex[a]] = a + 1;
2591             }
2592         }
2593         sfree(have);
2594     }
2595
2596     snew(have,dd->nat_tot);
2597
2598     ngl  = 0;
2599     for(i=0; i<natoms_sys; i++)
2600     {
2601         if (ga2la_get(dd->ga2la,i,&a,&cell))
2602         {
2603             if (a >= dd->nat_tot)
2604             {
2605                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2606                 nerr++;
2607             }
2608             else
2609             {
2610                 have[a] = 1;
2611                 if (dd->gatindex[a] != i)
2612                 {
2613                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2614                     nerr++;
2615                 }
2616             }
2617             ngl++;
2618         }
2619     }
2620     if (ngl != dd->nat_tot)
2621     {
2622         fprintf(stderr,
2623                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2624                 dd->rank,where,ngl,dd->nat_tot);
2625     }
2626     for(a=0; a<dd->nat_tot; a++)
2627     {
2628         if (have[a] == 0)
2629         {
2630             fprintf(stderr,
2631                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2632                     dd->rank,where,a+1,dd->gatindex[a]+1);
2633         }
2634     }
2635     sfree(have);
2636
2637     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2638
2639     if (nerr > 0) {
2640         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2641                   dd->rank,where,nerr);
2642     }
2643 }
2644
2645 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2646 {
2647     int  i;
2648     char *bLocalCG;
2649
2650     if (a_start == 0)
2651     {
2652         /* Clear the whole list without searching */
2653         ga2la_clear(dd->ga2la);
2654     }
2655     else
2656     {
2657         for(i=a_start; i<dd->nat_tot; i++)
2658         {
2659             ga2la_del(dd->ga2la,dd->gatindex[i]);
2660         }
2661     }
2662
2663     bLocalCG = dd->comm->bLocalCG;
2664     if (bLocalCG)
2665     {
2666         for(i=cg_start; i<dd->ncg_tot; i++)
2667         {
2668             bLocalCG[dd->index_gl[i]] = FALSE;
2669         }
2670     }
2671
2672     dd_clear_local_vsite_indices(dd);
2673     
2674     if (dd->constraints)
2675     {
2676         dd_clear_local_constraint_indices(dd);
2677     }
2678 }
2679
2680 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2681                             int dim_ind)
2682 {
2683     real grid_jump_limit;
2684
2685     /* The distance between the boundaries of cells at distance
2686      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2687      * and by the fact that cells should not be shifted by more than
2688      * half their size, such that cg's only shift by one cell
2689      * at redecomposition.
2690      */
2691     grid_jump_limit = comm->cellsize_limit;
2692     if (!comm->bVacDLBNoLimit)
2693     {
2694         grid_jump_limit = max(grid_jump_limit,
2695                               cutoff/comm->cd[dim_ind].np);
2696     }
2697
2698     return grid_jump_limit;
2699 }
2700
2701 static gmx_bool check_grid_jump(gmx_large_int_t step,
2702                                 gmx_domdec_t *dd,
2703                                 real cutoff,
2704                                 gmx_ddbox_t *ddbox,
2705                                 gmx_bool bFatal)
2706 {
2707     gmx_domdec_comm_t *comm;
2708     int  d,dim;
2709     real limit,bfac;
2710     gmx_bool bInvalid;
2711
2712     bInvalid = FALSE;
2713
2714     comm = dd->comm;
2715     
2716     for(d=1; d<dd->ndim; d++)
2717     {
2718         dim = dd->dim[d];
2719         limit = grid_jump_limit(comm,cutoff,d);
2720         bfac = ddbox->box_size[dim];
2721         if (ddbox->tric_dir[dim])
2722         {
2723             bfac *= ddbox->skew_fac[dim];
2724         }
2725         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2726             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2727         {
2728             bInvalid = TRUE;
2729
2730             if (bFatal)
2731             {
2732                 char buf[22];
2733
2734                 /* This error should never be triggered under normal
2735                  * circumstances, but you never know ...
2736                  */
2737                 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2738                           gmx_step_str(step,buf),
2739                           dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2740             }
2741         }
2742     }
2743
2744     return bInvalid;
2745 }
2746
2747 static int dd_load_count(gmx_domdec_comm_t *comm)
2748 {
2749     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2750 }
2751
2752 static float dd_force_load(gmx_domdec_comm_t *comm)
2753 {
2754     float load;
2755     
2756     if (comm->eFlop)
2757     {
2758         load = comm->flop;
2759         if (comm->eFlop > 1)
2760         {
2761             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2762         }
2763     } 
2764     else
2765     {
2766         load = comm->cycl[ddCyclF];
2767         if (comm->cycl_n[ddCyclF] > 1)
2768         {
2769             /* Subtract the maximum of the last n cycle counts
2770              * to get rid of possible high counts due to other soures,
2771              * for instance system activity, that would otherwise
2772              * affect the dynamic load balancing.
2773              */
2774             load -= comm->cycl_max[ddCyclF];
2775         }
2776     }
2777     
2778     return load;
2779 }
2780
2781 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2782 {
2783     gmx_domdec_comm_t *comm;
2784     int i;
2785     
2786     comm = dd->comm;
2787     
2788     snew(*dim_f,dd->nc[dim]+1);
2789     (*dim_f)[0] = 0;
2790     for(i=1; i<dd->nc[dim]; i++)
2791     {
2792         if (comm->slb_frac[dim])
2793         {
2794             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2795         }
2796         else
2797         {
2798             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2799         }
2800     }
2801     (*dim_f)[dd->nc[dim]] = 1;
2802 }
2803
2804 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2805 {
2806     int  pmeindex,slab,nso,i;
2807     ivec xyz;
2808     
2809     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2810     {
2811         ddpme->dim = YY;
2812     }
2813     else
2814     {
2815         ddpme->dim = dimind;
2816     }
2817     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2818     
2819     ddpme->nslab = (ddpme->dim == 0 ?
2820                     dd->comm->npmenodes_x :
2821                     dd->comm->npmenodes_y);
2822
2823     if (ddpme->nslab <= 1)
2824     {
2825         return;
2826     }
2827
2828     nso = dd->comm->npmenodes/ddpme->nslab;
2829     /* Determine for each PME slab the PP location range for dimension dim */
2830     snew(ddpme->pp_min,ddpme->nslab);
2831     snew(ddpme->pp_max,ddpme->nslab);
2832     for(slab=0; slab<ddpme->nslab; slab++) {
2833         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2834         ddpme->pp_max[slab] = 0;
2835     }
2836     for(i=0; i<dd->nnodes; i++) {
2837         ddindex2xyz(dd->nc,i,xyz);
2838         /* For y only use our y/z slab.
2839          * This assumes that the PME x grid size matches the DD grid size.
2840          */
2841         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2842             pmeindex = ddindex2pmeindex(dd,i);
2843             if (dimind == 0) {
2844                 slab = pmeindex/nso;
2845             } else {
2846                 slab = pmeindex % ddpme->nslab;
2847             }
2848             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2849             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2850         }
2851     }
2852
2853     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2854 }
2855
2856 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2857 {
2858     if (dd->comm->ddpme[0].dim == XX)
2859     {
2860         return dd->comm->ddpme[0].maxshift;
2861     }
2862     else
2863     {
2864         return 0;
2865     }
2866 }
2867
2868 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2869 {
2870     if (dd->comm->ddpme[0].dim == YY)
2871     {
2872         return dd->comm->ddpme[0].maxshift;
2873     }
2874     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2875     {
2876         return dd->comm->ddpme[1].maxshift;
2877     }
2878     else
2879     {
2880         return 0;
2881     }
2882 }
2883
2884 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2885                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2886 {
2887     gmx_domdec_comm_t *comm;
2888     int  nc,ns,s;
2889     int  *xmin,*xmax;
2890     real range,pme_boundary;
2891     int  sh;
2892     
2893     comm = dd->comm;
2894     nc  = dd->nc[ddpme->dim];
2895     ns  = ddpme->nslab;
2896     
2897     if (!ddpme->dim_match)
2898     {
2899         /* PP decomposition is not along dim: the worst situation */
2900         sh = ns/2;
2901     }
2902     else if (ns <= 3 || (bUniform && ns == nc))
2903     {
2904         /* The optimal situation */
2905         sh = 1;
2906     }
2907     else
2908     {
2909         /* We need to check for all pme nodes which nodes they
2910          * could possibly need to communicate with.
2911          */
2912         xmin = ddpme->pp_min;
2913         xmax = ddpme->pp_max;
2914         /* Allow for atoms to be maximally 2/3 times the cut-off
2915          * out of their DD cell. This is a reasonable balance between
2916          * between performance and support for most charge-group/cut-off
2917          * combinations.
2918          */
2919         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2920         /* Avoid extra communication when we are exactly at a boundary */
2921         range *= 0.999;
2922         
2923         sh = 1;
2924         for(s=0; s<ns; s++)
2925         {
2926             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2927             pme_boundary = (real)s/ns;
2928             while (sh+1 < ns &&
2929                    ((s-(sh+1) >= 0 &&
2930                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2931                     (s-(sh+1) <  0 &&
2932                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2933             {
2934                 sh++;
2935             }
2936             pme_boundary = (real)(s+1)/ns;
2937             while (sh+1 < ns &&
2938                    ((s+(sh+1) <  ns &&
2939                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2940                     (s+(sh+1) >= ns &&
2941                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2942             {
2943                 sh++;
2944             }
2945         }
2946     }
2947     
2948     ddpme->maxshift = sh;
2949     
2950     if (debug)
2951     {
2952         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2953                 ddpme->dim,ddpme->maxshift);
2954     }
2955 }
2956
2957 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2958 {
2959     int d,dim;
2960     
2961     for(d=0; d<dd->ndim; d++)
2962     {
2963         dim = dd->dim[d];
2964         if (dim < ddbox->nboundeddim &&
2965             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2966             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2967         {
2968             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2969                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2970                       dd->nc[dim],dd->comm->cellsize_limit);
2971         }
2972     }
2973 }
2974
2975 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2976                                   gmx_bool bMaster,ivec npulse)
2977 {
2978     gmx_domdec_comm_t *comm;
2979     int  d,j;
2980     rvec cellsize_min;
2981     real *cell_x,cell_dx,cellsize;
2982     
2983     comm = dd->comm;
2984     
2985     for(d=0; d<DIM; d++)
2986     {
2987         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2988         npulse[d] = 1;
2989         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2990         {
2991             /* Uniform grid */
2992             cell_dx = ddbox->box_size[d]/dd->nc[d];
2993             if (bMaster)
2994             {
2995                 for(j=0; j<dd->nc[d]+1; j++)
2996                 {
2997                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2998                 }
2999             }
3000             else
3001             {
3002                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3003                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3004             }
3005             cellsize = cell_dx*ddbox->skew_fac[d];
3006             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3007             {
3008                 npulse[d]++;
3009             }
3010             cellsize_min[d] = cellsize;
3011         }
3012         else
3013         {
3014             /* Statically load balanced grid */
3015             /* Also when we are not doing a master distribution we determine
3016              * all cell borders in a loop to obtain identical values
3017              * to the master distribution case and to determine npulse.
3018              */
3019             if (bMaster)
3020             {
3021                 cell_x = dd->ma->cell_x[d];
3022             }
3023             else
3024             {
3025                 snew(cell_x,dd->nc[d]+1);
3026             }
3027             cell_x[0] = ddbox->box0[d];
3028             for(j=0; j<dd->nc[d]; j++)
3029             {
3030                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3031                 cell_x[j+1] = cell_x[j] + cell_dx;
3032                 cellsize = cell_dx*ddbox->skew_fac[d];
3033                 while (cellsize*npulse[d] < comm->cutoff &&
3034                        npulse[d] < dd->nc[d]-1)
3035                 {
3036                     npulse[d]++;
3037                 }
3038                 cellsize_min[d] = min(cellsize_min[d],cellsize);
3039             }
3040             if (!bMaster)
3041             {
3042                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3043                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3044                 sfree(cell_x);
3045             }
3046         }
3047         /* The following limitation is to avoid that a cell would receive
3048          * some of its own home charge groups back over the periodic boundary.
3049          * Double charge groups cause trouble with the global indices.
3050          */
3051         if (d < ddbox->npbcdim &&
3052             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3053         {
3054             gmx_fatal_collective(FARGS,NULL,dd,
3055                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3056                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3057                                  comm->cutoff,
3058                                  dd->nc[d],dd->nc[d],
3059                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3060         }
3061     }
3062     
3063     if (!comm->bDynLoadBal)
3064     {
3065         copy_rvec(cellsize_min,comm->cellsize_min);
3066     }
3067    
3068     for(d=0; d<comm->npmedecompdim; d++)
3069     {
3070         set_pme_maxshift(dd,&comm->ddpme[d],
3071                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3072                          comm->ddpme[d].slb_dim_f);
3073     }
3074 }
3075
3076
3077 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3078                                        int d,int dim,gmx_domdec_root_t *root,
3079                                        gmx_ddbox_t *ddbox,
3080                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3081 {
3082     gmx_domdec_comm_t *comm;
3083     int  ncd,i,j,nmin,nmin_old;
3084     gmx_bool bLimLo,bLimHi;
3085     real *cell_size;
3086     real fac,halfway,cellsize_limit_f_i,region_size;
3087     gmx_bool bPBC,bLastHi=FALSE;
3088     int nrange[]={range[0],range[1]};
3089
3090     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
3091
3092     comm = dd->comm;
3093
3094     ncd = dd->nc[dim];
3095
3096     bPBC = (dim < ddbox->npbcdim);
3097
3098     cell_size = root->buf_ncd;
3099
3100     if (debug) 
3101     {
3102         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3103     }
3104
3105     /* First we need to check if the scaling does not make cells
3106      * smaller than the smallest allowed size.
3107      * We need to do this iteratively, since if a cell is too small,
3108      * it needs to be enlarged, which makes all the other cells smaller,
3109      * which could in turn make another cell smaller than allowed.
3110      */
3111     for(i=range[0]; i<range[1]; i++)
3112     {
3113         root->bCellMin[i] = FALSE;
3114     }
3115     nmin = 0;
3116     do
3117     {
3118         nmin_old = nmin;
3119         /* We need the total for normalization */
3120         fac = 0;
3121         for(i=range[0]; i<range[1]; i++)
3122         {
3123             if (root->bCellMin[i] == FALSE)
3124             {
3125                 fac += cell_size[i];
3126             }
3127         }
3128         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3129         /* Determine the cell boundaries */
3130         for(i=range[0]; i<range[1]; i++)
3131         {
3132             if (root->bCellMin[i] == FALSE)
3133             {
3134                 cell_size[i] *= fac;
3135                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3136                 {
3137                     cellsize_limit_f_i = 0;
3138                 }
3139                 else
3140                 {
3141                     cellsize_limit_f_i = cellsize_limit_f;
3142                 }
3143                 if (cell_size[i] < cellsize_limit_f_i)
3144                 {
3145                     root->bCellMin[i] = TRUE;
3146                     cell_size[i] = cellsize_limit_f_i;
3147                     nmin++;
3148                 }
3149             }
3150             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3151         }
3152     }
3153     while (nmin > nmin_old);
3154     
3155     i=range[1]-1;
3156     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3157     /* For this check we should not use DD_CELL_MARGIN,
3158      * but a slightly smaller factor,
3159      * since rounding could get use below the limit.
3160      */
3161     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3162     {
3163         char buf[22];
3164         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3165                   gmx_step_str(step,buf),
3166                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3167                   ncd,comm->cellsize_min[dim]);
3168     }
3169     
3170     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3171     
3172     if (!bUniform)
3173     {
3174         /* Check if the boundary did not displace more than halfway
3175          * each of the cells it bounds, as this could cause problems,
3176          * especially when the differences between cell sizes are large.
3177          * If changes are applied, they will not make cells smaller
3178          * than the cut-off, as we check all the boundaries which
3179          * might be affected by a change and if the old state was ok,
3180          * the cells will at most be shrunk back to their old size.
3181          */
3182         for(i=range[0]+1; i<range[1]; i++)
3183         {
3184             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3185             if (root->cell_f[i] < halfway)
3186             {
3187                 root->cell_f[i] = halfway;
3188                 /* Check if the change also causes shifts of the next boundaries */
3189                 for(j=i+1; j<range[1]; j++)
3190                 {
3191                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3192                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3193                 }
3194             }
3195             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3196             if (root->cell_f[i] > halfway)
3197             {
3198                 root->cell_f[i] = halfway;
3199                 /* Check if the change also causes shifts of the next boundaries */
3200                 for(j=i-1; j>=range[0]+1; j--)
3201                 {
3202                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3203                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3204                 }
3205             }
3206         }
3207     }
3208     
3209     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3210     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3211      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3212      * for a and b nrange is used */
3213     if (d > 0)
3214     {
3215         /* Take care of the staggering of the cell boundaries */
3216         if (bUniform)
3217         {
3218             for(i=range[0]; i<range[1]; i++)
3219             {
3220                 root->cell_f_max0[i] = root->cell_f[i];
3221                 root->cell_f_min1[i] = root->cell_f[i+1];
3222             }
3223         }
3224         else
3225         {
3226             for(i=range[0]+1; i<range[1]; i++)
3227             {
3228                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3229                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3230                 if (bLimLo && bLimHi)
3231                 {
3232                     /* Both limits violated, try the best we can */
3233                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3234                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3235                     nrange[0]=range[0];
3236                     nrange[1]=i;
3237                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3238
3239                     nrange[0]=i;
3240                     nrange[1]=range[1];
3241                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3242
3243                     return;
3244                 }
3245                 else if (bLimLo)
3246                 {
3247                     /* root->cell_f[i] = root->bound_min[i]; */
3248                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3249                     bLastHi=FALSE;
3250                 }
3251                 else if (bLimHi && !bLastHi)
3252                 {
3253                     bLastHi=TRUE;
3254                     if (nrange[1] < range[1])   /* found a LimLo before */
3255                     {
3256                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3257                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3258                         nrange[0]=nrange[1];
3259                     }
3260                     root->cell_f[i] = root->bound_max[i];
3261                     nrange[1]=i; 
3262                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3263                     nrange[0]=i;
3264                     nrange[1]=range[1];
3265                 }
3266             }
3267             if (nrange[1] < range[1])   /* found last a LimLo */
3268             {
3269                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3270                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3271                 nrange[0]=nrange[1];
3272                 nrange[1]=range[1];
3273                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3274             } 
3275             else if (nrange[0] > range[0]) /* found at least one LimHi */
3276             {
3277                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3278             }
3279         }
3280     }
3281 }
3282
3283
3284 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3285                                        int d,int dim,gmx_domdec_root_t *root,
3286                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3287                                        gmx_bool bUniform,gmx_large_int_t step)
3288 {
3289     gmx_domdec_comm_t *comm;
3290     int  ncd,d1,i,j,pos;
3291     real *cell_size;
3292     real load_aver,load_i,imbalance,change,change_max,sc;
3293     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3294     real change_limit;
3295     real relax = 0.5;
3296     gmx_bool bPBC;
3297     int range[] = { 0, 0 };
3298
3299     comm = dd->comm;
3300
3301     /* Convert the maximum change from the input percentage to a fraction */
3302     change_limit = comm->dlb_scale_lim*0.01;
3303
3304     ncd = dd->nc[dim];
3305
3306     bPBC = (dim < ddbox->npbcdim);
3307
3308     cell_size = root->buf_ncd;
3309
3310     /* Store the original boundaries */
3311     for(i=0; i<ncd+1; i++)
3312     {
3313         root->old_cell_f[i] = root->cell_f[i];
3314     }
3315     if (bUniform) {
3316         for(i=0; i<ncd; i++)
3317         {
3318             cell_size[i] = 1.0/ncd;
3319         }
3320     }
3321     else if (dd_load_count(comm))
3322     {
3323         load_aver = comm->load[d].sum_m/ncd;
3324         change_max = 0;
3325         for(i=0; i<ncd; i++)
3326         {
3327             /* Determine the relative imbalance of cell i */
3328             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3329             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3330             /* Determine the change of the cell size using underrelaxation */
3331             change = -relax*imbalance;
3332             change_max = max(change_max,max(change,-change));
3333         }
3334         /* Limit the amount of scaling.
3335          * We need to use the same rescaling for all cells in one row,
3336          * otherwise the load balancing might not converge.
3337          */
3338         sc = relax;
3339         if (change_max > change_limit)
3340         {
3341             sc *= change_limit/change_max;
3342         }
3343         for(i=0; i<ncd; i++)
3344         {
3345             /* Determine the relative imbalance of cell i */
3346             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3347             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3348             /* Determine the change of the cell size using underrelaxation */
3349             change = -sc*imbalance;
3350             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3351         }
3352     }
3353     
3354     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3355     cellsize_limit_f *= DD_CELL_MARGIN;
3356     dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3357     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3358     if (ddbox->tric_dir[dim])
3359     {
3360         cellsize_limit_f /= ddbox->skew_fac[dim];
3361         dist_min_f       /= ddbox->skew_fac[dim];
3362     }
3363     if (bDynamicBox && d > 0)
3364     {
3365         dist_min_f *= DD_PRES_SCALE_MARGIN;
3366     }
3367     if (d > 0 && !bUniform)
3368     {
3369         /* Make sure that the grid is not shifted too much */
3370         for(i=1; i<ncd; i++) {
3371             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3372             {
3373                 gmx_incons("Inconsistent DD boundary staggering limits!");
3374             }
3375             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3376             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3377             if (space > 0) {
3378                 root->bound_min[i] += 0.5*space;
3379             }
3380             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3381             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3382             if (space < 0) {
3383                 root->bound_max[i] += 0.5*space;
3384             }
3385             if (debug)
3386             {
3387                 fprintf(debug,
3388                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3389                         d,i,
3390                         root->cell_f_max0[i-1] + dist_min_f,
3391                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3392                         root->cell_f_min1[i] - dist_min_f);
3393             }
3394         }
3395     }
3396     range[1]=ncd;
3397     root->cell_f[0] = 0;
3398     root->cell_f[ncd] = 1;
3399     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3400
3401
3402     /* After the checks above, the cells should obey the cut-off
3403      * restrictions, but it does not hurt to check.
3404      */
3405     for(i=0; i<ncd; i++)
3406     {
3407         if (debug)
3408         {
3409             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3410                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3411         }
3412
3413         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3414             root->cell_f[i+1] - root->cell_f[i] <
3415             cellsize_limit_f/DD_CELL_MARGIN)
3416         {
3417             char buf[22];
3418             fprintf(stderr,
3419                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3420                     gmx_step_str(step,buf),dim2char(dim),i,
3421                     (root->cell_f[i+1] - root->cell_f[i])
3422                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3423         }
3424     }
3425     
3426     pos = ncd + 1;
3427     /* Store the cell boundaries of the lower dimensions at the end */
3428     for(d1=0; d1<d; d1++)
3429     {
3430         root->cell_f[pos++] = comm->cell_f0[d1];
3431         root->cell_f[pos++] = comm->cell_f1[d1];
3432     }
3433     
3434     if (d < comm->npmedecompdim)
3435     {
3436         /* The master determines the maximum shift for
3437          * the coordinate communication between separate PME nodes.
3438          */
3439         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3440     }
3441     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3442     if (d >= 1)
3443     {
3444         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3445     }
3446 }    
3447
3448 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3449                                              gmx_ddbox_t *ddbox,int dimind)
3450 {
3451     gmx_domdec_comm_t *comm;
3452     int dim;
3453
3454     comm = dd->comm;
3455
3456     /* Set the cell dimensions */
3457     dim = dd->dim[dimind];
3458     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3459     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3460     if (dim >= ddbox->nboundeddim)
3461     {
3462         comm->cell_x0[dim] += ddbox->box0[dim];
3463         comm->cell_x1[dim] += ddbox->box0[dim];
3464     }
3465 }
3466
3467 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3468                                          int d,int dim,real *cell_f_row,
3469                                          gmx_ddbox_t *ddbox)
3470 {
3471     gmx_domdec_comm_t *comm;
3472     int d1,dim1,pos;
3473
3474     comm = dd->comm;
3475
3476 #ifdef GMX_MPI
3477     /* Each node would only need to know two fractions,
3478      * but it is probably cheaper to broadcast the whole array.
3479      */
3480     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3481               0,comm->mpi_comm_load[d]);
3482 #endif
3483     /* Copy the fractions for this dimension from the buffer */
3484     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3485     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3486     /* The whole array was communicated, so set the buffer position */
3487     pos = dd->nc[dim] + 1;
3488     for(d1=0; d1<=d; d1++)
3489     {
3490         if (d1 < d)
3491         {
3492             /* Copy the cell fractions of the lower dimensions */
3493             comm->cell_f0[d1] = cell_f_row[pos++];
3494             comm->cell_f1[d1] = cell_f_row[pos++];
3495         }
3496         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3497     }
3498     /* Convert the communicated shift from float to int */
3499     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3500     if (d >= 1)
3501     {
3502         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3503     }
3504 }
3505
3506 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3507                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3508                                          gmx_bool bUniform,gmx_large_int_t step)
3509 {
3510     gmx_domdec_comm_t *comm;
3511     int d,dim,d1;
3512     gmx_bool bRowMember,bRowRoot;
3513     real *cell_f_row;
3514     
3515     comm = dd->comm;
3516
3517     for(d=0; d<dd->ndim; d++)
3518     {
3519         dim = dd->dim[d];
3520         bRowMember = TRUE;
3521         bRowRoot = TRUE;
3522         for(d1=d; d1<dd->ndim; d1++)
3523         {
3524             if (dd->ci[dd->dim[d1]] > 0)
3525             {
3526                 if (d1 > d)
3527                 {
3528                     bRowMember = FALSE;
3529                 }
3530                 bRowRoot = FALSE;
3531             }
3532         }
3533         if (bRowMember)
3534         {
3535             if (bRowRoot)
3536             {
3537                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3538                                            ddbox,bDynamicBox,bUniform,step);
3539                 cell_f_row = comm->root[d]->cell_f;
3540             }
3541             else
3542             {
3543                 cell_f_row = comm->cell_f_row;
3544             }
3545             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3546         }
3547     }
3548 }    
3549
3550 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3551 {
3552     int d;
3553
3554     /* This function assumes the box is static and should therefore
3555      * not be called when the box has changed since the last
3556      * call to dd_partition_system.
3557      */
3558     for(d=0; d<dd->ndim; d++)
3559     {
3560         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3561     }
3562 }
3563
3564
3565
3566 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3567                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3568                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3569                                   gmx_wallcycle_t wcycle)
3570 {
3571     gmx_domdec_comm_t *comm;
3572     int dim;
3573
3574     comm = dd->comm;
3575     
3576     if (bDoDLB)
3577     {
3578         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3579         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3580         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3581     }
3582     else if (bDynamicBox)
3583     {
3584         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3585     }
3586     
3587     /* Set the dimensions for which no DD is used */
3588     for(dim=0; dim<DIM; dim++) {
3589         if (dd->nc[dim] == 1) {
3590             comm->cell_x0[dim] = 0;
3591             comm->cell_x1[dim] = ddbox->box_size[dim];
3592             if (dim >= ddbox->nboundeddim)
3593             {
3594                 comm->cell_x0[dim] += ddbox->box0[dim];
3595                 comm->cell_x1[dim] += ddbox->box0[dim];
3596             }
3597         }
3598     }
3599 }
3600
3601 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3602 {
3603     int d,np,i;
3604     gmx_domdec_comm_dim_t *cd;
3605     
3606     for(d=0; d<dd->ndim; d++)
3607     {
3608         cd = &dd->comm->cd[d];
3609         np = npulse[dd->dim[d]];
3610         if (np > cd->np_nalloc)
3611         {
3612             if (debug)
3613             {
3614                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3615                         dim2char(dd->dim[d]),np);
3616             }
3617             if (DDMASTER(dd) && cd->np_nalloc > 0)
3618             {
3619                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3620             }
3621             srenew(cd->ind,np);
3622             for(i=cd->np_nalloc; i<np; i++)
3623             {
3624                 cd->ind[i].index  = NULL;
3625                 cd->ind[i].nalloc = 0;
3626             }
3627             cd->np_nalloc = np;
3628         }
3629         cd->np = np;
3630     }
3631 }
3632
3633
3634 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3635                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3636                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3637                               gmx_wallcycle_t wcycle)
3638 {
3639     gmx_domdec_comm_t *comm;
3640     int  d;
3641     ivec npulse;
3642     
3643     comm = dd->comm;
3644
3645     /* Copy the old cell boundaries for the cg displacement check */
3646     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3647     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3648     
3649     if (comm->bDynLoadBal)
3650     {
3651         if (DDMASTER(dd))
3652         {
3653             check_box_size(dd,ddbox);
3654         }
3655         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3656     }
3657     else
3658     {
3659         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3660         realloc_comm_ind(dd,npulse);
3661     }
3662     
3663     if (debug)
3664     {
3665         for(d=0; d<DIM; d++)
3666         {
3667             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3668                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3669         }
3670     }
3671 }
3672
3673 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3674                                   gmx_ddbox_t *ddbox,
3675                                   rvec cell_ns_x0,rvec cell_ns_x1,
3676                                   gmx_large_int_t step)
3677 {
3678     gmx_domdec_comm_t *comm;
3679     int dim_ind,dim;
3680     
3681     comm = dd->comm;
3682
3683     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3684     {
3685         dim = dd->dim[dim_ind];
3686         
3687         /* Without PBC we don't have restrictions on the outer cells */
3688         if (!(dim >= ddbox->npbcdim && 
3689               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3690             comm->bDynLoadBal &&
3691             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3692             comm->cellsize_min[dim])
3693         {
3694             char buf[22];
3695             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3696                       gmx_step_str(step,buf),dim2char(dim),
3697                       comm->cell_x1[dim] - comm->cell_x0[dim],
3698                       ddbox->skew_fac[dim],
3699                       dd->comm->cellsize_min[dim],
3700                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3701         }
3702     }
3703     
3704     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3705     {
3706         /* Communicate the boundaries and update cell_ns_x0/1 */
3707         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3708         if (dd->bGridJump && dd->ndim > 1)
3709         {
3710             check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3711         }
3712     }
3713 }
3714
3715 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3716 {
3717     if (YY < npbcdim)
3718     {
3719         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3720     }
3721     else
3722     {
3723         tcm[YY][XX] = 0;
3724     }
3725     if (ZZ < npbcdim)
3726     {
3727         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3728         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3729     }
3730     else
3731     {
3732         tcm[ZZ][XX] = 0;
3733         tcm[ZZ][YY] = 0;
3734     }
3735 }
3736
3737 static void check_screw_box(matrix box)
3738 {
3739     /* Mathematical limitation */
3740     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3741     {
3742         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3743     }
3744     
3745     /* Limitation due to the asymmetry of the eighth shell method */
3746     if (box[ZZ][YY] != 0)
3747     {
3748         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3749     }
3750 }
3751
3752 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3753                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3754                           gmx_domdec_t *dd)
3755 {
3756     gmx_domdec_master_t *ma;
3757     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3758     int  i,icg,j,k,k0,k1,d,npbcdim;
3759     matrix tcm;
3760     rvec box_size,cg_cm;
3761     ivec ind;
3762     real nrcg,inv_ncg,pos_d;
3763     atom_id *cgindex;
3764     gmx_bool bUnbounded,bScrew;
3765
3766     ma = dd->ma;
3767     
3768     if (tmp_ind == NULL)
3769     {
3770         snew(tmp_nalloc,dd->nnodes);
3771         snew(tmp_ind,dd->nnodes);
3772         for(i=0; i<dd->nnodes; i++)
3773         {
3774             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3775             snew(tmp_ind[i],tmp_nalloc[i]);
3776         }
3777     }
3778     
3779     /* Clear the count */
3780     for(i=0; i<dd->nnodes; i++)
3781     {
3782         ma->ncg[i] = 0;
3783         ma->nat[i] = 0;
3784     }
3785     
3786     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3787     
3788     cgindex = cgs->index;
3789     
3790     /* Compute the center of geometry for all charge groups */
3791     for(icg=0; icg<cgs->nr; icg++)
3792     {
3793         k0      = cgindex[icg];
3794         k1      = cgindex[icg+1];
3795         nrcg    = k1 - k0;
3796         if (nrcg == 1)
3797         {
3798             copy_rvec(pos[k0],cg_cm);
3799         }
3800         else
3801         {
3802             inv_ncg = 1.0/nrcg;
3803             
3804             clear_rvec(cg_cm);
3805             for(k=k0; (k<k1); k++)
3806             {
3807                 rvec_inc(cg_cm,pos[k]);
3808             }
3809             for(d=0; (d<DIM); d++)
3810             {
3811                 cg_cm[d] *= inv_ncg;
3812             }
3813         }
3814         /* Put the charge group in the box and determine the cell index */
3815         for(d=DIM-1; d>=0; d--) {
3816             pos_d = cg_cm[d];
3817             if (d < dd->npbcdim)
3818             {
3819                 bScrew = (dd->bScrewPBC && d == XX);
3820                 if (tric_dir[d] && dd->nc[d] > 1)
3821                 {
3822                     /* Use triclinic coordintates for this dimension */
3823                     for(j=d+1; j<DIM; j++)
3824                     {
3825                         pos_d += cg_cm[j]*tcm[j][d];
3826                     }
3827                 }
3828                 while(pos_d >= box[d][d])
3829                 {
3830                     pos_d -= box[d][d];
3831                     rvec_dec(cg_cm,box[d]);
3832                     if (bScrew)
3833                     {
3834                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3835                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3836                     }
3837                     for(k=k0; (k<k1); k++)
3838                     {
3839                         rvec_dec(pos[k],box[d]);
3840                         if (bScrew)
3841                         {
3842                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3843                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3844                         }
3845                     }
3846                 }
3847                 while(pos_d < 0)
3848                 {
3849                     pos_d += box[d][d];
3850                     rvec_inc(cg_cm,box[d]);
3851                     if (bScrew)
3852                     {
3853                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3854                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3855                     }
3856                     for(k=k0; (k<k1); k++)
3857                     {
3858                         rvec_inc(pos[k],box[d]);
3859                         if (bScrew) {
3860                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3861                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3862                         }
3863                     }
3864                 }
3865             }
3866             /* This could be done more efficiently */
3867             ind[d] = 0;
3868             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3869             {
3870                 ind[d]++;
3871             }
3872         }
3873         i = dd_index(dd->nc,ind);
3874         if (ma->ncg[i] == tmp_nalloc[i])
3875         {
3876             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3877             srenew(tmp_ind[i],tmp_nalloc[i]);
3878         }
3879         tmp_ind[i][ma->ncg[i]] = icg;
3880         ma->ncg[i]++;
3881         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3882     }
3883     
3884     k1 = 0;
3885     for(i=0; i<dd->nnodes; i++)
3886     {
3887         ma->index[i] = k1;
3888         for(k=0; k<ma->ncg[i]; k++)
3889         {
3890             ma->cg[k1++] = tmp_ind[i][k];
3891         }
3892     }
3893     ma->index[dd->nnodes] = k1;
3894     
3895     for(i=0; i<dd->nnodes; i++)
3896     {
3897         sfree(tmp_ind[i]);
3898     }
3899     sfree(tmp_ind);
3900     sfree(tmp_nalloc);
3901     
3902     if (fplog)
3903     {
3904         char buf[22];
3905         fprintf(fplog,"Charge group distribution at step %s:",
3906                 gmx_step_str(step,buf));
3907         for(i=0; i<dd->nnodes; i++)
3908         {
3909             fprintf(fplog," %d",ma->ncg[i]);
3910         }
3911         fprintf(fplog,"\n");
3912     }
3913 }
3914
3915 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3916                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3917                                 rvec pos[])
3918 {
3919     gmx_domdec_master_t *ma=NULL;
3920     ivec npulse;
3921     int  i,cg_gl;
3922     int  *ibuf,buf2[2] = { 0, 0 };
3923     gmx_bool bMaster = DDMASTER(dd);
3924     if (bMaster)
3925     {
3926         ma = dd->ma;
3927         
3928         if (dd->bScrewPBC)
3929         {
3930             check_screw_box(box);
3931         }
3932     
3933         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3934     
3935         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3936         for(i=0; i<dd->nnodes; i++)
3937         {
3938             ma->ibuf[2*i]   = ma->ncg[i];
3939             ma->ibuf[2*i+1] = ma->nat[i];
3940         }
3941         ibuf = ma->ibuf;
3942     }
3943     else
3944     {
3945         ibuf = NULL;
3946     }
3947     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3948     
3949     dd->ncg_home = buf2[0];
3950     dd->nat_home = buf2[1];
3951     dd->ncg_tot  = dd->ncg_home;
3952     dd->nat_tot  = dd->nat_home;
3953     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3954     {
3955         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3956         srenew(dd->index_gl,dd->cg_nalloc);
3957         srenew(dd->cgindex,dd->cg_nalloc+1);
3958     }
3959     if (bMaster)
3960     {
3961         for(i=0; i<dd->nnodes; i++)
3962         {
3963             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3964             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3965         }
3966     }
3967     
3968     dd_scatterv(dd,
3969                 DDMASTER(dd) ? ma->ibuf : NULL,
3970                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3971                 DDMASTER(dd) ? ma->cg : NULL,
3972                 dd->ncg_home*sizeof(int),dd->index_gl);
3973     
3974     /* Determine the home charge group sizes */
3975     dd->cgindex[0] = 0;
3976     for(i=0; i<dd->ncg_home; i++)
3977     {
3978         cg_gl = dd->index_gl[i];
3979         dd->cgindex[i+1] =
3980             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3981     }
3982     
3983     if (debug)
3984     {
3985         fprintf(debug,"Home charge groups:\n");
3986         for(i=0; i<dd->ncg_home; i++)
3987         {
3988             fprintf(debug," %d",dd->index_gl[i]);
3989             if (i % 10 == 9) 
3990                 fprintf(debug,"\n");
3991         }
3992         fprintf(debug,"\n");
3993     }
3994 }
3995
3996 static int compact_and_copy_vec_at(int ncg,int *move,
3997                                    int *cgindex,
3998                                    int nvec,int vec,
3999                                    rvec *src,gmx_domdec_comm_t *comm,
4000                                    gmx_bool bCompact)
4001 {
4002     int m,icg,i,i0,i1,nrcg;
4003     int home_pos;
4004     int pos_vec[DIM*2];
4005     
4006     home_pos = 0;
4007
4008     for(m=0; m<DIM*2; m++)
4009     {
4010         pos_vec[m] = 0;
4011     }
4012     
4013     i0 = 0;
4014     for(icg=0; icg<ncg; icg++)
4015     {
4016         i1 = cgindex[icg+1];
4017         m = move[icg];
4018         if (m == -1)
4019         {
4020             if (bCompact)
4021             {
4022                 /* Compact the home array in place */
4023                 for(i=i0; i<i1; i++)
4024                 {
4025                     copy_rvec(src[i],src[home_pos++]);
4026                 }
4027             }
4028         }
4029         else
4030         {
4031             /* Copy to the communication buffer */
4032             nrcg = i1 - i0;
4033             pos_vec[m] += 1 + vec*nrcg;
4034             for(i=i0; i<i1; i++)
4035             {
4036                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4037             }
4038             pos_vec[m] += (nvec - vec - 1)*nrcg;
4039         }
4040         if (!bCompact)
4041         {
4042             home_pos += i1 - i0;
4043         }
4044         i0 = i1;
4045     }
4046     
4047     return home_pos;
4048 }
4049
4050 static int compact_and_copy_vec_cg(int ncg,int *move,
4051                                    int *cgindex,
4052                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
4053                                    gmx_bool bCompact)
4054 {
4055     int m,icg,i0,i1,nrcg;
4056     int home_pos;
4057     int pos_vec[DIM*2];
4058     
4059     home_pos = 0;
4060     
4061     for(m=0; m<DIM*2; m++)
4062     {
4063         pos_vec[m] = 0;
4064     }
4065     
4066     i0 = 0;
4067     for(icg=0; icg<ncg; icg++)
4068     {
4069         i1 = cgindex[icg+1];
4070         m = move[icg];
4071         if (m == -1)
4072         {
4073             if (bCompact)
4074             {
4075                 /* Compact the home array in place */
4076                 copy_rvec(src[icg],src[home_pos++]);
4077             }
4078         }
4079         else
4080         {
4081             nrcg = i1 - i0;
4082             /* Copy to the communication buffer */
4083             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4084             pos_vec[m] += 1 + nrcg*nvec;
4085         }
4086         i0 = i1;
4087     }
4088     if (!bCompact)
4089     {
4090         home_pos = ncg;
4091     }
4092     
4093     return home_pos;
4094 }
4095
4096 static int compact_ind(int ncg,int *move,
4097                        int *index_gl,int *cgindex,
4098                        int *gatindex,
4099                        gmx_ga2la_t ga2la,char *bLocalCG,
4100                        int *cginfo)
4101 {
4102     int cg,nat,a0,a1,a,a_gl;
4103     int home_pos;
4104
4105     home_pos = 0;
4106     nat = 0;
4107     for(cg=0; cg<ncg; cg++)
4108     {
4109         a0 = cgindex[cg];
4110         a1 = cgindex[cg+1];
4111         if (move[cg] == -1)
4112         {
4113             /* Compact the home arrays in place.
4114              * Anything that can be done here avoids access to global arrays.
4115              */
4116             cgindex[home_pos] = nat;
4117             for(a=a0; a<a1; a++)
4118             {
4119                 a_gl = gatindex[a];
4120                 gatindex[nat] = a_gl;
4121                 /* The cell number stays 0, so we don't need to set it */
4122                 ga2la_change_la(ga2la,a_gl,nat);
4123                 nat++;
4124             }
4125             index_gl[home_pos] = index_gl[cg];
4126             cginfo[home_pos]   = cginfo[cg];
4127             /* The charge group remains local, so bLocalCG does not change */
4128             home_pos++;
4129         }
4130         else
4131         {
4132             /* Clear the global indices */
4133             for(a=a0; a<a1; a++)
4134             {
4135                 ga2la_del(ga2la,gatindex[a]);
4136             }
4137             if (bLocalCG)
4138             {
4139                 bLocalCG[index_gl[cg]] = FALSE;
4140             }
4141         }
4142     }
4143     cgindex[home_pos] = nat;
4144     
4145     return home_pos;
4146 }
4147
4148 static void clear_and_mark_ind(int ncg,int *move,
4149                                int *index_gl,int *cgindex,int *gatindex,
4150                                gmx_ga2la_t ga2la,char *bLocalCG,
4151                                int *cell_index)
4152 {
4153     int cg,a0,a1,a;
4154     
4155     for(cg=0; cg<ncg; cg++)
4156     {
4157         if (move[cg] >= 0)
4158         {
4159             a0 = cgindex[cg];
4160             a1 = cgindex[cg+1];
4161             /* Clear the global indices */
4162             for(a=a0; a<a1; a++)
4163             {
4164                 ga2la_del(ga2la,gatindex[a]);
4165             }
4166             if (bLocalCG)
4167             {
4168                 bLocalCG[index_gl[cg]] = FALSE;
4169             }
4170             /* Signal that this cg has moved using the ns cell index.
4171              * Here we set it to -1. fill_grid will change it
4172              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4173              */
4174             cell_index[cg] = -1;
4175         }
4176     }
4177 }
4178
4179 static void print_cg_move(FILE *fplog,
4180                           gmx_domdec_t *dd,
4181                           gmx_large_int_t step,int cg,int dim,int dir,
4182                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4183                           rvec cm_old,rvec cm_new,real pos_d)
4184 {
4185     gmx_domdec_comm_t *comm;
4186     char buf[22];
4187
4188     comm = dd->comm;
4189
4190     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4191     if (bHaveLimitdAndCMOld)
4192     {
4193         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4194                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4195     }
4196     else
4197     {
4198         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4199                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4200     }
4201     fprintf(fplog,"distance out of cell %f\n",
4202             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4203     if (bHaveLimitdAndCMOld)
4204     {
4205         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4206                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4207     }
4208     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4209             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4210     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4211             dim2char(dim),
4212             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4213     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4214             dim2char(dim),
4215             comm->cell_x0[dim],comm->cell_x1[dim]);
4216 }
4217
4218 static void cg_move_error(FILE *fplog,
4219                           gmx_domdec_t *dd,
4220                           gmx_large_int_t step,int cg,int dim,int dir,
4221                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4222                           rvec cm_old,rvec cm_new,real pos_d)
4223 {
4224     if (fplog)
4225     {
4226         print_cg_move(fplog, dd,step,cg,dim,dir,
4227                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4228     }
4229     print_cg_move(stderr,dd,step,cg,dim,dir,
4230                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4231     gmx_fatal(FARGS,
4232               "A charge group moved too far between two domain decomposition steps\n"
4233               "This usually means that your system is not well equilibrated");
4234 }
4235
4236 static void rotate_state_atom(t_state *state,int a)
4237 {
4238     int est;
4239
4240     for(est=0; est<estNR; est++)
4241     {
4242         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4243             switch (est) {
4244             case estX:
4245                 /* Rotate the complete state; for a rectangular box only */
4246                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4247                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4248                 break;
4249             case estV:
4250                 state->v[a][YY] = -state->v[a][YY];
4251                 state->v[a][ZZ] = -state->v[a][ZZ];
4252                 break;
4253             case estSDX:
4254                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4255                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4256                 break;
4257             case estCGP:
4258                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4259                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4260                 break;
4261             case estDISRE_INITF:
4262             case estDISRE_RM3TAV:
4263             case estORIRE_INITF:
4264             case estORIRE_DTAV:
4265                 /* These are distances, so not affected by rotation */
4266                 break;
4267             default:
4268                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4269             }
4270         }
4271     }
4272 }
4273
4274 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4275 {
4276     if (natoms > comm->moved_nalloc)
4277     {
4278         /* Contents should be preserved here */
4279         comm->moved_nalloc = over_alloc_dd(natoms);
4280         srenew(comm->moved,comm->moved_nalloc);
4281     }
4282
4283     return comm->moved;
4284 }
4285
4286 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4287                          gmx_domdec_t *dd,
4288                          t_state *state,
4289                          ivec tric_dir,matrix tcm,
4290                          rvec cell_x0,rvec cell_x1,
4291                          rvec limitd,rvec limit0,rvec limit1,
4292                          const int *cgindex,
4293                          int cg_start,int cg_end,
4294                          rvec *cg_cm,
4295                          int *move)
4296 {
4297     int  npbcdim;
4298     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4299     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4300     int  flag;
4301     gmx_bool bScrew;
4302     ivec dev;
4303     real inv_ncg,pos_d;
4304     rvec cm_new;
4305
4306     npbcdim = dd->npbcdim;
4307
4308     for(cg=cg_start; cg<cg_end; cg++)
4309     {
4310         k0   = cgindex[cg];
4311         k1   = cgindex[cg+1];
4312         nrcg = k1 - k0;
4313         if (nrcg == 1)
4314         {
4315             copy_rvec(state->x[k0],cm_new);
4316         }
4317         else
4318         {
4319             inv_ncg = 1.0/nrcg;
4320             
4321             clear_rvec(cm_new);
4322             for(k=k0; (k<k1); k++)
4323             {
4324                 rvec_inc(cm_new,state->x[k]);
4325             }
4326             for(d=0; (d<DIM); d++)
4327             {
4328                 cm_new[d] = inv_ncg*cm_new[d];
4329             }
4330         }
4331         
4332         clear_ivec(dev);
4333         /* Do pbc and check DD cell boundary crossings */
4334         for(d=DIM-1; d>=0; d--)
4335         {
4336             if (dd->nc[d] > 1)
4337             {
4338                 bScrew = (dd->bScrewPBC && d == XX);
4339                 /* Determine the location of this cg in lattice coordinates */
4340                 pos_d = cm_new[d];
4341                 if (tric_dir[d])
4342                 {
4343                     for(d2=d+1; d2<DIM; d2++)
4344                     {
4345                         pos_d += cm_new[d2]*tcm[d2][d];
4346                     }
4347                 }
4348                 /* Put the charge group in the triclinic unit-cell */
4349                 if (pos_d >= cell_x1[d])
4350                 {
4351                     if (pos_d >= limit1[d])
4352                     {
4353                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4354                                       cg_cm[cg],cm_new,pos_d);
4355                     }
4356                     dev[d] = 1;
4357                     if (dd->ci[d] == dd->nc[d] - 1)
4358                     {
4359                         rvec_dec(cm_new,state->box[d]);
4360                         if (bScrew)
4361                         {
4362                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4363                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4364                         }
4365                         for(k=k0; (k<k1); k++)
4366                         {
4367                             rvec_dec(state->x[k],state->box[d]);
4368                             if (bScrew)
4369                             {
4370                                 rotate_state_atom(state,k);
4371                             }
4372                         }
4373                     }
4374                 }
4375                 else if (pos_d < cell_x0[d])
4376                 {
4377                     if (pos_d < limit0[d])
4378                     {
4379                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4380                                       cg_cm[cg],cm_new,pos_d);
4381                     }
4382                     dev[d] = -1;
4383                     if (dd->ci[d] == 0)
4384                     {
4385                         rvec_inc(cm_new,state->box[d]);
4386                         if (bScrew)
4387                         {
4388                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4389                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4390                         }
4391                         for(k=k0; (k<k1); k++)
4392                         {
4393                             rvec_inc(state->x[k],state->box[d]);
4394                             if (bScrew)
4395                             {
4396                                 rotate_state_atom(state,k);
4397                             }
4398                         }
4399                     }
4400                 }
4401             }
4402             else if (d < npbcdim)
4403             {
4404                 /* Put the charge group in the rectangular unit-cell */
4405                 while (cm_new[d] >= state->box[d][d])
4406                 {
4407                     rvec_dec(cm_new,state->box[d]);
4408                     for(k=k0; (k<k1); k++)
4409                     {
4410                         rvec_dec(state->x[k],state->box[d]);
4411                     }
4412                 }
4413                 while (cm_new[d] < 0)
4414                 {
4415                     rvec_inc(cm_new,state->box[d]);
4416                     for(k=k0; (k<k1); k++)
4417                     {
4418                         rvec_inc(state->x[k],state->box[d]);
4419                     }
4420                 }
4421             }
4422         }
4423     
4424         copy_rvec(cm_new,cg_cm[cg]);
4425         
4426         /* Determine where this cg should go */
4427         flag = 0;
4428         mc = -1;
4429         for(d=0; d<dd->ndim; d++)
4430         {
4431             dim = dd->dim[d];
4432             if (dev[dim] == 1)
4433             {
4434                 flag |= DD_FLAG_FW(d);
4435                 if (mc == -1)
4436                 {
4437                     mc = d*2;
4438                 }
4439             }
4440             else if (dev[dim] == -1)
4441             {
4442                 flag |= DD_FLAG_BW(d);
4443                 if (mc == -1) {
4444                     if (dd->nc[dim] > 2)
4445                     {
4446                         mc = d*2 + 1;
4447                     }
4448                     else
4449                     {
4450                         mc = d*2;
4451                     }
4452                 }
4453             }
4454         }
4455         /* Temporarily store the flag in move */
4456         move[cg] = mc + flag;
4457     }
4458 }
4459
4460 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4461                                gmx_domdec_t *dd,ivec tric_dir,
4462                                t_state *state,rvec **f,
4463                                t_forcerec *fr,t_mdatoms *md,
4464                                gmx_bool bCompact,
4465                                t_nrnb *nrnb,
4466                                int *ncg_stay_home,
4467                                int *ncg_moved)
4468 {
4469     int  *move;
4470     int  npbcdim;
4471     int  ncg[DIM*2],nat[DIM*2];
4472     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4473     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4474     int  sbuf[2],rbuf[2];
4475     int  home_pos_cg,home_pos_at,buf_pos;
4476     int  flag;
4477     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4478     gmx_bool bScrew;
4479     ivec dev;
4480     real inv_ncg,pos_d;
4481     matrix tcm;
4482     rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4483     atom_id *cgindex;
4484     cginfo_mb_t *cginfo_mb;
4485     gmx_domdec_comm_t *comm;
4486     int  *moved;
4487     int  nthread,thread;
4488     
4489     if (dd->bScrewPBC)
4490     {
4491         check_screw_box(state->box);
4492     }
4493     
4494     comm  = dd->comm;
4495     if (fr->cutoff_scheme == ecutsGROUP)
4496     {
4497         cg_cm = fr->cg_cm;
4498     }
4499     
4500     for(i=0; i<estNR; i++)
4501     {
4502         if (EST_DISTR(i))
4503         {
4504             switch (i)
4505             {
4506             case estX:   /* Always present */            break;
4507             case estV:   bV   = (state->flags & (1<<i)); break;
4508             case estSDX: bSDX = (state->flags & (1<<i)); break;
4509             case estCGP: bCGP = (state->flags & (1<<i)); break;
4510             case estLD_RNG:
4511             case estLD_RNGI:
4512             case estDISRE_INITF:
4513             case estDISRE_RM3TAV:
4514             case estORIRE_INITF:
4515             case estORIRE_DTAV:
4516                 /* No processing required */
4517                 break;
4518             default:
4519             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4520             }
4521         }
4522     }
4523     
4524     if (dd->ncg_tot > comm->nalloc_int)
4525     {
4526         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4527         srenew(comm->buf_int,comm->nalloc_int);
4528     }
4529     move = comm->buf_int;
4530     
4531     /* Clear the count */
4532     for(c=0; c<dd->ndim*2; c++)
4533     {
4534         ncg[c] = 0;
4535         nat[c] = 0;
4536     }
4537
4538     npbcdim = dd->npbcdim;
4539
4540     for(d=0; (d<DIM); d++)
4541     {
4542         limitd[d] = dd->comm->cellsize_min[d];
4543         if (d >= npbcdim && dd->ci[d] == 0)
4544         {
4545             cell_x0[d] = -GMX_FLOAT_MAX;
4546         }
4547         else
4548         {
4549             cell_x0[d] = comm->cell_x0[d];
4550         }
4551         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4552         {
4553             cell_x1[d] = GMX_FLOAT_MAX;
4554         }
4555         else
4556         {
4557             cell_x1[d] = comm->cell_x1[d];
4558         }
4559         if (d < npbcdim)
4560         {
4561             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4562             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4563         }
4564         else
4565         {
4566             /* We check after communication if a charge group moved
4567              * more than one cell. Set the pre-comm check limit to float_max.
4568              */
4569             limit0[d] = -GMX_FLOAT_MAX;
4570             limit1[d] =  GMX_FLOAT_MAX;
4571         }
4572     }
4573     
4574     make_tric_corr_matrix(npbcdim,state->box,tcm);
4575     
4576     cgindex = dd->cgindex;
4577
4578     nthread = gmx_omp_nthreads_get(emntDomdec);
4579
4580     /* Compute the center of geometry for all home charge groups
4581      * and put them in the box and determine where they should go.
4582      */
4583 #pragma omp parallel for num_threads(nthread) schedule(static)
4584     for(thread=0; thread<nthread; thread++)
4585     {
4586         calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4587                      cell_x0,cell_x1,limitd,limit0,limit1,
4588                      cgindex,
4589                      ( thread   *dd->ncg_home)/nthread,
4590                      ((thread+1)*dd->ncg_home)/nthread,
4591                      fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4592                      move);
4593     }
4594
4595     for(cg=0; cg<dd->ncg_home; cg++)
4596     {
4597         if (move[cg] >= 0)
4598         {
4599             mc = move[cg];
4600             flag     = mc & ~DD_FLAG_NRCG;
4601             mc       = mc & DD_FLAG_NRCG;
4602             move[cg] = mc;
4603
4604             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4605             {
4606                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4607                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4608             }
4609             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4610             /* We store the cg size in the lower 16 bits
4611              * and the place where the charge group should go
4612              * in the next 6 bits. This saves some communication volume.
4613              */
4614             nrcg = cgindex[cg+1] - cgindex[cg];
4615             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4616             ncg[mc] += 1;
4617             nat[mc] += nrcg;
4618         }
4619     }
4620     
4621     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4622     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4623
4624     *ncg_moved = 0;
4625     for(i=0; i<dd->ndim*2; i++)
4626     {
4627         *ncg_moved += ncg[i];
4628     }
4629     
4630     nvec = 1;
4631     if (bV)
4632     {
4633         nvec++;
4634     }
4635     if (bSDX)
4636     {
4637         nvec++;
4638     }
4639     if (bCGP)
4640     {
4641         nvec++;
4642     }
4643     
4644     /* Make sure the communication buffers are large enough */
4645     for(mc=0; mc<dd->ndim*2; mc++)
4646     {
4647         nvr = ncg[mc] + nat[mc]*nvec;
4648         if (nvr > comm->cgcm_state_nalloc[mc])
4649         {
4650             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4651             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4652         }
4653     }
4654     
4655     switch (fr->cutoff_scheme)
4656     {
4657     case ecutsGROUP:
4658         /* Recalculating cg_cm might be cheaper than communicating,
4659          * but that could give rise to rounding issues.
4660          */
4661         home_pos_cg =
4662             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4663                                     nvec,cg_cm,comm,bCompact);
4664     break;
4665     case ecutsVERLET:
4666         /* Without charge groups we send the moved atom coordinates
4667          * over twice. This is so the code below can be used without
4668          * many conditionals for both for with and without charge groups.
4669          */
4670         home_pos_cg =
4671             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4672                                     nvec,state->x,comm,FALSE);
4673         if (bCompact)
4674         {
4675             home_pos_cg -= *ncg_moved;
4676         }
4677         break;
4678     default:
4679         gmx_incons("unimplemented");
4680         home_pos_cg = 0;
4681     }
4682     
4683     vec = 0;
4684     home_pos_at =
4685         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4686                                 nvec,vec++,state->x,comm,bCompact);
4687     if (bV)
4688     {
4689         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4690                                 nvec,vec++,state->v,comm,bCompact);
4691     }
4692     if (bSDX)
4693     {
4694         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4695                                 nvec,vec++,state->sd_X,comm,bCompact);
4696     }
4697     if (bCGP)
4698     {
4699         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4700                                 nvec,vec++,state->cg_p,comm,bCompact);
4701     }
4702     
4703     if (bCompact)
4704     {
4705         compact_ind(dd->ncg_home,move,
4706                     dd->index_gl,dd->cgindex,dd->gatindex,
4707                     dd->ga2la,comm->bLocalCG,
4708                     fr->cginfo);
4709     }
4710     else
4711     {
4712         if (fr->cutoff_scheme == ecutsVERLET)
4713         {
4714             moved = get_moved(comm,dd->ncg_home);
4715
4716             for(k=0; k<dd->ncg_home; k++)
4717             {
4718                 moved[k] = 0;
4719             }
4720         }
4721         else
4722         {
4723             moved = fr->ns.grid->cell_index;
4724         }
4725
4726         clear_and_mark_ind(dd->ncg_home,move,
4727                            dd->index_gl,dd->cgindex,dd->gatindex,
4728                            dd->ga2la,comm->bLocalCG,
4729                            moved);
4730     }
4731     
4732     cginfo_mb = fr->cginfo_mb;
4733
4734     *ncg_stay_home = home_pos_cg;
4735     for(d=0; d<dd->ndim; d++)
4736     {
4737         dim = dd->dim[d];
4738         ncg_recv = 0;
4739         nat_recv = 0;
4740         nvr      = 0;
4741         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4742         {
4743             cdd = d*2 + dir;
4744             /* Communicate the cg and atom counts */
4745             sbuf[0] = ncg[cdd];
4746             sbuf[1] = nat[cdd];
4747             if (debug)
4748             {
4749                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4750                         d,dir,sbuf[0],sbuf[1]);
4751             }
4752             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4753             
4754             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4755             {
4756                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4757                 srenew(comm->buf_int,comm->nalloc_int);
4758             }
4759             
4760             /* Communicate the charge group indices, sizes and flags */
4761             dd_sendrecv_int(dd, d, dir,
4762                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4763                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4764             
4765             nvs = ncg[cdd] + nat[cdd]*nvec;
4766             i   = rbuf[0]  + rbuf[1] *nvec;
4767             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4768             
4769             /* Communicate cgcm and state */
4770             dd_sendrecv_rvec(dd, d, dir,
4771                              comm->cgcm_state[cdd], nvs,
4772                              comm->vbuf.v+nvr, i);
4773             ncg_recv += rbuf[0];
4774             nat_recv += rbuf[1];
4775             nvr      += i;
4776         }
4777         
4778         /* Process the received charge groups */
4779         buf_pos = 0;
4780         for(cg=0; cg<ncg_recv; cg++)
4781         {
4782             flag = comm->buf_int[cg*DD_CGIBS+1];
4783
4784             if (dim >= npbcdim && dd->nc[dim] > 2)
4785             {
4786                 /* No pbc in this dim and more than one domain boundary.
4787                  * We do a separate check if a charge group didn't move too far.
4788                  */
4789                 if (((flag & DD_FLAG_FW(d)) &&
4790                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4791                     ((flag & DD_FLAG_BW(d)) &&
4792                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4793                 {
4794                     cg_move_error(fplog,dd,step,cg,dim,
4795                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4796                                    FALSE,0,
4797                                    comm->vbuf.v[buf_pos],
4798                                    comm->vbuf.v[buf_pos],
4799                                    comm->vbuf.v[buf_pos][dim]);
4800                 }
4801             }
4802
4803             mc = -1;
4804             if (d < dd->ndim-1)
4805             {
4806                 /* Check which direction this cg should go */
4807                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4808                 {
4809                     if (dd->bGridJump)
4810                     {
4811                         /* The cell boundaries for dimension d2 are not equal
4812                          * for each cell row of the lower dimension(s),
4813                          * therefore we might need to redetermine where
4814                          * this cg should go.
4815                          */
4816                         dim2 = dd->dim[d2];
4817                         /* If this cg crosses the box boundary in dimension d2
4818                          * we can use the communicated flag, so we do not
4819                          * have to worry about pbc.
4820                          */
4821                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4822                                (flag & DD_FLAG_FW(d2))) ||
4823                               (dd->ci[dim2] == 0 &&
4824                                (flag & DD_FLAG_BW(d2)))))
4825                         {
4826                             /* Clear the two flags for this dimension */
4827                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4828                             /* Determine the location of this cg
4829                              * in lattice coordinates
4830                              */
4831                             pos_d = comm->vbuf.v[buf_pos][dim2];
4832                             if (tric_dir[dim2])
4833                             {
4834                                 for(d3=dim2+1; d3<DIM; d3++)
4835                                 {
4836                                     pos_d +=
4837                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4838                                 }
4839                             }
4840                             /* Check of we are not at the box edge.
4841                              * pbc is only handled in the first step above,
4842                              * but this check could move over pbc while
4843                              * the first step did not due to different rounding.
4844                              */
4845                             if (pos_d >= cell_x1[dim2] &&
4846                                 dd->ci[dim2] != dd->nc[dim2]-1)
4847                             {
4848                                 flag |= DD_FLAG_FW(d2);
4849                             }
4850                             else if (pos_d < cell_x0[dim2] &&
4851                                      dd->ci[dim2] != 0)
4852                             {
4853                                 flag |= DD_FLAG_BW(d2);
4854                             }
4855                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4856                         }
4857                     }
4858                     /* Set to which neighboring cell this cg should go */
4859                     if (flag & DD_FLAG_FW(d2))
4860                     {
4861                         mc = d2*2;
4862                     }
4863                     else if (flag & DD_FLAG_BW(d2))
4864                     {
4865                         if (dd->nc[dd->dim[d2]] > 2)
4866                         {
4867                             mc = d2*2+1;
4868                         }
4869                         else
4870                         {
4871                             mc = d2*2;
4872                         }
4873                     }
4874                 }
4875             }
4876             
4877             nrcg = flag & DD_FLAG_NRCG;
4878             if (mc == -1)
4879             {
4880                 if (home_pos_cg+1 > dd->cg_nalloc)
4881                 {
4882                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4883                     srenew(dd->index_gl,dd->cg_nalloc);
4884                     srenew(dd->cgindex,dd->cg_nalloc+1);
4885                 }
4886                 /* Set the global charge group index and size */
4887                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4888                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4889                 /* Copy the state from the buffer */
4890                 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4891                 if (fr->cutoff_scheme == ecutsGROUP)
4892                 {
4893                     cg_cm = fr->cg_cm;
4894                     copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4895                 }
4896                 buf_pos++;
4897
4898                 /* Set the cginfo */
4899                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4900                                                    dd->index_gl[home_pos_cg]);
4901                 if (comm->bLocalCG)
4902                 {
4903                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4904                 }
4905
4906                 if (home_pos_at+nrcg > state->nalloc)
4907                 {
4908                     dd_realloc_state(state,f,home_pos_at+nrcg);
4909                 }
4910                 for(i=0; i<nrcg; i++)
4911                 {
4912                     copy_rvec(comm->vbuf.v[buf_pos++],
4913                               state->x[home_pos_at+i]);
4914                 }
4915                 if (bV)
4916                 {
4917                     for(i=0; i<nrcg; i++)
4918                     {
4919                         copy_rvec(comm->vbuf.v[buf_pos++],
4920                                   state->v[home_pos_at+i]);
4921                     }
4922                 }
4923                 if (bSDX)
4924                 {
4925                     for(i=0; i<nrcg; i++)
4926                     {
4927                         copy_rvec(comm->vbuf.v[buf_pos++],
4928                                   state->sd_X[home_pos_at+i]);
4929                     }
4930                 }
4931                 if (bCGP)
4932                 {
4933                     for(i=0; i<nrcg; i++)
4934                     {
4935                         copy_rvec(comm->vbuf.v[buf_pos++],
4936                                   state->cg_p[home_pos_at+i]);
4937                     }
4938                 }
4939                 home_pos_cg += 1;
4940                 home_pos_at += nrcg;
4941             }
4942             else
4943             {
4944                 /* Reallocate the buffers if necessary  */
4945                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4946                 {
4947                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4948                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4949                 }
4950                 nvr = ncg[mc] + nat[mc]*nvec;
4951                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4952                 {
4953                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4954                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4955                 }
4956                 /* Copy from the receive to the send buffers */
4957                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4958                        comm->buf_int + cg*DD_CGIBS,
4959                        DD_CGIBS*sizeof(int));
4960                 memcpy(comm->cgcm_state[mc][nvr],
4961                        comm->vbuf.v[buf_pos],
4962                        (1+nrcg*nvec)*sizeof(rvec));
4963                 buf_pos += 1 + nrcg*nvec;
4964                 ncg[mc] += 1;
4965                 nat[mc] += nrcg;
4966             }
4967         }
4968     }
4969     
4970     /* With sorting (!bCompact) the indices are now only partially up to date
4971      * and ncg_home and nat_home are not the real count, since there are
4972      * "holes" in the arrays for the charge groups that moved to neighbors.
4973      */
4974     if (fr->cutoff_scheme == ecutsVERLET)
4975     {
4976         moved = get_moved(comm,home_pos_cg);
4977
4978         for(i=dd->ncg_home; i<home_pos_cg; i++)
4979         {
4980             moved[i] = 0;
4981         }
4982     }
4983     dd->ncg_home = home_pos_cg;
4984     dd->nat_home = home_pos_at;
4985
4986     if (debug)
4987     {
4988         fprintf(debug,
4989                 "Finished repartitioning: cgs moved out %d, new home %d\n",
4990                 *ncg_moved,dd->ncg_home-*ncg_moved);
4991                 
4992     }
4993 }
4994
4995 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4996 {
4997     dd->comm->cycl[ddCycl] += cycles;
4998     dd->comm->cycl_n[ddCycl]++;
4999     if (cycles > dd->comm->cycl_max[ddCycl])
5000     {
5001         dd->comm->cycl_max[ddCycl] = cycles;
5002     }
5003 }
5004
5005 static double force_flop_count(t_nrnb *nrnb)
5006 {
5007     int i;
5008     double sum;
5009     const char *name;
5010
5011     sum = 0;
5012     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5013     {
5014         /* To get closer to the real timings, we half the count
5015          * for the normal loops and again half it for water loops.
5016          */
5017         name = nrnb_str(i);
5018         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5019         {
5020             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5021         }
5022         else
5023         {
5024             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5025         }
5026     }
5027     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5028     {
5029         name = nrnb_str(i);
5030         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5031         sum += nrnb->n[i]*cost_nrnb(i);
5032     }
5033     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5034     {
5035         sum += nrnb->n[i]*cost_nrnb(i);
5036     }
5037
5038     return sum;
5039 }
5040
5041 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5042 {
5043     if (dd->comm->eFlop)
5044     {
5045         dd->comm->flop -= force_flop_count(nrnb);
5046     }
5047 }
5048 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5049 {
5050     if (dd->comm->eFlop)
5051     {
5052         dd->comm->flop += force_flop_count(nrnb);
5053         dd->comm->flop_n++;
5054     }
5055 }  
5056
5057 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5058 {
5059     int i;
5060     
5061     for(i=0; i<ddCyclNr; i++)
5062     {
5063         dd->comm->cycl[i] = 0;
5064         dd->comm->cycl_n[i] = 0;
5065         dd->comm->cycl_max[i] = 0;
5066     }
5067     dd->comm->flop = 0;
5068     dd->comm->flop_n = 0;
5069 }
5070
5071 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5072 {
5073     gmx_domdec_comm_t *comm;
5074     gmx_domdec_load_t *load;
5075     gmx_domdec_root_t *root=NULL;
5076     int  d,dim,cid,i,pos;
5077     float cell_frac=0,sbuf[DD_NLOAD_MAX];
5078     gmx_bool bSepPME;
5079     
5080     if (debug)
5081     {
5082         fprintf(debug,"get_load_distribution start\n");
5083     }
5084
5085     wallcycle_start(wcycle,ewcDDCOMMLOAD);
5086     
5087     comm = dd->comm;
5088     
5089     bSepPME = (dd->pme_nodeid >= 0);
5090     
5091     for(d=dd->ndim-1; d>=0; d--)
5092     {
5093         dim = dd->dim[d];
5094         /* Check if we participate in the communication in this dimension */
5095         if (d == dd->ndim-1 || 
5096             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5097         {
5098             load = &comm->load[d];
5099             if (dd->bGridJump)
5100             {
5101                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5102             }
5103             pos = 0;
5104             if (d == dd->ndim-1)
5105             {
5106                 sbuf[pos++] = dd_force_load(comm);
5107                 sbuf[pos++] = sbuf[0];
5108                 if (dd->bGridJump)
5109                 {
5110                     sbuf[pos++] = sbuf[0];
5111                     sbuf[pos++] = cell_frac;
5112                     if (d > 0)
5113                     {
5114                         sbuf[pos++] = comm->cell_f_max0[d];
5115                         sbuf[pos++] = comm->cell_f_min1[d];
5116                     }
5117                 }
5118                 if (bSepPME)
5119                 {
5120                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5121                     sbuf[pos++] = comm->cycl[ddCyclPME];
5122                 }
5123             }
5124             else
5125             {
5126                 sbuf[pos++] = comm->load[d+1].sum;
5127                 sbuf[pos++] = comm->load[d+1].max;
5128                 if (dd->bGridJump)
5129                 {
5130                     sbuf[pos++] = comm->load[d+1].sum_m;
5131                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5132                     sbuf[pos++] = comm->load[d+1].flags;
5133                     if (d > 0)
5134                     {
5135                         sbuf[pos++] = comm->cell_f_max0[d];
5136                         sbuf[pos++] = comm->cell_f_min1[d];
5137                     }
5138                 }
5139                 if (bSepPME)
5140                 {
5141                     sbuf[pos++] = comm->load[d+1].mdf;
5142                     sbuf[pos++] = comm->load[d+1].pme;
5143                 }
5144             }
5145             load->nload = pos;
5146             /* Communicate a row in DD direction d.
5147              * The communicators are setup such that the root always has rank 0.
5148              */
5149 #ifdef GMX_MPI
5150             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
5151                        load->load,load->nload*sizeof(float),MPI_BYTE,
5152                        0,comm->mpi_comm_load[d]);
5153 #endif
5154             if (dd->ci[dim] == dd->master_ci[dim])
5155             {
5156                 /* We are the root, process this row */
5157                 if (comm->bDynLoadBal)
5158                 {
5159                     root = comm->root[d];
5160                 }
5161                 load->sum = 0;
5162                 load->max = 0;
5163                 load->sum_m = 0;
5164                 load->cvol_min = 1;
5165                 load->flags = 0;
5166                 load->mdf = 0;
5167                 load->pme = 0;
5168                 pos = 0;
5169                 for(i=0; i<dd->nc[dim]; i++)
5170                 {
5171                     load->sum += load->load[pos++];
5172                     load->max = max(load->max,load->load[pos]);
5173                     pos++;
5174                     if (dd->bGridJump)
5175                     {
5176                         if (root->bLimited)
5177                         {
5178                             /* This direction could not be load balanced properly,
5179                              * therefore we need to use the maximum iso the average load.
5180                              */
5181                             load->sum_m = max(load->sum_m,load->load[pos]);
5182                         }
5183                         else
5184                         {
5185                             load->sum_m += load->load[pos];
5186                         }
5187                         pos++;
5188                         load->cvol_min = min(load->cvol_min,load->load[pos]);
5189                         pos++;
5190                         if (d < dd->ndim-1)
5191                         {
5192                             load->flags = (int)(load->load[pos++] + 0.5);
5193                         }
5194                         if (d > 0)
5195                         {
5196                             root->cell_f_max0[i] = load->load[pos++];
5197                             root->cell_f_min1[i] = load->load[pos++];
5198                         }
5199                     }
5200                     if (bSepPME)
5201                     {
5202                         load->mdf = max(load->mdf,load->load[pos]);
5203                         pos++;
5204                         load->pme = max(load->pme,load->load[pos]);
5205                         pos++;
5206                     }
5207                 }
5208                 if (comm->bDynLoadBal && root->bLimited)
5209                 {
5210                     load->sum_m *= dd->nc[dim];
5211                     load->flags |= (1<<d);
5212                 }
5213             }
5214         }
5215     }
5216
5217     if (DDMASTER(dd))
5218     {
5219         comm->nload      += dd_load_count(comm);
5220         comm->load_step  += comm->cycl[ddCyclStep];
5221         comm->load_sum   += comm->load[0].sum;
5222         comm->load_max   += comm->load[0].max;
5223         if (comm->bDynLoadBal)
5224         {
5225             for(d=0; d<dd->ndim; d++)
5226             {
5227                 if (comm->load[0].flags & (1<<d))
5228                 {
5229                     comm->load_lim[d]++;
5230                 }
5231             }
5232         }
5233         if (bSepPME)
5234         {
5235             comm->load_mdf += comm->load[0].mdf;
5236             comm->load_pme += comm->load[0].pme;
5237         }
5238     }
5239
5240     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5241     
5242     if (debug)
5243     {
5244         fprintf(debug,"get_load_distribution finished\n");
5245     }
5246 }
5247
5248 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5249 {
5250     /* Return the relative performance loss on the total run time
5251      * due to the force calculation load imbalance.
5252      */
5253     if (dd->comm->nload > 0)
5254     {
5255         return
5256             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5257             (dd->comm->load_step*dd->nnodes);
5258     }
5259     else
5260     {
5261         return 0;
5262     }
5263 }
5264
5265 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5266 {
5267     char  buf[STRLEN];
5268     int   npp,npme,nnodes,d,limp;
5269     float imbal,pme_f_ratio,lossf,lossp=0;
5270     gmx_bool  bLim;
5271     gmx_domdec_comm_t *comm;
5272
5273     comm = dd->comm;
5274     if (DDMASTER(dd) && comm->nload > 0)
5275     {
5276         npp    = dd->nnodes;
5277         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5278         nnodes = npp + npme;
5279         imbal = comm->load_max*npp/comm->load_sum - 1;
5280         lossf = dd_force_imb_perf_loss(dd);
5281         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5282         fprintf(fplog,"%s",buf);
5283         fprintf(stderr,"\n");
5284         fprintf(stderr,"%s",buf);
5285         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5286         fprintf(fplog,"%s",buf);
5287         fprintf(stderr,"%s",buf);
5288         bLim = FALSE;
5289         if (comm->bDynLoadBal)
5290         {
5291             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5292             for(d=0; d<dd->ndim; d++)
5293             {
5294                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5295                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5296                 if (limp >= 50)
5297                 {
5298                     bLim = TRUE;
5299                 }
5300             }
5301             sprintf(buf+strlen(buf),"\n");
5302             fprintf(fplog,"%s",buf);
5303             fprintf(stderr,"%s",buf);
5304         }
5305         if (npme > 0)
5306         {
5307             pme_f_ratio = comm->load_pme/comm->load_mdf;
5308             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5309             if (lossp <= 0)
5310             {
5311                 lossp *= (float)npme/(float)nnodes;
5312             }
5313             else
5314             {
5315                 lossp *= (float)npp/(float)nnodes;
5316             }
5317             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5318             fprintf(fplog,"%s",buf);
5319             fprintf(stderr,"%s",buf);
5320             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5321             fprintf(fplog,"%s",buf);
5322             fprintf(stderr,"%s",buf);
5323         }
5324         fprintf(fplog,"\n");
5325         fprintf(stderr,"\n");
5326         
5327         if (lossf >= DD_PERF_LOSS)
5328         {
5329             sprintf(buf,
5330                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5331                     "      in the domain decomposition.\n",lossf*100);
5332             if (!comm->bDynLoadBal)
5333             {
5334                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5335             }
5336             else if (bLim)
5337             {
5338                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5339             }
5340             fprintf(fplog,"%s\n",buf);
5341             fprintf(stderr,"%s\n",buf);
5342         }
5343         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5344         {
5345             sprintf(buf,
5346                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5347                     "      had %s work to do than the PP nodes.\n"
5348                     "      You might want to %s the number of PME nodes\n"
5349                     "      or %s the cut-off and the grid spacing.\n",
5350                     fabs(lossp*100),
5351                     (lossp < 0) ? "less"     : "more",
5352                     (lossp < 0) ? "decrease" : "increase",
5353                     (lossp < 0) ? "decrease" : "increase");
5354             fprintf(fplog,"%s\n",buf);
5355             fprintf(stderr,"%s\n",buf);
5356         }
5357     }
5358 }
5359
5360 static float dd_vol_min(gmx_domdec_t *dd)
5361 {
5362     return dd->comm->load[0].cvol_min*dd->nnodes;
5363 }
5364
5365 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5366 {
5367     return dd->comm->load[0].flags;
5368 }
5369
5370 static float dd_f_imbal(gmx_domdec_t *dd)
5371 {
5372     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5373 }
5374
5375 float dd_pme_f_ratio(gmx_domdec_t *dd)
5376 {
5377     if (dd->comm->cycl_n[ddCyclPME] > 0)
5378     {
5379         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5380     }
5381     else
5382     {
5383         return -1.0;
5384     }
5385 }
5386
5387 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5388 {
5389     int flags,d;
5390     char buf[22];
5391     
5392     flags = dd_load_flags(dd);
5393     if (flags)
5394     {
5395         fprintf(fplog,
5396                 "DD  load balancing is limited by minimum cell size in dimension");
5397         for(d=0; d<dd->ndim; d++)
5398         {
5399             if (flags & (1<<d))
5400             {
5401                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5402             }
5403         }
5404         fprintf(fplog,"\n");
5405     }
5406     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5407     if (dd->comm->bDynLoadBal)
5408     {
5409         fprintf(fplog,"  vol min/aver %5.3f%c",
5410                 dd_vol_min(dd),flags ? '!' : ' ');
5411     }
5412     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5413     if (dd->comm->cycl_n[ddCyclPME])
5414     {
5415         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5416     }
5417     fprintf(fplog,"\n\n");
5418 }
5419
5420 static void dd_print_load_verbose(gmx_domdec_t *dd)
5421 {
5422     if (dd->comm->bDynLoadBal)
5423     {
5424         fprintf(stderr,"vol %4.2f%c ",
5425                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5426     }
5427     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5428     if (dd->comm->cycl_n[ddCyclPME])
5429     {
5430         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5431     }
5432 }
5433
5434 #ifdef GMX_MPI
5435 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5436 {
5437     MPI_Comm  c_row;
5438     int  dim, i, rank;
5439     ivec loc_c;
5440     gmx_domdec_root_t *root;
5441     gmx_bool bPartOfGroup = FALSE;
5442     
5443     dim = dd->dim[dim_ind];
5444     copy_ivec(loc,loc_c);
5445     for(i=0; i<dd->nc[dim]; i++)
5446     {
5447         loc_c[dim] = i;
5448         rank = dd_index(dd->nc,loc_c);
5449         if (rank == dd->rank)
5450         {
5451             /* This process is part of the group */
5452             bPartOfGroup = TRUE;
5453         }
5454     }
5455     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5456                    &c_row);
5457     if (bPartOfGroup)
5458     {
5459         dd->comm->mpi_comm_load[dim_ind] = c_row;
5460         if (dd->comm->eDLB != edlbNO)
5461         {
5462             if (dd->ci[dim] == dd->master_ci[dim])
5463             {
5464                 /* This is the root process of this row */
5465                 snew(dd->comm->root[dim_ind],1);
5466                 root = dd->comm->root[dim_ind];
5467                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5468                 snew(root->old_cell_f,dd->nc[dim]+1);
5469                 snew(root->bCellMin,dd->nc[dim]);
5470                 if (dim_ind > 0)
5471                 {
5472                     snew(root->cell_f_max0,dd->nc[dim]);
5473                     snew(root->cell_f_min1,dd->nc[dim]);
5474                     snew(root->bound_min,dd->nc[dim]);
5475                     snew(root->bound_max,dd->nc[dim]);
5476                 }
5477                 snew(root->buf_ncd,dd->nc[dim]);
5478             }
5479             else
5480             {
5481                 /* This is not a root process, we only need to receive cell_f */
5482                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5483             }
5484         }
5485         if (dd->ci[dim] == dd->master_ci[dim])
5486         {
5487             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5488         }
5489     }
5490 }
5491 #endif
5492
5493 static void make_load_communicators(gmx_domdec_t *dd)
5494 {
5495 #ifdef GMX_MPI
5496   int  dim0,dim1,i,j;
5497   ivec loc;
5498
5499   if (debug)
5500     fprintf(debug,"Making load communicators\n");
5501
5502   snew(dd->comm->load,dd->ndim);
5503   snew(dd->comm->mpi_comm_load,dd->ndim);
5504   
5505   clear_ivec(loc);
5506   make_load_communicator(dd,0,loc);
5507   if (dd->ndim > 1) {
5508     dim0 = dd->dim[0];
5509     for(i=0; i<dd->nc[dim0]; i++) {
5510       loc[dim0] = i;
5511       make_load_communicator(dd,1,loc);
5512     }
5513   }
5514   if (dd->ndim > 2) {
5515     dim0 = dd->dim[0];
5516     for(i=0; i<dd->nc[dim0]; i++) {
5517       loc[dim0] = i;
5518       dim1 = dd->dim[1];
5519       for(j=0; j<dd->nc[dim1]; j++) {
5520           loc[dim1] = j;
5521           make_load_communicator(dd,2,loc);
5522       }
5523     }
5524   }
5525
5526   if (debug)
5527     fprintf(debug,"Finished making load communicators\n");
5528 #endif
5529 }
5530
5531 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5532 {
5533     gmx_bool bZYX;
5534     int  d,dim,i,j,m;
5535     ivec tmp,s;
5536     int  nzone,nzonep;
5537     ivec dd_zp[DD_MAXIZONE];
5538     gmx_domdec_zones_t *zones;
5539     gmx_domdec_ns_ranges_t *izone;
5540     
5541     for(d=0; d<dd->ndim; d++)
5542     {
5543         dim = dd->dim[d];
5544         copy_ivec(dd->ci,tmp);
5545         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5546         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5547         copy_ivec(dd->ci,tmp);
5548         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5549         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5550         if (debug)
5551         {
5552             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5553                     dd->rank,dim,
5554                     dd->neighbor[d][0],
5555                     dd->neighbor[d][1]);
5556         }
5557     }
5558     
5559     if (DDMASTER(dd))
5560     {
5561         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5562             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5563     }
5564     if (fplog)
5565     {
5566         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5567                 dd->ndim,
5568                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5569                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5570     }
5571     switch (dd->ndim)
5572     {
5573     case 3:
5574         nzone  = dd_z3n;
5575         nzonep = dd_zp3n;
5576         for(i=0; i<nzonep; i++)
5577         {
5578             copy_ivec(dd_zp3[i],dd_zp[i]);
5579         }
5580         break;
5581     case 2:
5582         nzone  = dd_z2n;
5583         nzonep = dd_zp2n;
5584         for(i=0; i<nzonep; i++)
5585         {
5586             copy_ivec(dd_zp2[i],dd_zp[i]);
5587         }
5588         break;
5589     case 1:
5590         nzone  = dd_z1n;
5591         nzonep = dd_zp1n;
5592         for(i=0; i<nzonep; i++)
5593         {
5594             copy_ivec(dd_zp1[i],dd_zp[i]);
5595         }
5596         break;
5597     default:
5598         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5599         nzone = 0;
5600         nzonep = 0;
5601     }
5602
5603     zones = &dd->comm->zones;
5604
5605     for(i=0; i<nzone; i++)
5606     {
5607         m = 0;
5608         clear_ivec(zones->shift[i]);
5609         for(d=0; d<dd->ndim; d++)
5610         {
5611             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5612         }
5613     }
5614     
5615     zones->n = nzone;
5616     for(i=0; i<nzone; i++)
5617     {
5618         for(d=0; d<DIM; d++)
5619         {
5620             s[d] = dd->ci[d] - zones->shift[i][d];
5621             if (s[d] < 0)
5622             {
5623                 s[d] += dd->nc[d];
5624             }
5625             else if (s[d] >= dd->nc[d])
5626             {
5627                 s[d] -= dd->nc[d];
5628             }
5629         }
5630     }
5631     zones->nizone = nzonep;
5632     for(i=0; i<zones->nizone; i++)
5633     {
5634         if (dd_zp[i][0] != i)
5635         {
5636             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5637         }
5638         izone = &zones->izone[i];
5639         izone->j0 = dd_zp[i][1];
5640         izone->j1 = dd_zp[i][2];
5641         for(dim=0; dim<DIM; dim++)
5642         {
5643             if (dd->nc[dim] == 1)
5644             {
5645                 /* All shifts should be allowed */
5646                 izone->shift0[dim] = -1;
5647                 izone->shift1[dim] = 1;
5648             }
5649             else
5650             {
5651                 /*
5652                   izone->shift0[d] = 0;
5653                   izone->shift1[d] = 0;
5654                   for(j=izone->j0; j<izone->j1; j++) {
5655                   if (dd->shift[j][d] > dd->shift[i][d])
5656                   izone->shift0[d] = -1;
5657                   if (dd->shift[j][d] < dd->shift[i][d])
5658                   izone->shift1[d] = 1;
5659                   }
5660                 */
5661                 
5662                 int shift_diff;
5663                 
5664                 /* Assume the shift are not more than 1 cell */
5665                 izone->shift0[dim] = 1;
5666                 izone->shift1[dim] = -1;
5667                 for(j=izone->j0; j<izone->j1; j++)
5668                 {
5669                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5670                     if (shift_diff < izone->shift0[dim])
5671                     {
5672                         izone->shift0[dim] = shift_diff;
5673                     }
5674                     if (shift_diff > izone->shift1[dim])
5675                     {
5676                         izone->shift1[dim] = shift_diff;
5677                     }
5678                 }
5679             }
5680         }
5681     }
5682     
5683     if (dd->comm->eDLB != edlbNO)
5684     {
5685         snew(dd->comm->root,dd->ndim);
5686     }
5687     
5688     if (dd->comm->bRecordLoad)
5689     {
5690         make_load_communicators(dd);
5691     }
5692 }
5693
5694 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5695 {
5696     gmx_domdec_t *dd;
5697     gmx_domdec_comm_t *comm;
5698     int  i,rank,*buf;
5699     ivec periods;
5700 #ifdef GMX_MPI
5701     MPI_Comm comm_cart;
5702 #endif
5703     
5704     dd = cr->dd;
5705     comm = dd->comm;
5706     
5707 #ifdef GMX_MPI
5708     if (comm->bCartesianPP)
5709     {
5710         /* Set up cartesian communication for the particle-particle part */
5711         if (fplog)
5712         {
5713             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5714                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5715         }
5716         
5717         for(i=0; i<DIM; i++)
5718         {
5719             periods[i] = TRUE;
5720         }
5721         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5722                         &comm_cart);
5723         /* We overwrite the old communicator with the new cartesian one */
5724         cr->mpi_comm_mygroup = comm_cart;
5725     }
5726     
5727     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5728     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5729     
5730     if (comm->bCartesianPP_PME)
5731     {
5732         /* Since we want to use the original cartesian setup for sim,
5733          * and not the one after split, we need to make an index.
5734          */
5735         snew(comm->ddindex2ddnodeid,dd->nnodes);
5736         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5737         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5738         /* Get the rank of the DD master,
5739          * above we made sure that the master node is a PP node.
5740          */
5741         if (MASTER(cr))
5742         {
5743             rank = dd->rank;
5744         }
5745         else
5746         {
5747             rank = 0;
5748         }
5749         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5750     }
5751     else if (comm->bCartesianPP)
5752     {
5753         if (cr->npmenodes == 0)
5754         {
5755             /* The PP communicator is also
5756              * the communicator for this simulation
5757              */
5758             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5759         }
5760         cr->nodeid = dd->rank;
5761         
5762         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5763         
5764         /* We need to make an index to go from the coordinates
5765          * to the nodeid of this simulation.
5766          */
5767         snew(comm->ddindex2simnodeid,dd->nnodes);
5768         snew(buf,dd->nnodes);
5769         if (cr->duty & DUTY_PP)
5770         {
5771             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5772         }
5773         /* Communicate the ddindex to simulation nodeid index */
5774         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5775                       cr->mpi_comm_mysim);
5776         sfree(buf);
5777         
5778         /* Determine the master coordinates and rank.
5779          * The DD master should be the same node as the master of this sim.
5780          */
5781         for(i=0; i<dd->nnodes; i++)
5782         {
5783             if (comm->ddindex2simnodeid[i] == 0)
5784             {
5785                 ddindex2xyz(dd->nc,i,dd->master_ci);
5786                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5787             }
5788         }
5789         if (debug)
5790         {
5791             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5792         }
5793     }
5794     else
5795     {
5796         /* No Cartesian communicators */
5797         /* We use the rank in dd->comm->all as DD index */
5798         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5799         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5800         dd->masterrank = 0;
5801         clear_ivec(dd->master_ci);
5802     }
5803 #endif
5804   
5805     if (fplog)
5806     {
5807         fprintf(fplog,
5808                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5809                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5810     }
5811     if (debug)
5812     {
5813         fprintf(debug,
5814                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5815                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5816     }
5817 }
5818
5819 static void receive_ddindex2simnodeid(t_commrec *cr)
5820 {
5821     gmx_domdec_t *dd;
5822     
5823     gmx_domdec_comm_t *comm;
5824     int  *buf;
5825     
5826     dd = cr->dd;
5827     comm = dd->comm;
5828     
5829 #ifdef GMX_MPI
5830     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5831     {
5832         snew(comm->ddindex2simnodeid,dd->nnodes);
5833         snew(buf,dd->nnodes);
5834         if (cr->duty & DUTY_PP)
5835         {
5836             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5837         }
5838 #ifdef GMX_MPI
5839         /* Communicate the ddindex to simulation nodeid index */
5840         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5841                       cr->mpi_comm_mysim);
5842 #endif
5843         sfree(buf);
5844     }
5845 #endif
5846 }
5847
5848 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5849                                                      int ncg,int natoms)
5850 {
5851     gmx_domdec_master_t *ma;
5852     int i;
5853
5854     snew(ma,1);
5855     
5856     snew(ma->ncg,dd->nnodes);
5857     snew(ma->index,dd->nnodes+1);
5858     snew(ma->cg,ncg);
5859     snew(ma->nat,dd->nnodes);
5860     snew(ma->ibuf,dd->nnodes*2);
5861     snew(ma->cell_x,DIM);
5862     for(i=0; i<DIM; i++)
5863     {
5864         snew(ma->cell_x[i],dd->nc[i]+1);
5865     }
5866
5867     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5868     {
5869         ma->vbuf = NULL;
5870     }
5871     else
5872     {
5873         snew(ma->vbuf,natoms);
5874     }
5875
5876     return ma;
5877 }
5878
5879 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5880                                int reorder)
5881 {
5882     gmx_domdec_t *dd;
5883     gmx_domdec_comm_t *comm;
5884     int  i,rank;
5885     gmx_bool bDiv[DIM];
5886     ivec periods;
5887 #ifdef GMX_MPI
5888     MPI_Comm comm_cart;
5889 #endif
5890     
5891     dd = cr->dd;
5892     comm = dd->comm;
5893     
5894     if (comm->bCartesianPP)
5895     {
5896         for(i=1; i<DIM; i++)
5897         {
5898             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5899         }
5900         if (bDiv[YY] || bDiv[ZZ])
5901         {
5902             comm->bCartesianPP_PME = TRUE;
5903             /* If we have 2D PME decomposition, which is always in x+y,
5904              * we stack the PME only nodes in z.
5905              * Otherwise we choose the direction that provides the thinnest slab
5906              * of PME only nodes as this will have the least effect
5907              * on the PP communication.
5908              * But for the PME communication the opposite might be better.
5909              */
5910             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5911                              !bDiv[YY] ||
5912                              dd->nc[YY] > dd->nc[ZZ]))
5913             {
5914                 comm->cartpmedim = ZZ;
5915             }
5916             else
5917             {
5918                 comm->cartpmedim = YY;
5919             }
5920             comm->ntot[comm->cartpmedim]
5921                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5922         }
5923         else if (fplog)
5924         {
5925             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5926             fprintf(fplog,
5927                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5928         }
5929     }
5930     
5931 #ifdef GMX_MPI
5932     if (comm->bCartesianPP_PME)
5933     {
5934         if (fplog)
5935         {
5936             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5937         }
5938         
5939         for(i=0; i<DIM; i++)
5940         {
5941             periods[i] = TRUE;
5942         }
5943         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5944                         &comm_cart);
5945         
5946         MPI_Comm_rank(comm_cart,&rank);
5947         if (MASTERNODE(cr) && rank != 0)
5948         {
5949             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5950         }
5951         
5952         /* With this assigment we loose the link to the original communicator
5953          * which will usually be MPI_COMM_WORLD, unless have multisim.
5954          */
5955         cr->mpi_comm_mysim = comm_cart;
5956         cr->sim_nodeid = rank;
5957         
5958         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5959         
5960         if (fplog)
5961         {
5962             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5963                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5964         }
5965         
5966         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5967         {
5968             cr->duty = DUTY_PP;
5969         }
5970         if (cr->npmenodes == 0 ||
5971             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5972         {
5973             cr->duty = DUTY_PME;
5974         }
5975         
5976         /* Split the sim communicator into PP and PME only nodes */
5977         MPI_Comm_split(cr->mpi_comm_mysim,
5978                        cr->duty,
5979                        dd_index(comm->ntot,dd->ci),
5980                        &cr->mpi_comm_mygroup);
5981     }
5982     else
5983     {
5984         switch (dd_node_order)
5985         {
5986         case ddnoPP_PME:
5987             if (fplog)
5988             {
5989                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5990             }
5991             break;
5992         case ddnoINTERLEAVE:
5993             /* Interleave the PP-only and PME-only nodes,
5994              * as on clusters with dual-core machines this will double
5995              * the communication bandwidth of the PME processes
5996              * and thus speed up the PP <-> PME and inter PME communication.
5997              */
5998             if (fplog)
5999             {
6000                 fprintf(fplog,"Interleaving PP and PME nodes\n");
6001             }
6002             comm->pmenodes = dd_pmenodes(cr);
6003             break;
6004         case ddnoCARTESIAN:
6005             break;
6006         default:
6007             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6008         }
6009     
6010         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6011         {
6012             cr->duty = DUTY_PME;
6013         }
6014         else
6015         {
6016             cr->duty = DUTY_PP;
6017         }
6018         
6019         /* Split the sim communicator into PP and PME only nodes */
6020         MPI_Comm_split(cr->mpi_comm_mysim,
6021                        cr->duty,
6022                        cr->nodeid,
6023                        &cr->mpi_comm_mygroup);
6024         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6025     }
6026 #endif
6027
6028     if (fplog)
6029     {
6030         fprintf(fplog,"This is a %s only node\n\n",
6031                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6032     }
6033 }
6034
6035 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6036 {
6037     gmx_domdec_t *dd;
6038     gmx_domdec_comm_t *comm;
6039     int CartReorder;
6040     
6041     dd = cr->dd;
6042     comm = dd->comm;
6043     
6044     copy_ivec(dd->nc,comm->ntot);
6045     
6046     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6047     comm->bCartesianPP_PME = FALSE;
6048     
6049     /* Reorder the nodes by default. This might change the MPI ranks.
6050      * Real reordering is only supported on very few architectures,
6051      * Blue Gene is one of them.
6052      */
6053     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6054     
6055     if (cr->npmenodes > 0)
6056     {
6057         /* Split the communicator into a PP and PME part */
6058         split_communicator(fplog,cr,dd_node_order,CartReorder);
6059         if (comm->bCartesianPP_PME)
6060         {
6061             /* We (possibly) reordered the nodes in split_communicator,
6062              * so it is no longer required in make_pp_communicator.
6063              */
6064             CartReorder = FALSE;
6065         }
6066     }
6067     else
6068     {
6069         /* All nodes do PP and PME */
6070 #ifdef GMX_MPI    
6071         /* We do not require separate communicators */
6072         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6073 #endif
6074     }
6075     
6076     if (cr->duty & DUTY_PP)
6077     {
6078         /* Copy or make a new PP communicator */
6079         make_pp_communicator(fplog,cr,CartReorder);
6080     }
6081     else
6082     {
6083         receive_ddindex2simnodeid(cr);
6084     }
6085     
6086     if (!(cr->duty & DUTY_PME))
6087     {
6088         /* Set up the commnuication to our PME node */
6089         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6090         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6091         if (debug)
6092         {
6093             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6094                     dd->pme_nodeid,dd->pme_receive_vir_ener);
6095         }
6096     }
6097     else
6098     {
6099         dd->pme_nodeid = -1;
6100     }
6101
6102     if (DDMASTER(dd))
6103     {
6104         dd->ma = init_gmx_domdec_master_t(dd,
6105                                           comm->cgs_gl.nr,
6106                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6107     }
6108 }
6109
6110 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6111 {
6112     real *slb_frac,tot;
6113     int  i,n;
6114     double dbl;
6115     
6116     slb_frac = NULL;
6117     if (nc > 1 && size_string != NULL)
6118     {
6119         if (fplog)
6120         {
6121             fprintf(fplog,"Using static load balancing for the %s direction\n",
6122                     dir);
6123         }
6124         snew(slb_frac,nc);
6125         tot = 0;
6126         for (i=0; i<nc; i++)
6127         {
6128             dbl = 0;
6129             sscanf(size_string,"%lf%n",&dbl,&n);
6130             if (dbl == 0)
6131             {
6132                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6133             }
6134             slb_frac[i] = dbl;
6135             size_string += n;
6136             tot += slb_frac[i];
6137         }
6138         /* Normalize */
6139         if (fplog)
6140         {
6141             fprintf(fplog,"Relative cell sizes:");
6142         }
6143         for (i=0; i<nc; i++)
6144         {
6145             slb_frac[i] /= tot;
6146             if (fplog)
6147             {
6148                 fprintf(fplog," %5.3f",slb_frac[i]);
6149             }
6150         }
6151         if (fplog)
6152         {
6153             fprintf(fplog,"\n");
6154         }
6155     }
6156     
6157     return slb_frac;
6158 }
6159
6160 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6161 {
6162     int n,nmol,ftype;
6163     gmx_mtop_ilistloop_t iloop;
6164     t_ilist *il;
6165     
6166     n = 0;
6167     iloop = gmx_mtop_ilistloop_init(mtop);
6168     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6169     {
6170         for(ftype=0; ftype<F_NRE; ftype++)
6171         {
6172             if ((interaction_function[ftype].flags & IF_BOND) &&
6173                 NRAL(ftype) >  2)
6174             {
6175                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6176             }
6177         }
6178   }
6179
6180   return n;
6181 }
6182
6183 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6184 {
6185     char *val;
6186     int  nst;
6187     
6188     nst = def;
6189     val = getenv(env_var);
6190     if (val)
6191     {
6192         if (sscanf(val,"%d",&nst) <= 0)
6193         {
6194             nst = 1;
6195         }
6196         if (fplog)
6197         {
6198             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6199                     env_var,val,nst);
6200         }
6201     }
6202     
6203     return nst;
6204 }
6205
6206 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6207 {
6208     if (MASTER(cr))
6209     {
6210         fprintf(stderr,"\n%s\n",warn_string);
6211     }
6212     if (fplog)
6213     {
6214         fprintf(fplog,"\n%s\n",warn_string);
6215     }
6216 }
6217
6218 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6219                                   t_inputrec *ir,FILE *fplog)
6220 {
6221     if (ir->ePBC == epbcSCREW &&
6222         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6223     {
6224         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6225     }
6226
6227     if (ir->ns_type == ensSIMPLE)
6228     {
6229         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6230     }
6231
6232     if (ir->nstlist == 0)
6233     {
6234         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6235     }
6236
6237     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6238     {
6239         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6240     }
6241 }
6242
6243 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6244 {
6245     int  di,d;
6246     real r;
6247
6248     r = ddbox->box_size[XX];
6249     for(di=0; di<dd->ndim; di++)
6250     {
6251         d = dd->dim[di];
6252         /* Check using the initial average cell size */
6253         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6254     }
6255
6256     return r;
6257 }
6258
6259 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6260                              const char *dlb_opt,gmx_bool bRecordLoad,
6261                              unsigned long Flags,t_inputrec *ir)
6262 {
6263     gmx_domdec_t *dd;
6264     int  eDLB=-1;
6265     char buf[STRLEN];
6266
6267     switch (dlb_opt[0])
6268     {
6269     case 'a': eDLB = edlbAUTO; break;
6270     case 'n': eDLB = edlbNO;   break;
6271     case 'y': eDLB = edlbYES;  break;
6272     default: gmx_incons("Unknown dlb_opt");
6273     }
6274
6275     if (Flags & MD_RERUN)
6276     {
6277         return edlbNO;
6278     }
6279
6280     if (!EI_DYNAMICS(ir->eI))
6281     {
6282         if (eDLB == edlbYES)
6283         {
6284             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6285             dd_warning(cr,fplog,buf);
6286         }
6287             
6288         return edlbNO;
6289     }
6290
6291     if (!bRecordLoad)
6292     {
6293         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6294
6295         return edlbNO;
6296     }
6297
6298     if (Flags & MD_REPRODUCIBLE)
6299     {
6300         switch (eDLB)
6301         {
6302                         case edlbNO: 
6303                                 break;
6304                         case edlbAUTO:
6305                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6306                                 eDLB = edlbNO;
6307                                 break;
6308                         case edlbYES:
6309                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6310                                 break;
6311                         default:
6312                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6313                                 break;
6314         }
6315     }
6316
6317     return eDLB;
6318 }
6319
6320 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6321 {
6322     int dim;
6323
6324     dd->ndim = 0;
6325     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6326     {
6327         /* Decomposition order z,y,x */
6328         if (fplog)
6329         {
6330             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6331         }
6332         for(dim=DIM-1; dim>=0; dim--)
6333         {
6334             if (dd->nc[dim] > 1)
6335             {
6336                 dd->dim[dd->ndim++] = dim;
6337             }
6338         }
6339     }
6340     else
6341     {
6342         /* Decomposition order x,y,z */
6343         for(dim=0; dim<DIM; dim++)
6344         {
6345             if (dd->nc[dim] > 1)
6346             {
6347                 dd->dim[dd->ndim++] = dim;
6348             }
6349         }
6350     }
6351 }
6352
6353 static gmx_domdec_comm_t *init_dd_comm()
6354 {
6355     gmx_domdec_comm_t *comm;
6356     int  i;
6357
6358     snew(comm,1);
6359     snew(comm->cggl_flag,DIM*2);
6360     snew(comm->cgcm_state,DIM*2);
6361     for(i=0; i<DIM*2; i++)
6362     {
6363         comm->cggl_flag_nalloc[i]  = 0;
6364         comm->cgcm_state_nalloc[i] = 0;
6365     }
6366     
6367     comm->nalloc_int = 0;
6368     comm->buf_int    = NULL;
6369
6370     vec_rvec_init(&comm->vbuf);
6371
6372     comm->n_load_have    = 0;
6373     comm->n_load_collect = 0;
6374
6375     for(i=0; i<ddnatNR-ddnatZONE; i++)
6376     {
6377         comm->sum_nat[i] = 0;
6378     }
6379     comm->ndecomp = 0;
6380     comm->nload   = 0;
6381     comm->load_step = 0;
6382     comm->load_sum  = 0;
6383     comm->load_max  = 0;
6384     clear_ivec(comm->load_lim);
6385     comm->load_mdf  = 0;
6386     comm->load_pme  = 0;
6387
6388     return comm;
6389 }
6390
6391 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6392                                         unsigned long Flags,
6393                                         ivec nc,
6394                                         real comm_distance_min,real rconstr,
6395                                         const char *dlb_opt,real dlb_scale,
6396                                         const char *sizex,const char *sizey,const char *sizez,
6397                                         gmx_mtop_t *mtop,t_inputrec *ir,
6398                                         matrix box,rvec *x,
6399                                         gmx_ddbox_t *ddbox,
6400                                         int *npme_x,int *npme_y)
6401 {
6402     gmx_domdec_t *dd;
6403     gmx_domdec_comm_t *comm;
6404     int  recload;
6405     int  d,i,j;
6406     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6407     gmx_bool bC;
6408     char buf[STRLEN];
6409     
6410     if (fplog)
6411     {
6412         fprintf(fplog,
6413                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6414     }
6415     
6416     snew(dd,1);
6417
6418     dd->comm = init_dd_comm();
6419     comm = dd->comm;
6420     snew(comm->cggl_flag,DIM*2);
6421     snew(comm->cgcm_state,DIM*2);
6422
6423     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6424     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6425     
6426     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6427     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6428     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6429     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6430     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6431     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6432     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6433     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6434
6435     dd->pme_recv_f_alloc = 0;
6436     dd->pme_recv_f_buf = NULL;
6437
6438     if (dd->bSendRecv2 && fplog)
6439     {
6440         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6441     }
6442     if (comm->eFlop)
6443     {
6444         if (fplog)
6445         {
6446             fprintf(fplog,"Will load balance based on FLOP count\n");
6447         }
6448         if (comm->eFlop > 1)
6449         {
6450             srand(1+cr->nodeid);
6451         }
6452         comm->bRecordLoad = TRUE;
6453     }
6454     else
6455     {
6456         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6457                              
6458     }
6459     
6460     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6461     
6462     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6463     if (fplog)
6464     {
6465         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6466     }
6467     dd->bGridJump = comm->bDynLoadBal;
6468     
6469     if (comm->nstSortCG)
6470     {
6471         if (fplog)
6472         {
6473             if (comm->nstSortCG == 1)
6474             {
6475                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6476             }
6477             else
6478             {
6479                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6480                         comm->nstSortCG);
6481             }
6482         }
6483         snew(comm->sort,1);
6484     }
6485     else
6486     {
6487         if (fplog)
6488         {
6489             fprintf(fplog,"Will not sort the charge groups\n");
6490         }
6491     }
6492
6493     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6494     
6495     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6496     if (comm->bInterCGBondeds)
6497     {
6498         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6499     }
6500     else
6501     {
6502         comm->bInterCGMultiBody = FALSE;
6503     }
6504     
6505     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6506     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6507
6508     if (ir->rlistlong == 0)
6509     {
6510         /* Set the cut-off to some very large value,
6511          * so we don't need if statements everywhere in the code.
6512          * We use sqrt, since the cut-off is squared in some places.
6513          */
6514         comm->cutoff   = GMX_CUTOFF_INF;
6515     }
6516     else
6517     {
6518         comm->cutoff   = ir->rlistlong;
6519     }
6520     comm->cutoff_mbody = 0;
6521     
6522     comm->cellsize_limit = 0;
6523     comm->bBondComm = FALSE;
6524
6525     if (comm->bInterCGBondeds)
6526     {
6527         if (comm_distance_min > 0)
6528         {
6529             comm->cutoff_mbody = comm_distance_min;
6530             if (Flags & MD_DDBONDCOMM)
6531             {
6532                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6533             }
6534             else
6535             {
6536                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6537             }
6538             r_bonded_limit = comm->cutoff_mbody;
6539         }
6540         else if (ir->bPeriodicMols)
6541         {
6542             /* Can not easily determine the required cut-off */
6543             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6544             comm->cutoff_mbody = comm->cutoff/2;
6545             r_bonded_limit = comm->cutoff_mbody;
6546         }
6547         else
6548         {
6549             if (MASTER(cr))
6550             {
6551                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6552                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6553             }
6554             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6555             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6556
6557             /* We use an initial margin of 10% for the minimum cell size,
6558              * except when we are just below the non-bonded cut-off.
6559              */
6560             if (Flags & MD_DDBONDCOMM)
6561             {
6562                 if (max(r_2b,r_mb) > comm->cutoff)
6563                 {
6564                     r_bonded       = max(r_2b,r_mb);
6565                     r_bonded_limit = 1.1*r_bonded;
6566                     comm->bBondComm = TRUE;
6567                 }
6568                 else
6569                 {
6570                     r_bonded       = r_mb;
6571                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6572                 }
6573                 /* We determine cutoff_mbody later */
6574             }
6575             else
6576             {
6577                 /* No special bonded communication,
6578                  * simply increase the DD cut-off.
6579                  */
6580                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6581                 comm->cutoff_mbody = r_bonded_limit;
6582                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6583             }
6584         }
6585         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6586         if (fplog)
6587         {
6588             fprintf(fplog,
6589                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6590                     comm->cellsize_limit);
6591         }
6592     }
6593
6594     if (dd->bInterCGcons && rconstr <= 0)
6595     {
6596         /* There is a cell size limit due to the constraints (P-LINCS) */
6597         rconstr = constr_r_max(fplog,mtop,ir);
6598         if (fplog)
6599         {
6600             fprintf(fplog,
6601                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6602                     rconstr);
6603             if (rconstr > comm->cellsize_limit)
6604             {
6605                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6606             }
6607         }
6608     }
6609     else if (rconstr > 0 && fplog)
6610     {
6611         /* Here we do not check for dd->bInterCGcons,
6612          * because one can also set a cell size limit for virtual sites only
6613          * and at this point we don't know yet if there are intercg v-sites.
6614          */
6615         fprintf(fplog,
6616                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6617                 rconstr);
6618     }
6619     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6620
6621     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6622
6623     if (nc[XX] > 0)
6624     {
6625         copy_ivec(nc,dd->nc);
6626         set_dd_dim(fplog,dd);
6627         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6628
6629         if (cr->npmenodes == -1)
6630         {
6631             cr->npmenodes = 0;
6632         }
6633         acs = average_cellsize_min(dd,ddbox);
6634         if (acs < comm->cellsize_limit)
6635         {
6636             if (fplog)
6637             {
6638                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6639             }
6640             gmx_fatal_collective(FARGS,cr,NULL,
6641                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6642                                  acs,comm->cellsize_limit);
6643         }
6644     }
6645     else
6646     {
6647         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6648
6649         /* We need to choose the optimal DD grid and possibly PME nodes */
6650         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6651                                comm->eDLB!=edlbNO,dlb_scale,
6652                                comm->cellsize_limit,comm->cutoff,
6653                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6654         
6655         if (dd->nc[XX] == 0)
6656         {
6657             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6658             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6659                     !bC ? "-rdd" : "-rcon",
6660                     comm->eDLB!=edlbNO ? " or -dds" : "",
6661                     bC ? " or your LINCS settings" : "");
6662
6663             gmx_fatal_collective(FARGS,cr,NULL,
6664                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6665                                  "%s\n"
6666                                  "Look in the log file for details on the domain decomposition",
6667                                  cr->nnodes-cr->npmenodes,limit,buf);
6668         }
6669         set_dd_dim(fplog,dd);
6670     }
6671
6672     if (fplog)
6673     {
6674         fprintf(fplog,
6675                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6676                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6677     }
6678     
6679     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6680     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6681     {
6682         gmx_fatal_collective(FARGS,cr,NULL,
6683                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6684                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6685     }
6686     if (cr->npmenodes > dd->nnodes)
6687     {
6688         gmx_fatal_collective(FARGS,cr,NULL,
6689                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6690     }
6691     if (cr->npmenodes > 0)
6692     {
6693         comm->npmenodes = cr->npmenodes;
6694     }
6695     else
6696     {
6697         comm->npmenodes = dd->nnodes;
6698     }
6699
6700     if (EEL_PME(ir->coulombtype))
6701     {
6702         /* The following choices should match those
6703          * in comm_cost_est in domdec_setup.c.
6704          * Note that here the checks have to take into account
6705          * that the decomposition might occur in a different order than xyz
6706          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6707          * in which case they will not match those in comm_cost_est,
6708          * but since that is mainly for testing purposes that's fine.
6709          */
6710         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6711             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6712             getenv("GMX_PMEONEDD") == NULL)
6713         {
6714             comm->npmedecompdim = 2;
6715             comm->npmenodes_x   = dd->nc[XX];
6716             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6717         }
6718         else
6719         {
6720             /* In case nc is 1 in both x and y we could still choose to
6721              * decompose pme in y instead of x, but we use x for simplicity.
6722              */
6723             comm->npmedecompdim = 1;
6724             if (dd->dim[0] == YY)
6725             {
6726                 comm->npmenodes_x = 1;
6727                 comm->npmenodes_y = comm->npmenodes;
6728             }
6729             else
6730             {
6731                 comm->npmenodes_x = comm->npmenodes;
6732                 comm->npmenodes_y = 1;
6733             }
6734         }    
6735         if (fplog)
6736         {
6737             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6738                     comm->npmenodes_x,comm->npmenodes_y,1);
6739         }
6740     }
6741     else
6742     {
6743         comm->npmedecompdim = 0;
6744         comm->npmenodes_x   = 0;
6745         comm->npmenodes_y   = 0;
6746     }
6747     
6748     /* Technically we don't need both of these,
6749      * but it simplifies code not having to recalculate it.
6750      */
6751     *npme_x = comm->npmenodes_x;
6752     *npme_y = comm->npmenodes_y;
6753         
6754     snew(comm->slb_frac,DIM);
6755     if (comm->eDLB == edlbNO)
6756     {
6757         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6758         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6759         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6760     }
6761
6762     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6763     {
6764         if (comm->bBondComm || comm->eDLB != edlbNO)
6765         {
6766             /* Set the bonded communication distance to halfway
6767              * the minimum and the maximum,
6768              * since the extra communication cost is nearly zero.
6769              */
6770             acs = average_cellsize_min(dd,ddbox);
6771             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6772             if (comm->eDLB != edlbNO)
6773             {
6774                 /* Check if this does not limit the scaling */
6775                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6776             }
6777             if (!comm->bBondComm)
6778             {
6779                 /* Without bBondComm do not go beyond the n.b. cut-off */
6780                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6781                 if (comm->cellsize_limit >= comm->cutoff)
6782                 {
6783                     /* We don't loose a lot of efficieny
6784                      * when increasing it to the n.b. cut-off.
6785                      * It can even be slightly faster, because we need
6786                      * less checks for the communication setup.
6787                      */
6788                     comm->cutoff_mbody = comm->cutoff;
6789                 }
6790             }
6791             /* Check if we did not end up below our original limit */
6792             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6793
6794             if (comm->cutoff_mbody > comm->cellsize_limit)
6795             {
6796                 comm->cellsize_limit = comm->cutoff_mbody;
6797             }
6798         }
6799         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6800     }
6801
6802     if (debug)
6803     {
6804         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6805                 "cellsize limit %f\n",
6806                 comm->bBondComm,comm->cellsize_limit);
6807     }
6808     
6809     if (MASTER(cr))
6810     {
6811         check_dd_restrictions(cr,dd,ir,fplog);
6812     }
6813
6814     comm->partition_step = INT_MIN;
6815     dd->ddp_count = 0;
6816
6817     clear_dd_cycle_counts(dd);
6818
6819     return dd;
6820 }
6821
6822 static void set_dlb_limits(gmx_domdec_t *dd)
6823
6824 {
6825     int d;
6826
6827     for(d=0; d<dd->ndim; d++)
6828     {
6829         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6830         dd->comm->cellsize_min[dd->dim[d]] =
6831             dd->comm->cellsize_min_dlb[dd->dim[d]];
6832     }
6833 }
6834
6835
6836 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6837 {
6838     gmx_domdec_t *dd;
6839     gmx_domdec_comm_t *comm;
6840     real cellsize_min;
6841     int  d,nc,i;
6842     char buf[STRLEN];
6843     
6844     dd = cr->dd;
6845     comm = dd->comm;
6846     
6847     if (fplog)
6848     {
6849         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6850     }
6851
6852     cellsize_min = comm->cellsize_min[dd->dim[0]];
6853     for(d=1; d<dd->ndim; d++)
6854     {
6855         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6856     }
6857
6858     if (cellsize_min < comm->cellsize_limit*1.05)
6859     {
6860         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6861
6862         /* Change DLB from "auto" to "no". */
6863         comm->eDLB = edlbNO;
6864
6865         return;
6866     }
6867
6868     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6869     comm->bDynLoadBal = TRUE;
6870     dd->bGridJump = TRUE;
6871     
6872     set_dlb_limits(dd);
6873
6874     /* We can set the required cell size info here,
6875      * so we do not need to communicate this.
6876      * The grid is completely uniform.
6877      */
6878     for(d=0; d<dd->ndim; d++)
6879     {
6880         if (comm->root[d])
6881         {
6882             comm->load[d].sum_m = comm->load[d].sum;
6883
6884             nc = dd->nc[dd->dim[d]];
6885             for(i=0; i<nc; i++)
6886             {
6887                 comm->root[d]->cell_f[i]    = i/(real)nc;
6888                 if (d > 0)
6889                 {
6890                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6891                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6892                 }
6893             }
6894             comm->root[d]->cell_f[nc] = 1.0;
6895         }
6896     }
6897 }
6898
6899 static char *init_bLocalCG(gmx_mtop_t *mtop)
6900 {
6901     int  ncg,cg;
6902     char *bLocalCG;
6903     
6904     ncg = ncg_mtop(mtop);
6905     snew(bLocalCG,ncg);
6906     for(cg=0; cg<ncg; cg++)
6907     {
6908         bLocalCG[cg] = FALSE;
6909     }
6910
6911     return bLocalCG;
6912 }
6913
6914 void dd_init_bondeds(FILE *fplog,
6915                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6916                      gmx_vsite_t *vsite,gmx_constr_t constr,
6917                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6918 {
6919     gmx_domdec_comm_t *comm;
6920     gmx_bool bBondComm;
6921     int  d;
6922
6923     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6924
6925     comm = dd->comm;
6926
6927     if (comm->bBondComm)
6928     {
6929         /* Communicate atoms beyond the cut-off for bonded interactions */
6930         comm = dd->comm;
6931
6932         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6933
6934         comm->bLocalCG = init_bLocalCG(mtop);
6935     }
6936     else
6937     {
6938         /* Only communicate atoms based on cut-off */
6939         comm->cglink   = NULL;
6940         comm->bLocalCG = NULL;
6941     }
6942 }
6943
6944 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6945                               t_inputrec *ir,
6946                               gmx_bool bDynLoadBal,real dlb_scale,
6947                               gmx_ddbox_t *ddbox)
6948 {
6949     gmx_domdec_comm_t *comm;
6950     int  d;
6951     ivec np;
6952     real limit,shrink;
6953     char buf[64];
6954
6955     if (fplog == NULL)
6956     {
6957         return;
6958     }
6959
6960     comm = dd->comm;
6961
6962     if (bDynLoadBal)
6963     {
6964         fprintf(fplog,"The maximum number of communication pulses is:");
6965         for(d=0; d<dd->ndim; d++)
6966         {
6967             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6968         }
6969         fprintf(fplog,"\n");
6970         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6971         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6972         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6973         for(d=0; d<DIM; d++)
6974         {
6975             if (dd->nc[d] > 1)
6976             {
6977                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6978                 {
6979                     shrink = 0;
6980                 }
6981                 else
6982                 {
6983                     shrink =
6984                         comm->cellsize_min_dlb[d]/
6985                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6986                 }
6987                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6988             }
6989         }
6990         fprintf(fplog,"\n");
6991     }
6992     else
6993     {
6994         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6995         fprintf(fplog,"The initial number of communication pulses is:");
6996         for(d=0; d<dd->ndim; d++)
6997         {
6998             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6999         }
7000         fprintf(fplog,"\n");
7001         fprintf(fplog,"The initial domain decomposition cell size is:");
7002         for(d=0; d<DIM; d++) {
7003             if (dd->nc[d] > 1)
7004             {
7005                 fprintf(fplog," %c %.2f nm",
7006                         dim2char(d),dd->comm->cellsize_min[d]);
7007             }
7008         }
7009         fprintf(fplog,"\n\n");
7010     }
7011     
7012     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7013     {
7014         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7015         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7016                 "non-bonded interactions","",comm->cutoff);
7017
7018         if (bDynLoadBal)
7019         {
7020             limit = dd->comm->cellsize_limit;
7021         }
7022         else
7023         {
7024             if (dynamic_dd_box(ddbox,ir))
7025             {
7026                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7027             }
7028             limit = dd->comm->cellsize_min[XX];
7029             for(d=1; d<DIM; d++)
7030             {
7031                 limit = min(limit,dd->comm->cellsize_min[d]);
7032             }
7033         }
7034
7035         if (comm->bInterCGBondeds)
7036         {
7037             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7038                     "two-body bonded interactions","(-rdd)",
7039                     max(comm->cutoff,comm->cutoff_mbody));
7040             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7041                     "multi-body bonded interactions","(-rdd)",
7042                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7043         }
7044         if (dd->vsite_comm)
7045         {
7046             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7047                     "virtual site constructions","(-rcon)",limit);
7048         }
7049         if (dd->constraint_comm)
7050         {
7051             sprintf(buf,"atoms separated by up to %d constraints",
7052                     1+ir->nProjOrder);
7053             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7054                     buf,"(-rcon)",limit);
7055         }
7056         fprintf(fplog,"\n");
7057     }
7058     
7059     fflush(fplog);
7060 }
7061
7062 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7063                                 real dlb_scale,
7064                                 const t_inputrec *ir,
7065                                 const gmx_ddbox_t *ddbox)
7066 {
7067     gmx_domdec_comm_t *comm;
7068     int  d,dim,npulse,npulse_d_max,npulse_d;
7069     gmx_bool bNoCutOff;
7070
7071     comm = dd->comm;
7072
7073     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7074
7075     /* Determine the maximum number of comm. pulses in one dimension */
7076         
7077     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7078         
7079     /* Determine the maximum required number of grid pulses */
7080     if (comm->cellsize_limit >= comm->cutoff)
7081     {
7082         /* Only a single pulse is required */
7083         npulse = 1;
7084     }
7085     else if (!bNoCutOff && comm->cellsize_limit > 0)
7086     {
7087         /* We round down slightly here to avoid overhead due to the latency
7088          * of extra communication calls when the cut-off
7089          * would be only slightly longer than the cell size.
7090          * Later cellsize_limit is redetermined,
7091          * so we can not miss interactions due to this rounding.
7092          */
7093         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7094     }
7095     else
7096     {
7097         /* There is no cell size limit */
7098         npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7099     }
7100
7101     if (!bNoCutOff && npulse > 1)
7102     {
7103         /* See if we can do with less pulses, based on dlb_scale */
7104         npulse_d_max = 0;
7105         for(d=0; d<dd->ndim; d++)
7106         {
7107             dim = dd->dim[d];
7108             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7109                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7110             npulse_d_max = max(npulse_d_max,npulse_d);
7111         }
7112         npulse = min(npulse,npulse_d_max);
7113     }
7114
7115     /* This env var can override npulse */
7116     d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7117     if (d > 0)
7118     {
7119         npulse = d;
7120     }
7121
7122     comm->maxpulse = 1;
7123     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7124     for(d=0; d<dd->ndim; d++)
7125     {
7126         comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7127         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7128         snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7129         comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7130         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7131         {
7132             comm->bVacDLBNoLimit = FALSE;
7133         }
7134     }
7135
7136     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7137     if (!comm->bVacDLBNoLimit)
7138     {
7139         comm->cellsize_limit = max(comm->cellsize_limit,
7140                                    comm->cutoff/comm->maxpulse);
7141     }
7142     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7143     /* Set the minimum cell size for each DD dimension */
7144     for(d=0; d<dd->ndim; d++)
7145     {
7146         if (comm->bVacDLBNoLimit ||
7147             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7148         {
7149             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7150         }
7151         else
7152         {
7153             comm->cellsize_min_dlb[dd->dim[d]] =
7154                 comm->cutoff/comm->cd[d].np_dlb;
7155         }
7156     }
7157     if (comm->cutoff_mbody <= 0)
7158     {
7159         comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7160     }
7161     if (comm->bDynLoadBal)
7162     {
7163         set_dlb_limits(dd);
7164     }
7165 }
7166
7167 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7168 {
7169     /* If each molecule is a single charge group
7170      * or we use domain decomposition for each periodic dimension,
7171      * we do not need to take pbc into account for the bonded interactions.
7172      */
7173     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7174             !(dd->nc[XX]>1 &&
7175               dd->nc[YY]>1 &&
7176               (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7177 }
7178
7179 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7180                        t_inputrec *ir,t_forcerec *fr,
7181                        gmx_ddbox_t *ddbox)
7182 {
7183     gmx_domdec_comm_t *comm;
7184     int  natoms_tot;
7185     real vol_frac;
7186
7187     comm = dd->comm;
7188
7189     /* Initialize the thread data.
7190      * This can not be done in init_domain_decomposition,
7191      * as the numbers of threads is determined later.
7192      */
7193     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7194     if (comm->nth > 1)
7195     {
7196         snew(comm->dth,comm->nth);
7197     }
7198
7199     if (EEL_PME(ir->coulombtype))
7200     {
7201         init_ddpme(dd,&comm->ddpme[0],0);
7202         if (comm->npmedecompdim >= 2)
7203         {
7204             init_ddpme(dd,&comm->ddpme[1],1);
7205         }
7206     }
7207     else
7208     {
7209         comm->npmenodes = 0;
7210         if (dd->pme_nodeid >= 0)
7211         {
7212             gmx_fatal_collective(FARGS,NULL,dd,
7213                                  "Can not have separate PME nodes without PME electrostatics");
7214         }
7215     }
7216         
7217     if (debug)
7218     {
7219         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7220     }
7221     if (comm->eDLB != edlbNO)
7222     {
7223         set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7224     }
7225     
7226     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7227     if (comm->eDLB == edlbAUTO)
7228     {
7229         if (fplog)
7230         {
7231             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7232         }
7233         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7234     }
7235
7236     if (ir->ePBC == epbcNONE)
7237     {
7238         vol_frac = 1 - 1/(double)dd->nnodes;
7239     }
7240     else
7241     {
7242         vol_frac =
7243             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7244     }
7245     if (debug)
7246     {
7247         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7248     }
7249     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7250    
7251     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7252 }
7253
7254 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7255                           real cutoff_req)
7256 {
7257     gmx_domdec_t *dd;
7258     gmx_ddbox_t ddbox;
7259     int d,dim,np;
7260     real inv_cell_size;
7261     int LocallyLimited;
7262
7263     dd = cr->dd;
7264
7265     set_ddbox(dd,FALSE,cr,ir,state->box,
7266               TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7267
7268     LocallyLimited = 0;
7269
7270     for(d=0; d<dd->ndim; d++)
7271     {
7272         dim = dd->dim[d];
7273
7274         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7275         if (dynamic_dd_box(&ddbox,ir))
7276         {
7277             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7278         }
7279
7280         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7281
7282         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7283             dd->comm->cd[d].np_dlb > 0)
7284         {
7285             if (np > dd->comm->cd[d].np_dlb)
7286             {
7287                 return FALSE;
7288             }
7289
7290             /* If a current local cell size is smaller than the requested
7291              * cut-off, we could still fix it, but this gets very complicated.
7292              * Without fixing here, we might actually need more checks.
7293              */
7294             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7295             {
7296                 LocallyLimited = 1;
7297             }
7298         }
7299     }
7300
7301     if (dd->comm->eDLB != edlbNO)
7302     {
7303         if (check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7304         {
7305             LocallyLimited = 1; 
7306         }
7307
7308         gmx_sumi(1,&LocallyLimited,cr);
7309
7310         if (LocallyLimited > 0)
7311         {
7312             return FALSE;
7313         }
7314     }
7315
7316     dd->comm->cutoff = cutoff_req;
7317
7318     return TRUE;
7319 }
7320
7321 static void merge_cg_buffers(int ncell,
7322                              gmx_domdec_comm_dim_t *cd, int pulse,
7323                              int  *ncg_cell,
7324                              int  *index_gl, int  *recv_i,
7325                              rvec *cg_cm,    rvec *recv_vr,
7326                              int *cgindex,
7327                              cginfo_mb_t *cginfo_mb,int *cginfo)
7328 {
7329     gmx_domdec_ind_t *ind,*ind_p;
7330     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7331     int shift,shift_at;
7332     
7333     ind = &cd->ind[pulse];
7334     
7335     /* First correct the already stored data */
7336     shift = ind->nrecv[ncell];
7337     for(cell=ncell-1; cell>=0; cell--)
7338     {
7339         shift -= ind->nrecv[cell];
7340         if (shift > 0)
7341         {
7342             /* Move the cg's present from previous grid pulses */
7343             cg0 = ncg_cell[ncell+cell];
7344             cg1 = ncg_cell[ncell+cell+1];
7345             cgindex[cg1+shift] = cgindex[cg1];
7346             for(cg=cg1-1; cg>=cg0; cg--)
7347             {
7348                 index_gl[cg+shift] = index_gl[cg];
7349                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7350                 cgindex[cg+shift] = cgindex[cg];
7351                 cginfo[cg+shift] = cginfo[cg];
7352             }
7353             /* Correct the already stored send indices for the shift */
7354             for(p=1; p<=pulse; p++)
7355             {
7356                 ind_p = &cd->ind[p];
7357                 cg0 = 0;
7358                 for(c=0; c<cell; c++)
7359                 {
7360                     cg0 += ind_p->nsend[c];
7361                 }
7362                 cg1 = cg0 + ind_p->nsend[cell];
7363                 for(cg=cg0; cg<cg1; cg++)
7364                 {
7365                     ind_p->index[cg] += shift;
7366                 }
7367             }
7368         }
7369     }
7370
7371     /* Merge in the communicated buffers */
7372     shift = 0;
7373     shift_at = 0;
7374     cg0 = 0;
7375     for(cell=0; cell<ncell; cell++)
7376     {
7377         cg1 = ncg_cell[ncell+cell+1] + shift;
7378         if (shift_at > 0)
7379         {
7380             /* Correct the old cg indices */
7381             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7382             {
7383                 cgindex[cg+1] += shift_at;
7384             }
7385         }
7386         for(cg=0; cg<ind->nrecv[cell]; cg++)
7387         {
7388             /* Copy this charge group from the buffer */
7389             index_gl[cg1] = recv_i[cg0];
7390             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7391             /* Add it to the cgindex */
7392             cg_gl = index_gl[cg1];
7393             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7394             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7395             cgindex[cg1+1] = cgindex[cg1] + nat;
7396             cg0++;
7397             cg1++;
7398             shift_at += nat;
7399         }
7400         shift += ind->nrecv[cell];
7401         ncg_cell[ncell+cell+1] = cg1;
7402     }
7403 }
7404
7405 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7406                                int nzone,int cg0,const int *cgindex)
7407 {
7408     int cg,zone,p;
7409     
7410     /* Store the atom block boundaries for easy copying of communication buffers
7411      */
7412     cg = cg0;
7413     for(zone=0; zone<nzone; zone++)
7414     {
7415         for(p=0; p<cd->np; p++) {
7416             cd->ind[p].cell2at0[zone] = cgindex[cg];
7417             cg += cd->ind[p].nrecv[zone];
7418             cd->ind[p].cell2at1[zone] = cgindex[cg];
7419         }
7420     }
7421 }
7422
7423 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7424 {
7425     int  i;
7426     gmx_bool bMiss;
7427
7428     bMiss = FALSE;
7429     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7430     {
7431         if (!bLocalCG[link->a[i]])
7432         {
7433             bMiss = TRUE;
7434         }
7435     }
7436
7437     return bMiss;
7438 }
7439
7440 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7441 typedef struct {
7442     real c[DIM][4]; /* the corners for the non-bonded communication */
7443     real cr0;       /* corner for rounding */
7444     real cr1[4];    /* corners for rounding */
7445     real bc[DIM];   /* corners for bounded communication */
7446     real bcr1;      /* corner for rounding for bonded communication */
7447 } dd_corners_t;
7448
7449 /* Determine the corners of the domain(s) we are communicating with */
7450 static void
7451 set_dd_corners(const gmx_domdec_t *dd,
7452                int dim0, int dim1, int dim2,
7453                gmx_bool bDistMB,
7454                dd_corners_t *c)
7455 {
7456     const gmx_domdec_comm_t *comm;
7457     const gmx_domdec_zones_t *zones;
7458     int i,j;
7459
7460     comm = dd->comm;
7461
7462     zones = &comm->zones;
7463
7464     /* Keep the compiler happy */
7465     c->cr0  = 0;
7466     c->bcr1 = 0;
7467
7468     /* The first dimension is equal for all cells */
7469     c->c[0][0] = comm->cell_x0[dim0];
7470     if (bDistMB)
7471     {
7472         c->bc[0] = c->c[0][0];
7473     }
7474     if (dd->ndim >= 2)
7475     {
7476         dim1 = dd->dim[1];
7477         /* This cell row is only seen from the first row */
7478         c->c[1][0] = comm->cell_x0[dim1];
7479         /* All rows can see this row */
7480         c->c[1][1] = comm->cell_x0[dim1];
7481         if (dd->bGridJump)
7482         {
7483             c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7484             if (bDistMB)
7485             {
7486                 /* For the multi-body distance we need the maximum */
7487                 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7488             }
7489         }
7490         /* Set the upper-right corner for rounding */
7491         c->cr0 = comm->cell_x1[dim0];
7492         
7493         if (dd->ndim >= 3)
7494         {
7495             dim2 = dd->dim[2];
7496             for(j=0; j<4; j++)
7497             {
7498                 c->c[2][j] = comm->cell_x0[dim2];
7499             }
7500             if (dd->bGridJump)
7501             {
7502                 /* Use the maximum of the i-cells that see a j-cell */
7503                 for(i=0; i<zones->nizone; i++)
7504                 {
7505                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7506                     {
7507                         if (j >= 4)
7508                         {
7509                             c->c[2][j-4] =
7510                                 max(c->c[2][j-4],
7511                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7512                         }
7513                     }
7514                 }
7515                 if (bDistMB)
7516                 {
7517                     /* For the multi-body distance we need the maximum */
7518                     c->bc[2] = comm->cell_x0[dim2];
7519                     for(i=0; i<2; i++)
7520                     {
7521                         for(j=0; j<2; j++)
7522                         {
7523                             c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7524                         }
7525                     }
7526                 }
7527             }
7528             
7529             /* Set the upper-right corner for rounding */
7530             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7531              * Only cell (0,0,0) can see cell 7 (1,1,1)
7532              */
7533             c->cr1[0] = comm->cell_x1[dim1];
7534             c->cr1[3] = comm->cell_x1[dim1];
7535             if (dd->bGridJump)
7536             {
7537                 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7538                 if (bDistMB)
7539                 {
7540                     /* For the multi-body distance we need the maximum */
7541                     c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7542                 }
7543             }
7544         }
7545     }
7546 }
7547
7548 /* Determine which cg's we need to send in this pulse from this zone */
7549 static void
7550 get_zone_pulse_cgs(gmx_domdec_t *dd,
7551                    int zonei, int zone,
7552                    int cg0, int cg1,
7553                    const int *index_gl,
7554                    const int *cgindex,
7555                    int dim, int dim_ind,
7556                    int dim0, int dim1, int dim2,
7557                    real r_comm2, real r_bcomm2,
7558                    matrix box,
7559                    ivec tric_dist,
7560                    rvec *normal,
7561                    real skew_fac2_d, real skew_fac_01,
7562                    rvec *v_d, rvec *v_0, rvec *v_1,
7563                    const dd_corners_t *c,
7564                    rvec sf2_round,
7565                    gmx_bool bDistBonded,
7566                    gmx_bool bBondComm,
7567                    gmx_bool bDist2B,
7568                    gmx_bool bDistMB,
7569                    rvec *cg_cm,
7570                    int *cginfo,
7571                    gmx_domdec_ind_t *ind,
7572                    int **ibuf, int *ibuf_nalloc,
7573                    vec_rvec_t *vbuf,
7574                    int *nsend_ptr,
7575                    int *nat_ptr,
7576                    int *nsend_z_ptr)
7577 {
7578     gmx_domdec_comm_t *comm;
7579     gmx_bool bScrew;
7580     gmx_bool bDistMB_pulse;
7581     int  cg,i;
7582     real r2,rb2,r,tric_sh;
7583     rvec rn,rb;
7584     int  dimd;
7585     int  nsend_z,nsend,nat;
7586
7587     comm = dd->comm;
7588
7589     bScrew = (dd->bScrewPBC && dim == XX);
7590
7591     bDistMB_pulse = (bDistMB && bDistBonded);
7592
7593     nsend_z = 0;
7594     nsend   = *nsend_ptr;
7595     nat     = *nat_ptr;
7596
7597     for(cg=cg0; cg<cg1; cg++)
7598     {
7599         r2  = 0;
7600         rb2 = 0;
7601         if (tric_dist[dim_ind] == 0)
7602         {
7603             /* Rectangular direction, easy */
7604             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7605             if (r > 0)
7606             {
7607                 r2 += r*r;
7608             }
7609             if (bDistMB_pulse)
7610             {
7611                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7612                 if (r > 0)
7613                 {
7614                     rb2 += r*r;
7615                 }
7616             }
7617             /* Rounding gives at most a 16% reduction
7618              * in communicated atoms
7619              */
7620             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7621             {
7622                 r = cg_cm[cg][dim0] - c->cr0;
7623                 /* This is the first dimension, so always r >= 0 */
7624                 r2 += r*r;
7625                 if (bDistMB_pulse)
7626                 {
7627                     rb2 += r*r;
7628                 }
7629             }
7630             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7631             {
7632                 r = cg_cm[cg][dim1] - c->cr1[zone];
7633                 if (r > 0)
7634                 {
7635                     r2 += r*r;
7636                 }
7637                 if (bDistMB_pulse)
7638                 {
7639                     r = cg_cm[cg][dim1] - c->bcr1;
7640                     if (r > 0)
7641                     {
7642                         rb2 += r*r;
7643                     }
7644                 }
7645             }
7646         }
7647         else
7648         {
7649             /* Triclinic direction, more complicated */
7650             clear_rvec(rn);
7651             clear_rvec(rb);
7652             /* Rounding, conservative as the skew_fac multiplication
7653              * will slightly underestimate the distance.
7654              */
7655             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7656             {
7657                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7658                 for(i=dim0+1; i<DIM; i++)
7659                 {
7660                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7661                 }
7662                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7663                 if (bDistMB_pulse)
7664                 {
7665                     rb[dim0] = rn[dim0];
7666                     rb2 = r2;
7667                 }
7668                 /* Take care that the cell planes along dim0 might not
7669                  * be orthogonal to those along dim1 and dim2.
7670                  */
7671                 for(i=1; i<=dim_ind; i++)
7672                 {
7673                     dimd = dd->dim[i];
7674                     if (normal[dim0][dimd] > 0)
7675                     {
7676                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7677                         if (bDistMB_pulse)
7678                         {
7679                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7680                         }
7681                     }
7682                 }
7683             }
7684             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7685             {
7686                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7687                 tric_sh = 0;
7688                 for(i=dim1+1; i<DIM; i++)
7689                 {
7690                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7691                 }
7692                 rn[dim1] += tric_sh;
7693                 if (rn[dim1] > 0)
7694                 {
7695                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7696                     /* Take care of coupling of the distances
7697                      * to the planes along dim0 and dim1 through dim2.
7698                      */
7699                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7700                     /* Take care that the cell planes along dim1
7701                      * might not be orthogonal to that along dim2.
7702                      */
7703                     if (normal[dim1][dim2] > 0)
7704                     {
7705                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7706                     }
7707                 }
7708                 if (bDistMB_pulse)
7709                 {
7710                     rb[dim1] +=
7711                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7712                     if (rb[dim1] > 0)
7713                     {
7714                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7715                         /* Take care of coupling of the distances
7716                          * to the planes along dim0 and dim1 through dim2.
7717                          */
7718                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7719                         /* Take care that the cell planes along dim1
7720                          * might not be orthogonal to that along dim2.
7721                          */
7722                         if (normal[dim1][dim2] > 0)
7723                         {
7724                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7725                         }
7726                     }
7727                 }
7728             }
7729             /* The distance along the communication direction */
7730             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7731             tric_sh = 0;
7732             for(i=dim+1; i<DIM; i++)
7733             {
7734                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7735             }
7736             rn[dim] += tric_sh;
7737             if (rn[dim] > 0)
7738             {
7739                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7740                 /* Take care of coupling of the distances
7741                  * to the planes along dim0 and dim1 through dim2.
7742                  */
7743                 if (dim_ind == 1 && zonei == 1)
7744                 {
7745                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7746                 }
7747             }
7748             if (bDistMB_pulse)
7749             {
7750                 clear_rvec(rb);
7751                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7752                 if (rb[dim] > 0)
7753                 {
7754                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7755                     /* Take care of coupling of the distances
7756                      * to the planes along dim0 and dim1 through dim2.
7757                      */
7758                     if (dim_ind == 1 && zonei == 1)
7759                     {
7760                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7761                     }
7762                 }
7763             }
7764         }
7765         
7766         if (r2 < r_comm2 ||
7767             (bDistBonded &&
7768              ((bDistMB && rb2 < r_bcomm2) ||
7769               (bDist2B && r2  < r_bcomm2)) &&
7770              (!bBondComm ||
7771               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7772                missing_link(comm->cglink,index_gl[cg],
7773                             comm->bLocalCG)))))
7774         {
7775             /* Make an index to the local charge groups */
7776             if (nsend+1 > ind->nalloc)
7777             {
7778                 ind->nalloc = over_alloc_large(nsend+1);
7779                 srenew(ind->index,ind->nalloc);
7780             }
7781             if (nsend+1 > *ibuf_nalloc)
7782             {
7783                 *ibuf_nalloc = over_alloc_large(nsend+1);
7784                 srenew(*ibuf,*ibuf_nalloc);
7785             }
7786             ind->index[nsend] = cg;
7787             (*ibuf)[nsend] = index_gl[cg];
7788             nsend_z++;
7789             vec_rvec_check_alloc(vbuf,nsend+1);
7790             
7791             if (dd->ci[dim] == 0)
7792             {
7793                 /* Correct cg_cm for pbc */
7794                 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7795                 if (bScrew)
7796                 {
7797                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7798                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7799                 }
7800             }
7801             else
7802             {
7803                 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7804             }
7805             nsend++;
7806             nat += cgindex[cg+1] - cgindex[cg];
7807         }
7808     }
7809
7810     *nsend_ptr   = nsend;
7811     *nat_ptr     = nat;
7812     *nsend_z_ptr = nsend_z;
7813 }
7814
7815 static void setup_dd_communication(gmx_domdec_t *dd,
7816                                    matrix box,gmx_ddbox_t *ddbox,
7817                                    t_forcerec *fr,t_state *state,rvec **f)
7818 {
7819     int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7820     int nzone,nzone_send,zone,zonei,cg0,cg1;
7821     int c,i,j,cg,cg_gl,nrcg;
7822     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7823     gmx_domdec_comm_t *comm;
7824     gmx_domdec_zones_t *zones;
7825     gmx_domdec_comm_dim_t *cd;
7826     gmx_domdec_ind_t *ind;
7827     cginfo_mb_t *cginfo_mb;
7828     gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7829     real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7830     dd_corners_t corners;
7831     ivec tric_dist;
7832     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7833     real skew_fac2_d,skew_fac_01;
7834     rvec sf2_round;
7835     int  nsend,nat;
7836     int  th;
7837     
7838     if (debug)
7839     {
7840         fprintf(debug,"Setting up DD communication\n");
7841     }
7842     
7843     comm  = dd->comm;
7844
7845     switch (fr->cutoff_scheme)
7846     {
7847     case ecutsGROUP:
7848         cg_cm = fr->cg_cm;
7849         break;
7850     case ecutsVERLET:
7851         cg_cm = state->x;
7852         break;
7853     default:
7854         gmx_incons("unimplemented");
7855         cg_cm = NULL;
7856     }
7857
7858     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7859     {
7860         dim = dd->dim[dim_ind];
7861
7862         /* Check if we need to use triclinic distances */
7863         tric_dist[dim_ind] = 0;
7864         for(i=0; i<=dim_ind; i++)
7865         {
7866             if (ddbox->tric_dir[dd->dim[i]])
7867             {
7868                 tric_dist[dim_ind] = 1;
7869             }
7870         }
7871     }
7872
7873     bBondComm = comm->bBondComm;
7874
7875     /* Do we need to determine extra distances for multi-body bondeds? */
7876     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7877     
7878     /* Do we need to determine extra distances for only two-body bondeds? */
7879     bDist2B = (bBondComm && !bDistMB);
7880
7881     r_comm2  = sqr(comm->cutoff);
7882     r_bcomm2 = sqr(comm->cutoff_mbody);
7883
7884     if (debug)
7885     {
7886         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7887     }
7888
7889     zones = &comm->zones;
7890     
7891     dim0 = dd->dim[0];
7892     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7893     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7894
7895     set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7896     
7897     /* Triclinic stuff */
7898     normal = ddbox->normal;
7899     skew_fac_01 = 0;
7900     if (dd->ndim >= 2)
7901     {
7902         v_0 = ddbox->v[dim0];
7903         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7904         {
7905             /* Determine the coupling coefficient for the distances
7906              * to the cell planes along dim0 and dim1 through dim2.
7907              * This is required for correct rounding.
7908              */
7909             skew_fac_01 =
7910                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7911             if (debug)
7912             {
7913                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7914             }
7915         }
7916     }
7917     if (dd->ndim >= 3)
7918     {
7919         v_1 = ddbox->v[dim1];
7920     }
7921     
7922     zone_cg_range = zones->cg_range;
7923     index_gl = dd->index_gl;
7924     cgindex  = dd->cgindex;
7925     cginfo_mb = fr->cginfo_mb;
7926     
7927     zone_cg_range[0]   = 0;
7928     zone_cg_range[1]   = dd->ncg_home;
7929     comm->zone_ncg1[0] = dd->ncg_home;
7930     pos_cg             = dd->ncg_home;
7931     
7932     nat_tot = dd->nat_home;
7933     nzone = 1;
7934     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7935     {
7936         dim = dd->dim[dim_ind];
7937         cd = &comm->cd[dim_ind];
7938         
7939         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7940         {
7941             /* No pbc in this dimension, the first node should not comm. */
7942             nzone_send = 0;
7943         }
7944         else
7945         {
7946             nzone_send = nzone;
7947         }
7948
7949         v_d = ddbox->v[dim];
7950         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7951
7952         cd->bInPlace = TRUE;
7953         for(p=0; p<cd->np; p++)
7954         {
7955             /* Only atoms communicated in the first pulse are used
7956              * for multi-body bonded interactions or for bBondComm.
7957              */
7958             bDistBonded = ((bDistMB || bDist2B) && p == 0);
7959
7960             ind = &cd->ind[p];
7961             nsend = 0;
7962             nat = 0;
7963             for(zone=0; zone<nzone_send; zone++)
7964             {
7965                 if (tric_dist[dim_ind] && dim_ind > 0)
7966                 {
7967                     /* Determine slightly more optimized skew_fac's
7968                      * for rounding.
7969                      * This reduces the number of communicated atoms
7970                      * by about 10% for 3D DD of rhombic dodecahedra.
7971                      */
7972                     for(dimd=0; dimd<dim; dimd++)
7973                     {
7974                         sf2_round[dimd] = 1;
7975                         if (ddbox->tric_dir[dimd])
7976                         {
7977                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7978                             {
7979                                 /* If we are shifted in dimension i
7980                                  * and the cell plane is tilted forward
7981                                  * in dimension i, skip this coupling.
7982                                  */
7983                                 if (!(zones->shift[nzone+zone][i] &&
7984                                       ddbox->v[dimd][i][dimd] >= 0))
7985                                 {
7986                                     sf2_round[dimd] +=
7987                                         sqr(ddbox->v[dimd][i][dimd]);
7988                                 }
7989                             }
7990                             sf2_round[dimd] = 1/sf2_round[dimd];
7991                         }
7992                     }
7993                 }
7994
7995                 zonei = zone_perm[dim_ind][zone];
7996                 if (p == 0)
7997                 {
7998                     /* Here we permutate the zones to obtain a convenient order
7999                      * for neighbor searching
8000                      */
8001                     cg0 = zone_cg_range[zonei];
8002                     cg1 = zone_cg_range[zonei+1];
8003                 }
8004                 else
8005                 {
8006                     /* Look only at the cg's received in the previous grid pulse
8007                      */
8008                     cg1 = zone_cg_range[nzone+zone+1];
8009                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8010                 }
8011
8012 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8013                 for(th=0; th<comm->nth; th++)
8014                 {
8015                     gmx_domdec_ind_t *ind_p;
8016                     int **ibuf_p,*ibuf_nalloc_p;
8017                     vec_rvec_t *vbuf_p;
8018                     int *nsend_p,*nat_p;
8019                     int *nsend_zone_p;
8020                     int cg0_th,cg1_th;
8021
8022                     if (th == 0)
8023                     {
8024                         /* Thread 0 writes in the comm buffers */
8025                         ind_p         = ind;
8026                         ibuf_p        = &comm->buf_int;
8027                         ibuf_nalloc_p = &comm->nalloc_int;
8028                         vbuf_p        = &comm->vbuf;
8029                         nsend_p       = &nsend;
8030                         nat_p         = &nat;
8031                         nsend_zone_p  = &ind->nsend[zone];
8032                     }
8033                     else
8034                     {
8035                         /* Other threads write into temp buffers */
8036                         ind_p         = &comm->dth[th].ind;
8037                         ibuf_p        = &comm->dth[th].ibuf;
8038                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8039                         vbuf_p        = &comm->dth[th].vbuf;
8040                         nsend_p       = &comm->dth[th].nsend;
8041                         nat_p         = &comm->dth[th].nat;
8042                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8043
8044                         comm->dth[th].nsend      = 0;
8045                         comm->dth[th].nat        = 0;
8046                         comm->dth[th].nsend_zone = 0;
8047                     }
8048
8049                     if (comm->nth == 1)
8050                     {
8051                         cg0_th = cg0;
8052                         cg1_th = cg1;
8053                     }
8054                     else
8055                     {
8056                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8057                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8058                     }
8059                     
8060                     /* Get the cg's for this pulse in this zone */
8061                     get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8062                                        index_gl,cgindex,
8063                                        dim,dim_ind,dim0,dim1,dim2,
8064                                        r_comm2,r_bcomm2,
8065                                        box,tric_dist,
8066                                        normal,skew_fac2_d,skew_fac_01,
8067                                        v_d,v_0,v_1,&corners,sf2_round,
8068                                        bDistBonded,bBondComm,
8069                                        bDist2B,bDistMB,
8070                                        cg_cm,fr->cginfo,
8071                                        ind_p,
8072                                        ibuf_p,ibuf_nalloc_p,
8073                                        vbuf_p,
8074                                        nsend_p,nat_p,
8075                                        nsend_zone_p);
8076                 }
8077
8078                 /* Append data of threads>=1 to the communication buffers */
8079                 for(th=1; th<comm->nth; th++)
8080                 {
8081                     dd_comm_setup_work_t *dth;
8082                     int i,ns1;
8083
8084                     dth = &comm->dth[th];
8085
8086                     ns1 = nsend + dth->nsend_zone;
8087                     if (ns1 > ind->nalloc)
8088                     {
8089                         ind->nalloc = over_alloc_dd(ns1);
8090                         srenew(ind->index,ind->nalloc);
8091                     }
8092                     if (ns1 > comm->nalloc_int)
8093                     {
8094                         comm->nalloc_int = over_alloc_dd(ns1);
8095                         srenew(comm->buf_int,comm->nalloc_int);
8096                     }
8097                     if (ns1 > comm->vbuf.nalloc)
8098                     {
8099                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8100                         srenew(comm->vbuf.v,comm->vbuf.nalloc);
8101                     }
8102
8103                     for(i=0; i<dth->nsend_zone; i++)
8104                     {
8105                         ind->index[nsend] = dth->ind.index[i];
8106                         comm->buf_int[nsend] = dth->ibuf[i];
8107                         copy_rvec(dth->vbuf.v[i],
8108                                   comm->vbuf.v[nsend]);
8109                         nsend++;
8110                     }
8111                     nat              += dth->nat;
8112                     ind->nsend[zone] += dth->nsend_zone;
8113                 }
8114             }
8115             /* Clear the counts in case we do not have pbc */
8116             for(zone=nzone_send; zone<nzone; zone++)
8117             {
8118                 ind->nsend[zone] = 0;
8119             }
8120             ind->nsend[nzone]   = nsend;
8121             ind->nsend[nzone+1] = nat;
8122             /* Communicate the number of cg's and atoms to receive */
8123             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8124                             ind->nsend, nzone+2,
8125                             ind->nrecv, nzone+2);
8126             
8127             /* The rvec buffer is also required for atom buffers of size nsend
8128              * in dd_move_x and dd_move_f.
8129              */
8130             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8131
8132             if (p > 0)
8133             {
8134                 /* We can receive in place if only the last zone is not empty */
8135                 for(zone=0; zone<nzone-1; zone++)
8136                 {
8137                     if (ind->nrecv[zone] > 0)
8138                     {
8139                         cd->bInPlace = FALSE;
8140                     }
8141                 }
8142                 if (!cd->bInPlace)
8143                 {
8144                     /* The int buffer is only required here for the cg indices */
8145                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8146                     {
8147                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8148                         srenew(comm->buf_int2,comm->nalloc_int2);
8149                     }
8150                     /* The rvec buffer is also required for atom buffers
8151                      * of size nrecv in dd_move_x and dd_move_f.
8152                      */
8153                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8154                     vec_rvec_check_alloc(&comm->vbuf2,i);
8155                 }
8156             }
8157             
8158             /* Make space for the global cg indices */
8159             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8160                 || dd->cg_nalloc == 0)
8161             {
8162                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8163                 srenew(index_gl,dd->cg_nalloc);
8164                 srenew(cgindex,dd->cg_nalloc+1);
8165             }
8166             /* Communicate the global cg indices */
8167             if (cd->bInPlace)
8168             {
8169                 recv_i = index_gl + pos_cg;
8170             }
8171             else
8172             {
8173                 recv_i = comm->buf_int2;
8174             }
8175             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8176                             comm->buf_int, nsend,
8177                             recv_i,        ind->nrecv[nzone]);
8178
8179             /* Make space for cg_cm */
8180             dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8181             if (fr->cutoff_scheme == ecutsGROUP)
8182             {
8183                 cg_cm = fr->cg_cm;
8184             }
8185             else
8186             {
8187                 cg_cm = state->x;
8188             }
8189             /* Communicate cg_cm */
8190             if (cd->bInPlace)
8191             {
8192                 recv_vr = cg_cm + pos_cg;
8193             }
8194             else
8195             {
8196                 recv_vr = comm->vbuf2.v;
8197             }
8198             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8199                              comm->vbuf.v, nsend,
8200                              recv_vr,      ind->nrecv[nzone]);
8201             
8202             /* Make the charge group index */
8203             if (cd->bInPlace)
8204             {
8205                 zone = (p == 0 ? 0 : nzone - 1);
8206                 while (zone < nzone)
8207                 {
8208                     for(cg=0; cg<ind->nrecv[zone]; cg++)
8209                     {
8210                         cg_gl = index_gl[pos_cg];
8211                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8212                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8213                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8214                         if (bBondComm)
8215                         {
8216                             /* Update the charge group presence,
8217                              * so we can use it in the next pass of the loop.
8218                              */
8219                             comm->bLocalCG[cg_gl] = TRUE;
8220                         }
8221                         pos_cg++;
8222                     }
8223                     if (p == 0)
8224                     {
8225                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8226                     }
8227                     zone++;
8228                     zone_cg_range[nzone+zone] = pos_cg;
8229                 }
8230             }
8231             else
8232             {
8233                 /* This part of the code is never executed with bBondComm. */
8234                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8235                                  index_gl,recv_i,cg_cm,recv_vr,
8236                                  cgindex,fr->cginfo_mb,fr->cginfo);
8237                 pos_cg += ind->nrecv[nzone];
8238             }
8239             nat_tot += ind->nrecv[nzone+1];
8240         }
8241         if (!cd->bInPlace)
8242         {
8243             /* Store the atom block for easy copying of communication buffers */
8244             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8245         }
8246         nzone += nzone;
8247     }
8248     dd->index_gl = index_gl;
8249     dd->cgindex  = cgindex;
8250     
8251     dd->ncg_tot = zone_cg_range[zones->n];
8252     dd->nat_tot = nat_tot;
8253     comm->nat[ddnatHOME] = dd->nat_home;
8254     for(i=ddnatZONE; i<ddnatNR; i++)
8255     {
8256         comm->nat[i] = dd->nat_tot;
8257     }
8258
8259     if (!bBondComm)
8260     {
8261         /* We don't need to update cginfo, since that was alrady done above.
8262          * So we pass NULL for the forcerec.
8263          */
8264         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8265                       NULL,comm->bLocalCG);
8266     }
8267
8268     if (debug)
8269     {
8270         fprintf(debug,"Finished setting up DD communication, zones:");
8271         for(c=0; c<zones->n; c++)
8272         {
8273             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8274         }
8275         fprintf(debug,"\n");
8276     }
8277 }
8278
8279 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8280 {
8281     int c;
8282     
8283     for(c=0; c<zones->nizone; c++)
8284     {
8285         zones->izone[c].cg1  = zones->cg_range[c+1];
8286         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8287         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8288     }
8289 }
8290
8291 static void set_zones_size(gmx_domdec_t *dd,
8292                            matrix box,const gmx_ddbox_t *ddbox,
8293                            int zone_start,int zone_end)
8294 {
8295     gmx_domdec_comm_t *comm;
8296     gmx_domdec_zones_t *zones;
8297     gmx_bool bDistMB;
8298     int  z,zi,zj0,zj1,d,dim;
8299     real rcs,rcmbs;
8300     int  i,j;
8301     real size_j,add_tric;
8302     real vol;
8303
8304     comm = dd->comm;
8305
8306     zones = &comm->zones;
8307
8308     /* Do we need to determine extra distances for multi-body bondeds? */
8309     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8310
8311     for(z=zone_start; z<zone_end; z++)
8312     {
8313         /* Copy cell limits to zone limits.
8314          * Valid for non-DD dims and non-shifted dims.
8315          */
8316         copy_rvec(comm->cell_x0,zones->size[z].x0);
8317         copy_rvec(comm->cell_x1,zones->size[z].x1);
8318     }
8319
8320     for(d=0; d<dd->ndim; d++)
8321     {
8322         dim = dd->dim[d];
8323
8324         for(z=0; z<zones->n; z++)
8325         {
8326             /* With a staggered grid we have different sizes
8327              * for non-shifted dimensions.
8328              */
8329             if (dd->bGridJump && zones->shift[z][dim] == 0)
8330             {
8331                 if (d == 1)
8332                 {
8333                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8334                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8335                 }
8336                 else if (d == 2)
8337                 {
8338                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8339                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8340                 }
8341             }
8342         }
8343
8344         rcs   = comm->cutoff;
8345         rcmbs = comm->cutoff_mbody;
8346         if (ddbox->tric_dir[dim])
8347         {
8348             rcs   /= ddbox->skew_fac[dim];
8349             rcmbs /= ddbox->skew_fac[dim];
8350         }
8351
8352         /* Set the lower limit for the shifted zone dimensions */
8353         for(z=zone_start; z<zone_end; z++)
8354         {
8355             if (zones->shift[z][dim] > 0)
8356             {
8357                 dim = dd->dim[d];
8358                 if (!dd->bGridJump || d == 0)
8359                 {
8360                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8361                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8362                 }
8363                 else
8364                 {
8365                     /* Here we take the lower limit of the zone from
8366                      * the lowest domain of the zone below.
8367                      */
8368                     if (z < 4)
8369                     {
8370                         zones->size[z].x0[dim] =
8371                              comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8372                     }
8373                     else
8374                     {
8375                         if (d == 1)
8376                         {
8377                             zones->size[z].x0[dim] =
8378                                 zones->size[zone_perm[2][z-4]].x0[dim];
8379                         }
8380                         else
8381                         {
8382                             zones->size[z].x0[dim] =
8383                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8384                         }
8385                     }
8386                     /* A temporary limit, is updated below */
8387                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8388
8389                     if (bDistMB)
8390                     {
8391                         for(zi=0; zi<zones->nizone; zi++)
8392                         {
8393                             if (zones->shift[zi][dim] == 0)
8394                             {
8395                                 /* This takes the whole zone into account.
8396                                  * With multiple pulses this will lead
8397                                  * to a larger zone then strictly necessary.
8398                                  */
8399                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8400                                                              zones->size[zi].x1[dim]+rcmbs);
8401                             }
8402                         }
8403                     }
8404                 }
8405             }
8406         }
8407
8408         /* Loop over the i-zones to set the upper limit of each
8409          * j-zone they see.
8410          */
8411         for(zi=0; zi<zones->nizone; zi++)
8412         {
8413             if (zones->shift[zi][dim] == 0)
8414             {
8415                 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8416                 {
8417                     if (zones->shift[z][dim] > 0)
8418                     {
8419                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8420                                                      zones->size[zi].x1[dim]+rcs);
8421                     }
8422                 }
8423             }
8424         }
8425     }
8426
8427     for(z=zone_start; z<zone_end; z++)
8428     {
8429         for(i=0; i<DIM; i++)
8430         {
8431             zones->size[z].bb_x0[i] = zones->size[z].x0[i];
8432             zones->size[z].bb_x1[i] = zones->size[z].x1[i];
8433
8434             for(j=i+1; j<ddbox->npbcdim; j++)
8435             {
8436                 /* With 1D domain decomposition the cg's are not in
8437                  * the triclinic box, but trilinic x-y and rectangular y-z.
8438                  */
8439                 if (box[j][i] != 0 &&
8440                     !(dd->ndim == 1 && i == YY && j == ZZ))
8441                 {
8442                     /* Correct for triclinic offset of the lower corner */
8443                     add_tric = zones->size[z].x0[j]*box[j][i]/box[j][j];
8444                     zones->size[z].bb_x0[i] += add_tric;
8445                     zones->size[z].bb_x1[i] += add_tric;
8446
8447                     /* Correct for triclinic offset of the upper corner */
8448                     size_j = zones->size[z].x1[j] - zones->size[z].x0[j];
8449                     add_tric = size_j*box[j][i]/box[j][j];
8450
8451                     if (box[j][i] < 0)
8452                     {
8453                         zones->size[z].bb_x0[i] += add_tric;
8454                     }
8455                     else
8456                     {
8457                         zones->size[z].bb_x1[i] += add_tric;
8458                     }
8459                 }
8460             }
8461         }
8462     }
8463
8464     if (zone_start == 0)
8465     {
8466         vol = 1;
8467         for(dim=0; dim<DIM; dim++)
8468         {
8469             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8470         }
8471         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8472     }
8473
8474     if (debug)
8475     {
8476         for(z=zone_start; z<zone_end; z++)
8477         {
8478             fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8479                     z,
8480                     zones->size[z].x0[XX],zones->size[z].x1[XX],
8481                     zones->size[z].x0[YY],zones->size[z].x1[YY],
8482                     zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8483             fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8484                     z,
8485                     zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8486                     zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8487                     zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8488         }
8489     }
8490 }
8491
8492 static int comp_cgsort(const void *a,const void *b)
8493 {
8494     int comp;
8495     
8496     gmx_cgsort_t *cga,*cgb;
8497     cga = (gmx_cgsort_t *)a;
8498     cgb = (gmx_cgsort_t *)b;
8499     
8500     comp = cga->nsc - cgb->nsc;
8501     if (comp == 0)
8502     {
8503         comp = cga->ind_gl - cgb->ind_gl;
8504     }
8505     
8506     return comp;
8507 }
8508
8509 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8510                          int *a,int *buf)
8511 {
8512     int i;
8513     
8514     /* Order the data */
8515     for(i=0; i<n; i++)
8516     {
8517         buf[i] = a[sort[i].ind];
8518     }
8519     
8520     /* Copy back to the original array */
8521     for(i=0; i<n; i++)
8522     {
8523         a[i] = buf[i];
8524     }
8525 }
8526
8527 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8528                          rvec *v,rvec *buf)
8529 {
8530     int i;
8531     
8532     /* Order the data */
8533     for(i=0; i<n; i++)
8534     {
8535         copy_rvec(v[sort[i].ind],buf[i]);
8536     }
8537     
8538     /* Copy back to the original array */
8539     for(i=0; i<n; i++)
8540     {
8541         copy_rvec(buf[i],v[i]);
8542     }
8543 }
8544
8545 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8546                            rvec *v,rvec *buf)
8547 {
8548     int a,atot,cg,cg0,cg1,i;
8549     
8550     if (cgindex == NULL)
8551     {
8552         /* Avoid the useless loop of the atoms within a cg */
8553         order_vec_cg(ncg,sort,v,buf);
8554
8555         return;
8556     }
8557
8558     /* Order the data */
8559     a = 0;
8560     for(cg=0; cg<ncg; cg++)
8561     {
8562         cg0 = cgindex[sort[cg].ind];
8563         cg1 = cgindex[sort[cg].ind+1];
8564         for(i=cg0; i<cg1; i++)
8565         {
8566             copy_rvec(v[i],buf[a]);
8567             a++;
8568         }
8569     }
8570     atot = a;
8571     
8572     /* Copy back to the original array */
8573     for(a=0; a<atot; a++)
8574     {
8575         copy_rvec(buf[a],v[a]);
8576     }
8577 }
8578
8579 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8580                          int nsort_new,gmx_cgsort_t *sort_new,
8581                          gmx_cgsort_t *sort1)
8582 {
8583     int i1,i2,i_new;
8584     
8585     /* The new indices are not very ordered, so we qsort them */
8586     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8587     
8588     /* sort2 is already ordered, so now we can merge the two arrays */
8589     i1 = 0;
8590     i2 = 0;
8591     i_new = 0;
8592     while(i2 < nsort2 || i_new < nsort_new)
8593     {
8594         if (i2 == nsort2)
8595         {
8596             sort1[i1++] = sort_new[i_new++];
8597         }
8598         else if (i_new == nsort_new)
8599         {
8600             sort1[i1++] = sort2[i2++];
8601         }
8602         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8603                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8604                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8605         {
8606             sort1[i1++] = sort2[i2++];
8607         }
8608         else
8609         {
8610             sort1[i1++] = sort_new[i_new++];
8611         }
8612     }
8613 }
8614
8615 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8616 {
8617     gmx_domdec_sort_t *sort;
8618     gmx_cgsort_t *cgsort,*sort_i;
8619     int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8620     int  sort_last,sort_skip;
8621
8622     sort = dd->comm->sort;
8623
8624     a = fr->ns.grid->cell_index;
8625
8626     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8627
8628     if (ncg_home_old >= 0)
8629     {
8630         /* The charge groups that remained in the same ns grid cell
8631          * are completely ordered. So we can sort efficiently by sorting
8632          * the charge groups that did move into the stationary list.
8633          */
8634         ncg_new = 0;
8635         nsort2 = 0;
8636         nsort_new = 0;
8637         for(i=0; i<dd->ncg_home; i++)
8638         {
8639             /* Check if this cg did not move to another node */
8640             if (a[i] < moved)
8641             {
8642                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8643                 {
8644                     /* This cg is new on this node or moved ns grid cell */
8645                     if (nsort_new >= sort->sort_new_nalloc)
8646                     {
8647                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8648                         srenew(sort->sort_new,sort->sort_new_nalloc);
8649                     }
8650                     sort_i = &(sort->sort_new[nsort_new++]);
8651                 }
8652                 else
8653                 {
8654                     /* This cg did not move */
8655                     sort_i = &(sort->sort2[nsort2++]);
8656                 }
8657                 /* Sort on the ns grid cell indices
8658                  * and the global topology index.
8659                  * index_gl is irrelevant with cell ns,
8660                  * but we set it here anyhow to avoid a conditional.
8661                  */
8662                 sort_i->nsc    = a[i];
8663                 sort_i->ind_gl = dd->index_gl[i];
8664                 sort_i->ind    = i;
8665                 ncg_new++;
8666             }
8667         }
8668         if (debug)
8669         {
8670             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8671                     nsort2,nsort_new);
8672         }
8673         /* Sort efficiently */
8674         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8675                      sort->sort);
8676     }
8677     else
8678     {
8679         cgsort = sort->sort;
8680         ncg_new = 0;
8681         for(i=0; i<dd->ncg_home; i++)
8682         {
8683             /* Sort on the ns grid cell indices
8684              * and the global topology index
8685              */
8686             cgsort[i].nsc    = a[i];
8687             cgsort[i].ind_gl = dd->index_gl[i];
8688             cgsort[i].ind    = i;
8689             if (cgsort[i].nsc < moved)
8690             {
8691                 ncg_new++;
8692             }
8693         }
8694         if (debug)
8695         {
8696             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8697         }
8698         /* Determine the order of the charge groups using qsort */
8699         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8700     }
8701
8702     return ncg_new;
8703 }
8704
8705 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8706 {
8707     gmx_cgsort_t *sort;
8708     int  ncg_new,i,*a,na;
8709
8710     sort = dd->comm->sort->sort;
8711
8712     nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8713
8714     ncg_new = 0;
8715     for(i=0; i<na; i++)
8716     {
8717         if (a[i] >= 0)
8718         {
8719             sort[ncg_new].ind = a[i];
8720             ncg_new++;
8721         }
8722     }
8723
8724     return ncg_new;
8725 }
8726
8727 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8728                           rvec *cgcm,t_forcerec *fr,t_state *state,
8729                           int ncg_home_old)
8730 {
8731     gmx_domdec_sort_t *sort;
8732     gmx_cgsort_t *cgsort,*sort_i;
8733     int  *cgindex;
8734     int  ncg_new,i,*ibuf,cgsize;
8735     rvec *vbuf;
8736     
8737     sort = dd->comm->sort;
8738     
8739     if (dd->ncg_home > sort->sort_nalloc)
8740     {
8741         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8742         srenew(sort->sort,sort->sort_nalloc);
8743         srenew(sort->sort2,sort->sort_nalloc);
8744     }
8745     cgsort = sort->sort;
8746
8747     switch (fr->cutoff_scheme)
8748     {
8749     case ecutsGROUP:
8750         ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8751         break;
8752     case ecutsVERLET:
8753         ncg_new = dd_sort_order_nbnxn(dd,fr);
8754         break;
8755     default:
8756         gmx_incons("unimplemented");
8757         ncg_new = 0;
8758     }
8759
8760     /* We alloc with the old size, since cgindex is still old */
8761     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8762     vbuf = dd->comm->vbuf.v;
8763     
8764     if (dd->comm->bCGs)
8765     {
8766         cgindex = dd->cgindex;
8767     }
8768     else
8769     {
8770         cgindex = NULL;
8771     }
8772
8773     /* Remove the charge groups which are no longer at home here */
8774     dd->ncg_home = ncg_new;
8775     if (debug)
8776     {
8777         fprintf(debug,"Set the new home charge group count to %d\n",
8778                 dd->ncg_home);
8779     }
8780     
8781     /* Reorder the state */
8782     for(i=0; i<estNR; i++)
8783     {
8784         if (EST_DISTR(i) && (state->flags & (1<<i)))
8785         {
8786             switch (i)
8787             {
8788             case estX:
8789                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8790                 break;
8791             case estV:
8792                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8793                 break;
8794             case estSDX:
8795                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8796                 break;
8797             case estCGP:
8798                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8799                 break;
8800             case estLD_RNG:
8801             case estLD_RNGI:
8802             case estDISRE_INITF:
8803             case estDISRE_RM3TAV:
8804             case estORIRE_INITF:
8805             case estORIRE_DTAV:
8806                 /* No ordering required */
8807                 break;
8808             default:
8809                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8810                 break;
8811             }
8812         }
8813     }
8814     if (fr->cutoff_scheme == ecutsGROUP)
8815     {
8816         /* Reorder cgcm */
8817         order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8818     }
8819     
8820     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8821     {
8822         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8823         srenew(sort->ibuf,sort->ibuf_nalloc);
8824     }
8825     ibuf = sort->ibuf;
8826     /* Reorder the global cg index */
8827     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8828     /* Reorder the cginfo */
8829     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8830     /* Rebuild the local cg index */
8831     if (dd->comm->bCGs)
8832     {
8833         ibuf[0] = 0;
8834         for(i=0; i<dd->ncg_home; i++)
8835         {
8836             cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8837             ibuf[i+1] = ibuf[i] + cgsize;
8838         }
8839         for(i=0; i<dd->ncg_home+1; i++)
8840         {
8841             dd->cgindex[i] = ibuf[i];
8842         }
8843     }
8844     else
8845     {
8846         for(i=0; i<dd->ncg_home+1; i++)
8847         {
8848             dd->cgindex[i] = i;
8849         }
8850     }
8851     /* Set the home atom number */
8852     dd->nat_home = dd->cgindex[dd->ncg_home];
8853
8854     if (fr->cutoff_scheme == ecutsVERLET)
8855     {
8856         /* The atoms are now exactly in grid order, update the grid order */
8857         nbnxn_set_atomorder(fr->nbv->nbs);
8858     }
8859     else
8860     {
8861         /* Copy the sorted ns cell indices back to the ns grid struct */
8862         for(i=0; i<dd->ncg_home; i++)
8863         {
8864             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8865         }
8866         fr->ns.grid->nr = dd->ncg_home;
8867     }
8868 }
8869
8870 static void add_dd_statistics(gmx_domdec_t *dd)
8871 {
8872     gmx_domdec_comm_t *comm;
8873     int ddnat;
8874     
8875     comm = dd->comm;
8876     
8877     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8878     {
8879         comm->sum_nat[ddnat-ddnatZONE] +=
8880             comm->nat[ddnat] - comm->nat[ddnat-1];
8881     }
8882     comm->ndecomp++;
8883 }
8884
8885 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8886 {
8887     gmx_domdec_comm_t *comm;
8888     int ddnat;
8889     
8890     comm = dd->comm;
8891
8892     /* Reset all the statistics and counters for total run counting */
8893     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8894     {
8895         comm->sum_nat[ddnat-ddnatZONE] = 0;
8896     }
8897     comm->ndecomp = 0;
8898     comm->nload = 0;
8899     comm->load_step = 0;
8900     comm->load_sum = 0;
8901     comm->load_max = 0;
8902     clear_ivec(comm->load_lim);
8903     comm->load_mdf = 0;
8904     comm->load_pme = 0;
8905 }
8906
8907 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8908 {
8909     gmx_domdec_comm_t *comm;
8910     int ddnat;
8911     double av;
8912    
8913     comm = cr->dd->comm;
8914     
8915     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8916     
8917     if (fplog == NULL)
8918     {
8919         return;
8920     }
8921     
8922     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8923             
8924     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8925     {
8926         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8927         switch(ddnat)
8928         {
8929         case ddnatZONE:
8930             fprintf(fplog,
8931                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8932                     2,av);
8933             break;
8934         case ddnatVSITE:
8935             if (cr->dd->vsite_comm)
8936             {
8937                 fprintf(fplog,
8938                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8939                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8940                         av);
8941             }
8942             break;
8943         case ddnatCON:
8944             if (cr->dd->constraint_comm)
8945             {
8946                 fprintf(fplog,
8947                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8948                         1 + ir->nLincsIter,av);
8949             }
8950             break;
8951         default:
8952             gmx_incons(" Unknown type for DD statistics");
8953         }
8954     }
8955     fprintf(fplog,"\n");
8956     
8957     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8958     {
8959         print_dd_load_av(fplog,cr->dd);
8960     }
8961 }
8962
8963 void dd_partition_system(FILE            *fplog,
8964                          gmx_large_int_t      step,
8965                          t_commrec       *cr,
8966                          gmx_bool            bMasterState,
8967                          int             nstglobalcomm,
8968                          t_state         *state_global,
8969                          gmx_mtop_t      *top_global,
8970                          t_inputrec      *ir,
8971                          t_state         *state_local,
8972                          rvec            **f,
8973                          t_mdatoms       *mdatoms,
8974                          gmx_localtop_t  *top_local,
8975                          t_forcerec      *fr,
8976                          gmx_vsite_t     *vsite,
8977                          gmx_shellfc_t   shellfc,
8978                          gmx_constr_t    constr,
8979                          t_nrnb          *nrnb,
8980                          gmx_wallcycle_t wcycle,
8981                          gmx_bool            bVerbose)
8982 {
8983     gmx_domdec_t *dd;
8984     gmx_domdec_comm_t *comm;
8985     gmx_ddbox_t ddbox={0};
8986     t_block *cgs_gl;
8987     gmx_large_int_t step_pcoupl;
8988     rvec cell_ns_x0,cell_ns_x1;
8989     int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
8990     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8991     gmx_bool bRedist,bSortCG,bResortAll;
8992     ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
8993     real grid_density;
8994     char sbuf[22];
8995         
8996     dd = cr->dd;
8997     comm = dd->comm;
8998
8999     bBoxChanged = (bMasterState || DEFORM(*ir));
9000     if (ir->epc != epcNO)
9001     {
9002         /* With nstpcouple > 1 pressure coupling happens.
9003          * one step after calculating the pressure.
9004          * Box scaling happens at the end of the MD step,
9005          * after the DD partitioning.
9006          * We therefore have to do DLB in the first partitioning
9007          * after an MD step where P-coupling occured.
9008          * We need to determine the last step in which p-coupling occurred.
9009          * MRS -- need to validate this for vv?
9010          */
9011         n = ir->nstpcouple;
9012         if (n == 1)
9013         {
9014             step_pcoupl = step - 1;
9015         }
9016         else
9017         {
9018             step_pcoupl = ((step - 1)/n)*n + 1;
9019         }
9020         if (step_pcoupl >= comm->partition_step)
9021         {
9022             bBoxChanged = TRUE;
9023         }
9024     }
9025
9026     bNStGlobalComm = (step % nstglobalcomm == 0);
9027
9028     if (!comm->bDynLoadBal)
9029     {
9030         bDoDLB = FALSE;
9031     }
9032     else
9033     {
9034         /* Should we do dynamic load balacing this step?
9035          * Since it requires (possibly expensive) global communication,
9036          * we might want to do DLB less frequently.
9037          */
9038         if (bBoxChanged || ir->epc != epcNO)
9039         {
9040             bDoDLB = bBoxChanged;
9041         }
9042         else
9043         {
9044             bDoDLB = bNStGlobalComm;
9045         }
9046     }
9047
9048     /* Check if we have recorded loads on the nodes */
9049     if (comm->bRecordLoad && dd_load_count(comm))
9050     {
9051         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9052         {
9053             /* Check if we should use DLB at the second partitioning
9054              * and every 100 partitionings,
9055              * so the extra communication cost is negligible.
9056              */
9057             n = max(100,nstglobalcomm);
9058             bCheckDLB = (comm->n_load_collect == 0 ||
9059                          comm->n_load_have % n == n-1);
9060         }
9061         else
9062         {
9063             bCheckDLB = FALSE;
9064         }
9065         
9066         /* Print load every nstlog, first and last step to the log file */
9067         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9068                     comm->n_load_collect == 0 ||
9069                     (ir->nsteps >= 0 &&
9070                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9071
9072         /* Avoid extra communication due to verbose screen output
9073          * when nstglobalcomm is set.
9074          */
9075         if (bDoDLB || bLogLoad || bCheckDLB ||
9076             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9077         {
9078             get_load_distribution(dd,wcycle);
9079             if (DDMASTER(dd))
9080             {
9081                 if (bLogLoad)
9082                 {
9083                     dd_print_load(fplog,dd,step-1);
9084                 }
9085                 if (bVerbose)
9086                 {
9087                     dd_print_load_verbose(dd);
9088                 }
9089             }
9090             comm->n_load_collect++;
9091
9092             if (bCheckDLB) {
9093                 /* Since the timings are node dependent, the master decides */
9094                 if (DDMASTER(dd))
9095                 {
9096                     bTurnOnDLB =
9097                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9098                     if (debug)
9099                     {
9100                         fprintf(debug,"step %s, imb loss %f\n",
9101                                 gmx_step_str(step,sbuf),
9102                                 dd_force_imb_perf_loss(dd));
9103                     }
9104                 }
9105                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9106                 if (bTurnOnDLB)
9107                 {
9108                     turn_on_dlb(fplog,cr,step);
9109                     bDoDLB = TRUE;
9110                 }
9111             }
9112         }
9113         comm->n_load_have++;
9114     }
9115
9116     cgs_gl = &comm->cgs_gl;
9117
9118     bRedist = FALSE;
9119     if (bMasterState)
9120     {
9121         /* Clear the old state */
9122         clear_dd_indices(dd,0,0);
9123
9124         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9125                   TRUE,cgs_gl,state_global->x,&ddbox);
9126     
9127         get_cg_distribution(fplog,step,dd,cgs_gl,
9128                             state_global->box,&ddbox,state_global->x);
9129         
9130         dd_distribute_state(dd,cgs_gl,
9131                             state_global,state_local,f);
9132         
9133         dd_make_local_cgs(dd,&top_local->cgs);
9134         
9135         /* Ensure that we have space for the new distribution */
9136         dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9137
9138         if (fr->cutoff_scheme == ecutsGROUP)
9139         {
9140             calc_cgcm(fplog,0,dd->ncg_home,
9141                       &top_local->cgs,state_local->x,fr->cg_cm);
9142         }
9143         
9144         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9145         
9146         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9147
9148         cg0 = 0;
9149     }
9150     else if (state_local->ddp_count != dd->ddp_count)
9151     {
9152         if (state_local->ddp_count > dd->ddp_count)
9153         {
9154             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9155         }
9156         
9157         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9158         {
9159             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9160         }
9161         
9162         /* Clear the old state */
9163         clear_dd_indices(dd,0,0);
9164         
9165         /* Build the new indices */
9166         rebuild_cgindex(dd,cgs_gl->index,state_local);
9167         make_dd_indices(dd,cgs_gl->index,0);
9168
9169         if (fr->cutoff_scheme == ecutsGROUP)
9170         {
9171             /* Redetermine the cg COMs */
9172             calc_cgcm(fplog,0,dd->ncg_home,
9173                       &top_local->cgs,state_local->x,fr->cg_cm);
9174         }
9175         
9176         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9177
9178         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9179
9180         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9181                   TRUE,&top_local->cgs,state_local->x,&ddbox);
9182
9183         bRedist = comm->bDynLoadBal;
9184     }
9185     else
9186     {
9187         /* We have the full state, only redistribute the cgs */
9188
9189         /* Clear the non-home indices */
9190         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9191
9192         /* Avoid global communication for dim's without pbc and -gcom */
9193         if (!bNStGlobalComm)
9194         {
9195             copy_rvec(comm->box0    ,ddbox.box0    );
9196             copy_rvec(comm->box_size,ddbox.box_size);
9197         }
9198         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9199                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9200
9201         bBoxChanged = TRUE;
9202         bRedist = TRUE;
9203     }
9204     /* For dim's without pbc and -gcom */
9205     copy_rvec(ddbox.box0    ,comm->box0    );
9206     copy_rvec(ddbox.box_size,comm->box_size);
9207     
9208     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9209                       step,wcycle);
9210     
9211     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9212     {
9213         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9214     }
9215     
9216     /* Check if we should sort the charge groups */
9217     if (comm->nstSortCG > 0)
9218     {
9219         bSortCG = (bMasterState ||
9220                    (bRedist && (step % comm->nstSortCG == 0)));
9221     }
9222     else
9223     {
9224         bSortCG = FALSE;
9225     }
9226
9227     ncg_home_old = dd->ncg_home;
9228
9229     ncg_moved = 0;
9230     if (bRedist)
9231     {
9232         wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9233
9234         dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9235                            state_local,f,fr,mdatoms,
9236                            !bSortCG,nrnb,&cg0,&ncg_moved);
9237
9238         wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9239     }
9240     
9241     get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9242                           dd,&ddbox,
9243                           &comm->cell_x0,&comm->cell_x1,
9244                           dd->ncg_home,fr->cg_cm,
9245                           cell_ns_x0,cell_ns_x1,&grid_density);
9246
9247     if (bBoxChanged)
9248     {
9249         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9250     }
9251
9252     switch (fr->cutoff_scheme)
9253     {
9254     case ecutsGROUP:
9255         copy_ivec(fr->ns.grid->n,ncells_old);
9256         grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9257                    state_local->box,cell_ns_x0,cell_ns_x1,
9258                    fr->rlistlong,grid_density);
9259         break;
9260     case ecutsVERLET:
9261         nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9262         break;
9263     default:
9264         gmx_incons("unimplemented");
9265     }
9266     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9267     copy_ivec(ddbox.tric_dir,comm->tric_dir);
9268
9269     if (bSortCG)
9270     {
9271         wallcycle_sub_start(wcycle,ewcsDD_GRID);
9272
9273         /* Sort the state on charge group position.
9274          * This enables exact restarts from this step.
9275          * It also improves performance by about 15% with larger numbers
9276          * of atoms per node.
9277          */
9278         
9279         /* Fill the ns grid with the home cell,
9280          * so we can sort with the indices.
9281          */
9282         set_zones_ncg_home(dd);
9283
9284         switch (fr->cutoff_scheme)
9285         {
9286         case ecutsVERLET:
9287             set_zones_size(dd,state_local->box,&ddbox,0,1);
9288
9289             nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9290                               0,
9291                               comm->zones.size[0].bb_x0,
9292                               comm->zones.size[0].bb_x1,
9293                               0,dd->ncg_home,
9294                               comm->zones.dens_zone0,
9295                               fr->cginfo,
9296                               state_local->x,
9297                               ncg_moved,comm->moved,
9298                               fr->nbv->grp[eintLocal].kernel_type,
9299                               fr->nbv->grp[eintLocal].nbat);
9300
9301             nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9302             break;
9303         case ecutsGROUP:
9304             fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9305                       0,dd->ncg_home,fr->cg_cm);
9306             
9307             copy_ivec(fr->ns.grid->n,ncells_new);
9308             break;
9309         default:
9310             gmx_incons("unimplemented");
9311         }
9312
9313         bResortAll = bMasterState;
9314    
9315         /* Check if we can user the old order and ns grid cell indices
9316          * of the charge groups to sort the charge groups efficiently.
9317          */
9318         if (ncells_new[XX] != ncells_old[XX] ||
9319             ncells_new[YY] != ncells_old[YY] ||
9320             ncells_new[ZZ] != ncells_old[ZZ])
9321         {
9322             bResortAll = TRUE;
9323         }
9324
9325         if (debug)
9326         {
9327             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9328                     gmx_step_str(step,sbuf),dd->ncg_home);
9329         }
9330         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9331                       bResortAll ? -1 : ncg_home_old);
9332         /* Rebuild all the indices */
9333         cg0 = 0;
9334         ga2la_clear(dd->ga2la);
9335
9336         wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9337     }
9338
9339     wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9340     
9341     /* Setup up the communication and communicate the coordinates */
9342     setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9343     
9344     /* Set the indices */
9345     make_dd_indices(dd,cgs_gl->index,cg0);
9346
9347     /* Set the charge group boundaries for neighbor searching */
9348     set_cg_boundaries(&comm->zones);
9349
9350     if (fr->cutoff_scheme == ecutsVERLET)
9351     {
9352         set_zones_size(dd,state_local->box,&ddbox,
9353                        bSortCG ? 1 : 0,comm->zones.n);
9354     }
9355
9356     wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9357
9358     /*
9359     write_dd_pdb("dd_home",step,"dump",top_global,cr,
9360                  -1,state_local->x,state_local->box);
9361     */
9362
9363     wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9364     
9365     /* Extract a local topology from the global topology */
9366     for(i=0; i<dd->ndim; i++)
9367     {
9368         np[dd->dim[i]] = comm->cd[i].np;
9369     }
9370     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9371                       comm->cellsize_min,np,
9372                       fr,
9373                       fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9374                       vsite,top_global,top_local);
9375
9376     wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9377
9378     wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9379     
9380     /* Set up the special atom communication */
9381     n = comm->nat[ddnatZONE];
9382     for(i=ddnatZONE+1; i<ddnatNR; i++)
9383     {
9384         switch(i)
9385         {
9386         case ddnatVSITE:
9387             if (vsite && vsite->n_intercg_vsite)
9388             {
9389                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9390             }
9391             break;
9392         case ddnatCON:
9393             if (dd->bInterCGcons || dd->bInterCGsettles)
9394             {
9395                 /* Only for inter-cg constraints we need special code */
9396                 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9397                                               constr,ir->nProjOrder,
9398                                               top_local->idef.il);
9399             }
9400             break;
9401         default:
9402             gmx_incons("Unknown special atom type setup");
9403         }
9404         comm->nat[i] = n;
9405     }
9406
9407     wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9408
9409     wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9410
9411     /* Make space for the extra coordinates for virtual site
9412      * or constraint communication.
9413      */
9414     state_local->natoms = comm->nat[ddnatNR-1];
9415     if (state_local->natoms > state_local->nalloc)
9416     {
9417         dd_realloc_state(state_local,f,state_local->natoms);
9418     }
9419
9420     if (fr->bF_NoVirSum)
9421     {
9422         if (vsite && vsite->n_intercg_vsite)
9423         {
9424             nat_f_novirsum = comm->nat[ddnatVSITE];
9425         }
9426         else
9427         {
9428             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9429             {
9430                 nat_f_novirsum = dd->nat_tot;
9431             }
9432             else
9433             {
9434                 nat_f_novirsum = dd->nat_home;
9435             }
9436         }
9437     }
9438     else
9439     {
9440         nat_f_novirsum = 0;
9441     }
9442
9443     /* Set the number of atoms required for the force calculation.
9444      * Forces need to be constrained when using a twin-range setup
9445      * or with energy minimization. For simple simulations we could
9446      * avoid some allocation, zeroing and copying, but this is
9447      * probably not worth the complications ande checking.
9448      */
9449     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9450                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9451
9452     /* We make the all mdatoms up to nat_tot_con.
9453      * We could save some work by only setting invmass
9454      * between nat_tot and nat_tot_con.
9455      */
9456     /* This call also sets the new number of home particles to dd->nat_home */
9457     atoms2md(top_global,ir,
9458              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9459
9460     /* Now we have the charges we can sort the FE interactions */
9461     dd_sort_local_top(dd,mdatoms,top_local);
9462
9463     if (shellfc)
9464     {
9465         /* Make the local shell stuff, currently no communication is done */
9466         make_local_shells(cr,mdatoms,shellfc);
9467     }
9468     
9469         if (ir->implicit_solvent)
9470     {
9471         make_local_gb(cr,fr->born,ir->gb_algorithm);
9472     }
9473
9474     init_bonded_thread_force_reduction(fr,&top_local->idef);
9475
9476     if (!(cr->duty & DUTY_PME))
9477     {
9478         /* Send the charges to our PME only node */
9479         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9480                        mdatoms->chargeA,mdatoms->chargeB,
9481                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9482     }
9483     
9484     if (constr)
9485     {
9486         set_constraints(constr,top_local,ir,mdatoms,cr);
9487     }
9488     
9489     if (ir->ePull != epullNO)
9490     {
9491         /* Update the local pull groups */
9492         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9493     }
9494     
9495     if (ir->bRot)
9496     {
9497         /* Update the local rotation groups */
9498         dd_make_local_rotation_groups(dd,ir->rot);
9499     }
9500
9501
9502     add_dd_statistics(dd);
9503     
9504     /* Make sure we only count the cycles for this DD partitioning */
9505     clear_dd_cycle_counts(dd);
9506     
9507     /* Because the order of the atoms might have changed since
9508      * the last vsite construction, we need to communicate the constructing
9509      * atom coordinates again (for spreading the forces this MD step).
9510      */
9511     dd_move_x_vsites(dd,state_local->box,state_local->x);
9512
9513     wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9514     
9515     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9516     {
9517         dd_move_x(dd,state_local->box,state_local->x);
9518         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9519                      -1,state_local->x,state_local->box);
9520     }
9521
9522     /* Store the partitioning step */
9523     comm->partition_step = step;
9524     
9525     /* Increase the DD partitioning counter */
9526     dd->ddp_count++;
9527     /* The state currently matches this DD partitioning count, store it */
9528     state_local->ddp_count = dd->ddp_count;
9529     if (bMasterState)
9530     {
9531         /* The DD master node knows the complete cg distribution,
9532          * store the count so we can possibly skip the cg info communication.
9533          */
9534         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9535     }
9536
9537     if (comm->DD_debug > 0)
9538     {
9539         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9540         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9541                                 "after partitioning");
9542     }
9543 }