Merge release-4-5-patches into release-4-6
[alexxy/gromacs.git] / src / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "vec.h"
31 #include "domdec.h"
32 #include "domdec_network.h"
33 #include "nrnb.h"
34 #include "pbc.h"
35 #include "chargegroup.h"
36 #include "constr.h"
37 #include "mdatoms.h"
38 #include "names.h"
39 #include "pdbio.h"
40 #include "futil.h"
41 #include "force.h"
42 #include "pme.h"
43 #include "pull.h"
44 #include "pull_rotation.h"
45 #include "gmx_wallcycle.h"
46 #include "mdrun.h"
47 #include "nsgrid.h"
48 #include "shellfc.h"
49 #include "mtop_util.h"
50 #include "gmxfio.h"
51 #include "gmx_ga2la.h"
52 #include "gmx_sort.h"
53
54 #ifdef GMX_LIB_MPI
55 #include <mpi.h>
56 #endif
57 #ifdef GMX_THREAD_MPI
58 #include "tmpi.h"
59 #endif
60
61 #define DDRANK(dd,rank)    (rank)
62 #define DDMASTERRANK(dd)   (dd->masterrank)
63
64 typedef struct gmx_domdec_master
65 {
66     /* The cell boundaries */
67     real **cell_x;
68     /* The global charge group division */
69     int  *ncg;     /* Number of home charge groups for each node */
70     int  *index;   /* Index of nnodes+1 into cg */
71     int  *cg;      /* Global charge group index */
72     int  *nat;     /* Number of home atoms for each node. */
73     int  *ibuf;    /* Buffer for communication */
74     rvec *vbuf;    /* Buffer for state scattering and gathering */
75 } gmx_domdec_master_t;
76
77 typedef struct
78 {
79     /* The numbers of charge groups to send and receive for each cell
80      * that requires communication, the last entry contains the total
81      * number of atoms that needs to be communicated.
82      */
83     int nsend[DD_MAXIZONE+2];
84     int nrecv[DD_MAXIZONE+2];
85     /* The charge groups to send */
86     int *index;
87     int nalloc;
88     /* The atom range for non-in-place communication */
89     int cell2at0[DD_MAXIZONE];
90     int cell2at1[DD_MAXIZONE];
91 } gmx_domdec_ind_t;
92
93 typedef struct
94 {
95     int  np;                   /* Number of grid pulses in this dimension */
96     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
97     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
98     int  np_nalloc;
99     gmx_bool bInPlace;             /* Can we communicate in place?            */
100 } gmx_domdec_comm_dim_t;
101
102 typedef struct
103 {
104     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
105     real *cell_f;      /* State var.: cell boundaries, box relative      */
106     real *old_cell_f;  /* Temp. var.: old cell size                      */
107     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
108     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
109     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
110     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
111     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
112     real *buf_ncd;     /* Temp. var.                                     */
113 } gmx_domdec_root_t;
114
115 #define DD_NLOAD_MAX 9
116
117 /* Here floats are accurate enough, since these variables
118  * only influence the load balancing, not the actual MD results.
119  */
120 typedef struct
121 {
122     int  nload;
123     float *load;
124     float sum;
125     float max;
126     float sum_m;
127     float cvol_min;
128     float mdf;
129     float pme;
130     int   flags;
131 } gmx_domdec_load_t;
132
133 typedef struct
134 {
135     int  nsc;
136     int  ind_gl;
137     int  ind;
138 } gmx_cgsort_t;
139
140 typedef struct
141 {
142     gmx_cgsort_t *sort1,*sort2;
143     int  sort_nalloc;
144     gmx_cgsort_t *sort_new;
145     int  sort_new_nalloc;
146     int  *ibuf;
147     int  ibuf_nalloc;
148 } gmx_domdec_sort_t;
149
150 typedef struct
151 {
152     rvec *v;
153     int  nalloc;
154 } vec_rvec_t;
155
156 /* This enum determines the order of the coordinates.
157  * ddnatHOME and ddnatZONE should be first and second,
158  * the others can be ordered as wanted.
159  */
160 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
161
162 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
163 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
164
165 typedef struct
166 {
167     int  dim;      /* The dimension                                          */
168     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
169     int  nslab;    /* The number of PME slabs in this dimension              */
170     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
171     int  *pp_min;  /* The minimum pp node location, size nslab               */
172     int  *pp_max;  /* The maximum pp node location,size nslab                */
173     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
174 } gmx_ddpme_t;
175
176 typedef struct
177 {
178     real min0;    /* The minimum bottom of this zone                        */
179     real max1;    /* The maximum top of this zone                           */
180     real mch0;    /* The maximum bottom communicaton height for this zone   */
181     real mch1;    /* The maximum top communicaton height for this zone      */
182     real p1_0;    /* The bottom value of the first cell in this zone        */
183     real p1_1;    /* The top value of the first cell in this zone           */
184 } gmx_ddzone_t;
185
186 typedef struct gmx_domdec_comm
187 {
188     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
189      * unless stated otherwise.
190      */
191
192     /* The number of decomposition dimensions for PME, 0: no PME */
193     int  npmedecompdim;
194     /* The number of nodes doing PME (PP/PME or only PME) */
195     int  npmenodes;
196     int  npmenodes_x;
197     int  npmenodes_y;
198     /* The communication setup including the PME only nodes */
199     gmx_bool bCartesianPP_PME;
200     ivec ntot;
201     int  cartpmedim;
202     int  *pmenodes;          /* size npmenodes                         */
203     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
204                               * but with bCartesianPP_PME              */
205     gmx_ddpme_t ddpme[2];
206     
207     /* The DD particle-particle nodes only */
208     gmx_bool bCartesianPP;
209     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
210     
211     /* The global charge groups */
212     t_block cgs_gl;
213
214     /* Should we sort the cgs */
215     int  nstSortCG;
216     gmx_domdec_sort_t *sort;
217     
218     /* Are there bonded and multi-body interactions between charge groups? */
219     gmx_bool bInterCGBondeds;
220     gmx_bool bInterCGMultiBody;
221
222     /* Data for the optional bonded interaction atom communication range */
223     gmx_bool bBondComm;
224     t_blocka *cglink;
225     char *bLocalCG;
226
227     /* The DLB option */
228     int  eDLB;
229     /* Are we actually using DLB? */
230     gmx_bool bDynLoadBal;
231
232     /* Cell sizes for static load balancing, first index cartesian */
233     real **slb_frac;
234     
235     /* The width of the communicated boundaries */
236     real cutoff_mbody;
237     real cutoff;
238     /* The minimum cell size (including triclinic correction) */
239     rvec cellsize_min;
240     /* For dlb, for use with edlbAUTO */
241     rvec cellsize_min_dlb;
242     /* The lower limit for the DD cell size with DLB */
243     real cellsize_limit;
244     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
245     gmx_bool bVacDLBNoLimit;
246
247     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
248     ivec tric_dir;
249     /* box0 and box_size are required with dim's without pbc and -gcom */
250     rvec box0;
251     rvec box_size;
252     
253     /* The cell boundaries */
254     rvec cell_x0;
255     rvec cell_x1;
256
257     /* The old location of the cell boundaries, to check cg displacements */
258     rvec old_cell_x0;
259     rvec old_cell_x1;
260
261     /* The communication setup and charge group boundaries for the zones */
262     gmx_domdec_zones_t zones;
263     
264     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
265      * cell boundaries of neighboring cells for dynamic load balancing.
266      */
267     gmx_ddzone_t zone_d1[2];
268     gmx_ddzone_t zone_d2[2][2];
269     
270     /* The coordinate/force communication setup and indices */
271     gmx_domdec_comm_dim_t cd[DIM];
272     /* The maximum number of cells to communicate with in one dimension */
273     int  maxpulse;
274     
275     /* Which cg distribution is stored on the master node */
276     int master_cg_ddp_count;
277     
278     /* The number of cg's received from the direct neighbors */
279     int  zone_ncg1[DD_MAXZONE];
280     
281     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
282     int  nat[ddnatNR];
283     
284     /* Communication buffer for general use */
285     int  *buf_int;
286     int  nalloc_int;
287
288      /* Communication buffer for general use */
289     vec_rvec_t vbuf;
290     
291     /* Communication buffers only used with multiple grid pulses */
292     int  *buf_int2;
293     int  nalloc_int2;
294     vec_rvec_t vbuf2;
295     
296     /* Communication buffers for local redistribution */
297     int  **cggl_flag;
298     int  cggl_flag_nalloc[DIM*2];
299     rvec **cgcm_state;
300     int  cgcm_state_nalloc[DIM*2];
301     
302     /* Cell sizes for dynamic load balancing */
303     gmx_domdec_root_t **root;
304     real *cell_f_row;
305     real cell_f0[DIM];
306     real cell_f1[DIM];
307     real cell_f_max0[DIM];
308     real cell_f_min1[DIM];
309     
310     /* Stuff for load communication */
311     gmx_bool bRecordLoad;
312     gmx_domdec_load_t *load;
313 #ifdef GMX_MPI
314     MPI_Comm *mpi_comm_load;
315 #endif
316
317     /* Maximum DLB scaling per load balancing step in percent */
318     int dlb_scale_lim;
319
320     /* Cycle counters */
321     float cycl[ddCyclNr];
322     int   cycl_n[ddCyclNr];
323     float cycl_max[ddCyclNr];
324     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
325     int eFlop;
326     double flop;
327     int    flop_n;
328     /* Have often have did we have load measurements */
329     int    n_load_have;
330     /* Have often have we collected the load measurements */
331     int    n_load_collect;
332     
333     /* Statistics */
334     double sum_nat[ddnatNR-ddnatZONE];
335     int    ndecomp;
336     int    nload;
337     double load_step;
338     double load_sum;
339     double load_max;
340     ivec   load_lim;
341     double load_mdf;
342     double load_pme;
343
344     /* The last partition step */
345     gmx_large_int_t globalcomm_step;
346
347     /* Debugging */
348     int  nstDDDump;
349     int  nstDDDumpGrid;
350     int  DD_debug;
351 } gmx_domdec_comm_t;
352
353 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
354 #define DD_CGIBS 2
355
356 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
357 #define DD_FLAG_NRCG  65535
358 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
359 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
360
361 /* Zone permutation required to obtain consecutive charge groups
362  * for neighbor searching.
363  */
364 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
365
366 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
367  * components see only j zones with that component 0.
368  */
369
370 /* The DD zone order */
371 static const ivec dd_zo[DD_MAXZONE] =
372   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
373
374 /* The 3D setup */
375 #define dd_z3n  8
376 #define dd_zp3n 4
377 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
378
379 /* The 2D setup */
380 #define dd_z2n  4
381 #define dd_zp2n 2
382 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
383
384 /* The 1D setup */
385 #define dd_z1n  2
386 #define dd_zp1n 1
387 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
388
389 /* Factors used to avoid problems due to rounding issues */
390 #define DD_CELL_MARGIN       1.0001
391 #define DD_CELL_MARGIN2      1.00005
392 /* Factor to account for pressure scaling during nstlist steps */
393 #define DD_PRES_SCALE_MARGIN 1.02
394
395 /* Allowed performance loss before we DLB or warn */
396 #define DD_PERF_LOSS 0.05
397
398 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
399
400 /* Use separate MPI send and receive commands
401  * when nnodes <= GMX_DD_NNODES_SENDRECV.
402  * This saves memory (and some copying for small nnodes).
403  * For high parallelization scatter and gather calls are used.
404  */
405 #define GMX_DD_NNODES_SENDRECV 4
406
407
408 /*
409 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
410
411 static void index2xyz(ivec nc,int ind,ivec xyz)
412 {
413   xyz[XX] = ind % nc[XX];
414   xyz[YY] = (ind / nc[XX]) % nc[YY];
415   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
416 }
417 */
418
419 /* This order is required to minimize the coordinate communication in PME
420  * which uses decomposition in the x direction.
421  */
422 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
423
424 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
425 {
426     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
427     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
428     xyz[ZZ] = ind % nc[ZZ];
429 }
430
431 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
432 {
433     int ddindex;
434     int ddnodeid=-1;
435     
436     ddindex = dd_index(dd->nc,c);
437     if (dd->comm->bCartesianPP_PME)
438     {
439         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
440     }
441     else if (dd->comm->bCartesianPP)
442     {
443 #ifdef GMX_MPI
444         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
445 #endif
446     }
447     else
448     {
449         ddnodeid = ddindex;
450     }
451     
452     return ddnodeid;
453 }
454
455 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
456 {
457     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
458 }
459
460 int ddglatnr(gmx_domdec_t *dd,int i)
461 {
462     int atnr;
463     
464     if (dd == NULL)
465     {
466         atnr = i + 1;
467     }
468     else
469     {
470         if (i >= dd->comm->nat[ddnatNR-1])
471         {
472             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
473         }
474         atnr = dd->gatindex[i] + 1;
475     }
476     
477     return atnr;
478 }
479
480 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
481 {
482     return &dd->comm->cgs_gl;
483 }
484
485 static void vec_rvec_init(vec_rvec_t *v)
486 {
487     v->nalloc = 0;
488     v->v      = NULL;
489 }
490
491 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
492 {
493     if (n > v->nalloc)
494     {
495         v->nalloc = over_alloc_dd(n);
496         srenew(v->v,v->nalloc);
497     }
498 }
499
500 void dd_store_state(gmx_domdec_t *dd,t_state *state)
501 {
502     int i;
503     
504     if (state->ddp_count != dd->ddp_count)
505     {
506         gmx_incons("The state does not the domain decomposition state");
507     }
508     
509     state->ncg_gl = dd->ncg_home;
510     if (state->ncg_gl > state->cg_gl_nalloc)
511     {
512         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
513         srenew(state->cg_gl,state->cg_gl_nalloc);
514     }
515     for(i=0; i<state->ncg_gl; i++)
516     {
517         state->cg_gl[i] = dd->index_gl[i];
518     }
519     
520     state->ddp_count_cg_gl = dd->ddp_count;
521 }
522
523 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
524 {
525     return &dd->comm->zones;
526 }
527
528 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
529                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
530 {
531     gmx_domdec_zones_t *zones;
532     int izone,d,dim;
533
534     zones = &dd->comm->zones;
535
536     izone = 0;
537     while (icg >= zones->izone[izone].cg1)
538     {
539         izone++;
540     }
541     
542     if (izone == 0)
543     {
544         *jcg0 = icg;
545     }
546     else if (izone < zones->nizone)
547     {
548         *jcg0 = zones->izone[izone].jcg0;
549     }
550     else
551     {
552         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
553                   icg,izone,zones->nizone);
554     }
555         
556     *jcg1 = zones->izone[izone].jcg1;
557     
558     for(d=0; d<dd->ndim; d++)
559     {
560         dim = dd->dim[d];
561         shift0[dim] = zones->izone[izone].shift0[dim];
562         shift1[dim] = zones->izone[izone].shift1[dim];
563         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
564         {
565             /* A conservative approach, this can be optimized */
566             shift0[dim] -= 1;
567             shift1[dim] += 1;
568         }
569     }
570 }
571
572 int dd_natoms_vsite(gmx_domdec_t *dd)
573 {
574     return dd->comm->nat[ddnatVSITE];
575 }
576
577 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
578 {
579     *at_start = dd->comm->nat[ddnatCON-1];
580     *at_end   = dd->comm->nat[ddnatCON];
581 }
582
583 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
584 {
585     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
586     int  *index,*cgindex;
587     gmx_domdec_comm_t *comm;
588     gmx_domdec_comm_dim_t *cd;
589     gmx_domdec_ind_t *ind;
590     rvec shift={0,0,0},*buf,*rbuf;
591     gmx_bool bPBC,bScrew;
592     
593     comm = dd->comm;
594     
595     cgindex = dd->cgindex;
596     
597     buf = comm->vbuf.v;
598
599     nzone = 1;
600     nat_tot = dd->nat_home;
601     for(d=0; d<dd->ndim; d++)
602     {
603         bPBC   = (dd->ci[dd->dim[d]] == 0);
604         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
605         if (bPBC)
606         {
607             copy_rvec(box[dd->dim[d]],shift);
608         }
609         cd = &comm->cd[d];
610         for(p=0; p<cd->np; p++)
611         {
612             ind = &cd->ind[p];
613             index = ind->index;
614             n = 0;
615             if (!bPBC)
616             {
617                 for(i=0; i<ind->nsend[nzone]; i++)
618                 {
619                     at0 = cgindex[index[i]];
620                     at1 = cgindex[index[i]+1];
621                     for(j=at0; j<at1; j++)
622                     {
623                         copy_rvec(x[j],buf[n]);
624                         n++;
625                     }
626                 }
627             }
628             else if (!bScrew)
629             {
630                 for(i=0; i<ind->nsend[nzone]; i++)
631                 {
632                     at0 = cgindex[index[i]];
633                     at1 = cgindex[index[i]+1];
634                     for(j=at0; j<at1; j++)
635                     {
636                         /* We need to shift the coordinates */
637                         rvec_add(x[j],shift,buf[n]);
638                         n++;
639                     }
640                 }
641             }
642             else
643             {
644                 for(i=0; i<ind->nsend[nzone]; i++)
645                 {
646                     at0 = cgindex[index[i]];
647                     at1 = cgindex[index[i]+1];
648                     for(j=at0; j<at1; j++)
649                     {
650                         /* Shift x */
651                         buf[n][XX] = x[j][XX] + shift[XX];
652                         /* Rotate y and z.
653                          * This operation requires a special shift force
654                          * treatment, which is performed in calc_vir.
655                          */
656                         buf[n][YY] = box[YY][YY] - x[j][YY];
657                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
658                         n++;
659                     }
660                 }
661             }
662             
663             if (cd->bInPlace)
664             {
665                 rbuf = x + nat_tot;
666             }
667             else
668             {
669                 rbuf = comm->vbuf2.v;
670             }
671             /* Send and receive the coordinates */
672             dd_sendrecv_rvec(dd, d, dddirBackward,
673                              buf,  ind->nsend[nzone+1],
674                              rbuf, ind->nrecv[nzone+1]);
675             if (!cd->bInPlace)
676             {
677                 j = 0;
678                 for(zone=0; zone<nzone; zone++)
679                 {
680                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
681                     {
682                         copy_rvec(rbuf[j],x[i]);
683                         j++;
684                     }
685                 }
686             }
687             nat_tot += ind->nrecv[nzone+1];
688         }
689         nzone += nzone;
690     }
691 }
692
693 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
694 {
695     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
696     int  *index,*cgindex;
697     gmx_domdec_comm_t *comm;
698     gmx_domdec_comm_dim_t *cd;
699     gmx_domdec_ind_t *ind;
700     rvec *buf,*sbuf;
701     ivec vis;
702     int  is;
703     gmx_bool bPBC,bScrew;
704     
705     comm = dd->comm;
706     
707     cgindex = dd->cgindex;
708
709     buf = comm->vbuf.v;
710
711     n = 0;
712     nzone = comm->zones.n/2;
713     nat_tot = dd->nat_tot;
714     for(d=dd->ndim-1; d>=0; d--)
715     {
716         bPBC   = (dd->ci[dd->dim[d]] == 0);
717         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
718         if (fshift == NULL && !bScrew)
719         {
720             bPBC = FALSE;
721         }
722         /* Determine which shift vector we need */
723         clear_ivec(vis);
724         vis[dd->dim[d]] = 1;
725         is = IVEC2IS(vis);
726         
727         cd = &comm->cd[d];
728         for(p=cd->np-1; p>=0; p--) {
729             ind = &cd->ind[p];
730             nat_tot -= ind->nrecv[nzone+1];
731             if (cd->bInPlace)
732             {
733                 sbuf = f + nat_tot;
734             }
735             else
736             {
737                 sbuf = comm->vbuf2.v;
738                 j = 0;
739                 for(zone=0; zone<nzone; zone++)
740                 {
741                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
742                     {
743                         copy_rvec(f[i],sbuf[j]);
744                         j++;
745                     }
746                 }
747             }
748             /* Communicate the forces */
749             dd_sendrecv_rvec(dd, d, dddirForward,
750                              sbuf, ind->nrecv[nzone+1],
751                              buf,  ind->nsend[nzone+1]);
752             index = ind->index;
753             /* Add the received forces */
754             n = 0;
755             if (!bPBC)
756             {
757                 for(i=0; i<ind->nsend[nzone]; i++)
758                 {
759                     at0 = cgindex[index[i]];
760                     at1 = cgindex[index[i]+1];
761                     for(j=at0; j<at1; j++)
762                     {
763                         rvec_inc(f[j],buf[n]);
764                         n++;
765                     }
766                 } 
767             }
768             else if (!bScrew)
769             {
770                 for(i=0; i<ind->nsend[nzone]; i++)
771                 {
772                     at0 = cgindex[index[i]];
773                     at1 = cgindex[index[i]+1];
774                     for(j=at0; j<at1; j++)
775                     {
776                         rvec_inc(f[j],buf[n]);
777                         /* Add this force to the shift force */
778                         rvec_inc(fshift[is],buf[n]);
779                         n++;
780                     }
781                 }
782             }
783             else
784             {
785                 for(i=0; i<ind->nsend[nzone]; i++)
786                 {
787                     at0 = cgindex[index[i]];
788                     at1 = cgindex[index[i]+1];
789                     for(j=at0; j<at1; j++)
790                     {
791                         /* Rotate the force */
792                         f[j][XX] += buf[n][XX];
793                         f[j][YY] -= buf[n][YY];
794                         f[j][ZZ] -= buf[n][ZZ];
795                         if (fshift)
796                         {
797                             /* Add this force to the shift force */
798                             rvec_inc(fshift[is],buf[n]);
799                         }
800                         n++;
801                     }
802                 }
803             }
804         }
805         nzone /= 2;
806     }
807 }
808
809 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
810 {
811     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
812     int  *index,*cgindex;
813     gmx_domdec_comm_t *comm;
814     gmx_domdec_comm_dim_t *cd;
815     gmx_domdec_ind_t *ind;
816     real *buf,*rbuf;
817     
818     comm = dd->comm;
819     
820     cgindex = dd->cgindex;
821     
822     buf = &comm->vbuf.v[0][0];
823
824     nzone = 1;
825     nat_tot = dd->nat_home;
826     for(d=0; d<dd->ndim; d++)
827     {
828         cd = &comm->cd[d];
829         for(p=0; p<cd->np; p++)
830         {
831             ind = &cd->ind[p];
832             index = ind->index;
833             n = 0;
834             for(i=0; i<ind->nsend[nzone]; i++)
835             {
836                 at0 = cgindex[index[i]];
837                 at1 = cgindex[index[i]+1];
838                 for(j=at0; j<at1; j++)
839                 {
840                     buf[n] = v[j];
841                     n++;
842                 }
843             }
844             
845             if (cd->bInPlace)
846             {
847                 rbuf = v + nat_tot;
848             }
849             else
850             {
851                 rbuf = &comm->vbuf2.v[0][0];
852             }
853             /* Send and receive the coordinates */
854             dd_sendrecv_real(dd, d, dddirBackward,
855                              buf,  ind->nsend[nzone+1],
856                              rbuf, ind->nrecv[nzone+1]);
857             if (!cd->bInPlace)
858             {
859                 j = 0;
860                 for(zone=0; zone<nzone; zone++)
861                 {
862                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
863                     {
864                         v[i] = rbuf[j];
865                         j++;
866                     }
867                 }
868             }
869             nat_tot += ind->nrecv[nzone+1];
870         }
871         nzone += nzone;
872     }
873 }
874
875 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
876 {
877     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
878     int  *index,*cgindex;
879     gmx_domdec_comm_t *comm;
880     gmx_domdec_comm_dim_t *cd;
881     gmx_domdec_ind_t *ind;
882     real *buf,*sbuf;
883     
884     comm = dd->comm;
885     
886     cgindex = dd->cgindex;
887
888     buf = &comm->vbuf.v[0][0];
889
890     n = 0;
891     nzone = comm->zones.n/2;
892     nat_tot = dd->nat_tot;
893     for(d=dd->ndim-1; d>=0; d--)
894     {
895         cd = &comm->cd[d];
896         for(p=cd->np-1; p>=0; p--) {
897             ind = &cd->ind[p];
898             nat_tot -= ind->nrecv[nzone+1];
899             if (cd->bInPlace)
900             {
901                 sbuf = v + nat_tot;
902             }
903             else
904             {
905                 sbuf = &comm->vbuf2.v[0][0];
906                 j = 0;
907                 for(zone=0; zone<nzone; zone++)
908                 {
909                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
910                     {
911                         sbuf[j] = v[i];
912                         j++;
913                     }
914                 }
915             }
916             /* Communicate the forces */
917             dd_sendrecv_real(dd, d, dddirForward,
918                              sbuf, ind->nrecv[nzone+1],
919                              buf,  ind->nsend[nzone+1]);
920             index = ind->index;
921             /* Add the received forces */
922             n = 0;
923             for(i=0; i<ind->nsend[nzone]; i++)
924             {
925                 at0 = cgindex[index[i]];
926                 at1 = cgindex[index[i]+1];
927                 for(j=at0; j<at1; j++)
928                 {
929                     v[j] += buf[n];
930                     n++;
931                 }
932             } 
933         }
934         nzone /= 2;
935     }
936 }
937
938 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
939 {
940     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
941             d,i,j,
942             zone->min0,zone->max1,
943             zone->mch0,zone->mch0,
944             zone->p1_0,zone->p1_1);
945 }
946
947 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
948                                int ddimind,int direction,
949                                gmx_ddzone_t *buf_s,int n_s,
950                                gmx_ddzone_t *buf_r,int n_r)
951 {
952     rvec vbuf_s[5*2],vbuf_r[5*2];
953     int i;
954
955     for(i=0; i<n_s; i++)
956     {
957         vbuf_s[i*2  ][0] = buf_s[i].min0;
958         vbuf_s[i*2  ][1] = buf_s[i].max1;
959         vbuf_s[i*2  ][2] = buf_s[i].mch0;
960         vbuf_s[i*2+1][0] = buf_s[i].mch1;
961         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
962         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
963     }
964
965     dd_sendrecv_rvec(dd, ddimind, direction,
966                      vbuf_s, n_s*2,
967                      vbuf_r, n_r*2);
968
969     for(i=0; i<n_r; i++)
970     {
971         buf_r[i].min0 = vbuf_r[i*2  ][0];
972         buf_r[i].max1 = vbuf_r[i*2  ][1];
973         buf_r[i].mch0 = vbuf_r[i*2  ][2];
974         buf_r[i].mch1 = vbuf_r[i*2+1][0];
975         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
976         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
977     }
978 }
979
980 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
981                           rvec cell_ns_x0,rvec cell_ns_x1)
982 {
983     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
984     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
985     rvec extr_s[2],extr_r[2];
986     rvec dh;
987     real dist_d,c=0,det;
988     gmx_domdec_comm_t *comm;
989     gmx_bool bPBC,bUse;
990
991     comm = dd->comm;
992
993     for(d=1; d<dd->ndim; d++)
994     {
995         dim = dd->dim[d];
996         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
997         zp->min0 = cell_ns_x0[dim];
998         zp->max1 = cell_ns_x1[dim];
999         zp->mch0 = cell_ns_x0[dim];
1000         zp->mch1 = cell_ns_x1[dim];
1001         zp->p1_0 = cell_ns_x0[dim];
1002         zp->p1_1 = cell_ns_x1[dim];
1003     }
1004     
1005     for(d=dd->ndim-2; d>=0; d--)
1006     {
1007         dim  = dd->dim[d];
1008         bPBC = (dim < ddbox->npbcdim);
1009
1010         /* Use an rvec to store two reals */
1011         extr_s[d][0] = comm->cell_f0[d+1];
1012         extr_s[d][1] = comm->cell_f1[d+1];
1013         extr_s[d][2] = 0;
1014
1015         pos = 0;
1016         /* Store the extremes in the backward sending buffer,
1017          * so the get updated separately from the forward communication.
1018          */
1019         for(d1=d; d1<dd->ndim-1; d1++)
1020         {
1021             /* We invert the order to be able to use the same loop for buf_e */
1022             buf_s[pos].min0 = extr_s[d1][1];
1023             buf_s[pos].max1 = extr_s[d1][0];
1024             buf_s[pos].mch0 = 0;
1025             buf_s[pos].mch1 = 0;
1026             /* Store the cell corner of the dimension we communicate along */
1027             buf_s[pos].p1_0 = comm->cell_x0[dim];
1028             buf_s[pos].p1_1 = 0;
1029             pos++;
1030         }
1031
1032         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1033         pos++;
1034
1035         if (dd->ndim == 3 && d == 0)
1036         {
1037             buf_s[pos] = comm->zone_d2[0][1];
1038             pos++;
1039             buf_s[pos] = comm->zone_d1[0];
1040             pos++;
1041         }
1042
1043         /* We only need to communicate the extremes
1044          * in the forward direction
1045          */
1046         npulse = comm->cd[d].np;
1047         if (bPBC)
1048         {
1049             /* Take the minimum to avoid double communication */
1050             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1051         }
1052         else
1053         {
1054             /* Without PBC we should really not communicate over
1055              * the boundaries, but implementing that complicates
1056              * the communication setup and therefore we simply
1057              * do all communication, but ignore some data.
1058              */
1059             npulse_min = npulse;
1060         }
1061         for(p=0; p<npulse_min; p++)
1062         {
1063             /* Communicate the extremes forward */
1064             bUse = (bPBC || dd->ci[dim] > 0);
1065
1066             dd_sendrecv_rvec(dd, d, dddirForward,
1067                              extr_s+d, dd->ndim-d-1,
1068                              extr_r+d, dd->ndim-d-1);
1069
1070             if (bUse)
1071             {
1072                 for(d1=d; d1<dd->ndim-1; d1++)
1073                 {
1074                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1075                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1076                 }
1077             }
1078         }
1079
1080         buf_size = pos;
1081         for(p=0; p<npulse; p++)
1082         {
1083             /* Communicate all the zone information backward */
1084             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1085
1086             dd_sendrecv_ddzone(dd, d, dddirBackward,
1087                                buf_s, buf_size,
1088                                buf_r, buf_size);
1089
1090             clear_rvec(dh);
1091             if (p > 0)
1092             {
1093                 for(d1=d+1; d1<dd->ndim; d1++)
1094                 {
1095                     /* Determine the decrease of maximum required
1096                      * communication height along d1 due to the distance along d,
1097                      * this avoids a lot of useless atom communication.
1098                      */
1099                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1100
1101                     if (ddbox->tric_dir[dim])
1102                     {
1103                         /* c is the off-diagonal coupling between the cell planes
1104                          * along directions d and d1.
1105                          */
1106                         c = ddbox->v[dim][dd->dim[d1]][dim];
1107                     }
1108                     else
1109                     {
1110                         c = 0;
1111                     }
1112                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1113                     if (det > 0)
1114                     {
1115                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1116                     }
1117                     else
1118                     {
1119                         /* A negative value signals out of range */
1120                         dh[d1] = -1;
1121                     }
1122                 }
1123             }
1124
1125             /* Accumulate the extremes over all pulses */
1126             for(i=0; i<buf_size; i++)
1127             {
1128                 if (p == 0)
1129                 {
1130                     buf_e[i] = buf_r[i];
1131                 }
1132                 else
1133                 {
1134                     if (bUse)
1135                     {
1136                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1137                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1138                     }
1139
1140                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1141                     {
1142                         d1 = 1;
1143                     }
1144                     else
1145                     {
1146                         d1 = d + 1;
1147                     }
1148                     if (bUse && dh[d1] >= 0)
1149                     {
1150                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1151                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1152                     }
1153                 }
1154                 /* Copy the received buffer to the send buffer,
1155                  * to pass the data through with the next pulse.
1156                  */
1157                 buf_s[i] = buf_r[i];
1158             }
1159             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1160                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1161             {
1162                 /* Store the extremes */ 
1163                 pos = 0;
1164
1165                 for(d1=d; d1<dd->ndim-1; d1++)
1166                 {
1167                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1168                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1169                     pos++;
1170                 }
1171
1172                 if (d == 1 || (d == 0 && dd->ndim == 3))
1173                 {
1174                     for(i=d; i<2; i++)
1175                     {
1176                         comm->zone_d2[1-d][i] = buf_e[pos];
1177                         pos++;
1178                     }
1179                 }
1180                 if (d == 0)
1181                 {
1182                     comm->zone_d1[1] = buf_e[pos];
1183                     pos++;
1184                 }
1185             }
1186         }
1187     }
1188     
1189     if (dd->ndim >= 2)
1190     {
1191         dim = dd->dim[1];
1192         for(i=0; i<2; i++)
1193         {
1194             if (debug)
1195             {
1196                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1197             }
1198             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1199             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1200         }
1201     }
1202     if (dd->ndim >= 3)
1203     {
1204         dim = dd->dim[2];
1205         for(i=0; i<2; i++)
1206         {
1207             for(j=0; j<2; j++)
1208             {
1209                 if (debug)
1210                 {
1211                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1212                 }
1213                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1214                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1215             }
1216         }
1217     }
1218     for(d=1; d<dd->ndim; d++)
1219     {
1220         comm->cell_f_max0[d] = extr_s[d-1][0];
1221         comm->cell_f_min1[d] = extr_s[d-1][1];
1222         if (debug)
1223         {
1224             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1225                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1226         }
1227     }
1228 }
1229
1230 static void dd_collect_cg(gmx_domdec_t *dd,
1231                           t_state *state_local)
1232 {
1233     gmx_domdec_master_t *ma=NULL;
1234     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1235     t_block *cgs_gl;
1236
1237     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1238     {
1239         /* The master has the correct distribution */
1240         return;
1241     }
1242     
1243     if (state_local->ddp_count == dd->ddp_count)
1244     {
1245         ncg_home = dd->ncg_home;
1246         cg       = dd->index_gl;
1247         nat_home = dd->nat_home;
1248     } 
1249     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1250     {
1251         cgs_gl = &dd->comm->cgs_gl;
1252
1253         ncg_home = state_local->ncg_gl;
1254         cg       = state_local->cg_gl;
1255         nat_home = 0;
1256         for(i=0; i<ncg_home; i++)
1257         {
1258             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1259         }
1260     }
1261     else
1262     {
1263         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1264     }
1265     
1266     buf2[0] = dd->ncg_home;
1267     buf2[1] = dd->nat_home;
1268     if (DDMASTER(dd))
1269     {
1270         ma = dd->ma;
1271         ibuf = ma->ibuf;
1272     }
1273     else
1274     {
1275         ibuf = NULL;
1276     }
1277     /* Collect the charge group and atom counts on the master */
1278     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1279     
1280     if (DDMASTER(dd))
1281     {
1282         ma->index[0] = 0;
1283         for(i=0; i<dd->nnodes; i++)
1284         {
1285             ma->ncg[i] = ma->ibuf[2*i];
1286             ma->nat[i] = ma->ibuf[2*i+1];
1287             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1288             
1289         }
1290         /* Make byte counts and indices */
1291         for(i=0; i<dd->nnodes; i++)
1292         {
1293             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1294             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1295         }
1296         if (debug)
1297         {
1298             fprintf(debug,"Initial charge group distribution: ");
1299             for(i=0; i<dd->nnodes; i++)
1300                 fprintf(debug," %d",ma->ncg[i]);
1301             fprintf(debug,"\n");
1302         }
1303     }
1304     
1305     /* Collect the charge group indices on the master */
1306     dd_gatherv(dd,
1307                dd->ncg_home*sizeof(int),dd->index_gl,
1308                DDMASTER(dd) ? ma->ibuf : NULL,
1309                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1310                DDMASTER(dd) ? ma->cg : NULL);
1311     
1312     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1313 }
1314
1315 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1316                                     rvec *lv,rvec *v)
1317 {
1318     gmx_domdec_master_t *ma;
1319     int  n,i,c,a,nalloc=0;
1320     rvec *buf=NULL;
1321     t_block *cgs_gl;
1322
1323     ma = dd->ma;
1324     
1325     if (!DDMASTER(dd))
1326     {
1327 #ifdef GMX_MPI
1328         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1329                  dd->rank,dd->mpi_comm_all);
1330 #endif
1331     } else {
1332         /* Copy the master coordinates to the global array */
1333         cgs_gl = &dd->comm->cgs_gl;
1334
1335         n = DDMASTERRANK(dd);
1336         a = 0;
1337         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1338         {
1339             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1340             {
1341                 copy_rvec(lv[a++],v[c]);
1342             }
1343         }
1344         
1345         for(n=0; n<dd->nnodes; n++)
1346         {
1347             if (n != dd->rank)
1348             {
1349                 if (ma->nat[n] > nalloc)
1350                 {
1351                     nalloc = over_alloc_dd(ma->nat[n]);
1352                     srenew(buf,nalloc);
1353                 }
1354 #ifdef GMX_MPI
1355                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1356                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1357 #endif
1358                 a = 0;
1359                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1360                 {
1361                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1362                     {
1363                         copy_rvec(buf[a++],v[c]);
1364                     }
1365                 }
1366             }
1367         }
1368         sfree(buf);
1369     }
1370 }
1371
1372 static void get_commbuffer_counts(gmx_domdec_t *dd,
1373                                   int **counts,int **disps)
1374 {
1375     gmx_domdec_master_t *ma;
1376     int n;
1377
1378     ma = dd->ma;
1379     
1380     /* Make the rvec count and displacment arrays */
1381     *counts  = ma->ibuf;
1382     *disps   = ma->ibuf + dd->nnodes;
1383     for(n=0; n<dd->nnodes; n++)
1384     {
1385         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1386         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1387     }
1388 }
1389
1390 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1391                                    rvec *lv,rvec *v)
1392 {
1393     gmx_domdec_master_t *ma;
1394     int  *rcounts=NULL,*disps=NULL;
1395     int  n,i,c,a;
1396     rvec *buf=NULL;
1397     t_block *cgs_gl;
1398     
1399     ma = dd->ma;
1400     
1401     if (DDMASTER(dd))
1402     {
1403         get_commbuffer_counts(dd,&rcounts,&disps);
1404
1405         buf = ma->vbuf;
1406     }
1407     
1408     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1409
1410     if (DDMASTER(dd))
1411     {
1412         cgs_gl = &dd->comm->cgs_gl;
1413
1414         a = 0;
1415         for(n=0; n<dd->nnodes; n++)
1416         {
1417             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1418             {
1419                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1420                 {
1421                     copy_rvec(buf[a++],v[c]);
1422                 }
1423             }
1424         }
1425     }
1426 }
1427
1428 void dd_collect_vec(gmx_domdec_t *dd,
1429                     t_state *state_local,rvec *lv,rvec *v)
1430 {
1431     gmx_domdec_master_t *ma;
1432     int  n,i,c,a,nalloc=0;
1433     rvec *buf=NULL;
1434     
1435     dd_collect_cg(dd,state_local);
1436
1437     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1438     {
1439         dd_collect_vec_sendrecv(dd,lv,v);
1440     }
1441     else
1442     {
1443         dd_collect_vec_gatherv(dd,lv,v);
1444     }
1445 }
1446
1447
1448 void dd_collect_state(gmx_domdec_t *dd,
1449                       t_state *state_local,t_state *state)
1450 {
1451     int est,i,j,nh;
1452
1453     nh = state->nhchainlength;
1454
1455     if (DDMASTER(dd))
1456     {
1457         state->lambda = state_local->lambda;
1458         state->veta = state_local->veta;
1459         state->vol0 = state_local->vol0;
1460         copy_mat(state_local->box,state->box);
1461         copy_mat(state_local->boxv,state->boxv);
1462         copy_mat(state_local->svir_prev,state->svir_prev);
1463         copy_mat(state_local->fvir_prev,state->fvir_prev);
1464         copy_mat(state_local->pres_prev,state->pres_prev);
1465
1466
1467         for(i=0; i<state_local->ngtc; i++)
1468         {
1469             for(j=0; j<nh; j++) {
1470                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1471                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1472             }
1473             state->therm_integral[i] = state_local->therm_integral[i];            
1474         }
1475         for(i=0; i<state_local->nnhpres; i++) 
1476         {
1477             for(j=0; j<nh; j++) {
1478                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1479                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1480             }
1481         }
1482     }
1483     for(est=0; est<estNR; est++)
1484     {
1485         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1486         {
1487             switch (est) {
1488             case estX:
1489                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1490                 break;
1491             case estV:
1492                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1493                 break;
1494             case estSDX:
1495                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1496                 break;
1497             case estCGP:
1498                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1499                 break;
1500             case estLD_RNG:
1501                 if (state->nrngi == 1)
1502                 {
1503                     if (DDMASTER(dd))
1504                     {
1505                         for(i=0; i<state_local->nrng; i++)
1506                         {
1507                             state->ld_rng[i] = state_local->ld_rng[i];
1508                         }
1509                     }
1510                 }
1511                 else
1512                 {
1513                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1514                               state_local->ld_rng,state->ld_rng);
1515                 }
1516                 break;
1517             case estLD_RNGI:
1518                 if (state->nrngi == 1)
1519                 {
1520                    if (DDMASTER(dd))
1521                     {
1522                         state->ld_rngi[0] = state_local->ld_rngi[0];
1523                     } 
1524                 }
1525                 else
1526                 {
1527                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1528                               state_local->ld_rngi,state->ld_rngi);
1529                 }
1530                 break;
1531             case estDISRE_INITF:
1532             case estDISRE_RM3TAV:
1533             case estORIRE_INITF:
1534             case estORIRE_DTAV:
1535                 break;
1536             default:
1537                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1538             }
1539         }
1540     }
1541 }
1542
1543 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1544 {
1545     if (debug)
1546     {
1547         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1548     }
1549     fr->cg_nalloc = over_alloc_dd(nalloc);
1550     srenew(fr->cg_cm,fr->cg_nalloc);
1551     srenew(fr->cginfo,fr->cg_nalloc);
1552 }
1553
1554 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1555 {
1556     int est;
1557
1558     if (debug)
1559     {
1560         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1561     }
1562
1563     state->nalloc = over_alloc_dd(nalloc);
1564     
1565     for(est=0; est<estNR; est++)
1566     {
1567         if (EST_DISTR(est) && (state->flags & (1<<est)))
1568         {
1569             switch(est) {
1570             case estX:
1571                 srenew(state->x,state->nalloc);
1572                 break;
1573             case estV:
1574                 srenew(state->v,state->nalloc);
1575                 break;
1576             case estSDX:
1577                 srenew(state->sd_X,state->nalloc);
1578                 break;
1579             case estCGP:
1580                 srenew(state->cg_p,state->nalloc);
1581                 break;
1582             case estLD_RNG:
1583             case estLD_RNGI:
1584             case estDISRE_INITF:
1585             case estDISRE_RM3TAV:
1586             case estORIRE_INITF:
1587             case estORIRE_DTAV:
1588                 /* No reallocation required */
1589                 break;
1590             default:
1591                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1592             }
1593         }
1594     }
1595     
1596     if (f != NULL)
1597     {
1598         srenew(*f,state->nalloc);
1599     }
1600 }
1601
1602 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1603                                        rvec *v,rvec *lv)
1604 {
1605     gmx_domdec_master_t *ma;
1606     int  n,i,c,a,nalloc=0;
1607     rvec *buf=NULL;
1608     
1609     if (DDMASTER(dd))
1610     {
1611         ma  = dd->ma;
1612         
1613         for(n=0; n<dd->nnodes; n++)
1614         {
1615             if (n != dd->rank)
1616             {
1617                 if (ma->nat[n] > nalloc)
1618                 {
1619                     nalloc = over_alloc_dd(ma->nat[n]);
1620                     srenew(buf,nalloc);
1621                 }
1622                 /* Use lv as a temporary buffer */
1623                 a = 0;
1624                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1625                 {
1626                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1627                     {
1628                         copy_rvec(v[c],buf[a++]);
1629                     }
1630                 }
1631                 if (a != ma->nat[n])
1632                 {
1633                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1634                               a,ma->nat[n]);
1635                 }
1636                 
1637 #ifdef GMX_MPI
1638                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1639                          DDRANK(dd,n),n,dd->mpi_comm_all);
1640 #endif
1641             }
1642         }
1643         sfree(buf);
1644         n = DDMASTERRANK(dd);
1645         a = 0;
1646         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1647         {
1648             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1649             {
1650                 copy_rvec(v[c],lv[a++]);
1651             }
1652         }
1653     }
1654     else
1655     {
1656 #ifdef GMX_MPI
1657         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1658                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1659 #endif
1660     }
1661 }
1662
1663 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1664                                        rvec *v,rvec *lv)
1665 {
1666     gmx_domdec_master_t *ma;
1667     int  *scounts=NULL,*disps=NULL;
1668     int  n,i,c,a,nalloc=0;
1669     rvec *buf=NULL;
1670     
1671     if (DDMASTER(dd))
1672     {
1673         ma  = dd->ma;
1674      
1675         get_commbuffer_counts(dd,&scounts,&disps);
1676
1677         buf = ma->vbuf;
1678         a = 0;
1679         for(n=0; n<dd->nnodes; n++)
1680         {
1681             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1682             {
1683                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1684                 {
1685                     copy_rvec(v[c],buf[a++]);
1686                 }
1687             }
1688         }
1689     }
1690
1691     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1692 }
1693
1694 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1695 {
1696     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1697     {
1698         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1699     }
1700     else
1701     {
1702         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1703     }
1704 }
1705
1706 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1707                                 t_state *state,t_state *state_local,
1708                                 rvec **f)
1709 {
1710     int  i,j,ngtch,ngtcp,nh;
1711
1712     nh = state->nhchainlength;
1713
1714     if (DDMASTER(dd))
1715     {
1716         state_local->lambda = state->lambda;
1717         state_local->veta   = state->veta;
1718         state_local->vol0   = state->vol0;
1719         copy_mat(state->box,state_local->box);
1720         copy_mat(state->box_rel,state_local->box_rel);
1721         copy_mat(state->boxv,state_local->boxv);
1722         copy_mat(state->svir_prev,state_local->svir_prev);
1723         copy_mat(state->fvir_prev,state_local->fvir_prev);
1724         for(i=0; i<state_local->ngtc; i++)
1725         {
1726             for(j=0; j<nh; j++) {
1727                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1728                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1729             }
1730             state_local->therm_integral[i] = state->therm_integral[i];
1731         }
1732         for(i=0; i<state_local->nnhpres; i++)
1733         {
1734             for(j=0; j<nh; j++) {
1735                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1736                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1737             }
1738         }
1739     }
1740     dd_bcast(dd,sizeof(real),&state_local->lambda);
1741     dd_bcast(dd,sizeof(real),&state_local->veta);
1742     dd_bcast(dd,sizeof(real),&state_local->vol0);
1743     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1744     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1745     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1746     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1747     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1748     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1749     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1750     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1751     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1752     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1753
1754     if (dd->nat_home > state_local->nalloc)
1755     {
1756         dd_realloc_state(state_local,f,dd->nat_home);
1757     }
1758     for(i=0; i<estNR; i++)
1759     {
1760         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1761         {
1762             switch (i) {
1763             case estX:
1764                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1765                 break;
1766             case estV:
1767                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1768                 break;
1769             case estSDX:
1770                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1771                 break;
1772             case estCGP:
1773                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1774                 break;
1775             case estLD_RNG:
1776                 if (state->nrngi == 1)
1777                 {
1778                     dd_bcastc(dd,
1779                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1780                               state->ld_rng,state_local->ld_rng);
1781                 }
1782                 else
1783                 {
1784                     dd_scatter(dd,
1785                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1786                                state->ld_rng,state_local->ld_rng);
1787                 }
1788                 break;
1789             case estLD_RNGI:
1790                 if (state->nrngi == 1)
1791                 {
1792                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1793                               state->ld_rngi,state_local->ld_rngi);
1794                 }
1795                 else
1796                 {
1797                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1798                                state->ld_rngi,state_local->ld_rngi);
1799                 }   
1800                 break;
1801             case estDISRE_INITF:
1802             case estDISRE_RM3TAV:
1803             case estORIRE_INITF:
1804             case estORIRE_DTAV:
1805                 /* Not implemented yet */
1806                 break;
1807             default:
1808                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1809             }
1810         }
1811     }
1812 }
1813
1814 static char dim2char(int dim)
1815 {
1816     char c='?';
1817     
1818     switch (dim)
1819     {
1820     case XX: c = 'X'; break;
1821     case YY: c = 'Y'; break;
1822     case ZZ: c = 'Z'; break;
1823     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1824     }
1825     
1826     return c;
1827 }
1828
1829 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1830                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1831 {
1832     rvec grid_s[2],*grid_r=NULL,cx,r;
1833     char fname[STRLEN],format[STRLEN],buf[22];
1834     FILE *out;
1835     int  a,i,d,z,y,x;
1836     matrix tric;
1837     real vol;
1838
1839     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1840     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1841     
1842     if (DDMASTER(dd))
1843     {
1844         snew(grid_r,2*dd->nnodes);
1845     }
1846     
1847     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1848     
1849     if (DDMASTER(dd))
1850     {
1851         for(d=0; d<DIM; d++)
1852         {
1853             for(i=0; i<DIM; i++)
1854             {
1855                 if (d == i)
1856                 {
1857                     tric[d][i] = 1;
1858                 }
1859                 else
1860                 {
1861                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1862                     {
1863                         tric[d][i] = box[i][d]/box[i][i];
1864                     }
1865                     else
1866                     {
1867                         tric[d][i] = 0;
1868                     }
1869                 }
1870             }
1871         }
1872         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1873         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1874         out = gmx_fio_fopen(fname,"w");
1875         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1876         a = 1;
1877         for(i=0; i<dd->nnodes; i++)
1878         {
1879             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1880             for(d=0; d<DIM; d++)
1881             {
1882                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1883             }
1884             for(z=0; z<2; z++)
1885             {
1886                 for(y=0; y<2; y++)
1887                 {
1888                     for(x=0; x<2; x++)
1889                     {
1890                         cx[XX] = grid_r[i*2+x][XX];
1891                         cx[YY] = grid_r[i*2+y][YY];
1892                         cx[ZZ] = grid_r[i*2+z][ZZ];
1893                         mvmul(tric,cx,r);
1894                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1895                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1896                     }
1897                 }
1898             }
1899             for(d=0; d<DIM; d++)
1900             {
1901                 for(x=0; x<4; x++)
1902                 {
1903                     switch(d)
1904                     {
1905                     case 0: y = 1 + i*8 + 2*x; break;
1906                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1907                     case 2: y = 1 + i*8 + x; break;
1908                     }
1909                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1910                 }
1911             }
1912         }
1913         gmx_fio_fclose(out);
1914         sfree(grid_r);
1915     }
1916 }
1917
1918 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1919                   gmx_mtop_t *mtop,t_commrec *cr,
1920                   int natoms,rvec x[],matrix box)
1921 {
1922     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1923     FILE *out;
1924     int  i,ii,resnr,c;
1925     char *atomname,*resname;
1926     real b;
1927     gmx_domdec_t *dd;
1928     
1929     dd = cr->dd;
1930     if (natoms == -1)
1931     {
1932         natoms = dd->comm->nat[ddnatVSITE];
1933     }
1934     
1935     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1936     
1937     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1938     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1939     
1940     out = gmx_fio_fopen(fname,"w");
1941     
1942     fprintf(out,"TITLE     %s\n",title);
1943     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1944     for(i=0; i<natoms; i++)
1945     {
1946         ii = dd->gatindex[i];
1947         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1948         if (i < dd->comm->nat[ddnatZONE])
1949         {
1950             c = 0;
1951             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1952             {
1953                 c++;
1954             }
1955             b = c;
1956         }
1957         else if (i < dd->comm->nat[ddnatVSITE])
1958         {
1959             b = dd->comm->zones.n;
1960         }
1961         else
1962         {
1963             b = dd->comm->zones.n + 1;
1964         }
1965         fprintf(out,strlen(atomname)<4 ? format : format4,
1966                 "ATOM",(ii+1)%100000,
1967                 atomname,resname,' ',resnr%10000,' ',
1968                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1969     }
1970     fprintf(out,"TER\n");
1971     
1972     gmx_fio_fclose(out);
1973 }
1974
1975 real dd_cutoff_mbody(gmx_domdec_t *dd)
1976 {
1977     gmx_domdec_comm_t *comm;
1978     int  di;
1979     real r;
1980
1981     comm = dd->comm;
1982
1983     r = -1;
1984     if (comm->bInterCGBondeds)
1985     {
1986         if (comm->cutoff_mbody > 0)
1987         {
1988             r = comm->cutoff_mbody;
1989         }
1990         else
1991         {
1992             /* cutoff_mbody=0 means we do not have DLB */
1993             r = comm->cellsize_min[dd->dim[0]];
1994             for(di=1; di<dd->ndim; di++)
1995             {
1996                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1997             }
1998             if (comm->bBondComm)
1999             {
2000                 r = max(r,comm->cutoff_mbody);
2001             }
2002             else
2003             {
2004                 r = min(r,comm->cutoff);
2005             }
2006         }
2007     }
2008
2009     return r;
2010 }
2011
2012 real dd_cutoff_twobody(gmx_domdec_t *dd)
2013 {
2014     real r_mb;
2015
2016     r_mb = dd_cutoff_mbody(dd);
2017
2018     return max(dd->comm->cutoff,r_mb);
2019 }
2020
2021
2022 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2023 {
2024     int nc,ntot;
2025     
2026     nc   = dd->nc[dd->comm->cartpmedim];
2027     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2028     copy_ivec(coord,coord_pme);
2029     coord_pme[dd->comm->cartpmedim] =
2030         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2031 }
2032
2033 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2034 {
2035     /* Here we assign a PME node to communicate with this DD node
2036      * by assuming that the major index of both is x.
2037      * We add cr->npmenodes/2 to obtain an even distribution.
2038      */
2039     return (ddindex*npme + npme/2)/ndd;
2040 }
2041
2042 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2043 {
2044     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2045 }
2046
2047 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2048 {
2049     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2050 }
2051
2052 static int *dd_pmenodes(t_commrec *cr)
2053 {
2054     int *pmenodes;
2055     int n,i,p0,p1;
2056     
2057     snew(pmenodes,cr->npmenodes);
2058     n = 0;
2059     for(i=0; i<cr->dd->nnodes; i++) {
2060         p0 = cr_ddindex2pmeindex(cr,i);
2061         p1 = cr_ddindex2pmeindex(cr,i+1);
2062         if (i+1 == cr->dd->nnodes || p1 > p0) {
2063             if (debug)
2064                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2065             pmenodes[n] = i + 1 + n;
2066             n++;
2067         }
2068     }
2069
2070     return pmenodes;
2071 }
2072
2073 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2074 {
2075     gmx_domdec_t *dd;
2076     ivec coords,coords_pme,nc;
2077     int  slab;
2078     
2079     dd = cr->dd;
2080     /*
2081       if (dd->comm->bCartesian) {
2082       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2083       dd_coords2pmecoords(dd,coords,coords_pme);
2084       copy_ivec(dd->ntot,nc);
2085       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2086       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2087       
2088       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2089       } else {
2090       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2091       }
2092     */
2093     coords[XX] = x;
2094     coords[YY] = y;
2095     coords[ZZ] = z;
2096     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2097     
2098     return slab;
2099 }
2100
2101 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2102 {
2103     gmx_domdec_comm_t *comm;
2104     ivec coords;
2105     int  ddindex,nodeid=-1;
2106     
2107     comm = cr->dd->comm;
2108     
2109     coords[XX] = x;
2110     coords[YY] = y;
2111     coords[ZZ] = z;
2112     if (comm->bCartesianPP_PME)
2113     {
2114 #ifdef GMX_MPI
2115         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2116 #endif
2117     }
2118     else
2119     {
2120         ddindex = dd_index(cr->dd->nc,coords);
2121         if (comm->bCartesianPP)
2122         {
2123             nodeid = comm->ddindex2simnodeid[ddindex];
2124         }
2125         else
2126         {
2127             if (comm->pmenodes)
2128             {
2129                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2130             }
2131             else
2132             {
2133                 nodeid = ddindex;
2134             }
2135         }
2136     }
2137   
2138     return nodeid;
2139 }
2140
2141 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2142 {
2143     gmx_domdec_t *dd;
2144     gmx_domdec_comm_t *comm;
2145     ivec coord,coord_pme;
2146     int  i;
2147     int  pmenode=-1;
2148     
2149     dd = cr->dd;
2150     comm = dd->comm;
2151     
2152     /* This assumes a uniform x domain decomposition grid cell size */
2153     if (comm->bCartesianPP_PME)
2154     {
2155 #ifdef GMX_MPI
2156         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2157         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2158         {
2159             /* This is a PP node */
2160             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2161             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2162         }
2163 #endif
2164     }
2165     else if (comm->bCartesianPP)
2166     {
2167         if (sim_nodeid < dd->nnodes)
2168         {
2169             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2170         }
2171     }
2172     else
2173     {
2174         /* This assumes DD cells with identical x coordinates
2175          * are numbered sequentially.
2176          */
2177         if (dd->comm->pmenodes == NULL)
2178         {
2179             if (sim_nodeid < dd->nnodes)
2180             {
2181                 /* The DD index equals the nodeid */
2182                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2183             }
2184         }
2185         else
2186         {
2187             i = 0;
2188             while (sim_nodeid > dd->comm->pmenodes[i])
2189             {
2190                 i++;
2191             }
2192             if (sim_nodeid < dd->comm->pmenodes[i])
2193             {
2194                 pmenode = dd->comm->pmenodes[i];
2195             }
2196         }
2197     }
2198     
2199     return pmenode;
2200 }
2201
2202 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2203 {
2204     gmx_bool bPMEOnlyNode;
2205     
2206     if (DOMAINDECOMP(cr))
2207     {
2208         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2209     }
2210     else
2211     {
2212         bPMEOnlyNode = FALSE;
2213     }
2214     
2215     return bPMEOnlyNode;
2216 }
2217
2218 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2219                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2220 {
2221     gmx_domdec_t *dd;
2222     int x,y,z;
2223     ivec coord,coord_pme;
2224     
2225     dd = cr->dd;
2226     
2227     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2228     
2229     *nmy_ddnodes = 0;
2230     for(x=0; x<dd->nc[XX]; x++)
2231     {
2232         for(y=0; y<dd->nc[YY]; y++)
2233         {
2234             for(z=0; z<dd->nc[ZZ]; z++)
2235             {
2236                 if (dd->comm->bCartesianPP_PME)
2237                 {
2238                     coord[XX] = x;
2239                     coord[YY] = y;
2240                     coord[ZZ] = z;
2241                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2242                     if (dd->ci[XX] == coord_pme[XX] &&
2243                         dd->ci[YY] == coord_pme[YY] &&
2244                         dd->ci[ZZ] == coord_pme[ZZ])
2245                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2246                 }
2247                 else
2248                 {
2249                     /* The slab corresponds to the nodeid in the PME group */
2250                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2251                     {
2252                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2253                     }
2254                 }
2255             }
2256         }
2257     }
2258     
2259     /* The last PP-only node is the peer node */
2260     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2261     
2262     if (debug)
2263     {
2264         fprintf(debug,"Receive coordinates from PP nodes:");
2265         for(x=0; x<*nmy_ddnodes; x++)
2266         {
2267             fprintf(debug," %d",(*my_ddnodes)[x]);
2268         }
2269         fprintf(debug,"\n");
2270     }
2271 }
2272
2273 static gmx_bool receive_vir_ener(t_commrec *cr)
2274 {
2275     gmx_domdec_comm_t *comm;
2276     int  pmenode,coords[DIM],rank;
2277     gmx_bool bReceive;
2278     
2279     bReceive = TRUE;
2280     if (cr->npmenodes < cr->dd->nnodes)
2281     {
2282         comm = cr->dd->comm;
2283         if (comm->bCartesianPP_PME)
2284         {
2285             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2286 #ifdef GMX_MPI
2287             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2288             coords[comm->cartpmedim]++;
2289             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2290             {
2291                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2292                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2293                 {
2294                     /* This is not the last PP node for pmenode */
2295                     bReceive = FALSE;
2296                 }
2297             }
2298 #endif  
2299         }
2300         else
2301         {
2302             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2303             if (cr->sim_nodeid+1 < cr->nnodes &&
2304                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2305             {
2306                 /* This is not the last PP node for pmenode */
2307                 bReceive = FALSE;
2308             }
2309         }
2310     }
2311     
2312     return bReceive;
2313 }
2314
2315 static void set_zones_ncg_home(gmx_domdec_t *dd)
2316 {
2317     gmx_domdec_zones_t *zones;
2318     int i;
2319
2320     zones = &dd->comm->zones;
2321
2322     zones->cg_range[0] = 0;
2323     for(i=1; i<zones->n+1; i++)
2324     {
2325         zones->cg_range[i] = dd->ncg_home;
2326     }
2327 }
2328
2329 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2330 {
2331     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2332     
2333     ind = state->cg_gl;
2334     dd_cg_gl = dd->index_gl;
2335     cgindex  = dd->cgindex;
2336     nat = 0;
2337     cgindex[0] = nat;
2338     for(i=0; i<state->ncg_gl; i++)
2339     {
2340         cgindex[i] = nat;
2341         cg_gl = ind[i];
2342         dd_cg_gl[i] = cg_gl;
2343         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2344     }
2345     cgindex[i] = nat;
2346     
2347     dd->ncg_home = state->ncg_gl;
2348     dd->nat_home = nat;
2349
2350     set_zones_ncg_home(dd);
2351 }
2352
2353 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2354 {
2355     while (cg >= cginfo_mb->cg_end)
2356     {
2357         cginfo_mb++;
2358     }
2359
2360     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2361 }
2362
2363 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2364                           t_forcerec *fr,char *bLocalCG)
2365 {
2366     cginfo_mb_t *cginfo_mb;
2367     int *cginfo;
2368     int cg;
2369
2370     if (fr != NULL)
2371     {
2372         cginfo_mb = fr->cginfo_mb;
2373         cginfo    = fr->cginfo;
2374
2375         for(cg=cg0; cg<cg1; cg++)
2376         {
2377             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2378         }
2379     }
2380
2381     if (bLocalCG != NULL)
2382     {
2383         for(cg=cg0; cg<cg1; cg++)
2384         {
2385             bLocalCG[index_gl[cg]] = TRUE;
2386         }
2387     }
2388 }
2389
2390 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2391 {
2392     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2393     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2394     gmx_ga2la_t *ga2la;
2395     char *bLocalCG;
2396
2397     bLocalCG = dd->comm->bLocalCG;
2398
2399     if (dd->nat_tot > dd->gatindex_nalloc)
2400     {
2401         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2402         srenew(dd->gatindex,dd->gatindex_nalloc);
2403     }
2404
2405     nzone      = dd->comm->zones.n;
2406     zone2cg    = dd->comm->zones.cg_range;
2407     zone_ncg1  = dd->comm->zone_ncg1;
2408     index_gl   = dd->index_gl;
2409     gatindex   = dd->gatindex;
2410
2411     if (zone2cg[1] != dd->ncg_home)
2412     {
2413         gmx_incons("dd->ncg_zone is not up to date");
2414     }
2415     
2416     /* Make the local to global and global to local atom index */
2417     a = dd->cgindex[cg_start];
2418     for(zone=0; zone<nzone; zone++)
2419     {
2420         if (zone == 0)
2421         {
2422             cg0 = cg_start;
2423         }
2424         else
2425         {
2426             cg0 = zone2cg[zone];
2427         }
2428         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2429         {
2430             zone1 = zone;
2431             if (cg - cg0 >= zone_ncg1[zone])
2432             {
2433                 /* Signal that this cg is from more than one zone away */
2434                 zone1 += nzone;
2435             }
2436             cg_gl = index_gl[cg];
2437             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2438             {
2439                 gatindex[a] = a_gl;
2440                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2441                 a++;
2442             }
2443         }
2444     }
2445 }
2446
2447 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2448                           const char *where)
2449 {
2450     int ncg,i,ngl,nerr;
2451
2452     nerr = 0;
2453     if (bLocalCG == NULL)
2454     {
2455         return nerr;
2456     }
2457     for(i=0; i<dd->ncg_tot; i++)
2458     {
2459         if (!bLocalCG[dd->index_gl[i]])
2460         {
2461             fprintf(stderr,
2462                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2463             nerr++;
2464         }
2465     }
2466     ngl = 0;
2467     for(i=0; i<ncg_sys; i++)
2468     {
2469         if (bLocalCG[i])
2470         {
2471             ngl++;
2472         }
2473     }
2474     if (ngl != dd->ncg_tot)
2475     {
2476         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2477         nerr++;
2478     }
2479
2480     return nerr;
2481 }
2482
2483 static void check_index_consistency(gmx_domdec_t *dd,
2484                                     int natoms_sys,int ncg_sys,
2485                                     const char *where)
2486 {
2487     int  nerr,ngl,i,a,cell;
2488     int  *have;
2489
2490     nerr = 0;
2491
2492     if (dd->comm->DD_debug > 1)
2493     {
2494         snew(have,natoms_sys);
2495         for(a=0; a<dd->nat_tot; a++)
2496         {
2497             if (have[dd->gatindex[a]] > 0)
2498             {
2499                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2500             }
2501             else
2502             {
2503                 have[dd->gatindex[a]] = a + 1;
2504             }
2505         }
2506         sfree(have);
2507     }
2508
2509     snew(have,dd->nat_tot);
2510
2511     ngl  = 0;
2512     for(i=0; i<natoms_sys; i++)
2513     {
2514         if (ga2la_get(dd->ga2la,i,&a,&cell))
2515         {
2516             if (a >= dd->nat_tot)
2517             {
2518                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2519                 nerr++;
2520             }
2521             else
2522             {
2523                 have[a] = 1;
2524                 if (dd->gatindex[a] != i)
2525                 {
2526                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2527                     nerr++;
2528                 }
2529             }
2530             ngl++;
2531         }
2532     }
2533     if (ngl != dd->nat_tot)
2534     {
2535         fprintf(stderr,
2536                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2537                 dd->rank,where,ngl,dd->nat_tot);
2538     }
2539     for(a=0; a<dd->nat_tot; a++)
2540     {
2541         if (have[a] == 0)
2542         {
2543             fprintf(stderr,
2544                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2545                     dd->rank,where,a+1,dd->gatindex[a]+1);
2546         }
2547     }
2548     sfree(have);
2549
2550     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2551
2552     if (nerr > 0) {
2553         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2554                   dd->rank,where,nerr);
2555     }
2556 }
2557
2558 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2559 {
2560     int  i;
2561     char *bLocalCG;
2562
2563     if (a_start == 0)
2564     {
2565         /* Clear the whole list without searching */
2566         ga2la_clear(dd->ga2la);
2567     }
2568     else
2569     {
2570         for(i=a_start; i<dd->nat_tot; i++)
2571         {
2572             ga2la_del(dd->ga2la,dd->gatindex[i]);
2573         }
2574     }
2575
2576     bLocalCG = dd->comm->bLocalCG;
2577     if (bLocalCG)
2578     {
2579         for(i=cg_start; i<dd->ncg_tot; i++)
2580         {
2581             bLocalCG[dd->index_gl[i]] = FALSE;
2582         }
2583     }
2584
2585     dd_clear_local_vsite_indices(dd);
2586     
2587     if (dd->constraints)
2588     {
2589         dd_clear_local_constraint_indices(dd);
2590     }
2591 }
2592
2593 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2594 {
2595     real grid_jump_limit;
2596
2597     /* The distance between the boundaries of cells at distance
2598      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2599      * and by the fact that cells should not be shifted by more than
2600      * half their size, such that cg's only shift by one cell
2601      * at redecomposition.
2602      */
2603     grid_jump_limit = comm->cellsize_limit;
2604     if (!comm->bVacDLBNoLimit)
2605     {
2606         grid_jump_limit = max(grid_jump_limit,
2607                               comm->cutoff/comm->cd[dim_ind].np);
2608     }
2609
2610     return grid_jump_limit;
2611 }
2612
2613 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2614 {
2615     gmx_domdec_comm_t *comm;
2616     int  d,dim;
2617     real limit,bfac;
2618     
2619     comm = dd->comm;
2620     
2621     for(d=1; d<dd->ndim; d++)
2622     {
2623         dim = dd->dim[d];
2624         limit = grid_jump_limit(comm,d);
2625         bfac = ddbox->box_size[dim];
2626         if (ddbox->tric_dir[dim])
2627         {
2628             bfac *= ddbox->skew_fac[dim];
2629         }
2630         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2631             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2632         {
2633             char buf[22];
2634             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2635                       gmx_step_str(step,buf),
2636                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2637         }
2638     }
2639 }
2640
2641 static int dd_load_count(gmx_domdec_comm_t *comm)
2642 {
2643     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2644 }
2645
2646 static float dd_force_load(gmx_domdec_comm_t *comm)
2647 {
2648     float load;
2649     
2650     if (comm->eFlop)
2651     {
2652         load = comm->flop;
2653         if (comm->eFlop > 1)
2654         {
2655             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2656         }
2657     } 
2658     else
2659     {
2660         load = comm->cycl[ddCyclF];
2661         if (comm->cycl_n[ddCyclF] > 1)
2662         {
2663             /* Subtract the maximum of the last n cycle counts
2664              * to get rid of possible high counts due to other soures,
2665              * for instance system activity, that would otherwise
2666              * affect the dynamic load balancing.
2667              */
2668             load -= comm->cycl_max[ddCyclF];
2669         }
2670     }
2671     
2672     return load;
2673 }
2674
2675 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2676 {
2677     gmx_domdec_comm_t *comm;
2678     int i;
2679     
2680     comm = dd->comm;
2681     
2682     snew(*dim_f,dd->nc[dim]+1);
2683     (*dim_f)[0] = 0;
2684     for(i=1; i<dd->nc[dim]; i++)
2685     {
2686         if (comm->slb_frac[dim])
2687         {
2688             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2689         }
2690         else
2691         {
2692             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2693         }
2694     }
2695     (*dim_f)[dd->nc[dim]] = 1;
2696 }
2697
2698 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2699 {
2700     int  pmeindex,slab,nso,i;
2701     ivec xyz;
2702     
2703     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2704     {
2705         ddpme->dim = YY;
2706     }
2707     else
2708     {
2709         ddpme->dim = dimind;
2710     }
2711     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2712     
2713     ddpme->nslab = (ddpme->dim == 0 ?
2714                     dd->comm->npmenodes_x :
2715                     dd->comm->npmenodes_y);
2716
2717     if (ddpme->nslab <= 1)
2718     {
2719         return;
2720     }
2721
2722     nso = dd->comm->npmenodes/ddpme->nslab;
2723     /* Determine for each PME slab the PP location range for dimension dim */
2724     snew(ddpme->pp_min,ddpme->nslab);
2725     snew(ddpme->pp_max,ddpme->nslab);
2726     for(slab=0; slab<ddpme->nslab; slab++) {
2727         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2728         ddpme->pp_max[slab] = 0;
2729     }
2730     for(i=0; i<dd->nnodes; i++) {
2731         ddindex2xyz(dd->nc,i,xyz);
2732         /* For y only use our y/z slab.
2733          * This assumes that the PME x grid size matches the DD grid size.
2734          */
2735         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2736             pmeindex = ddindex2pmeindex(dd,i);
2737             if (dimind == 0) {
2738                 slab = pmeindex/nso;
2739             } else {
2740                 slab = pmeindex % ddpme->nslab;
2741             }
2742             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2743             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2744         }
2745     }
2746
2747     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2748 }
2749
2750 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2751 {
2752     if (dd->comm->ddpme[0].dim == XX)
2753     {
2754         return dd->comm->ddpme[0].maxshift;
2755     }
2756     else
2757     {
2758         return 0;
2759     }
2760 }
2761
2762 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2763 {
2764     if (dd->comm->ddpme[0].dim == YY)
2765     {
2766         return dd->comm->ddpme[0].maxshift;
2767     }
2768     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2769     {
2770         return dd->comm->ddpme[1].maxshift;
2771     }
2772     else
2773     {
2774         return 0;
2775     }
2776 }
2777
2778 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2779                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2780 {
2781     gmx_domdec_comm_t *comm;
2782     int  nc,ns,s;
2783     int  *xmin,*xmax;
2784     real range,pme_boundary;
2785     int  sh;
2786     
2787     comm = dd->comm;
2788     nc  = dd->nc[ddpme->dim];
2789     ns  = ddpme->nslab;
2790     
2791     if (!ddpme->dim_match)
2792     {
2793         /* PP decomposition is not along dim: the worst situation */
2794         sh = ns/2;
2795     }
2796     else if (ns <= 3 || (bUniform && ns == nc))
2797     {
2798         /* The optimal situation */
2799         sh = 1;
2800     }
2801     else
2802     {
2803         /* We need to check for all pme nodes which nodes they
2804          * could possibly need to communicate with.
2805          */
2806         xmin = ddpme->pp_min;
2807         xmax = ddpme->pp_max;
2808         /* Allow for atoms to be maximally 2/3 times the cut-off
2809          * out of their DD cell. This is a reasonable balance between
2810          * between performance and support for most charge-group/cut-off
2811          * combinations.
2812          */
2813         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2814         /* Avoid extra communication when we are exactly at a boundary */
2815         range *= 0.999;
2816         
2817         sh = 1;
2818         for(s=0; s<ns; s++)
2819         {
2820             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2821             pme_boundary = (real)s/ns;
2822             while (sh+1 < ns &&
2823                    ((s-(sh+1) >= 0 &&
2824                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2825                     (s-(sh+1) <  0 &&
2826                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2827             {
2828                 sh++;
2829             }
2830             pme_boundary = (real)(s+1)/ns;
2831             while (sh+1 < ns &&
2832                    ((s+(sh+1) <  ns &&
2833                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2834                     (s+(sh+1) >= ns &&
2835                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2836             {
2837                 sh++;
2838             }
2839         }
2840     }
2841     
2842     ddpme->maxshift = sh;
2843     
2844     if (debug)
2845     {
2846         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2847                 ddpme->dim,ddpme->maxshift);
2848     }
2849 }
2850
2851 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2852 {
2853     int d,dim;
2854     
2855     for(d=0; d<dd->ndim; d++)
2856     {
2857         dim = dd->dim[d];
2858         if (dim < ddbox->nboundeddim &&
2859             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2860             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2861         {
2862             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2863                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2864                       dd->nc[dim],dd->comm->cellsize_limit);
2865         }
2866     }
2867 }
2868
2869 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2870                                   gmx_bool bMaster,ivec npulse)
2871 {
2872     gmx_domdec_comm_t *comm;
2873     int  d,j;
2874     rvec cellsize_min;
2875     real *cell_x,cell_dx,cellsize;
2876     
2877     comm = dd->comm;
2878     
2879     for(d=0; d<DIM; d++)
2880     {
2881         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2882         npulse[d] = 1;
2883         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2884         {
2885             /* Uniform grid */
2886             cell_dx = ddbox->box_size[d]/dd->nc[d];
2887             if (bMaster)
2888             {
2889                 for(j=0; j<dd->nc[d]+1; j++)
2890                 {
2891                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2892                 }
2893             }
2894             else
2895             {
2896                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2897                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2898             }
2899             cellsize = cell_dx*ddbox->skew_fac[d];
2900             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2901             {
2902                 npulse[d]++;
2903             }
2904             cellsize_min[d] = cellsize;
2905         }
2906         else
2907         {
2908             /* Statically load balanced grid */
2909             /* Also when we are not doing a master distribution we determine
2910              * all cell borders in a loop to obtain identical values
2911              * to the master distribution case and to determine npulse.
2912              */
2913             if (bMaster)
2914             {
2915                 cell_x = dd->ma->cell_x[d];
2916             }
2917             else
2918             {
2919                 snew(cell_x,dd->nc[d]+1);
2920             }
2921             cell_x[0] = ddbox->box0[d];
2922             for(j=0; j<dd->nc[d]; j++)
2923             {
2924                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2925                 cell_x[j+1] = cell_x[j] + cell_dx;
2926                 cellsize = cell_dx*ddbox->skew_fac[d];
2927                 while (cellsize*npulse[d] < comm->cutoff &&
2928                        npulse[d] < dd->nc[d]-1)
2929                 {
2930                     npulse[d]++;
2931                 }
2932                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2933             }
2934             if (!bMaster)
2935             {
2936                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2937                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2938                 sfree(cell_x);
2939             }
2940         }
2941         /* The following limitation is to avoid that a cell would receive
2942          * some of its own home charge groups back over the periodic boundary.
2943          * Double charge groups cause trouble with the global indices.
2944          */
2945         if (d < ddbox->npbcdim &&
2946             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2947         {
2948             gmx_fatal_collective(FARGS,NULL,dd,
2949                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2950                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2951                                  comm->cutoff,
2952                                  dd->nc[d],dd->nc[d],
2953                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2954         }
2955     }
2956     
2957     if (!comm->bDynLoadBal)
2958     {
2959         copy_rvec(cellsize_min,comm->cellsize_min);
2960     }
2961    
2962     for(d=0; d<comm->npmedecompdim; d++)
2963     {
2964         set_pme_maxshift(dd,&comm->ddpme[d],
2965                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2966                          comm->ddpme[d].slb_dim_f);
2967     }
2968 }
2969
2970
2971 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2972                                        int d,int dim,gmx_domdec_root_t *root,
2973                                        gmx_ddbox_t *ddbox,
2974                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2975 {
2976     gmx_domdec_comm_t *comm;
2977     int  ncd,i,j,nmin,nmin_old;
2978     gmx_bool bLimLo,bLimHi;
2979     real *cell_size;
2980     real fac,halfway,cellsize_limit_f_i,region_size;
2981     gmx_bool bPBC,bLastHi=FALSE;
2982     int nrange[]={range[0],range[1]};
2983
2984     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
2985
2986     comm = dd->comm;
2987
2988     ncd = dd->nc[dim];
2989
2990     bPBC = (dim < ddbox->npbcdim);
2991
2992     cell_size = root->buf_ncd;
2993
2994     if (debug) 
2995     {
2996         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2997     }
2998
2999     /* First we need to check if the scaling does not make cells
3000      * smaller than the smallest allowed size.
3001      * We need to do this iteratively, since if a cell is too small,
3002      * it needs to be enlarged, which makes all the other cells smaller,
3003      * which could in turn make another cell smaller than allowed.
3004      */
3005     for(i=range[0]; i<range[1]; i++)
3006     {
3007         root->bCellMin[i] = FALSE;
3008     }
3009     nmin = 0;
3010     do
3011     {
3012         nmin_old = nmin;
3013         /* We need the total for normalization */
3014         fac = 0;
3015         for(i=range[0]; i<range[1]; i++)
3016         {
3017             if (root->bCellMin[i] == FALSE)
3018             {
3019                 fac += cell_size[i];
3020             }
3021         }
3022         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3023         /* Determine the cell boundaries */
3024         for(i=range[0]; i<range[1]; i++)
3025         {
3026             if (root->bCellMin[i] == FALSE)
3027             {
3028                 cell_size[i] *= fac;
3029                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3030                 {
3031                     cellsize_limit_f_i = 0;
3032                 }
3033                 else
3034                 {
3035                     cellsize_limit_f_i = cellsize_limit_f;
3036                 }
3037                 if (cell_size[i] < cellsize_limit_f_i)
3038                 {
3039                     root->bCellMin[i] = TRUE;
3040                     cell_size[i] = cellsize_limit_f_i;
3041                     nmin++;
3042                 }
3043             }
3044             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3045         }
3046     }
3047     while (nmin > nmin_old);
3048     
3049     i=range[1]-1;
3050     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3051     /* For this check we should not use DD_CELL_MARGIN,
3052      * but a slightly smaller factor,
3053      * since rounding could get use below the limit.
3054      */
3055     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3056     {
3057         char buf[22];
3058         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3059                   gmx_step_str(step,buf),
3060                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3061                   ncd,comm->cellsize_min[dim]);
3062     }
3063     
3064     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3065     
3066     if (!bUniform)
3067     {
3068         /* Check if the boundary did not displace more than halfway
3069          * each of the cells it bounds, as this could cause problems,
3070          * especially when the differences between cell sizes are large.
3071          * If changes are applied, they will not make cells smaller
3072          * than the cut-off, as we check all the boundaries which
3073          * might be affected by a change and if the old state was ok,
3074          * the cells will at most be shrunk back to their old size.
3075          */
3076         for(i=range[0]+1; i<range[1]; i++)
3077         {
3078             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3079             if (root->cell_f[i] < halfway)
3080             {
3081                 root->cell_f[i] = halfway;
3082                 /* Check if the change also causes shifts of the next boundaries */
3083                 for(j=i+1; j<range[1]; j++)
3084                 {
3085                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3086                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3087                 }
3088             }
3089             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3090             if (root->cell_f[i] > halfway)
3091             {
3092                 root->cell_f[i] = halfway;
3093                 /* Check if the change also causes shifts of the next boundaries */
3094                 for(j=i-1; j>=range[0]+1; j--)
3095                 {
3096                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3097                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3098                 }
3099             }
3100         }
3101     }
3102     
3103     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3104     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3105      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3106      * for a and b nrange is used */
3107     if (d > 0)
3108     {
3109         /* Take care of the staggering of the cell boundaries */
3110         if (bUniform)
3111         {
3112             for(i=range[0]; i<range[1]; i++)
3113             {
3114                 root->cell_f_max0[i] = root->cell_f[i];
3115                 root->cell_f_min1[i] = root->cell_f[i+1];
3116             }
3117         }
3118         else
3119         {
3120             for(i=range[0]+1; i<range[1]; i++)
3121             {
3122                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3123                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3124                 if (bLimLo && bLimHi)
3125                 {
3126                     /* Both limits violated, try the best we can */
3127                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3128                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3129                     nrange[0]=range[0];
3130                     nrange[1]=i;
3131                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3132
3133                     nrange[0]=i;
3134                     nrange[1]=range[1];
3135                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3136
3137                     return;
3138                 }
3139                 else if (bLimLo)
3140                 {
3141                     /* root->cell_f[i] = root->bound_min[i]; */
3142                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3143                     bLastHi=FALSE;
3144                 }
3145                 else if (bLimHi && !bLastHi)
3146                 {
3147                     bLastHi=TRUE;
3148                     if (nrange[1] < range[1])   /* found a LimLo before */
3149                     {
3150                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3151                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3152                         nrange[0]=nrange[1];
3153                     }
3154                     root->cell_f[i] = root->bound_max[i];
3155                     nrange[1]=i; 
3156                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3157                     nrange[0]=i;
3158                     nrange[1]=range[1];
3159                 }
3160             }
3161             if (nrange[1] < range[1])   /* found last a LimLo */
3162             {
3163                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3164                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3165                 nrange[0]=nrange[1];
3166                 nrange[1]=range[1];
3167                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3168             } 
3169             else if (nrange[0] > range[0]) /* found at least one LimHi */
3170             {
3171                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3172             }
3173         }
3174     }
3175 }
3176
3177
3178 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3179                                        int d,int dim,gmx_domdec_root_t *root,
3180                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3181                                        gmx_bool bUniform,gmx_large_int_t step)
3182 {
3183     gmx_domdec_comm_t *comm;
3184     int  ncd,d1,i,j,pos;
3185     real *cell_size;
3186     real load_aver,load_i,imbalance,change,change_max,sc;
3187     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3188     real change_limit;
3189     real relax = 0.5;
3190     gmx_bool bPBC;
3191     int range[] = { 0, 0 };
3192
3193     comm = dd->comm;
3194
3195     /* Convert the maximum change from the input percentage to a fraction */
3196     change_limit = comm->dlb_scale_lim*0.01;
3197
3198     ncd = dd->nc[dim];
3199
3200     bPBC = (dim < ddbox->npbcdim);
3201
3202     cell_size = root->buf_ncd;
3203
3204     /* Store the original boundaries */
3205     for(i=0; i<ncd+1; i++)
3206     {
3207         root->old_cell_f[i] = root->cell_f[i];
3208     }
3209     if (bUniform) {
3210         for(i=0; i<ncd; i++)
3211         {
3212             cell_size[i] = 1.0/ncd;
3213         }
3214     }
3215     else if (dd_load_count(comm))
3216     {
3217         load_aver = comm->load[d].sum_m/ncd;
3218         change_max = 0;
3219         for(i=0; i<ncd; i++)
3220         {
3221             /* Determine the relative imbalance of cell i */
3222             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3223             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3224             /* Determine the change of the cell size using underrelaxation */
3225             change = -relax*imbalance;
3226             change_max = max(change_max,max(change,-change));
3227         }
3228         /* Limit the amount of scaling.
3229          * We need to use the same rescaling for all cells in one row,
3230          * otherwise the load balancing might not converge.
3231          */
3232         sc = relax;
3233         if (change_max > change_limit)
3234         {
3235             sc *= change_limit/change_max;
3236         }
3237         for(i=0; i<ncd; i++)
3238         {
3239             /* Determine the relative imbalance of cell i */
3240             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3241             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3242             /* Determine the change of the cell size using underrelaxation */
3243             change = -sc*imbalance;
3244             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3245         }
3246     }
3247     
3248     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3249     cellsize_limit_f *= DD_CELL_MARGIN;
3250     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3251     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3252     if (ddbox->tric_dir[dim])
3253     {
3254         cellsize_limit_f /= ddbox->skew_fac[dim];
3255         dist_min_f       /= ddbox->skew_fac[dim];
3256     }
3257     if (bDynamicBox && d > 0)
3258     {
3259         dist_min_f *= DD_PRES_SCALE_MARGIN;
3260     }
3261     if (d > 0 && !bUniform)
3262     {
3263         /* Make sure that the grid is not shifted too much */
3264         for(i=1; i<ncd; i++) {
3265             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3266             {
3267                 gmx_incons("Inconsistent DD boundary staggering limits!");
3268             }
3269             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3270             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3271             if (space > 0) {
3272                 root->bound_min[i] += 0.5*space;
3273             }
3274             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3275             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3276             if (space < 0) {
3277                 root->bound_max[i] += 0.5*space;
3278             }
3279             if (debug)
3280             {
3281                 fprintf(debug,
3282                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3283                         d,i,
3284                         root->cell_f_max0[i-1] + dist_min_f,
3285                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3286                         root->cell_f_min1[i] - dist_min_f);
3287             }
3288         }
3289     }
3290     range[1]=ncd;
3291     root->cell_f[0] = 0;
3292     root->cell_f[ncd] = 1;
3293     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3294
3295
3296     /* After the checks above, the cells should obey the cut-off
3297      * restrictions, but it does not hurt to check.
3298      */
3299     for(i=0; i<ncd; i++)
3300     {
3301         if (debug)
3302         {
3303             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3304                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3305         }
3306
3307         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3308             root->cell_f[i+1] - root->cell_f[i] <
3309             cellsize_limit_f/DD_CELL_MARGIN)
3310         {
3311             char buf[22];
3312             fprintf(stderr,
3313                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3314                     gmx_step_str(step,buf),dim2char(dim),i,
3315                     (root->cell_f[i+1] - root->cell_f[i])
3316                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3317         }
3318     }
3319     
3320     pos = ncd + 1;
3321     /* Store the cell boundaries of the lower dimensions at the end */
3322     for(d1=0; d1<d; d1++)
3323     {
3324         root->cell_f[pos++] = comm->cell_f0[d1];
3325         root->cell_f[pos++] = comm->cell_f1[d1];
3326     }
3327     
3328     if (d < comm->npmedecompdim)
3329     {
3330         /* The master determines the maximum shift for
3331          * the coordinate communication between separate PME nodes.
3332          */
3333         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3334     }
3335     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3336     if (d >= 1)
3337     {
3338         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3339     }
3340 }    
3341
3342 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3343                                              gmx_ddbox_t *ddbox,int dimind)
3344 {
3345     gmx_domdec_comm_t *comm;
3346     int dim;
3347
3348     comm = dd->comm;
3349
3350     /* Set the cell dimensions */
3351     dim = dd->dim[dimind];
3352     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3353     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3354     if (dim >= ddbox->nboundeddim)
3355     {
3356         comm->cell_x0[dim] += ddbox->box0[dim];
3357         comm->cell_x1[dim] += ddbox->box0[dim];
3358     }
3359 }
3360
3361 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3362                                          int d,int dim,real *cell_f_row,
3363                                          gmx_ddbox_t *ddbox)
3364 {
3365     gmx_domdec_comm_t *comm;
3366     int d1,dim1,pos;
3367
3368     comm = dd->comm;
3369
3370 #ifdef GMX_MPI
3371     /* Each node would only need to know two fractions,
3372      * but it is probably cheaper to broadcast the whole array.
3373      */
3374     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3375               0,comm->mpi_comm_load[d]);
3376 #endif
3377     /* Copy the fractions for this dimension from the buffer */
3378     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3379     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3380     /* The whole array was communicated, so set the buffer position */
3381     pos = dd->nc[dim] + 1;
3382     for(d1=0; d1<=d; d1++)
3383     {
3384         if (d1 < d)
3385         {
3386             /* Copy the cell fractions of the lower dimensions */
3387             comm->cell_f0[d1] = cell_f_row[pos++];
3388             comm->cell_f1[d1] = cell_f_row[pos++];
3389         }
3390         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3391     }
3392     /* Convert the communicated shift from float to int */
3393     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3394     if (d >= 1)
3395     {
3396         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3397     }
3398 }
3399
3400 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3401                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3402                                          gmx_bool bUniform,gmx_large_int_t step)
3403 {
3404     gmx_domdec_comm_t *comm;
3405     int d,dim,d1;
3406     gmx_bool bRowMember,bRowRoot;
3407     real *cell_f_row;
3408     
3409     comm = dd->comm;
3410
3411     for(d=0; d<dd->ndim; d++)
3412     {
3413         dim = dd->dim[d];
3414         bRowMember = TRUE;
3415         bRowRoot = TRUE;
3416         for(d1=d; d1<dd->ndim; d1++)
3417         {
3418             if (dd->ci[dd->dim[d1]] > 0)
3419             {
3420                 if (d1 > d)
3421                 {
3422                     bRowMember = FALSE;
3423                 }
3424                 bRowRoot = FALSE;
3425             }
3426         }
3427         if (bRowMember)
3428         {
3429             if (bRowRoot)
3430             {
3431                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3432                                            ddbox,bDynamicBox,bUniform,step);
3433                 cell_f_row = comm->root[d]->cell_f;
3434             }
3435             else
3436             {
3437                 cell_f_row = comm->cell_f_row;
3438             }
3439             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3440         }
3441     }
3442 }    
3443
3444 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3445 {
3446     int d;
3447
3448     /* This function assumes the box is static and should therefore
3449      * not be called when the box has changed since the last
3450      * call to dd_partition_system.
3451      */
3452     for(d=0; d<dd->ndim; d++)
3453     {
3454         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3455     }
3456 }
3457
3458
3459
3460 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3461                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3462                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3463                                   gmx_wallcycle_t wcycle)
3464 {
3465     gmx_domdec_comm_t *comm;
3466     int dim;
3467
3468     comm = dd->comm;
3469     
3470     if (bDoDLB)
3471     {
3472         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3473         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3474         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3475     }
3476     else if (bDynamicBox)
3477     {
3478         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3479     }
3480     
3481     /* Set the dimensions for which no DD is used */
3482     for(dim=0; dim<DIM; dim++) {
3483         if (dd->nc[dim] == 1) {
3484             comm->cell_x0[dim] = 0;
3485             comm->cell_x1[dim] = ddbox->box_size[dim];
3486             if (dim >= ddbox->nboundeddim)
3487             {
3488                 comm->cell_x0[dim] += ddbox->box0[dim];
3489                 comm->cell_x1[dim] += ddbox->box0[dim];
3490             }
3491         }
3492     }
3493 }
3494
3495 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3496 {
3497     int d,np,i;
3498     gmx_domdec_comm_dim_t *cd;
3499     
3500     for(d=0; d<dd->ndim; d++)
3501     {
3502         cd = &dd->comm->cd[d];
3503         np = npulse[dd->dim[d]];
3504         if (np > cd->np_nalloc)
3505         {
3506             if (debug)
3507             {
3508                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3509                         dim2char(dd->dim[d]),np);
3510             }
3511             if (DDMASTER(dd) && cd->np_nalloc > 0)
3512             {
3513                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3514             }
3515             srenew(cd->ind,np);
3516             for(i=cd->np_nalloc; i<np; i++)
3517             {
3518                 cd->ind[i].index  = NULL;
3519                 cd->ind[i].nalloc = 0;
3520             }
3521             cd->np_nalloc = np;
3522         }
3523         cd->np = np;
3524     }
3525 }
3526
3527
3528 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3529                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3530                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3531                               gmx_wallcycle_t wcycle)
3532 {
3533     gmx_domdec_comm_t *comm;
3534     int  d;
3535     ivec npulse;
3536     
3537     comm = dd->comm;
3538
3539     /* Copy the old cell boundaries for the cg displacement check */
3540     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3541     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3542     
3543     if (comm->bDynLoadBal)
3544     {
3545         if (DDMASTER(dd))
3546         {
3547             check_box_size(dd,ddbox);
3548         }
3549         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3550     }
3551     else
3552     {
3553         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3554         realloc_comm_ind(dd,npulse);
3555     }
3556     
3557     if (debug)
3558     {
3559         for(d=0; d<DIM; d++)
3560         {
3561             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3562                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3563         }
3564     }
3565 }
3566
3567 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3568                                   gmx_ddbox_t *ddbox,
3569                                   rvec cell_ns_x0,rvec cell_ns_x1,
3570                                   gmx_large_int_t step)
3571 {
3572     gmx_domdec_comm_t *comm;
3573     int dim_ind,dim;
3574     
3575     comm = dd->comm;
3576
3577     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3578     {
3579         dim = dd->dim[dim_ind];
3580         
3581         /* Without PBC we don't have restrictions on the outer cells */
3582         if (!(dim >= ddbox->npbcdim && 
3583               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3584             comm->bDynLoadBal &&
3585             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3586             comm->cellsize_min[dim])
3587         {
3588             char buf[22];
3589             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3590                       gmx_step_str(step,buf),dim2char(dim),
3591                       comm->cell_x1[dim] - comm->cell_x0[dim],
3592                       ddbox->skew_fac[dim],
3593                       dd->comm->cellsize_min[dim],
3594                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3595         }
3596     }
3597     
3598     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3599     {
3600         /* Communicate the boundaries and update cell_ns_x0/1 */
3601         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3602         if (dd->bGridJump && dd->ndim > 1)
3603         {
3604             check_grid_jump(step,dd,ddbox);
3605         }
3606     }
3607 }
3608
3609 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3610 {
3611     if (YY < npbcdim)
3612     {
3613         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3614     }
3615     else
3616     {
3617         tcm[YY][XX] = 0;
3618     }
3619     if (ZZ < npbcdim)
3620     {
3621         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3622         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3623     }
3624     else
3625     {
3626         tcm[ZZ][XX] = 0;
3627         tcm[ZZ][YY] = 0;
3628     }
3629 }
3630
3631 static void check_screw_box(matrix box)
3632 {
3633     /* Mathematical limitation */
3634     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3635     {
3636         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3637     }
3638     
3639     /* Limitation due to the asymmetry of the eighth shell method */
3640     if (box[ZZ][YY] != 0)
3641     {
3642         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3643     }
3644 }
3645
3646 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3647                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3648                           gmx_domdec_t *dd)
3649 {
3650     gmx_domdec_master_t *ma;
3651     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3652     int  i,icg,j,k,k0,k1,d,npbcdim;
3653     matrix tcm;
3654     rvec box_size,cg_cm;
3655     ivec ind;
3656     real nrcg,inv_ncg,pos_d;
3657     atom_id *cgindex;
3658     gmx_bool bUnbounded,bScrew;
3659
3660     ma = dd->ma;
3661     
3662     if (tmp_ind == NULL)
3663     {
3664         snew(tmp_nalloc,dd->nnodes);
3665         snew(tmp_ind,dd->nnodes);
3666         for(i=0; i<dd->nnodes; i++)
3667         {
3668             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3669             snew(tmp_ind[i],tmp_nalloc[i]);
3670         }
3671     }
3672     
3673     /* Clear the count */
3674     for(i=0; i<dd->nnodes; i++)
3675     {
3676         ma->ncg[i] = 0;
3677         ma->nat[i] = 0;
3678     }
3679     
3680     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3681     
3682     cgindex = cgs->index;
3683     
3684     /* Compute the center of geometry for all charge groups */
3685     for(icg=0; icg<cgs->nr; icg++)
3686     {
3687         k0      = cgindex[icg];
3688         k1      = cgindex[icg+1];
3689         nrcg    = k1 - k0;
3690         if (nrcg == 1)
3691         {
3692             copy_rvec(pos[k0],cg_cm);
3693         }
3694         else
3695         {
3696             inv_ncg = 1.0/nrcg;
3697             
3698             clear_rvec(cg_cm);
3699             for(k=k0; (k<k1); k++)
3700             {
3701                 rvec_inc(cg_cm,pos[k]);
3702             }
3703             for(d=0; (d<DIM); d++)
3704             {
3705                 cg_cm[d] *= inv_ncg;
3706             }
3707         }
3708         /* Put the charge group in the box and determine the cell index */
3709         for(d=DIM-1; d>=0; d--) {
3710             pos_d = cg_cm[d];
3711             if (d < dd->npbcdim)
3712             {
3713                 bScrew = (dd->bScrewPBC && d == XX);
3714                 if (tric_dir[d] && dd->nc[d] > 1)
3715                 {
3716                     /* Use triclinic coordintates for this dimension */
3717                     for(j=d+1; j<DIM; j++)
3718                     {
3719                         pos_d += cg_cm[j]*tcm[j][d];
3720                     }
3721                 }
3722                 while(pos_d >= box[d][d])
3723                 {
3724                     pos_d -= box[d][d];
3725                     rvec_dec(cg_cm,box[d]);
3726                     if (bScrew)
3727                     {
3728                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3729                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3730                     }
3731                     for(k=k0; (k<k1); k++)
3732                     {
3733                         rvec_dec(pos[k],box[d]);
3734                         if (bScrew)
3735                         {
3736                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3737                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3738                         }
3739                     }
3740                 }
3741                 while(pos_d < 0)
3742                 {
3743                     pos_d += box[d][d];
3744                     rvec_inc(cg_cm,box[d]);
3745                     if (bScrew)
3746                     {
3747                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3748                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3749                     }
3750                     for(k=k0; (k<k1); k++)
3751                     {
3752                         rvec_inc(pos[k],box[d]);
3753                         if (bScrew) {
3754                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3755                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3756                         }
3757                     }
3758                 }
3759             }
3760             /* This could be done more efficiently */
3761             ind[d] = 0;
3762             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3763             {
3764                 ind[d]++;
3765             }
3766         }
3767         i = dd_index(dd->nc,ind);
3768         if (ma->ncg[i] == tmp_nalloc[i])
3769         {
3770             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3771             srenew(tmp_ind[i],tmp_nalloc[i]);
3772         }
3773         tmp_ind[i][ma->ncg[i]] = icg;
3774         ma->ncg[i]++;
3775         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3776     }
3777     
3778     k1 = 0;
3779     for(i=0; i<dd->nnodes; i++)
3780     {
3781         ma->index[i] = k1;
3782         for(k=0; k<ma->ncg[i]; k++)
3783         {
3784             ma->cg[k1++] = tmp_ind[i][k];
3785         }
3786     }
3787     ma->index[dd->nnodes] = k1;
3788     
3789     for(i=0; i<dd->nnodes; i++)
3790     {
3791         sfree(tmp_ind[i]);
3792     }
3793     sfree(tmp_ind);
3794     sfree(tmp_nalloc);
3795     
3796     if (fplog)
3797     {
3798         char buf[22];
3799         fprintf(fplog,"Charge group distribution at step %s:",
3800                 gmx_step_str(step,buf));
3801         for(i=0; i<dd->nnodes; i++)
3802         {
3803             fprintf(fplog," %d",ma->ncg[i]);
3804         }
3805         fprintf(fplog,"\n");
3806     }
3807 }
3808
3809 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3810                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3811                                 rvec pos[])
3812 {
3813     gmx_domdec_master_t *ma=NULL;
3814     ivec npulse;
3815     int  i,cg_gl;
3816     int  *ibuf,buf2[2] = { 0, 0 };
3817     
3818     if (DDMASTER(dd))
3819     {
3820         ma = dd->ma;
3821         
3822         if (dd->bScrewPBC)
3823         {
3824             check_screw_box(box);
3825         }
3826     
3827         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3828     
3829         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3830         for(i=0; i<dd->nnodes; i++)
3831         {
3832             ma->ibuf[2*i]   = ma->ncg[i];
3833             ma->ibuf[2*i+1] = ma->nat[i];
3834         }
3835         ibuf = ma->ibuf;
3836     }
3837     else
3838     {
3839         ibuf = NULL;
3840     }
3841     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3842     
3843     dd->ncg_home = buf2[0];
3844     dd->nat_home = buf2[1];
3845     dd->ncg_tot  = dd->ncg_home;
3846     dd->nat_tot  = dd->nat_home;
3847     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3848     {
3849         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3850         srenew(dd->index_gl,dd->cg_nalloc);
3851         srenew(dd->cgindex,dd->cg_nalloc+1);
3852     }
3853     if (DDMASTER(dd))
3854     {
3855         for(i=0; i<dd->nnodes; i++)
3856         {
3857             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3858             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3859         }
3860     }
3861     
3862     dd_scatterv(dd,
3863                 DDMASTER(dd) ? ma->ibuf : NULL,
3864                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3865                 DDMASTER(dd) ? ma->cg : NULL,
3866                 dd->ncg_home*sizeof(int),dd->index_gl);
3867     
3868     /* Determine the home charge group sizes */
3869     dd->cgindex[0] = 0;
3870     for(i=0; i<dd->ncg_home; i++)
3871     {
3872         cg_gl = dd->index_gl[i];
3873         dd->cgindex[i+1] =
3874             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3875     }
3876     
3877     if (debug)
3878     {
3879         fprintf(debug,"Home charge groups:\n");
3880         for(i=0; i<dd->ncg_home; i++)
3881         {
3882             fprintf(debug," %d",dd->index_gl[i]);
3883             if (i % 10 == 9) 
3884                 fprintf(debug,"\n");
3885         }
3886         fprintf(debug,"\n");
3887     }
3888 }
3889
3890 static int compact_and_copy_vec_at(int ncg,int *move,
3891                                    int *cgindex,
3892                                    int nvec,int vec,
3893                                    rvec *src,gmx_domdec_comm_t *comm,
3894                                    gmx_bool bCompact)
3895 {
3896     int m,icg,i,i0,i1,nrcg;
3897     int home_pos;
3898     int pos_vec[DIM*2];
3899     
3900     home_pos = 0;
3901
3902     for(m=0; m<DIM*2; m++)
3903     {
3904         pos_vec[m] = 0;
3905     }
3906     
3907     i0 = 0;
3908     for(icg=0; icg<ncg; icg++)
3909     {
3910         i1 = cgindex[icg+1];
3911         m = move[icg];
3912         if (m == -1)
3913         {
3914             if (bCompact)
3915             {
3916                 /* Compact the home array in place */
3917                 for(i=i0; i<i1; i++)
3918                 {
3919                     copy_rvec(src[i],src[home_pos++]);
3920                 }
3921             }
3922         }
3923         else
3924         {
3925             /* Copy to the communication buffer */
3926             nrcg = i1 - i0;
3927             pos_vec[m] += 1 + vec*nrcg;
3928             for(i=i0; i<i1; i++)
3929             {
3930                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3931             }
3932             pos_vec[m] += (nvec - vec - 1)*nrcg;
3933         }
3934         if (!bCompact)
3935         {
3936             home_pos += i1 - i0;
3937         }
3938         i0 = i1;
3939     }
3940     
3941     return home_pos;
3942 }
3943
3944 static int compact_and_copy_vec_cg(int ncg,int *move,
3945                                    int *cgindex,
3946                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3947                                    gmx_bool bCompact)
3948 {
3949     int m,icg,i0,i1,nrcg;
3950     int home_pos;
3951     int pos_vec[DIM*2];
3952     
3953     home_pos = 0;
3954     
3955     for(m=0; m<DIM*2; m++)
3956     {
3957         pos_vec[m] = 0;
3958     }
3959     
3960     i0 = 0;
3961     for(icg=0; icg<ncg; icg++)
3962     {
3963         i1 = cgindex[icg+1];
3964         m = move[icg];
3965         if (m == -1)
3966         {
3967             if (bCompact)
3968             {
3969                 /* Compact the home array in place */
3970                 copy_rvec(src[icg],src[home_pos++]);
3971             }
3972         }
3973         else
3974         {
3975             nrcg = i1 - i0;
3976             /* Copy to the communication buffer */
3977             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3978             pos_vec[m] += 1 + nrcg*nvec;
3979         }
3980         i0 = i1;
3981     }
3982     if (!bCompact)
3983     {
3984         home_pos = ncg;
3985     }
3986     
3987     return home_pos;
3988 }
3989
3990 static int compact_ind(int ncg,int *move,
3991                        int *index_gl,int *cgindex,
3992                        int *gatindex,
3993                        gmx_ga2la_t ga2la,char *bLocalCG,
3994                        int *cginfo)
3995 {
3996     int cg,nat,a0,a1,a,a_gl;
3997     int home_pos;
3998
3999     home_pos = 0;
4000     nat = 0;
4001     for(cg=0; cg<ncg; cg++)
4002     {
4003         a0 = cgindex[cg];
4004         a1 = cgindex[cg+1];
4005         if (move[cg] == -1)
4006         {
4007             /* Compact the home arrays in place.
4008              * Anything that can be done here avoids access to global arrays.
4009              */
4010             cgindex[home_pos] = nat;
4011             for(a=a0; a<a1; a++)
4012             {
4013                 a_gl = gatindex[a];
4014                 gatindex[nat] = a_gl;
4015                 /* The cell number stays 0, so we don't need to set it */
4016                 ga2la_change_la(ga2la,a_gl,nat);
4017                 nat++;
4018             }
4019             index_gl[home_pos] = index_gl[cg];
4020             cginfo[home_pos]   = cginfo[cg];
4021             /* The charge group remains local, so bLocalCG does not change */
4022             home_pos++;
4023         }
4024         else
4025         {
4026             /* Clear the global indices */
4027             for(a=a0; a<a1; a++)
4028             {
4029                 ga2la_del(ga2la,gatindex[a]);
4030             }
4031             if (bLocalCG)
4032             {
4033                 bLocalCG[index_gl[cg]] = FALSE;
4034             }
4035         }
4036     }
4037     cgindex[home_pos] = nat;
4038     
4039     return home_pos;
4040 }
4041
4042 static void clear_and_mark_ind(int ncg,int *move,
4043                                int *index_gl,int *cgindex,int *gatindex,
4044                                gmx_ga2la_t ga2la,char *bLocalCG,
4045                                int *cell_index)
4046 {
4047     int cg,a0,a1,a;
4048     
4049     for(cg=0; cg<ncg; cg++)
4050     {
4051         if (move[cg] >= 0)
4052         {
4053             a0 = cgindex[cg];
4054             a1 = cgindex[cg+1];
4055             /* Clear the global indices */
4056             for(a=a0; a<a1; a++)
4057             {
4058                 ga2la_del(ga2la,gatindex[a]);
4059             }
4060             if (bLocalCG)
4061             {
4062                 bLocalCG[index_gl[cg]] = FALSE;
4063             }
4064             /* Signal that this cg has moved using the ns cell index.
4065              * Here we set it to -1.
4066              * fill_grid will change it from -1 to 4*grid->ncells.
4067              */
4068             cell_index[cg] = -1;
4069         }
4070     }
4071 }
4072
4073 static void print_cg_move(FILE *fplog,
4074                           gmx_domdec_t *dd,
4075                           gmx_large_int_t step,int cg,int dim,int dir,
4076                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4077                           rvec cm_old,rvec cm_new,real pos_d)
4078 {
4079     gmx_domdec_comm_t *comm;
4080     char buf[22];
4081
4082     comm = dd->comm;
4083
4084     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4085     if (bHaveLimitdAndCMOld)
4086     {
4087         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4088                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4089     }
4090     else
4091     {
4092         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4093                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4094     }
4095     fprintf(fplog,"distance out of cell %f\n",
4096             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4097     if (bHaveLimitdAndCMOld)
4098     {
4099         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4100                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4101     }
4102     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4103             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4104     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4105             dim2char(dim),
4106             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4107     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4108             dim2char(dim),
4109             comm->cell_x0[dim],comm->cell_x1[dim]);
4110 }
4111
4112 static void cg_move_error(FILE *fplog,
4113                           gmx_domdec_t *dd,
4114                           gmx_large_int_t step,int cg,int dim,int dir,
4115                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4116                           rvec cm_old,rvec cm_new,real pos_d)
4117 {
4118     if (fplog)
4119     {
4120         print_cg_move(fplog, dd,step,cg,dim,dir,
4121                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4122     }
4123     print_cg_move(stderr,dd,step,cg,dim,dir,
4124                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4125     gmx_fatal(FARGS,
4126               "A charge group moved too far between two domain decomposition steps\n"
4127               "This usually means that your system is not well equilibrated");
4128 }
4129
4130 static void rotate_state_atom(t_state *state,int a)
4131 {
4132     int est;
4133
4134     for(est=0; est<estNR; est++)
4135     {
4136         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4137             switch (est) {
4138             case estX:
4139                 /* Rotate the complete state; for a rectangular box only */
4140                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4141                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4142                 break;
4143             case estV:
4144                 state->v[a][YY] = -state->v[a][YY];
4145                 state->v[a][ZZ] = -state->v[a][ZZ];
4146                 break;
4147             case estSDX:
4148                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4149                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4150                 break;
4151             case estCGP:
4152                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4153                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4154                 break;
4155             case estDISRE_INITF:
4156             case estDISRE_RM3TAV:
4157             case estORIRE_INITF:
4158             case estORIRE_DTAV:
4159                 /* These are distances, so not affected by rotation */
4160                 break;
4161             default:
4162                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4163             }
4164         }
4165     }
4166 }
4167
4168 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4169                               gmx_domdec_t *dd,ivec tric_dir,
4170                               t_state *state,rvec **f,
4171                               t_forcerec *fr,t_mdatoms *md,
4172                               gmx_bool bCompact,
4173                               t_nrnb *nrnb)
4174 {
4175     int  *move;
4176     int  npbcdim;
4177     int  ncg[DIM*2],nat[DIM*2];
4178     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4179     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4180     int  sbuf[2],rbuf[2];
4181     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4182     int  flag;
4183     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4184     gmx_bool bScrew;
4185     ivec dev;
4186     real inv_ncg,pos_d;
4187     matrix tcm;
4188     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4189     atom_id *cgindex;
4190     cginfo_mb_t *cginfo_mb;
4191     gmx_domdec_comm_t *comm;
4192     
4193     if (dd->bScrewPBC)
4194     {
4195         check_screw_box(state->box);
4196     }
4197     
4198     comm  = dd->comm;
4199     cg_cm = fr->cg_cm;
4200     
4201     for(i=0; i<estNR; i++)
4202     {
4203         if (EST_DISTR(i))
4204         {
4205             switch (i)
4206             {
4207             case estX:   /* Always present */            break;
4208             case estV:   bV   = (state->flags & (1<<i)); break;
4209             case estSDX: bSDX = (state->flags & (1<<i)); break;
4210             case estCGP: bCGP = (state->flags & (1<<i)); break;
4211             case estLD_RNG:
4212             case estLD_RNGI:
4213             case estDISRE_INITF:
4214             case estDISRE_RM3TAV:
4215             case estORIRE_INITF:
4216             case estORIRE_DTAV:
4217                 /* No processing required */
4218                 break;
4219             default:
4220             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4221             }
4222         }
4223     }
4224     
4225     if (dd->ncg_tot > comm->nalloc_int)
4226     {
4227         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4228         srenew(comm->buf_int,comm->nalloc_int);
4229     }
4230     move = comm->buf_int;
4231     
4232     /* Clear the count */
4233     for(c=0; c<dd->ndim*2; c++)
4234     {
4235         ncg[c] = 0;
4236         nat[c] = 0;
4237     }
4238
4239     npbcdim = dd->npbcdim;
4240
4241     for(d=0; (d<DIM); d++)
4242     {
4243         limitd[d] = dd->comm->cellsize_min[d];
4244         if (d >= npbcdim && dd->ci[d] == 0)
4245         {
4246             cell_x0[d] = -GMX_FLOAT_MAX;
4247         }
4248         else
4249         {
4250             cell_x0[d] = comm->cell_x0[d];
4251         }
4252         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4253         {
4254             cell_x1[d] = GMX_FLOAT_MAX;
4255         }
4256         else
4257         {
4258             cell_x1[d] = comm->cell_x1[d];
4259         }
4260         if (d < npbcdim)
4261         {
4262             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4263             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4264         }
4265         else
4266         {
4267             /* We check after communication if a charge group moved
4268              * more than one cell. Set the pre-comm check limit to float_max.
4269              */
4270             limit0[d] = -GMX_FLOAT_MAX;
4271             limit1[d] =  GMX_FLOAT_MAX;
4272         }
4273     }
4274     
4275     make_tric_corr_matrix(npbcdim,state->box,tcm);
4276     
4277     cgindex = dd->cgindex;
4278     
4279     /* Compute the center of geometry for all home charge groups
4280      * and put them in the box and determine where they should go.
4281      */
4282     for(cg=0; cg<dd->ncg_home; cg++)
4283     {
4284         k0   = cgindex[cg];
4285         k1   = cgindex[cg+1];
4286         nrcg = k1 - k0;
4287         if (nrcg == 1)
4288         {
4289             copy_rvec(state->x[k0],cm_new);
4290         }
4291         else
4292         {
4293             inv_ncg = 1.0/nrcg;
4294             
4295             clear_rvec(cm_new);
4296             for(k=k0; (k<k1); k++)
4297             {
4298                 rvec_inc(cm_new,state->x[k]);
4299             }
4300             for(d=0; (d<DIM); d++)
4301             {
4302                 cm_new[d] = inv_ncg*cm_new[d];
4303             }
4304         }
4305         
4306         clear_ivec(dev);
4307         /* Do pbc and check DD cell boundary crossings */
4308         for(d=DIM-1; d>=0; d--)
4309         {
4310             if (dd->nc[d] > 1)
4311             {
4312                 bScrew = (dd->bScrewPBC && d == XX);
4313                 /* Determine the location of this cg in lattice coordinates */
4314                 pos_d = cm_new[d];
4315                 if (tric_dir[d])
4316                 {
4317                     for(d2=d+1; d2<DIM; d2++)
4318                     {
4319                         pos_d += cm_new[d2]*tcm[d2][d];
4320                     }
4321                 }
4322                 /* Put the charge group in the triclinic unit-cell */
4323                 if (pos_d >= cell_x1[d])
4324                 {
4325                     if (pos_d >= limit1[d])
4326                     {
4327                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4328                                       cg_cm[cg],cm_new,pos_d);
4329                     }
4330                     dev[d] = 1;
4331                     if (dd->ci[d] == dd->nc[d] - 1)
4332                     {
4333                         rvec_dec(cm_new,state->box[d]);
4334                         if (bScrew)
4335                         {
4336                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4337                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4338                         }
4339                         for(k=k0; (k<k1); k++)
4340                         {
4341                             rvec_dec(state->x[k],state->box[d]);
4342                             if (bScrew)
4343                             {
4344                                 rotate_state_atom(state,k);
4345                             }
4346                         }
4347                     }
4348                 }
4349                 else if (pos_d < cell_x0[d])
4350                 {
4351                     if (pos_d < limit0[d])
4352                     {
4353                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4354                                       cg_cm[cg],cm_new,pos_d);
4355                     }
4356                     dev[d] = -1;
4357                     if (dd->ci[d] == 0)
4358                     {
4359                         rvec_inc(cm_new,state->box[d]);
4360                         if (bScrew)
4361                         {
4362                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4363                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4364                         }
4365                         for(k=k0; (k<k1); k++)
4366                         {
4367                             rvec_inc(state->x[k],state->box[d]);
4368                             if (bScrew)
4369                             {
4370                                 rotate_state_atom(state,k);
4371                             }
4372                         }
4373                     }
4374                 }
4375             }
4376             else if (d < npbcdim)
4377             {
4378                 /* Put the charge group in the rectangular unit-cell */
4379                 while (cm_new[d] >= state->box[d][d])
4380                 {
4381                     rvec_dec(cm_new,state->box[d]);
4382                     for(k=k0; (k<k1); k++)
4383                     {
4384                         rvec_dec(state->x[k],state->box[d]);
4385                     }
4386                 }
4387                 while (cm_new[d] < 0)
4388                 {
4389                     rvec_inc(cm_new,state->box[d]);
4390                     for(k=k0; (k<k1); k++)
4391                     {
4392                         rvec_inc(state->x[k],state->box[d]);
4393                     }
4394                 }
4395             }
4396         }
4397     
4398         copy_rvec(cm_new,cg_cm[cg]);
4399         
4400         /* Determine where this cg should go */
4401         flag = 0;
4402         mc = -1;
4403         for(d=0; d<dd->ndim; d++)
4404         {
4405             dim = dd->dim[d];
4406             if (dev[dim] == 1)
4407             {
4408                 flag |= DD_FLAG_FW(d);
4409                 if (mc == -1)
4410                 {
4411                     mc = d*2;
4412                 }
4413             }
4414             else if (dev[dim] == -1)
4415             {
4416                 flag |= DD_FLAG_BW(d);
4417                 if (mc == -1) {
4418                     if (dd->nc[dim] > 2)
4419                     {
4420                         mc = d*2 + 1;
4421                     }
4422                     else
4423                     {
4424                         mc = d*2;
4425                     }
4426                 }
4427             }
4428         }
4429         move[cg] = mc;
4430         if (mc >= 0)
4431         {
4432             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4433             {
4434                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4435                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4436             }
4437             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4438             /* We store the cg size in the lower 16 bits
4439              * and the place where the charge group should go
4440              * in the next 6 bits. This saves some communication volume.
4441              */
4442             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4443             ncg[mc] += 1;
4444             nat[mc] += nrcg;
4445         }
4446     }
4447     
4448     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4449     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4450     
4451     nvec = 1;
4452     if (bV)
4453     {
4454         nvec++;
4455     }
4456     if (bSDX)
4457     {
4458         nvec++;
4459     }
4460     if (bCGP)
4461     {
4462         nvec++;
4463     }
4464     
4465     /* Make sure the communication buffers are large enough */
4466     for(mc=0; mc<dd->ndim*2; mc++)
4467     {
4468         nvr = ncg[mc] + nat[mc]*nvec;
4469         if (nvr > comm->cgcm_state_nalloc[mc])
4470         {
4471             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4472             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4473         }
4474     }
4475     
4476     /* Recalculating cg_cm might be cheaper than communicating,
4477      * but that could give rise to rounding issues.
4478      */
4479     home_pos_cg =
4480         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4481                                 nvec,cg_cm,comm,bCompact);
4482     
4483     vec = 0;
4484     home_pos_at =
4485         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4486                                 nvec,vec++,state->x,comm,bCompact);
4487     if (bV)
4488     {
4489         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4490                                 nvec,vec++,state->v,comm,bCompact);
4491     }
4492     if (bSDX)
4493     {
4494         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4495                                 nvec,vec++,state->sd_X,comm,bCompact);
4496     }
4497     if (bCGP)
4498     {
4499         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4500                                 nvec,vec++,state->cg_p,comm,bCompact);
4501     }
4502     
4503     if (bCompact)
4504     {
4505         compact_ind(dd->ncg_home,move,
4506                     dd->index_gl,dd->cgindex,dd->gatindex,
4507                     dd->ga2la,comm->bLocalCG,
4508                     fr->cginfo);
4509     }
4510     else
4511     {
4512         clear_and_mark_ind(dd->ncg_home,move,
4513                            dd->index_gl,dd->cgindex,dd->gatindex,
4514                            dd->ga2la,comm->bLocalCG,
4515                            fr->ns.grid->cell_index);
4516     }
4517     
4518     cginfo_mb = fr->cginfo_mb;
4519
4520     ncg_stay_home = home_pos_cg;
4521     for(d=0; d<dd->ndim; d++)
4522     {
4523         dim = dd->dim[d];
4524         ncg_recv = 0;
4525         nat_recv = 0;
4526         nvr      = 0;
4527         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4528         {
4529             cdd = d*2 + dir;
4530             /* Communicate the cg and atom counts */
4531             sbuf[0] = ncg[cdd];
4532             sbuf[1] = nat[cdd];
4533             if (debug)
4534             {
4535                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4536                         d,dir,sbuf[0],sbuf[1]);
4537             }
4538             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4539             
4540             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4541             {
4542                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4543                 srenew(comm->buf_int,comm->nalloc_int);
4544             }
4545             
4546             /* Communicate the charge group indices, sizes and flags */
4547             dd_sendrecv_int(dd, d, dir,
4548                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4549                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4550             
4551             nvs = ncg[cdd] + nat[cdd]*nvec;
4552             i   = rbuf[0]  + rbuf[1] *nvec;
4553             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4554             
4555             /* Communicate cgcm and state */
4556             dd_sendrecv_rvec(dd, d, dir,
4557                              comm->cgcm_state[cdd], nvs,
4558                              comm->vbuf.v+nvr, i);
4559             ncg_recv += rbuf[0];
4560             nat_recv += rbuf[1];
4561             nvr      += i;
4562         }
4563         
4564         /* Process the received charge groups */
4565         buf_pos = 0;
4566         for(cg=0; cg<ncg_recv; cg++)
4567         {
4568             flag = comm->buf_int[cg*DD_CGIBS+1];
4569
4570             if (dim >= npbcdim && dd->nc[dim] > 2)
4571             {
4572                 /* No pbc in this dim and more than one domain boundary.
4573                  * We to a separate check if a charge did not move too far.
4574                  */
4575                 if (((flag & DD_FLAG_FW(d)) &&
4576                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4577                     ((flag & DD_FLAG_BW(d)) &&
4578                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4579                 {
4580                     cg_move_error(fplog,dd,step,cg,d,
4581                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4582                                    FALSE,0,
4583                                    comm->vbuf.v[buf_pos],
4584                                    comm->vbuf.v[buf_pos],
4585                                    comm->vbuf.v[buf_pos][d]);
4586                 }
4587             }
4588
4589             mc = -1;
4590             if (d < dd->ndim-1)
4591             {
4592                 /* Check which direction this cg should go */
4593                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4594                 {
4595                     if (dd->bGridJump)
4596                     {
4597                         /* The cell boundaries for dimension d2 are not equal
4598                          * for each cell row of the lower dimension(s),
4599                          * therefore we might need to redetermine where
4600                          * this cg should go.
4601                          */
4602                         dim2 = dd->dim[d2];
4603                         /* If this cg crosses the box boundary in dimension d2
4604                          * we can use the communicated flag, so we do not
4605                          * have to worry about pbc.
4606                          */
4607                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4608                                (flag & DD_FLAG_FW(d2))) ||
4609                               (dd->ci[dim2] == 0 &&
4610                                (flag & DD_FLAG_BW(d2)))))
4611                         {
4612                             /* Clear the two flags for this dimension */
4613                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4614                             /* Determine the location of this cg
4615                              * in lattice coordinates
4616                              */
4617                             pos_d = comm->vbuf.v[buf_pos][dim2];
4618                             if (tric_dir[dim2])
4619                             {
4620                                 for(d3=dim2+1; d3<DIM; d3++)
4621                                 {
4622                                     pos_d +=
4623                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4624                                 }
4625                             }
4626                             /* Check of we are not at the box edge.
4627                              * pbc is only handled in the first step above,
4628                              * but this check could move over pbc while
4629                              * the first step did not due to different rounding.
4630                              */
4631                             if (pos_d >= cell_x1[dim2] &&
4632                                 dd->ci[dim2] != dd->nc[dim2]-1)
4633                             {
4634                                 flag |= DD_FLAG_FW(d2);
4635                             }
4636                             else if (pos_d < cell_x0[dim2] &&
4637                                      dd->ci[dim2] != 0)
4638                             {
4639                                 flag |= DD_FLAG_BW(d2);
4640                             }
4641                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4642                         }
4643                     }
4644                     /* Set to which neighboring cell this cg should go */
4645                     if (flag & DD_FLAG_FW(d2))
4646                     {
4647                         mc = d2*2;
4648                     }
4649                     else if (flag & DD_FLAG_BW(d2))
4650                     {
4651                         if (dd->nc[dd->dim[d2]] > 2)
4652                         {
4653                             mc = d2*2+1;
4654                         }
4655                         else
4656                         {
4657                             mc = d2*2;
4658                         }
4659                     }
4660                 }
4661             }
4662             
4663             nrcg = flag & DD_FLAG_NRCG;
4664             if (mc == -1)
4665             {
4666                 if (home_pos_cg+1 > dd->cg_nalloc)
4667                 {
4668                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4669                     srenew(dd->index_gl,dd->cg_nalloc);
4670                     srenew(dd->cgindex,dd->cg_nalloc+1);
4671                 }
4672                 /* Set the global charge group index and size */
4673                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4674                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4675                 /* Copy the state from the buffer */
4676                 if (home_pos_cg >= fr->cg_nalloc)
4677                 {
4678                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4679                     cg_cm = fr->cg_cm;
4680                 }
4681                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4682                 /* Set the cginfo */
4683                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4684                                                    dd->index_gl[home_pos_cg]);
4685                 if (comm->bLocalCG)
4686                 {
4687                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4688                 }
4689
4690                 if (home_pos_at+nrcg > state->nalloc)
4691                 {
4692                     dd_realloc_state(state,f,home_pos_at+nrcg);
4693                 }
4694                 for(i=0; i<nrcg; i++)
4695                 {
4696                     copy_rvec(comm->vbuf.v[buf_pos++],
4697                               state->x[home_pos_at+i]);
4698                 }
4699                 if (bV)
4700                 {
4701                     for(i=0; i<nrcg; i++)
4702                     {
4703                         copy_rvec(comm->vbuf.v[buf_pos++],
4704                                   state->v[home_pos_at+i]);
4705                     }
4706                 }
4707                 if (bSDX)
4708                 {
4709                     for(i=0; i<nrcg; i++)
4710                     {
4711                         copy_rvec(comm->vbuf.v[buf_pos++],
4712                                   state->sd_X[home_pos_at+i]);
4713                     }
4714                 }
4715                 if (bCGP)
4716                 {
4717                     for(i=0; i<nrcg; i++)
4718                     {
4719                         copy_rvec(comm->vbuf.v[buf_pos++],
4720                                   state->cg_p[home_pos_at+i]);
4721                     }
4722                 }
4723                 home_pos_cg += 1;
4724                 home_pos_at += nrcg;
4725             }
4726             else
4727             {
4728                 /* Reallocate the buffers if necessary  */
4729                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4730                 {
4731                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4732                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4733                 }
4734                 nvr = ncg[mc] + nat[mc]*nvec;
4735                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4736                 {
4737                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4738                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4739                 }
4740                 /* Copy from the receive to the send buffers */
4741                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4742                        comm->buf_int + cg*DD_CGIBS,
4743                        DD_CGIBS*sizeof(int));
4744                 memcpy(comm->cgcm_state[mc][nvr],
4745                        comm->vbuf.v[buf_pos],
4746                        (1+nrcg*nvec)*sizeof(rvec));
4747                 buf_pos += 1 + nrcg*nvec;
4748                 ncg[mc] += 1;
4749                 nat[mc] += nrcg;
4750             }
4751         }
4752     }
4753     
4754     /* With sorting (!bCompact) the indices are now only partially up to date
4755      * and ncg_home and nat_home are not the real count, since there are
4756      * "holes" in the arrays for the charge groups that moved to neighbors.
4757      */
4758     dd->ncg_home = home_pos_cg;
4759     dd->nat_home = home_pos_at;
4760
4761     if (debug)
4762     {
4763         fprintf(debug,"Finished repartitioning\n");
4764     }
4765
4766     return ncg_stay_home;
4767 }
4768
4769 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4770 {
4771     dd->comm->cycl[ddCycl] += cycles;
4772     dd->comm->cycl_n[ddCycl]++;
4773     if (cycles > dd->comm->cycl_max[ddCycl])
4774     {
4775         dd->comm->cycl_max[ddCycl] = cycles;
4776     }
4777 }
4778
4779 static double force_flop_count(t_nrnb *nrnb)
4780 {
4781     int i;
4782     double sum;
4783     const char *name;
4784
4785     sum = 0;
4786     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4787     {
4788         /* To get closer to the real timings, we half the count
4789          * for the normal loops and again half it for water loops.
4790          */
4791         name = nrnb_str(i);
4792         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4793         {
4794             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4795         }
4796         else
4797         {
4798             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4799         }
4800     }
4801     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4802     {
4803         name = nrnb_str(i);
4804         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4805         sum += nrnb->n[i]*cost_nrnb(i);
4806     }
4807     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4808     {
4809         sum += nrnb->n[i]*cost_nrnb(i);
4810     }
4811
4812     return sum;
4813 }
4814
4815 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4816 {
4817     if (dd->comm->eFlop)
4818     {
4819         dd->comm->flop -= force_flop_count(nrnb);
4820     }
4821 }
4822 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4823 {
4824     if (dd->comm->eFlop)
4825     {
4826         dd->comm->flop += force_flop_count(nrnb);
4827         dd->comm->flop_n++;
4828     }
4829 }  
4830
4831 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4832 {
4833     int i;
4834     
4835     for(i=0; i<ddCyclNr; i++)
4836     {
4837         dd->comm->cycl[i] = 0;
4838         dd->comm->cycl_n[i] = 0;
4839         dd->comm->cycl_max[i] = 0;
4840     }
4841     dd->comm->flop = 0;
4842     dd->comm->flop_n = 0;
4843 }
4844
4845 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4846 {
4847     gmx_domdec_comm_t *comm;
4848     gmx_domdec_load_t *load;
4849     gmx_domdec_root_t *root=NULL;
4850     int  d,dim,cid,i,pos;
4851     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4852     gmx_bool bSepPME;
4853     
4854     if (debug)
4855     {
4856         fprintf(debug,"get_load_distribution start\n");
4857     }
4858
4859     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4860     
4861     comm = dd->comm;
4862     
4863     bSepPME = (dd->pme_nodeid >= 0);
4864     
4865     for(d=dd->ndim-1; d>=0; d--)
4866     {
4867         dim = dd->dim[d];
4868         /* Check if we participate in the communication in this dimension */
4869         if (d == dd->ndim-1 || 
4870             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4871         {
4872             load = &comm->load[d];
4873             if (dd->bGridJump)
4874             {
4875                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4876             }
4877             pos = 0;
4878             if (d == dd->ndim-1)
4879             {
4880                 sbuf[pos++] = dd_force_load(comm);
4881                 sbuf[pos++] = sbuf[0];
4882                 if (dd->bGridJump)
4883                 {
4884                     sbuf[pos++] = sbuf[0];
4885                     sbuf[pos++] = cell_frac;
4886                     if (d > 0)
4887                     {
4888                         sbuf[pos++] = comm->cell_f_max0[d];
4889                         sbuf[pos++] = comm->cell_f_min1[d];
4890                     }
4891                 }
4892                 if (bSepPME)
4893                 {
4894                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4895                     sbuf[pos++] = comm->cycl[ddCyclPME];
4896                 }
4897             }
4898             else
4899             {
4900                 sbuf[pos++] = comm->load[d+1].sum;
4901                 sbuf[pos++] = comm->load[d+1].max;
4902                 if (dd->bGridJump)
4903                 {
4904                     sbuf[pos++] = comm->load[d+1].sum_m;
4905                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4906                     sbuf[pos++] = comm->load[d+1].flags;
4907                     if (d > 0)
4908                     {
4909                         sbuf[pos++] = comm->cell_f_max0[d];
4910                         sbuf[pos++] = comm->cell_f_min1[d];
4911                     }
4912                 }
4913                 if (bSepPME)
4914                 {
4915                     sbuf[pos++] = comm->load[d+1].mdf;
4916                     sbuf[pos++] = comm->load[d+1].pme;
4917                 }
4918             }
4919             load->nload = pos;
4920             /* Communicate a row in DD direction d.
4921              * The communicators are setup such that the root always has rank 0.
4922              */
4923 #ifdef GMX_MPI
4924             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4925                        load->load,load->nload*sizeof(float),MPI_BYTE,
4926                        0,comm->mpi_comm_load[d]);
4927 #endif
4928             if (dd->ci[dim] == dd->master_ci[dim])
4929             {
4930                 /* We are the root, process this row */
4931                 if (comm->bDynLoadBal)
4932                 {
4933                     root = comm->root[d];
4934                 }
4935                 load->sum = 0;
4936                 load->max = 0;
4937                 load->sum_m = 0;
4938                 load->cvol_min = 1;
4939                 load->flags = 0;
4940                 load->mdf = 0;
4941                 load->pme = 0;
4942                 pos = 0;
4943                 for(i=0; i<dd->nc[dim]; i++)
4944                 {
4945                     load->sum += load->load[pos++];
4946                     load->max = max(load->max,load->load[pos]);
4947                     pos++;
4948                     if (dd->bGridJump)
4949                     {
4950                         if (root->bLimited)
4951                         {
4952                             /* This direction could not be load balanced properly,
4953                              * therefore we need to use the maximum iso the average load.
4954                              */
4955                             load->sum_m = max(load->sum_m,load->load[pos]);
4956                         }
4957                         else
4958                         {
4959                             load->sum_m += load->load[pos];
4960                         }
4961                         pos++;
4962                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4963                         pos++;
4964                         if (d < dd->ndim-1)
4965                         {
4966                             load->flags = (int)(load->load[pos++] + 0.5);
4967                         }
4968                         if (d > 0)
4969                         {
4970                             root->cell_f_max0[i] = load->load[pos++];
4971                             root->cell_f_min1[i] = load->load[pos++];
4972                         }
4973                     }
4974                     if (bSepPME)
4975                     {
4976                         load->mdf = max(load->mdf,load->load[pos]);
4977                         pos++;
4978                         load->pme = max(load->pme,load->load[pos]);
4979                         pos++;
4980                     }
4981                 }
4982                 if (comm->bDynLoadBal && root->bLimited)
4983                 {
4984                     load->sum_m *= dd->nc[dim];
4985                     load->flags |= (1<<d);
4986                 }
4987             }
4988         }
4989     }
4990
4991     if (DDMASTER(dd))
4992     {
4993         comm->nload      += dd_load_count(comm);
4994         comm->load_step  += comm->cycl[ddCyclStep];
4995         comm->load_sum   += comm->load[0].sum;
4996         comm->load_max   += comm->load[0].max;
4997         if (comm->bDynLoadBal)
4998         {
4999             for(d=0; d<dd->ndim; d++)
5000             {
5001                 if (comm->load[0].flags & (1<<d))
5002                 {
5003                     comm->load_lim[d]++;
5004                 }
5005             }
5006         }
5007         if (bSepPME)
5008         {
5009             comm->load_mdf += comm->load[0].mdf;
5010             comm->load_pme += comm->load[0].pme;
5011         }
5012     }
5013
5014     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5015     
5016     if (debug)
5017     {
5018         fprintf(debug,"get_load_distribution finished\n");
5019     }
5020 }
5021
5022 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5023 {
5024     /* Return the relative performance loss on the total run time
5025      * due to the force calculation load imbalance.
5026      */
5027     if (dd->comm->nload > 0)
5028     {
5029         return
5030             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5031             (dd->comm->load_step*dd->nnodes);
5032     }
5033     else
5034     {
5035         return 0;
5036     }
5037 }
5038
5039 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5040 {
5041     char  buf[STRLEN];
5042     int   npp,npme,nnodes,d,limp;
5043     float imbal,pme_f_ratio,lossf,lossp=0;
5044     gmx_bool  bLim;
5045     gmx_domdec_comm_t *comm;
5046
5047     comm = dd->comm;
5048     if (DDMASTER(dd) && comm->nload > 0)
5049     {
5050         npp    = dd->nnodes;
5051         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5052         nnodes = npp + npme;
5053         imbal = comm->load_max*npp/comm->load_sum - 1;
5054         lossf = dd_force_imb_perf_loss(dd);
5055         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5056         fprintf(fplog,"%s",buf);
5057         fprintf(stderr,"\n");
5058         fprintf(stderr,"%s",buf);
5059         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5060         fprintf(fplog,"%s",buf);
5061         fprintf(stderr,"%s",buf);
5062         bLim = FALSE;
5063         if (comm->bDynLoadBal)
5064         {
5065             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5066             for(d=0; d<dd->ndim; d++)
5067             {
5068                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5069                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5070                 if (limp >= 50)
5071                 {
5072                     bLim = TRUE;
5073                 }
5074             }
5075             sprintf(buf+strlen(buf),"\n");
5076             fprintf(fplog,"%s",buf);
5077             fprintf(stderr,"%s",buf);
5078         }
5079         if (npme > 0)
5080         {
5081             pme_f_ratio = comm->load_pme/comm->load_mdf;
5082             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5083             if (lossp <= 0)
5084             {
5085                 lossp *= (float)npme/(float)nnodes;
5086             }
5087             else
5088             {
5089                 lossp *= (float)npp/(float)nnodes;
5090             }
5091             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5092             fprintf(fplog,"%s",buf);
5093             fprintf(stderr,"%s",buf);
5094             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5095             fprintf(fplog,"%s",buf);
5096             fprintf(stderr,"%s",buf);
5097         }
5098         fprintf(fplog,"\n");
5099         fprintf(stderr,"\n");
5100         
5101         if (lossf >= DD_PERF_LOSS)
5102         {
5103             sprintf(buf,
5104                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5105                     "      in the domain decomposition.\n",lossf*100);
5106             if (!comm->bDynLoadBal)
5107             {
5108                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5109             }
5110             else if (bLim)
5111             {
5112                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5113             }
5114             fprintf(fplog,"%s\n",buf);
5115             fprintf(stderr,"%s\n",buf);
5116         }
5117         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5118         {
5119             sprintf(buf,
5120                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5121                     "      had %s work to do than the PP nodes.\n"
5122                     "      You might want to %s the number of PME nodes\n"
5123                     "      or %s the cut-off and the grid spacing.\n",
5124                     fabs(lossp*100),
5125                     (lossp < 0) ? "less"     : "more",
5126                     (lossp < 0) ? "decrease" : "increase",
5127                     (lossp < 0) ? "decrease" : "increase");
5128             fprintf(fplog,"%s\n",buf);
5129             fprintf(stderr,"%s\n",buf);
5130         }
5131     }
5132 }
5133
5134 static float dd_vol_min(gmx_domdec_t *dd)
5135 {
5136     return dd->comm->load[0].cvol_min*dd->nnodes;
5137 }
5138
5139 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5140 {
5141     return dd->comm->load[0].flags;
5142 }
5143
5144 static float dd_f_imbal(gmx_domdec_t *dd)
5145 {
5146     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5147 }
5148
5149 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5150 {
5151     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5152 }
5153
5154 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5155 {
5156     int flags,d;
5157     char buf[22];
5158     
5159     flags = dd_load_flags(dd);
5160     if (flags)
5161     {
5162         fprintf(fplog,
5163                 "DD  load balancing is limited by minimum cell size in dimension");
5164         for(d=0; d<dd->ndim; d++)
5165         {
5166             if (flags & (1<<d))
5167             {
5168                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5169             }
5170         }
5171         fprintf(fplog,"\n");
5172     }
5173     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5174     if (dd->comm->bDynLoadBal)
5175     {
5176         fprintf(fplog,"  vol min/aver %5.3f%c",
5177                 dd_vol_min(dd),flags ? '!' : ' ');
5178     }
5179     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5180     if (dd->comm->cycl_n[ddCyclPME])
5181     {
5182         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5183     }
5184     fprintf(fplog,"\n\n");
5185 }
5186
5187 static void dd_print_load_verbose(gmx_domdec_t *dd)
5188 {
5189     if (dd->comm->bDynLoadBal)
5190     {
5191         fprintf(stderr,"vol %4.2f%c ",
5192                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5193     }
5194     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5195     if (dd->comm->cycl_n[ddCyclPME])
5196     {
5197         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5198     }
5199 }
5200
5201 #ifdef GMX_MPI
5202 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5203                                    int dim_ind,ivec loc)
5204 {
5205     MPI_Group g_row = MPI_GROUP_EMPTY;
5206     MPI_Comm  c_row;
5207     int  dim,i,*rank;
5208     ivec loc_c;
5209     gmx_domdec_root_t *root;
5210     gmx_bool bPartOfGroup = FALSE;
5211     
5212     dim = dd->dim[dim_ind];
5213     copy_ivec(loc,loc_c);
5214     snew(rank,dd->nc[dim]);
5215     for(i=0; i<dd->nc[dim]; i++)
5216     {
5217         loc_c[dim] = i;
5218         rank[i] = dd_index(dd->nc,loc_c);
5219         if (rank[i] == dd->rank)
5220         {
5221             /* This process is part of the group */
5222             bPartOfGroup = TRUE;
5223         }
5224     }
5225     if (bPartOfGroup)
5226     {
5227         MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5228     }
5229     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5230     if (bPartOfGroup)
5231     {
5232         dd->comm->mpi_comm_load[dim_ind] = c_row;
5233         if (dd->comm->eDLB != edlbNO)
5234         {
5235             if (dd->ci[dim] == dd->master_ci[dim])
5236             {
5237                 /* This is the root process of this row */
5238                 snew(dd->comm->root[dim_ind],1);
5239                 root = dd->comm->root[dim_ind];
5240                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5241                 snew(root->old_cell_f,dd->nc[dim]+1);
5242                 snew(root->bCellMin,dd->nc[dim]);
5243                 if (dim_ind > 0)
5244                 {
5245                     snew(root->cell_f_max0,dd->nc[dim]);
5246                     snew(root->cell_f_min1,dd->nc[dim]);
5247                     snew(root->bound_min,dd->nc[dim]);
5248                     snew(root->bound_max,dd->nc[dim]);
5249                 }
5250                 snew(root->buf_ncd,dd->nc[dim]);
5251             }
5252             else
5253             {
5254                 /* This is not a root process, we only need to receive cell_f */
5255                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5256             }
5257         }
5258         if (dd->ci[dim] == dd->master_ci[dim])
5259         {
5260             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5261         }
5262     }
5263     sfree(rank);
5264 }
5265 #endif
5266
5267 static void make_load_communicators(gmx_domdec_t *dd)
5268 {
5269 #ifdef GMX_MPI
5270   MPI_Group g_all;
5271   int  dim0,dim1,i,j;
5272   ivec loc;
5273
5274   if (debug)
5275     fprintf(debug,"Making load communicators\n");
5276
5277   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5278   
5279   snew(dd->comm->load,dd->ndim);
5280   snew(dd->comm->mpi_comm_load,dd->ndim);
5281   
5282   clear_ivec(loc);
5283   make_load_communicator(dd,g_all,0,loc);
5284   if (dd->ndim > 1) {
5285     dim0 = dd->dim[0];
5286     for(i=0; i<dd->nc[dim0]; i++) {
5287       loc[dim0] = i;
5288       make_load_communicator(dd,g_all,1,loc);
5289     }
5290   }
5291   if (dd->ndim > 2) {
5292     dim0 = dd->dim[0];
5293     for(i=0; i<dd->nc[dim0]; i++) {
5294       loc[dim0] = i;
5295       dim1 = dd->dim[1];
5296       for(j=0; j<dd->nc[dim1]; j++) {
5297           loc[dim1] = j;
5298           make_load_communicator(dd,g_all,2,loc);
5299       }
5300     }
5301   }
5302
5303   MPI_Group_free(&g_all);
5304
5305   if (debug)
5306     fprintf(debug,"Finished making load communicators\n");
5307 #endif
5308 }
5309
5310 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5311 {
5312     gmx_bool bZYX;
5313     int  d,dim,i,j,m;
5314     ivec tmp,s;
5315     int  nzone,nzonep;
5316     ivec dd_zp[DD_MAXIZONE];
5317     gmx_domdec_zones_t *zones;
5318     gmx_domdec_ns_ranges_t *izone;
5319     
5320     for(d=0; d<dd->ndim; d++)
5321     {
5322         dim = dd->dim[d];
5323         copy_ivec(dd->ci,tmp);
5324         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5325         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5326         copy_ivec(dd->ci,tmp);
5327         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5328         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5329         if (debug)
5330         {
5331             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5332                     dd->rank,dim,
5333                     dd->neighbor[d][0],
5334                     dd->neighbor[d][1]);
5335         }
5336     }
5337     
5338     if (DDMASTER(dd))
5339     {
5340         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5341             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5342     }
5343     if (fplog)
5344     {
5345         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5346                 dd->ndim,
5347                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5348                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5349     }
5350     switch (dd->ndim)
5351     {
5352     case 3:
5353         nzone  = dd_z3n;
5354         nzonep = dd_zp3n;
5355         for(i=0; i<nzonep; i++)
5356         {
5357             copy_ivec(dd_zp3[i],dd_zp[i]);
5358         }
5359         break;
5360     case 2:
5361         nzone  = dd_z2n;
5362         nzonep = dd_zp2n;
5363         for(i=0; i<nzonep; i++)
5364         {
5365             copy_ivec(dd_zp2[i],dd_zp[i]);
5366         }
5367         break;
5368     case 1:
5369         nzone  = dd_z1n;
5370         nzonep = dd_zp1n;
5371         for(i=0; i<nzonep; i++)
5372         {
5373             copy_ivec(dd_zp1[i],dd_zp[i]);
5374         }
5375         break;
5376     default:
5377         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5378         nzone = 0;
5379         nzonep = 0;
5380     }
5381
5382     zones = &dd->comm->zones;
5383
5384     for(i=0; i<nzone; i++)
5385     {
5386         m = 0;
5387         clear_ivec(zones->shift[i]);
5388         for(d=0; d<dd->ndim; d++)
5389         {
5390             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5391         }
5392     }
5393     
5394     zones->n = nzone;
5395     for(i=0; i<nzone; i++)
5396     {
5397         for(d=0; d<DIM; d++)
5398         {
5399             s[d] = dd->ci[d] - zones->shift[i][d];
5400             if (s[d] < 0)
5401             {
5402                 s[d] += dd->nc[d];
5403             }
5404             else if (s[d] >= dd->nc[d])
5405             {
5406                 s[d] -= dd->nc[d];
5407             }
5408         }
5409     }
5410     zones->nizone = nzonep;
5411     for(i=0; i<zones->nizone; i++)
5412     {
5413         if (dd_zp[i][0] != i)
5414         {
5415             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5416         }
5417         izone = &zones->izone[i];
5418         izone->j0 = dd_zp[i][1];
5419         izone->j1 = dd_zp[i][2];
5420         for(dim=0; dim<DIM; dim++)
5421         {
5422             if (dd->nc[dim] == 1)
5423             {
5424                 /* All shifts should be allowed */
5425                 izone->shift0[dim] = -1;
5426                 izone->shift1[dim] = 1;
5427             }
5428             else
5429             {
5430                 /*
5431                   izone->shift0[d] = 0;
5432                   izone->shift1[d] = 0;
5433                   for(j=izone->j0; j<izone->j1; j++) {
5434                   if (dd->shift[j][d] > dd->shift[i][d])
5435                   izone->shift0[d] = -1;
5436                   if (dd->shift[j][d] < dd->shift[i][d])
5437                   izone->shift1[d] = 1;
5438                   }
5439                 */
5440                 
5441                 int shift_diff;
5442                 
5443                 /* Assume the shift are not more than 1 cell */
5444                 izone->shift0[dim] = 1;
5445                 izone->shift1[dim] = -1;
5446                 for(j=izone->j0; j<izone->j1; j++)
5447                 {
5448                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5449                     if (shift_diff < izone->shift0[dim])
5450                     {
5451                         izone->shift0[dim] = shift_diff;
5452                     }
5453                     if (shift_diff > izone->shift1[dim])
5454                     {
5455                         izone->shift1[dim] = shift_diff;
5456                     }
5457                 }
5458             }
5459         }
5460     }
5461     
5462     if (dd->comm->eDLB != edlbNO)
5463     {
5464         snew(dd->comm->root,dd->ndim);
5465     }
5466     
5467     if (dd->comm->bRecordLoad)
5468     {
5469         make_load_communicators(dd);
5470     }
5471 }
5472
5473 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5474 {
5475     gmx_domdec_t *dd;
5476     gmx_domdec_comm_t *comm;
5477     int  i,rank,*buf;
5478     ivec periods;
5479 #ifdef GMX_MPI
5480     MPI_Comm comm_cart;
5481 #endif
5482     
5483     dd = cr->dd;
5484     comm = dd->comm;
5485     
5486 #ifdef GMX_MPI
5487     if (comm->bCartesianPP)
5488     {
5489         /* Set up cartesian communication for the particle-particle part */
5490         if (fplog)
5491         {
5492             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5493                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5494         }
5495         
5496         for(i=0; i<DIM; i++)
5497         {
5498             periods[i] = TRUE;
5499         }
5500         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5501                         &comm_cart);
5502         /* We overwrite the old communicator with the new cartesian one */
5503         cr->mpi_comm_mygroup = comm_cart;
5504     }
5505     
5506     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5507     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5508     
5509     if (comm->bCartesianPP_PME)
5510     {
5511         /* Since we want to use the original cartesian setup for sim,
5512          * and not the one after split, we need to make an index.
5513          */
5514         snew(comm->ddindex2ddnodeid,dd->nnodes);
5515         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5516         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5517         /* Get the rank of the DD master,
5518          * above we made sure that the master node is a PP node.
5519          */
5520         if (MASTER(cr))
5521         {
5522             rank = dd->rank;
5523         }
5524         else
5525         {
5526             rank = 0;
5527         }
5528         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5529     }
5530     else if (comm->bCartesianPP)
5531     {
5532         if (cr->npmenodes == 0)
5533         {
5534             /* The PP communicator is also
5535              * the communicator for this simulation
5536              */
5537             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5538         }
5539         cr->nodeid = dd->rank;
5540         
5541         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5542         
5543         /* We need to make an index to go from the coordinates
5544          * to the nodeid of this simulation.
5545          */
5546         snew(comm->ddindex2simnodeid,dd->nnodes);
5547         snew(buf,dd->nnodes);
5548         if (cr->duty & DUTY_PP)
5549         {
5550             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5551         }
5552         /* Communicate the ddindex to simulation nodeid index */
5553         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5554                       cr->mpi_comm_mysim);
5555         sfree(buf);
5556         
5557         /* Determine the master coordinates and rank.
5558          * The DD master should be the same node as the master of this sim.
5559          */
5560         for(i=0; i<dd->nnodes; i++)
5561         {
5562             if (comm->ddindex2simnodeid[i] == 0)
5563             {
5564                 ddindex2xyz(dd->nc,i,dd->master_ci);
5565                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5566             }
5567         }
5568         if (debug)
5569         {
5570             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5571         }
5572     }
5573     else
5574     {
5575         /* No Cartesian communicators */
5576         /* We use the rank in dd->comm->all as DD index */
5577         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5578         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5579         dd->masterrank = 0;
5580         clear_ivec(dd->master_ci);
5581     }
5582 #endif
5583   
5584     if (fplog)
5585     {
5586         fprintf(fplog,
5587                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5588                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5589     }
5590     if (debug)
5591     {
5592         fprintf(debug,
5593                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5594                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5595     }
5596 }
5597
5598 static void receive_ddindex2simnodeid(t_commrec *cr)
5599 {
5600     gmx_domdec_t *dd;
5601     
5602     gmx_domdec_comm_t *comm;
5603     int  *buf;
5604     
5605     dd = cr->dd;
5606     comm = dd->comm;
5607     
5608 #ifdef GMX_MPI
5609     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5610     {
5611         snew(comm->ddindex2simnodeid,dd->nnodes);
5612         snew(buf,dd->nnodes);
5613         if (cr->duty & DUTY_PP)
5614         {
5615             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5616         }
5617 #ifdef GMX_MPI
5618         /* Communicate the ddindex to simulation nodeid index */
5619         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5620                       cr->mpi_comm_mysim);
5621 #endif
5622         sfree(buf);
5623     }
5624 #endif
5625 }
5626
5627 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5628                                                      int ncg,int natoms)
5629 {
5630     gmx_domdec_master_t *ma;
5631     int i;
5632
5633     snew(ma,1);
5634     
5635     snew(ma->ncg,dd->nnodes);
5636     snew(ma->index,dd->nnodes+1);
5637     snew(ma->cg,ncg);
5638     snew(ma->nat,dd->nnodes);
5639     snew(ma->ibuf,dd->nnodes*2);
5640     snew(ma->cell_x,DIM);
5641     for(i=0; i<DIM; i++)
5642     {
5643         snew(ma->cell_x[i],dd->nc[i]+1);
5644     }
5645
5646     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5647     {
5648         ma->vbuf = NULL;
5649     }
5650     else
5651     {
5652         snew(ma->vbuf,natoms);
5653     }
5654
5655     return ma;
5656 }
5657
5658 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5659                                int reorder)
5660 {
5661     gmx_domdec_t *dd;
5662     gmx_domdec_comm_t *comm;
5663     int  i,rank;
5664     gmx_bool bDiv[DIM];
5665     ivec periods;
5666 #ifdef GMX_MPI
5667     MPI_Comm comm_cart;
5668 #endif
5669     
5670     dd = cr->dd;
5671     comm = dd->comm;
5672     
5673     if (comm->bCartesianPP)
5674     {
5675         for(i=1; i<DIM; i++)
5676         {
5677             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5678         }
5679         if (bDiv[YY] || bDiv[ZZ])
5680         {
5681             comm->bCartesianPP_PME = TRUE;
5682             /* If we have 2D PME decomposition, which is always in x+y,
5683              * we stack the PME only nodes in z.
5684              * Otherwise we choose the direction that provides the thinnest slab
5685              * of PME only nodes as this will have the least effect
5686              * on the PP communication.
5687              * But for the PME communication the opposite might be better.
5688              */
5689             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5690                              !bDiv[YY] ||
5691                              dd->nc[YY] > dd->nc[ZZ]))
5692             {
5693                 comm->cartpmedim = ZZ;
5694             }
5695             else
5696             {
5697                 comm->cartpmedim = YY;
5698             }
5699             comm->ntot[comm->cartpmedim]
5700                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5701         }
5702         else if (fplog)
5703         {
5704             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5705             fprintf(fplog,
5706                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5707         }
5708     }
5709     
5710 #ifdef GMX_MPI
5711     if (comm->bCartesianPP_PME)
5712     {
5713         if (fplog)
5714         {
5715             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5716         }
5717         
5718         for(i=0; i<DIM; i++)
5719         {
5720             periods[i] = TRUE;
5721         }
5722         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5723                         &comm_cart);
5724         
5725         MPI_Comm_rank(comm_cart,&rank);
5726         if (MASTERNODE(cr) && rank != 0)
5727         {
5728             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5729         }
5730         
5731         /* With this assigment we loose the link to the original communicator
5732          * which will usually be MPI_COMM_WORLD, unless have multisim.
5733          */
5734         cr->mpi_comm_mysim = comm_cart;
5735         cr->sim_nodeid = rank;
5736         
5737         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5738         
5739         if (fplog)
5740         {
5741             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5742                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5743         }
5744         
5745         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5746         {
5747             cr->duty = DUTY_PP;
5748         }
5749         if (cr->npmenodes == 0 ||
5750             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5751         {
5752             cr->duty = DUTY_PME;
5753         }
5754         
5755         /* Split the sim communicator into PP and PME only nodes */
5756         MPI_Comm_split(cr->mpi_comm_mysim,
5757                        cr->duty,
5758                        dd_index(comm->ntot,dd->ci),
5759                        &cr->mpi_comm_mygroup);
5760     }
5761     else
5762     {
5763         switch (dd_node_order)
5764         {
5765         case ddnoPP_PME:
5766             if (fplog)
5767             {
5768                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5769             }
5770             break;
5771         case ddnoINTERLEAVE:
5772             /* Interleave the PP-only and PME-only nodes,
5773              * as on clusters with dual-core machines this will double
5774              * the communication bandwidth of the PME processes
5775              * and thus speed up the PP <-> PME and inter PME communication.
5776              */
5777             if (fplog)
5778             {
5779                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5780             }
5781             comm->pmenodes = dd_pmenodes(cr);
5782             break;
5783         case ddnoCARTESIAN:
5784             break;
5785         default:
5786             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5787         }
5788     
5789         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5790         {
5791             cr->duty = DUTY_PME;
5792         }
5793         else
5794         {
5795             cr->duty = DUTY_PP;
5796         }
5797         
5798         /* Split the sim communicator into PP and PME only nodes */
5799         MPI_Comm_split(cr->mpi_comm_mysim,
5800                        cr->duty,
5801                        cr->nodeid,
5802                        &cr->mpi_comm_mygroup);
5803         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5804     }
5805 #endif
5806
5807     if (fplog)
5808     {
5809         fprintf(fplog,"This is a %s only node\n\n",
5810                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5811     }
5812 }
5813
5814 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5815 {
5816     gmx_domdec_t *dd;
5817     gmx_domdec_comm_t *comm;
5818     int CartReorder;
5819     
5820     dd = cr->dd;
5821     comm = dd->comm;
5822     
5823     copy_ivec(dd->nc,comm->ntot);
5824     
5825     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5826     comm->bCartesianPP_PME = FALSE;
5827     
5828     /* Reorder the nodes by default. This might change the MPI ranks.
5829      * Real reordering is only supported on very few architectures,
5830      * Blue Gene is one of them.
5831      */
5832     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5833     
5834     if (cr->npmenodes > 0)
5835     {
5836         /* Split the communicator into a PP and PME part */
5837         split_communicator(fplog,cr,dd_node_order,CartReorder);
5838         if (comm->bCartesianPP_PME)
5839         {
5840             /* We (possibly) reordered the nodes in split_communicator,
5841              * so it is no longer required in make_pp_communicator.
5842              */
5843             CartReorder = FALSE;
5844         }
5845     }
5846     else
5847     {
5848         /* All nodes do PP and PME */
5849 #ifdef GMX_MPI    
5850         /* We do not require separate communicators */
5851         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5852 #endif
5853     }
5854     
5855     if (cr->duty & DUTY_PP)
5856     {
5857         /* Copy or make a new PP communicator */
5858         make_pp_communicator(fplog,cr,CartReorder);
5859     }
5860     else
5861     {
5862         receive_ddindex2simnodeid(cr);
5863     }
5864     
5865     if (!(cr->duty & DUTY_PME))
5866     {
5867         /* Set up the commnuication to our PME node */
5868         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5869         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5870         if (debug)
5871         {
5872             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5873                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5874         }
5875     }
5876     else
5877     {
5878         dd->pme_nodeid = -1;
5879     }
5880
5881     if (DDMASTER(dd))
5882     {
5883         dd->ma = init_gmx_domdec_master_t(dd,
5884                                           comm->cgs_gl.nr,
5885                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5886     }
5887 }
5888
5889 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5890 {
5891     real *slb_frac,tot;
5892     int  i,n;
5893     double dbl;
5894     
5895     slb_frac = NULL;
5896     if (nc > 1 && size_string != NULL)
5897     {
5898         if (fplog)
5899         {
5900             fprintf(fplog,"Using static load balancing for the %s direction\n",
5901                     dir);
5902         }
5903         snew(slb_frac,nc);
5904         tot = 0;
5905         for (i=0; i<nc; i++)
5906         {
5907             dbl = 0;
5908             sscanf(size_string,"%lf%n",&dbl,&n);
5909             if (dbl == 0)
5910             {
5911                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5912             }
5913             slb_frac[i] = dbl;
5914             size_string += n;
5915             tot += slb_frac[i];
5916         }
5917         /* Normalize */
5918         if (fplog)
5919         {
5920             fprintf(fplog,"Relative cell sizes:");
5921         }
5922         for (i=0; i<nc; i++)
5923         {
5924             slb_frac[i] /= tot;
5925             if (fplog)
5926             {
5927                 fprintf(fplog," %5.3f",slb_frac[i]);
5928             }
5929         }
5930         if (fplog)
5931         {
5932             fprintf(fplog,"\n");
5933         }
5934     }
5935     
5936     return slb_frac;
5937 }
5938
5939 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5940 {
5941     int n,nmol,ftype;
5942     gmx_mtop_ilistloop_t iloop;
5943     t_ilist *il;
5944     
5945     n = 0;
5946     iloop = gmx_mtop_ilistloop_init(mtop);
5947     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5948     {
5949         for(ftype=0; ftype<F_NRE; ftype++)
5950         {
5951             if ((interaction_function[ftype].flags & IF_BOND) &&
5952                 NRAL(ftype) >  2)
5953             {
5954                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5955             }
5956         }
5957   }
5958
5959   return n;
5960 }
5961
5962 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5963 {
5964     char *val;
5965     int  nst;
5966     
5967     nst = def;
5968     val = getenv(env_var);
5969     if (val)
5970     {
5971         if (sscanf(val,"%d",&nst) <= 0)
5972         {
5973             nst = 1;
5974         }
5975         if (fplog)
5976         {
5977             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5978                     env_var,val,nst);
5979         }
5980     }
5981     
5982     return nst;
5983 }
5984
5985 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5986 {
5987     if (MASTER(cr))
5988     {
5989         fprintf(stderr,"\n%s\n",warn_string);
5990     }
5991     if (fplog)
5992     {
5993         fprintf(fplog,"\n%s\n",warn_string);
5994     }
5995 }
5996
5997 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5998                                   t_inputrec *ir,FILE *fplog)
5999 {
6000     if (ir->ePBC == epbcSCREW &&
6001         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6002     {
6003         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6004     }
6005
6006     if (ir->ns_type == ensSIMPLE)
6007     {
6008         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6009     }
6010
6011     if (ir->nstlist == 0)
6012     {
6013         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6014     }
6015
6016     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6017     {
6018         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6019     }
6020 }
6021
6022 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6023 {
6024     int  di,d;
6025     real r;
6026
6027     r = ddbox->box_size[XX];
6028     for(di=0; di<dd->ndim; di++)
6029     {
6030         d = dd->dim[di];
6031         /* Check using the initial average cell size */
6032         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6033     }
6034
6035     return r;
6036 }
6037
6038 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6039                              const char *dlb_opt,gmx_bool bRecordLoad,
6040                              unsigned long Flags,t_inputrec *ir)
6041 {
6042     gmx_domdec_t *dd;
6043     int  eDLB=-1;
6044     char buf[STRLEN];
6045
6046     switch (dlb_opt[0])
6047     {
6048     case 'a': eDLB = edlbAUTO; break;
6049     case 'n': eDLB = edlbNO;   break;
6050     case 'y': eDLB = edlbYES;  break;
6051     default: gmx_incons("Unknown dlb_opt");
6052     }
6053
6054     if (Flags & MD_RERUN)
6055     {
6056         return edlbNO;
6057     }
6058
6059     if (!EI_DYNAMICS(ir->eI))
6060     {
6061         if (eDLB == edlbYES)
6062         {
6063             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6064             dd_warning(cr,fplog,buf);
6065         }
6066             
6067         return edlbNO;
6068     }
6069
6070     if (!bRecordLoad)
6071     {
6072         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6073
6074         return edlbNO;
6075     }
6076
6077     if (Flags & MD_REPRODUCIBLE)
6078     {
6079         switch (eDLB)
6080         {
6081                         case edlbNO: 
6082                                 break;
6083                         case edlbAUTO:
6084                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6085                                 eDLB = edlbNO;
6086                                 break;
6087                         case edlbYES:
6088                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6089                                 break;
6090                         default:
6091                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6092                                 break;
6093         }
6094     }
6095
6096     return eDLB;
6097 }
6098
6099 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6100 {
6101     int dim;
6102
6103     dd->ndim = 0;
6104     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6105     {
6106         /* Decomposition order z,y,x */
6107         if (fplog)
6108         {
6109             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6110         }
6111         for(dim=DIM-1; dim>=0; dim--)
6112         {
6113             if (dd->nc[dim] > 1)
6114             {
6115                 dd->dim[dd->ndim++] = dim;
6116             }
6117         }
6118     }
6119     else
6120     {
6121         /* Decomposition order x,y,z */
6122         for(dim=0; dim<DIM; dim++)
6123         {
6124             if (dd->nc[dim] > 1)
6125             {
6126                 dd->dim[dd->ndim++] = dim;
6127             }
6128         }
6129     }
6130 }
6131
6132 static gmx_domdec_comm_t *init_dd_comm()
6133 {
6134     gmx_domdec_comm_t *comm;
6135     int  i;
6136
6137     snew(comm,1);
6138     snew(comm->cggl_flag,DIM*2);
6139     snew(comm->cgcm_state,DIM*2);
6140     for(i=0; i<DIM*2; i++)
6141     {
6142         comm->cggl_flag_nalloc[i]  = 0;
6143         comm->cgcm_state_nalloc[i] = 0;
6144     }
6145     
6146     comm->nalloc_int = 0;
6147     comm->buf_int    = NULL;
6148
6149     vec_rvec_init(&comm->vbuf);
6150
6151     comm->n_load_have    = 0;
6152     comm->n_load_collect = 0;
6153
6154     for(i=0; i<ddnatNR-ddnatZONE; i++)
6155     {
6156         comm->sum_nat[i] = 0;
6157     }
6158     comm->ndecomp = 0;
6159     comm->nload   = 0;
6160     comm->load_step = 0;
6161     comm->load_sum  = 0;
6162     comm->load_max  = 0;
6163     clear_ivec(comm->load_lim);
6164     comm->load_mdf  = 0;
6165     comm->load_pme  = 0;
6166
6167     return comm;
6168 }
6169
6170 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6171                                         unsigned long Flags,
6172                                         ivec nc,
6173                                         real comm_distance_min,real rconstr,
6174                                         const char *dlb_opt,real dlb_scale,
6175                                         const char *sizex,const char *sizey,const char *sizez,
6176                                         gmx_mtop_t *mtop,t_inputrec *ir,
6177                                         matrix box,rvec *x,
6178                                         gmx_ddbox_t *ddbox,
6179                                         int *npme_x,int *npme_y)
6180 {
6181     gmx_domdec_t *dd;
6182     gmx_domdec_comm_t *comm;
6183     int  recload;
6184     int  d,i,j;
6185     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6186     gmx_bool bC;
6187     char buf[STRLEN];
6188     
6189     if (fplog)
6190     {
6191         fprintf(fplog,
6192                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6193     }
6194     
6195     snew(dd,1);
6196
6197     dd->comm = init_dd_comm();
6198     comm = dd->comm;
6199     snew(comm->cggl_flag,DIM*2);
6200     snew(comm->cgcm_state,DIM*2);
6201
6202     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6203     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6204     
6205     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6206     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6207     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6208     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6209     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6210     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6211     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6212     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6213
6214     dd->pme_recv_f_alloc = 0;
6215     dd->pme_recv_f_buf = NULL;
6216
6217     if (dd->bSendRecv2 && fplog)
6218     {
6219         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6220     }
6221     if (comm->eFlop)
6222     {
6223         if (fplog)
6224         {
6225             fprintf(fplog,"Will load balance based on FLOP count\n");
6226         }
6227         if (comm->eFlop > 1)
6228         {
6229             srand(1+cr->nodeid);
6230         }
6231         comm->bRecordLoad = TRUE;
6232     }
6233     else
6234     {
6235         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6236                              
6237     }
6238     
6239     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6240     
6241     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6242     if (fplog)
6243     {
6244         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6245     }
6246     dd->bGridJump = comm->bDynLoadBal;
6247     
6248     if (comm->nstSortCG)
6249     {
6250         if (fplog)
6251         {
6252             if (comm->nstSortCG == 1)
6253             {
6254                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6255             }
6256             else
6257             {
6258                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6259                         comm->nstSortCG);
6260             }
6261         }
6262         snew(comm->sort,1);
6263     }
6264     else
6265     {
6266         if (fplog)
6267         {
6268             fprintf(fplog,"Will not sort the charge groups\n");
6269         }
6270     }
6271     
6272     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6273     if (comm->bInterCGBondeds)
6274     {
6275         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6276     }
6277     else
6278     {
6279         comm->bInterCGMultiBody = FALSE;
6280     }
6281     
6282     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6283
6284     if (ir->rlistlong == 0)
6285     {
6286         /* Set the cut-off to some very large value,
6287          * so we don't need if statements everywhere in the code.
6288          * We use sqrt, since the cut-off is squared in some places.
6289          */
6290         comm->cutoff   = GMX_CUTOFF_INF;
6291     }
6292     else
6293     {
6294         comm->cutoff   = ir->rlistlong;
6295     }
6296     comm->cutoff_mbody = 0;
6297     
6298     comm->cellsize_limit = 0;
6299     comm->bBondComm = FALSE;
6300
6301     if (comm->bInterCGBondeds)
6302     {
6303         if (comm_distance_min > 0)
6304         {
6305             comm->cutoff_mbody = comm_distance_min;
6306             if (Flags & MD_DDBONDCOMM)
6307             {
6308                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6309             }
6310             else
6311             {
6312                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6313             }
6314             r_bonded_limit = comm->cutoff_mbody;
6315         }
6316         else if (ir->bPeriodicMols)
6317         {
6318             /* Can not easily determine the required cut-off */
6319             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6320             comm->cutoff_mbody = comm->cutoff/2;
6321             r_bonded_limit = comm->cutoff_mbody;
6322         }
6323         else
6324         {
6325             if (MASTER(cr))
6326             {
6327                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6328                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6329             }
6330             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6331             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6332
6333             /* We use an initial margin of 10% for the minimum cell size,
6334              * except when we are just below the non-bonded cut-off.
6335              */
6336             if (Flags & MD_DDBONDCOMM)
6337             {
6338                 if (max(r_2b,r_mb) > comm->cutoff)
6339                 {
6340                     r_bonded       = max(r_2b,r_mb);
6341                     r_bonded_limit = 1.1*r_bonded;
6342                     comm->bBondComm = TRUE;
6343                 }
6344                 else
6345                 {
6346                     r_bonded       = r_mb;
6347                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6348                 }
6349                 /* We determine cutoff_mbody later */
6350             }
6351             else
6352             {
6353                 /* No special bonded communication,
6354                  * simply increase the DD cut-off.
6355                  */
6356                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6357                 comm->cutoff_mbody = r_bonded_limit;
6358                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6359             }
6360         }
6361         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6362         if (fplog)
6363         {
6364             fprintf(fplog,
6365                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6366                     comm->cellsize_limit);
6367         }
6368     }
6369
6370     if (dd->bInterCGcons && rconstr <= 0)
6371     {
6372         /* There is a cell size limit due to the constraints (P-LINCS) */
6373         rconstr = constr_r_max(fplog,mtop,ir);
6374         if (fplog)
6375         {
6376             fprintf(fplog,
6377                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6378                     rconstr);
6379             if (rconstr > comm->cellsize_limit)
6380             {
6381                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6382             }
6383         }
6384     }
6385     else if (rconstr > 0 && fplog)
6386     {
6387         /* Here we do not check for dd->bInterCGcons,
6388          * because one can also set a cell size limit for virtual sites only
6389          * and at this point we don't know yet if there are intercg v-sites.
6390          */
6391         fprintf(fplog,
6392                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6393                 rconstr);
6394     }
6395     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6396
6397     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6398
6399     if (nc[XX] > 0)
6400     {
6401         copy_ivec(nc,dd->nc);
6402         set_dd_dim(fplog,dd);
6403         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6404
6405         if (cr->npmenodes == -1)
6406         {
6407             cr->npmenodes = 0;
6408         }
6409         acs = average_cellsize_min(dd,ddbox);
6410         if (acs < comm->cellsize_limit)
6411         {
6412             if (fplog)
6413             {
6414                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6415             }
6416             gmx_fatal_collective(FARGS,cr,NULL,
6417                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6418                                  acs,comm->cellsize_limit);
6419         }
6420     }
6421     else
6422     {
6423         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6424
6425         /* We need to choose the optimal DD grid and possibly PME nodes */
6426         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6427                                comm->eDLB!=edlbNO,dlb_scale,
6428                                comm->cellsize_limit,comm->cutoff,
6429                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6430         
6431         if (dd->nc[XX] == 0)
6432         {
6433             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6434             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6435                     !bC ? "-rdd" : "-rcon",
6436                     comm->eDLB!=edlbNO ? " or -dds" : "",
6437                     bC ? " or your LINCS settings" : "");
6438
6439             gmx_fatal_collective(FARGS,cr,NULL,
6440                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6441                                  "%s\n"
6442                                  "Look in the log file for details on the domain decomposition",
6443                                  cr->nnodes-cr->npmenodes,limit,buf);
6444         }
6445         set_dd_dim(fplog,dd);
6446     }
6447
6448     if (fplog)
6449     {
6450         fprintf(fplog,
6451                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6452                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6453     }
6454     
6455     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6456     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6457     {
6458         gmx_fatal_collective(FARGS,cr,NULL,
6459                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6460                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6461     }
6462     if (cr->npmenodes > dd->nnodes)
6463     {
6464         gmx_fatal_collective(FARGS,cr,NULL,
6465                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6466     }
6467     if (cr->npmenodes > 0)
6468     {
6469         comm->npmenodes = cr->npmenodes;
6470     }
6471     else
6472     {
6473         comm->npmenodes = dd->nnodes;
6474     }
6475
6476     if (EEL_PME(ir->coulombtype))
6477     {
6478         /* The following choices should match those
6479          * in comm_cost_est in domdec_setup.c.
6480          * Note that here the checks have to take into account
6481          * that the decomposition might occur in a different order than xyz
6482          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6483          * in which case they will not match those in comm_cost_est,
6484          * but since that is mainly for testing purposes that's fine.
6485          */
6486         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6487             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6488             getenv("GMX_PMEONEDD") == NULL)
6489         {
6490             comm->npmedecompdim = 2;
6491             comm->npmenodes_x   = dd->nc[XX];
6492             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6493         }
6494         else
6495         {
6496             /* In case nc is 1 in both x and y we could still choose to
6497              * decompose pme in y instead of x, but we use x for simplicity.
6498              */
6499             comm->npmedecompdim = 1;
6500             if (dd->dim[0] == YY)
6501             {
6502                 comm->npmenodes_x = 1;
6503                 comm->npmenodes_y = comm->npmenodes;
6504             }
6505             else
6506             {
6507                 comm->npmenodes_x = comm->npmenodes;
6508                 comm->npmenodes_y = 1;
6509             }
6510         }    
6511         if (fplog)
6512         {
6513             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6514                     comm->npmenodes_x,comm->npmenodes_y,1);
6515         }
6516     }
6517     else
6518     {
6519         comm->npmedecompdim = 0;
6520         comm->npmenodes_x   = 0;
6521         comm->npmenodes_y   = 0;
6522     }
6523     
6524     /* Technically we don't need both of these,
6525      * but it simplifies code not having to recalculate it.
6526      */
6527     *npme_x = comm->npmenodes_x;
6528     *npme_y = comm->npmenodes_y;
6529         
6530     snew(comm->slb_frac,DIM);
6531     if (comm->eDLB == edlbNO)
6532     {
6533         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6534         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6535         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6536     }
6537
6538     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6539     {
6540         if (comm->bBondComm || comm->eDLB != edlbNO)
6541         {
6542             /* Set the bonded communication distance to halfway
6543              * the minimum and the maximum,
6544              * since the extra communication cost is nearly zero.
6545              */
6546             acs = average_cellsize_min(dd,ddbox);
6547             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6548             if (comm->eDLB != edlbNO)
6549             {
6550                 /* Check if this does not limit the scaling */
6551                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6552             }
6553             if (!comm->bBondComm)
6554             {
6555                 /* Without bBondComm do not go beyond the n.b. cut-off */
6556                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6557                 if (comm->cellsize_limit >= comm->cutoff)
6558                 {
6559                     /* We don't loose a lot of efficieny
6560                      * when increasing it to the n.b. cut-off.
6561                      * It can even be slightly faster, because we need
6562                      * less checks for the communication setup.
6563                      */
6564                     comm->cutoff_mbody = comm->cutoff;
6565                 }
6566             }
6567             /* Check if we did not end up below our original limit */
6568             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6569
6570             if (comm->cutoff_mbody > comm->cellsize_limit)
6571             {
6572                 comm->cellsize_limit = comm->cutoff_mbody;
6573             }
6574         }
6575         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6576     }
6577
6578     if (debug)
6579     {
6580         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6581                 "cellsize limit %f\n",
6582                 comm->bBondComm,comm->cellsize_limit);
6583     }
6584     
6585     if (MASTER(cr))
6586     {
6587         check_dd_restrictions(cr,dd,ir,fplog);
6588     }
6589
6590     comm->globalcomm_step = INT_MIN;
6591     dd->ddp_count = 0;
6592
6593     clear_dd_cycle_counts(dd);
6594
6595     return dd;
6596 }
6597
6598 static void set_dlb_limits(gmx_domdec_t *dd)
6599
6600 {
6601     int d;
6602
6603     for(d=0; d<dd->ndim; d++)
6604     {
6605         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6606         dd->comm->cellsize_min[dd->dim[d]] =
6607             dd->comm->cellsize_min_dlb[dd->dim[d]];
6608     }
6609 }
6610
6611
6612 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6613 {
6614     gmx_domdec_t *dd;
6615     gmx_domdec_comm_t *comm;
6616     real cellsize_min;
6617     int  d,nc,i;
6618     char buf[STRLEN];
6619     
6620     dd = cr->dd;
6621     comm = dd->comm;
6622     
6623     if (fplog)
6624     {
6625         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6626     }
6627
6628     cellsize_min = comm->cellsize_min[dd->dim[0]];
6629     for(d=1; d<dd->ndim; d++)
6630     {
6631         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6632     }
6633
6634     if (cellsize_min < comm->cellsize_limit*1.05)
6635     {
6636         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6637
6638         /* Change DLB from "auto" to "no". */
6639         comm->eDLB = edlbNO;
6640
6641         return;
6642     }
6643
6644     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6645     comm->bDynLoadBal = TRUE;
6646     dd->bGridJump = TRUE;
6647     
6648     set_dlb_limits(dd);
6649
6650     /* We can set the required cell size info here,
6651      * so we do not need to communicate this.
6652      * The grid is completely uniform.
6653      */
6654     for(d=0; d<dd->ndim; d++)
6655     {
6656         if (comm->root[d])
6657         {
6658             comm->load[d].sum_m = comm->load[d].sum;
6659
6660             nc = dd->nc[dd->dim[d]];
6661             for(i=0; i<nc; i++)
6662             {
6663                 comm->root[d]->cell_f[i]    = i/(real)nc;
6664                 if (d > 0)
6665                 {
6666                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6667                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6668                 }
6669             }
6670             comm->root[d]->cell_f[nc] = 1.0;
6671         }
6672     }
6673 }
6674
6675 static char *init_bLocalCG(gmx_mtop_t *mtop)
6676 {
6677     int  ncg,cg;
6678     char *bLocalCG;
6679     
6680     ncg = ncg_mtop(mtop);
6681     snew(bLocalCG,ncg);
6682     for(cg=0; cg<ncg; cg++)
6683     {
6684         bLocalCG[cg] = FALSE;
6685     }
6686
6687     return bLocalCG;
6688 }
6689
6690 void dd_init_bondeds(FILE *fplog,
6691                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6692                      gmx_vsite_t *vsite,gmx_constr_t constr,
6693                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6694 {
6695     gmx_domdec_comm_t *comm;
6696     gmx_bool bBondComm;
6697     int  d;
6698
6699     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6700
6701     comm = dd->comm;
6702
6703     if (comm->bBondComm)
6704     {
6705         /* Communicate atoms beyond the cut-off for bonded interactions */
6706         comm = dd->comm;
6707
6708         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6709
6710         comm->bLocalCG = init_bLocalCG(mtop);
6711     }
6712     else
6713     {
6714         /* Only communicate atoms based on cut-off */
6715         comm->cglink   = NULL;
6716         comm->bLocalCG = NULL;
6717     }
6718 }
6719
6720 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6721                               t_inputrec *ir,
6722                               gmx_bool bDynLoadBal,real dlb_scale,
6723                               gmx_ddbox_t *ddbox)
6724 {
6725     gmx_domdec_comm_t *comm;
6726     int  d;
6727     ivec np;
6728     real limit,shrink;
6729     char buf[64];
6730
6731     if (fplog == NULL)
6732     {
6733         return;
6734     }
6735
6736     comm = dd->comm;
6737
6738     if (bDynLoadBal)
6739     {
6740         fprintf(fplog,"The maximum number of communication pulses is:");
6741         for(d=0; d<dd->ndim; d++)
6742         {
6743             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6744         }
6745         fprintf(fplog,"\n");
6746         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6747         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6748         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6749         for(d=0; d<DIM; d++)
6750         {
6751             if (dd->nc[d] > 1)
6752             {
6753                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6754                 {
6755                     shrink = 0;
6756                 }
6757                 else
6758                 {
6759                     shrink =
6760                         comm->cellsize_min_dlb[d]/
6761                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6762                 }
6763                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6764             }
6765         }
6766         fprintf(fplog,"\n");
6767     }
6768     else
6769     {
6770         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6771         fprintf(fplog,"The initial number of communication pulses is:");
6772         for(d=0; d<dd->ndim; d++)
6773         {
6774             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6775         }
6776         fprintf(fplog,"\n");
6777         fprintf(fplog,"The initial domain decomposition cell size is:");
6778         for(d=0; d<DIM; d++) {
6779             if (dd->nc[d] > 1)
6780             {
6781                 fprintf(fplog," %c %.2f nm",
6782                         dim2char(d),dd->comm->cellsize_min[d]);
6783             }
6784         }
6785         fprintf(fplog,"\n\n");
6786     }
6787     
6788     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6789     {
6790         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6791         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6792                 "non-bonded interactions","",comm->cutoff);
6793
6794         if (bDynLoadBal)
6795         {
6796             limit = dd->comm->cellsize_limit;
6797         }
6798         else
6799         {
6800             if (dynamic_dd_box(ddbox,ir))
6801             {
6802                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6803             }
6804             limit = dd->comm->cellsize_min[XX];
6805             for(d=1; d<DIM; d++)
6806             {
6807                 limit = min(limit,dd->comm->cellsize_min[d]);
6808             }
6809         }
6810
6811         if (comm->bInterCGBondeds)
6812         {
6813             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6814                     "two-body bonded interactions","(-rdd)",
6815                     max(comm->cutoff,comm->cutoff_mbody));
6816             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6817                     "multi-body bonded interactions","(-rdd)",
6818                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6819         }
6820         if (dd->vsite_comm)
6821         {
6822             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6823                     "virtual site constructions","(-rcon)",limit);
6824         }
6825         if (dd->constraint_comm)
6826         {
6827             sprintf(buf,"atoms separated by up to %d constraints",
6828                     1+ir->nProjOrder);
6829             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6830                     buf,"(-rcon)",limit);
6831         }
6832         fprintf(fplog,"\n");
6833     }
6834     
6835     fflush(fplog);
6836 }
6837
6838 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6839                        t_inputrec *ir,t_forcerec *fr,
6840                        gmx_ddbox_t *ddbox)
6841 {
6842     gmx_domdec_comm_t *comm;
6843     int  d,dim,npulse,npulse_d_max,npulse_d;
6844     gmx_bool bNoCutOff;
6845     int  natoms_tot;
6846     real vol_frac;
6847
6848     comm = dd->comm;
6849
6850     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6851
6852     if (EEL_PME(ir->coulombtype))
6853     {
6854         init_ddpme(dd,&comm->ddpme[0],0);
6855         if (comm->npmedecompdim >= 2)
6856         {
6857             init_ddpme(dd,&comm->ddpme[1],1);
6858         }
6859     }
6860     else
6861     {
6862         comm->npmenodes = 0;
6863         if (dd->pme_nodeid >= 0)
6864         {
6865             gmx_fatal_collective(FARGS,NULL,dd,
6866                                  "Can not have separate PME nodes without PME electrostatics");
6867         }
6868     }
6869     
6870     /* If each molecule is a single charge group
6871      * or we use domain decomposition for each periodic dimension,
6872      * we do not need to take pbc into account for the bonded interactions.
6873      */
6874     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6875         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6876     {
6877         fr->bMolPBC = FALSE;
6878     }
6879     else
6880     {
6881         fr->bMolPBC = TRUE;
6882     }
6883         
6884     if (debug)
6885     {
6886         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6887     }
6888     if (comm->eDLB != edlbNO)
6889     {
6890         /* Determine the maximum number of comm. pulses in one dimension */
6891         
6892         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6893         
6894         /* Determine the maximum required number of grid pulses */
6895         if (comm->cellsize_limit >= comm->cutoff)
6896         {
6897             /* Only a single pulse is required */
6898             npulse = 1;
6899         }
6900         else if (!bNoCutOff && comm->cellsize_limit > 0)
6901         {
6902             /* We round down slightly here to avoid overhead due to the latency
6903              * of extra communication calls when the cut-off
6904              * would be only slightly longer than the cell size.
6905              * Later cellsize_limit is redetermined,
6906              * so we can not miss interactions due to this rounding.
6907              */
6908             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6909         }
6910         else
6911         {
6912             /* There is no cell size limit */
6913             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6914         }
6915
6916         if (!bNoCutOff && npulse > 1)
6917         {
6918             /* See if we can do with less pulses, based on dlb_scale */
6919             npulse_d_max = 0;
6920             for(d=0; d<dd->ndim; d++)
6921             {
6922                 dim = dd->dim[d];
6923                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6924                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6925                 npulse_d_max = max(npulse_d_max,npulse_d);
6926             }
6927             npulse = min(npulse,npulse_d_max);
6928         }
6929         
6930         /* This env var can override npulse */
6931         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6932         if (d > 0)
6933         {
6934             npulse = d;
6935         }
6936
6937         comm->maxpulse = 1;
6938         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6939         for(d=0; d<dd->ndim; d++)
6940         {
6941             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6942             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6943             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6944             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6945             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6946             {
6947                 comm->bVacDLBNoLimit = FALSE;
6948             }
6949         }
6950         
6951         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6952         if (!comm->bVacDLBNoLimit)
6953         {
6954             comm->cellsize_limit = max(comm->cellsize_limit,
6955                                        comm->cutoff/comm->maxpulse);
6956         }
6957         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6958         /* Set the minimum cell size for each DD dimension */
6959         for(d=0; d<dd->ndim; d++)
6960         {
6961             if (comm->bVacDLBNoLimit ||
6962                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6963             {
6964                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6965             }
6966             else
6967             {
6968                 comm->cellsize_min_dlb[dd->dim[d]] =
6969                     comm->cutoff/comm->cd[d].np_dlb;
6970             }
6971         }
6972         if (comm->cutoff_mbody <= 0)
6973         {
6974             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6975         }
6976         if (comm->bDynLoadBal)
6977         {
6978             set_dlb_limits(dd);
6979         }
6980     }
6981     
6982     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6983     if (comm->eDLB == edlbAUTO)
6984     {
6985         if (fplog)
6986         {
6987             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6988         }
6989         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6990     }
6991
6992     if (ir->ePBC == epbcNONE)
6993     {
6994         vol_frac = 1 - 1/(double)dd->nnodes;
6995     }
6996     else
6997     {
6998         vol_frac =
6999             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7000     }
7001     if (debug)
7002     {
7003         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7004     }
7005     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7006    
7007     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7008 }
7009
7010 static void merge_cg_buffers(int ncell,
7011                              gmx_domdec_comm_dim_t *cd, int pulse,
7012                              int  *ncg_cell,
7013                              int  *index_gl, int  *recv_i,
7014                              rvec *cg_cm,    rvec *recv_vr,
7015                              int *cgindex,
7016                              cginfo_mb_t *cginfo_mb,int *cginfo)
7017 {
7018     gmx_domdec_ind_t *ind,*ind_p;
7019     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7020     int shift,shift_at;
7021     
7022     ind = &cd->ind[pulse];
7023     
7024     /* First correct the already stored data */
7025     shift = ind->nrecv[ncell];
7026     for(cell=ncell-1; cell>=0; cell--)
7027     {
7028         shift -= ind->nrecv[cell];
7029         if (shift > 0)
7030         {
7031             /* Move the cg's present from previous grid pulses */
7032             cg0 = ncg_cell[ncell+cell];
7033             cg1 = ncg_cell[ncell+cell+1];
7034             cgindex[cg1+shift] = cgindex[cg1];
7035             for(cg=cg1-1; cg>=cg0; cg--)
7036             {
7037                 index_gl[cg+shift] = index_gl[cg];
7038                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7039                 cgindex[cg+shift] = cgindex[cg];
7040                 cginfo[cg+shift] = cginfo[cg];
7041             }
7042             /* Correct the already stored send indices for the shift */
7043             for(p=1; p<=pulse; p++)
7044             {
7045                 ind_p = &cd->ind[p];
7046                 cg0 = 0;
7047                 for(c=0; c<cell; c++)
7048                 {
7049                     cg0 += ind_p->nsend[c];
7050                 }
7051                 cg1 = cg0 + ind_p->nsend[cell];
7052                 for(cg=cg0; cg<cg1; cg++)
7053                 {
7054                     ind_p->index[cg] += shift;
7055                 }
7056             }
7057         }
7058     }
7059
7060     /* Merge in the communicated buffers */
7061     shift = 0;
7062     shift_at = 0;
7063     cg0 = 0;
7064     for(cell=0; cell<ncell; cell++)
7065     {
7066         cg1 = ncg_cell[ncell+cell+1] + shift;
7067         if (shift_at > 0)
7068         {
7069             /* Correct the old cg indices */
7070             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7071             {
7072                 cgindex[cg+1] += shift_at;
7073             }
7074         }
7075         for(cg=0; cg<ind->nrecv[cell]; cg++)
7076         {
7077             /* Copy this charge group from the buffer */
7078             index_gl[cg1] = recv_i[cg0];
7079             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7080             /* Add it to the cgindex */
7081             cg_gl = index_gl[cg1];
7082             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7083             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7084             cgindex[cg1+1] = cgindex[cg1] + nat;
7085             cg0++;
7086             cg1++;
7087             shift_at += nat;
7088         }
7089         shift += ind->nrecv[cell];
7090         ncg_cell[ncell+cell+1] = cg1;
7091     }
7092 }
7093
7094 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7095                                int nzone,int cg0,const int *cgindex)
7096 {
7097     int cg,zone,p;
7098     
7099     /* Store the atom block boundaries for easy copying of communication buffers
7100      */
7101     cg = cg0;
7102     for(zone=0; zone<nzone; zone++)
7103     {
7104         for(p=0; p<cd->np; p++) {
7105             cd->ind[p].cell2at0[zone] = cgindex[cg];
7106             cg += cd->ind[p].nrecv[zone];
7107             cd->ind[p].cell2at1[zone] = cgindex[cg];
7108         }
7109     }
7110 }
7111
7112 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7113 {
7114     int  i;
7115     gmx_bool bMiss;
7116
7117     bMiss = FALSE;
7118     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7119     {
7120         if (!bLocalCG[link->a[i]])
7121         {
7122             bMiss = TRUE;
7123         }
7124     }
7125
7126     return bMiss;
7127 }
7128
7129 static void setup_dd_communication(gmx_domdec_t *dd,
7130                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7131 {
7132     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7133     int nzone,nzone_send,zone,zonei,cg0,cg1;
7134     int c,i,j,cg,cg_gl,nrcg;
7135     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7136     gmx_domdec_comm_t *comm;
7137     gmx_domdec_zones_t *zones;
7138     gmx_domdec_comm_dim_t *cd;
7139     gmx_domdec_ind_t *ind;
7140     cginfo_mb_t *cginfo_mb;
7141     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7142     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7143     rvec rb,rn;
7144     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7145     real bcorner[DIM],bcorner_round_1=0;
7146     ivec tric_dist;
7147     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7148     real skew_fac2_d,skew_fac_01;
7149     rvec sf2_round;
7150     int  nsend,nat;
7151     
7152     if (debug)
7153     {
7154         fprintf(debug,"Setting up DD communication\n");
7155     }
7156     
7157     comm  = dd->comm;
7158     cg_cm = fr->cg_cm;
7159
7160     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7161     {
7162         dim = dd->dim[dim_ind];
7163
7164         /* Check if we need to use triclinic distances */
7165         tric_dist[dim_ind] = 0;
7166         for(i=0; i<=dim_ind; i++)
7167         {
7168             if (ddbox->tric_dir[dd->dim[i]])
7169             {
7170                 tric_dist[dim_ind] = 1;
7171             }
7172         }
7173     }
7174
7175     bBondComm = comm->bBondComm;
7176
7177     /* Do we need to determine extra distances for multi-body bondeds? */
7178     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7179     
7180     /* Do we need to determine extra distances for only two-body bondeds? */
7181     bDist2B = (bBondComm && !bDistMB);
7182
7183     r_comm2  = sqr(comm->cutoff);
7184     r_bcomm2 = sqr(comm->cutoff_mbody);
7185
7186     if (debug)
7187     {
7188         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7189     }
7190
7191     zones = &comm->zones;
7192     
7193     dim0 = dd->dim[0];
7194     /* The first dimension is equal for all cells */
7195     corner[0][0] = comm->cell_x0[dim0];
7196     if (bDistMB)
7197     {
7198         bcorner[0] = corner[0][0];
7199     }
7200     if (dd->ndim >= 2)
7201     {
7202         dim1 = dd->dim[1];
7203         /* This cell row is only seen from the first row */
7204         corner[1][0] = comm->cell_x0[dim1];
7205         /* All rows can see this row */
7206         corner[1][1] = comm->cell_x0[dim1];
7207         if (dd->bGridJump)
7208         {
7209             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7210             if (bDistMB)
7211             {
7212                 /* For the multi-body distance we need the maximum */
7213                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7214             }
7215         }
7216         /* Set the upper-right corner for rounding */
7217         corner_round_0 = comm->cell_x1[dim0];
7218         
7219         if (dd->ndim >= 3)
7220         {
7221             dim2 = dd->dim[2];
7222             for(j=0; j<4; j++)
7223             {
7224                 corner[2][j] = comm->cell_x0[dim2];
7225             }
7226             if (dd->bGridJump)
7227             {
7228                 /* Use the maximum of the i-cells that see a j-cell */
7229                 for(i=0; i<zones->nizone; i++)
7230                 {
7231                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7232                     {
7233                         if (j >= 4)
7234                         {
7235                             corner[2][j-4] =
7236                                 max(corner[2][j-4],
7237                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7238                         }
7239                     }
7240                 }
7241                 if (bDistMB)
7242                 {
7243                     /* For the multi-body distance we need the maximum */
7244                     bcorner[2] = comm->cell_x0[dim2];
7245                     for(i=0; i<2; i++)
7246                     {
7247                         for(j=0; j<2; j++)
7248                         {
7249                             bcorner[2] = max(bcorner[2],
7250                                              comm->zone_d2[i][j].p1_0);
7251                         }
7252                     }
7253                 }
7254             }
7255             
7256             /* Set the upper-right corner for rounding */
7257             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7258              * Only cell (0,0,0) can see cell 7 (1,1,1)
7259              */
7260             corner_round_1[0] = comm->cell_x1[dim1];
7261             corner_round_1[3] = comm->cell_x1[dim1];
7262             if (dd->bGridJump)
7263             {
7264                 corner_round_1[0] = max(comm->cell_x1[dim1],
7265                                         comm->zone_d1[1].mch1);
7266                 if (bDistMB)
7267                 {
7268                     /* For the multi-body distance we need the maximum */
7269                     bcorner_round_1 = max(comm->cell_x1[dim1],
7270                                           comm->zone_d1[1].p1_1);
7271                 }
7272             }
7273         }
7274     }
7275     
7276     /* Triclinic stuff */
7277     normal = ddbox->normal;
7278     skew_fac_01 = 0;
7279     if (dd->ndim >= 2)
7280     {
7281         v_0 = ddbox->v[dim0];
7282         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7283         {
7284             /* Determine the coupling coefficient for the distances
7285              * to the cell planes along dim0 and dim1 through dim2.
7286              * This is required for correct rounding.
7287              */
7288             skew_fac_01 =
7289                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7290             if (debug)
7291             {
7292                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7293             }
7294         }
7295     }
7296     if (dd->ndim >= 3)
7297     {
7298         v_1 = ddbox->v[dim1];
7299     }
7300     
7301     zone_cg_range = zones->cg_range;
7302     index_gl = dd->index_gl;
7303     cgindex  = dd->cgindex;
7304     cginfo_mb = fr->cginfo_mb;
7305     
7306     zone_cg_range[0]   = 0;
7307     zone_cg_range[1]   = dd->ncg_home;
7308     comm->zone_ncg1[0] = dd->ncg_home;
7309     pos_cg             = dd->ncg_home;
7310     
7311     nat_tot = dd->nat_home;
7312     nzone = 1;
7313     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7314     {
7315         dim = dd->dim[dim_ind];
7316         cd = &comm->cd[dim_ind];
7317         
7318         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7319         {
7320             /* No pbc in this dimension, the first node should not comm. */
7321             nzone_send = 0;
7322         }
7323         else
7324         {
7325             nzone_send = nzone;
7326         }
7327
7328         bScrew = (dd->bScrewPBC && dim == XX);
7329         
7330         v_d = ddbox->v[dim];
7331         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7332
7333         cd->bInPlace = TRUE;
7334         for(p=0; p<cd->np; p++)
7335         {
7336             /* Only atoms communicated in the first pulse are used
7337              * for multi-body bonded interactions or for bBondComm.
7338              */
7339             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7340             bDistMB_pulse = (bDistMB && bDistBonded);
7341
7342             ind = &cd->ind[p];
7343             nsend = 0;
7344             nat = 0;
7345             for(zone=0; zone<nzone_send; zone++)
7346             {
7347                 if (tric_dist[dim_ind] && dim_ind > 0)
7348                 {
7349                     /* Determine slightly more optimized skew_fac's
7350                      * for rounding.
7351                      * This reduces the number of communicated atoms
7352                      * by about 10% for 3D DD of rhombic dodecahedra.
7353                      */
7354                     for(dimd=0; dimd<dim; dimd++)
7355                     {
7356                         sf2_round[dimd] = 1;
7357                         if (ddbox->tric_dir[dimd])
7358                         {
7359                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7360                             {
7361                                 /* If we are shifted in dimension i
7362                                  * and the cell plane is tilted forward
7363                                  * in dimension i, skip this coupling.
7364                                  */
7365                                 if (!(zones->shift[nzone+zone][i] &&
7366                                       ddbox->v[dimd][i][dimd] >= 0))
7367                                 {
7368                                     sf2_round[dimd] +=
7369                                         sqr(ddbox->v[dimd][i][dimd]);
7370                                 }
7371                             }
7372                             sf2_round[dimd] = 1/sf2_round[dimd];
7373                         }
7374                     }
7375                 }
7376
7377                 zonei = zone_perm[dim_ind][zone];
7378                 if (p == 0)
7379                 {
7380                     /* Here we permutate the zones to obtain a convenient order
7381                      * for neighbor searching
7382                      */
7383                     cg0 = zone_cg_range[zonei];
7384                     cg1 = zone_cg_range[zonei+1];
7385                 }
7386                 else
7387                 {
7388                     /* Look only at the cg's received in the previous grid pulse
7389                      */
7390                     cg1 = zone_cg_range[nzone+zone+1];
7391                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7392                 }
7393                 ind->nsend[zone] = 0;
7394                 for(cg=cg0; cg<cg1; cg++)
7395                 {
7396                     r2  = 0;
7397                     rb2 = 0;
7398                     if (tric_dist[dim_ind] == 0)
7399                     {
7400                         /* Rectangular direction, easy */
7401                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7402                         if (r > 0)
7403                         {
7404                             r2 += r*r;
7405                         }
7406                         if (bDistMB_pulse)
7407                         {
7408                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7409                             if (r > 0)
7410                             {
7411                                 rb2 += r*r;
7412                             }
7413                         }
7414                         /* Rounding gives at most a 16% reduction
7415                          * in communicated atoms
7416                          */
7417                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7418                         {
7419                             r = cg_cm[cg][dim0] - corner_round_0;
7420                             /* This is the first dimension, so always r >= 0 */
7421                             r2 += r*r;
7422                             if (bDistMB_pulse)
7423                             {
7424                                 rb2 += r*r;
7425                             }
7426                         }
7427                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7428                         {
7429                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7430                             if (r > 0)
7431                             {
7432                                 r2 += r*r;
7433                             }
7434                             if (bDistMB_pulse)
7435                             {
7436                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7437                                 if (r > 0)
7438                                 {
7439                                     rb2 += r*r;
7440                                 }
7441                             }
7442                         }
7443                     }
7444                     else
7445                     {
7446                         /* Triclinic direction, more complicated */
7447                         clear_rvec(rn);
7448                         clear_rvec(rb);
7449                         /* Rounding, conservative as the skew_fac multiplication
7450                          * will slightly underestimate the distance.
7451                          */
7452                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7453                         {
7454                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7455                             for(i=dim0+1; i<DIM; i++)
7456                             {
7457                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7458                             }
7459                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7460                             if (bDistMB_pulse)
7461                             {
7462                                 rb[dim0] = rn[dim0];
7463                                 rb2 = r2;
7464                             }
7465                             /* Take care that the cell planes along dim0 might not
7466                              * be orthogonal to those along dim1 and dim2.
7467                              */
7468                             for(i=1; i<=dim_ind; i++)
7469                             {
7470                                 dimd = dd->dim[i];
7471                                 if (normal[dim0][dimd] > 0)
7472                                 {
7473                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7474                                     if (bDistMB_pulse)
7475                                     {
7476                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7477                                     }
7478                                 }
7479                             }
7480                         }
7481                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7482                         {
7483                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7484                             tric_sh = 0;
7485                             for(i=dim1+1; i<DIM; i++)
7486                             {
7487                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7488                             }
7489                             rn[dim1] += tric_sh;
7490                             if (rn[dim1] > 0)
7491                             {
7492                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7493                                 /* Take care of coupling of the distances
7494                                  * to the planes along dim0 and dim1 through dim2.
7495                                  */
7496                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7497                                 /* Take care that the cell planes along dim1
7498                                  * might not be orthogonal to that along dim2.
7499                                  */
7500                                 if (normal[dim1][dim2] > 0)
7501                                 {
7502                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7503                                 }
7504                             }
7505                             if (bDistMB_pulse)
7506                             {
7507                                 rb[dim1] +=
7508                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7509                                 if (rb[dim1] > 0)
7510                                 {
7511                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7512                                     /* Take care of coupling of the distances
7513                                      * to the planes along dim0 and dim1 through dim2.
7514                                      */
7515                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7516                                     /* Take care that the cell planes along dim1
7517                                      * might not be orthogonal to that along dim2.
7518                                      */
7519                                     if (normal[dim1][dim2] > 0)
7520                                     {
7521                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7522                                     }
7523                                 }
7524                             }
7525                         }
7526                         /* The distance along the communication direction */
7527                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7528                         tric_sh = 0;
7529                         for(i=dim+1; i<DIM; i++)
7530                         {
7531                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7532                         }
7533                         rn[dim] += tric_sh;
7534                         if (rn[dim] > 0)
7535                         {
7536                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7537                             /* Take care of coupling of the distances
7538                              * to the planes along dim0 and dim1 through dim2.
7539                              */
7540                             if (dim_ind == 1 && zonei == 1)
7541                             {
7542                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7543                             }
7544                         }
7545                         if (bDistMB_pulse)
7546                         {
7547                             clear_rvec(rb);
7548                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7549                             if (rb[dim] > 0)
7550                             {
7551                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7552                                 /* Take care of coupling of the distances
7553                                  * to the planes along dim0 and dim1 through dim2.
7554                                  */
7555                                 if (dim_ind == 1 && zonei == 1)
7556                                 {
7557                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7558                                 }
7559                             }
7560                         }
7561                     }
7562                     
7563                     if (r2 < r_comm2 ||
7564                         (bDistBonded &&
7565                          ((bDistMB && rb2 < r_bcomm2) ||
7566                           (bDist2B && r2  < r_bcomm2)) &&
7567                          (!bBondComm ||
7568                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7569                            missing_link(comm->cglink,index_gl[cg],
7570                                         comm->bLocalCG)))))
7571                     {
7572                         /* Make an index to the local charge groups */
7573                         if (nsend+1 > ind->nalloc)
7574                         {
7575                             ind->nalloc = over_alloc_large(nsend+1);
7576                             srenew(ind->index,ind->nalloc);
7577                         }
7578                         if (nsend+1 > comm->nalloc_int)
7579                         {
7580                             comm->nalloc_int = over_alloc_large(nsend+1);
7581                             srenew(comm->buf_int,comm->nalloc_int);
7582                         }
7583                         ind->index[nsend] = cg;
7584                         comm->buf_int[nsend] = index_gl[cg];
7585                         ind->nsend[zone]++;
7586                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7587
7588                         if (dd->ci[dim] == 0)
7589                         {
7590                             /* Correct cg_cm for pbc */
7591                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7592                             if (bScrew)
7593                             {
7594                                 comm->vbuf.v[nsend][YY] =
7595                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7596                                 comm->vbuf.v[nsend][ZZ] =
7597                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7598                             }
7599                         }
7600                         else
7601                         {
7602                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7603                         }
7604                         nsend++;
7605                         nat += cgindex[cg+1] - cgindex[cg];
7606                     }
7607                 }
7608             }
7609             /* Clear the counts in case we do not have pbc */
7610             for(zone=nzone_send; zone<nzone; zone++)
7611             {
7612                 ind->nsend[zone] = 0;
7613             }
7614             ind->nsend[nzone]   = nsend;
7615             ind->nsend[nzone+1] = nat;
7616             /* Communicate the number of cg's and atoms to receive */
7617             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7618                             ind->nsend, nzone+2,
7619                             ind->nrecv, nzone+2);
7620             
7621             /* The rvec buffer is also required for atom buffers of size nsend
7622              * in dd_move_x and dd_move_f.
7623              */
7624             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7625
7626             if (p > 0)
7627             {
7628                 /* We can receive in place if only the last zone is not empty */
7629                 for(zone=0; zone<nzone-1; zone++)
7630                 {
7631                     if (ind->nrecv[zone] > 0)
7632                     {
7633                         cd->bInPlace = FALSE;
7634                     }
7635                 }
7636                 if (!cd->bInPlace)
7637                 {
7638                     /* The int buffer is only required here for the cg indices */
7639                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7640                     {
7641                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7642                         srenew(comm->buf_int2,comm->nalloc_int2);
7643                     }
7644                     /* The rvec buffer is also required for atom buffers
7645                      * of size nrecv in dd_move_x and dd_move_f.
7646                      */
7647                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7648                     vec_rvec_check_alloc(&comm->vbuf2,i);
7649                 }
7650             }
7651             
7652             /* Make space for the global cg indices */
7653             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7654                 || dd->cg_nalloc == 0)
7655             {
7656                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7657                 srenew(index_gl,dd->cg_nalloc);
7658                 srenew(cgindex,dd->cg_nalloc+1);
7659             }
7660             /* Communicate the global cg indices */
7661             if (cd->bInPlace)
7662             {
7663                 recv_i = index_gl + pos_cg;
7664             }
7665             else
7666             {
7667                 recv_i = comm->buf_int2;
7668             }
7669             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7670                             comm->buf_int, nsend,
7671                             recv_i,        ind->nrecv[nzone]);
7672
7673             /* Make space for cg_cm */
7674             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7675             {
7676                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7677                 cg_cm = fr->cg_cm;
7678             }
7679             /* Communicate cg_cm */
7680             if (cd->bInPlace)
7681             {
7682                 recv_vr = cg_cm + pos_cg;
7683             }
7684             else
7685             {
7686                 recv_vr = comm->vbuf2.v;
7687             }
7688             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7689                              comm->vbuf.v, nsend,
7690                              recv_vr,      ind->nrecv[nzone]);
7691             
7692             /* Make the charge group index */
7693             if (cd->bInPlace)
7694             {
7695                 zone = (p == 0 ? 0 : nzone - 1);
7696                 while (zone < nzone)
7697                 {
7698                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7699                     {
7700                         cg_gl = index_gl[pos_cg];
7701                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7702                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7703                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7704                         if (bBondComm)
7705                         {
7706                             /* Update the charge group presence,
7707                              * so we can use it in the next pass of the loop.
7708                              */
7709                             comm->bLocalCG[cg_gl] = TRUE;
7710                         }
7711                         pos_cg++;
7712                     }
7713                     if (p == 0)
7714                     {
7715                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7716                     }
7717                     zone++;
7718                     zone_cg_range[nzone+zone] = pos_cg;
7719                 }
7720             }
7721             else
7722             {
7723                 /* This part of the code is never executed with bBondComm. */
7724                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7725                                  index_gl,recv_i,cg_cm,recv_vr,
7726                                  cgindex,fr->cginfo_mb,fr->cginfo);
7727                 pos_cg += ind->nrecv[nzone];
7728             }
7729             nat_tot += ind->nrecv[nzone+1];
7730         }
7731         if (!cd->bInPlace)
7732         {
7733             /* Store the atom block for easy copying of communication buffers */
7734             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7735         }
7736         nzone += nzone;
7737     }
7738     dd->index_gl = index_gl;
7739     dd->cgindex  = cgindex;
7740     
7741     dd->ncg_tot = zone_cg_range[zones->n];
7742     dd->nat_tot = nat_tot;
7743     comm->nat[ddnatHOME] = dd->nat_home;
7744     for(i=ddnatZONE; i<ddnatNR; i++)
7745     {
7746         comm->nat[i] = dd->nat_tot;
7747     }
7748
7749     if (!bBondComm)
7750     {
7751         /* We don't need to update cginfo, since that was alrady done above.
7752          * So we pass NULL for the forcerec.
7753          */
7754         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7755                       NULL,comm->bLocalCG);
7756     }
7757
7758     if (debug)
7759     {
7760         fprintf(debug,"Finished setting up DD communication, zones:");
7761         for(c=0; c<zones->n; c++)
7762         {
7763             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7764         }
7765         fprintf(debug,"\n");
7766     }
7767 }
7768
7769 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7770 {
7771     int c;
7772     
7773     for(c=0; c<zones->nizone; c++)
7774     {
7775         zones->izone[c].cg1  = zones->cg_range[c+1];
7776         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7777         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7778     }
7779 }
7780
7781 static int comp_cgsort(const void *a,const void *b)
7782 {
7783     int comp;
7784     
7785     gmx_cgsort_t *cga,*cgb;
7786     cga = (gmx_cgsort_t *)a;
7787     cgb = (gmx_cgsort_t *)b;
7788     
7789     comp = cga->nsc - cgb->nsc;
7790     if (comp == 0)
7791     {
7792         comp = cga->ind_gl - cgb->ind_gl;
7793     }
7794     
7795     return comp;
7796 }
7797
7798 static void order_int_cg(int n,gmx_cgsort_t *sort,
7799                          int *a,int *buf)
7800 {
7801     int i;
7802     
7803     /* Order the data */
7804     for(i=0; i<n; i++)
7805     {
7806         buf[i] = a[sort[i].ind];
7807     }
7808     
7809     /* Copy back to the original array */
7810     for(i=0; i<n; i++)
7811     {
7812         a[i] = buf[i];
7813     }
7814 }
7815
7816 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7817                          rvec *v,rvec *buf)
7818 {
7819     int i;
7820     
7821     /* Order the data */
7822     for(i=0; i<n; i++)
7823     {
7824         copy_rvec(v[sort[i].ind],buf[i]);
7825     }
7826     
7827     /* Copy back to the original array */
7828     for(i=0; i<n; i++)
7829     {
7830         copy_rvec(buf[i],v[i]);
7831     }
7832 }
7833
7834 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7835                            rvec *v,rvec *buf)
7836 {
7837     int a,atot,cg,cg0,cg1,i;
7838     
7839     /* Order the data */
7840     a = 0;
7841     for(cg=0; cg<ncg; cg++)
7842     {
7843         cg0 = cgindex[sort[cg].ind];
7844         cg1 = cgindex[sort[cg].ind+1];
7845         for(i=cg0; i<cg1; i++)
7846         {
7847             copy_rvec(v[i],buf[a]);
7848             a++;
7849         }
7850     }
7851     atot = a;
7852     
7853     /* Copy back to the original array */
7854     for(a=0; a<atot; a++)
7855     {
7856         copy_rvec(buf[a],v[a]);
7857     }
7858 }
7859
7860 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7861                          int nsort_new,gmx_cgsort_t *sort_new,
7862                          gmx_cgsort_t *sort1)
7863 {
7864     int i1,i2,i_new;
7865     
7866     /* The new indices are not very ordered, so we qsort them */
7867     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7868     
7869     /* sort2 is already ordered, so now we can merge the two arrays */
7870     i1 = 0;
7871     i2 = 0;
7872     i_new = 0;
7873     while(i2 < nsort2 || i_new < nsort_new)
7874     {
7875         if (i2 == nsort2)
7876         {
7877             sort1[i1++] = sort_new[i_new++];
7878         }
7879         else if (i_new == nsort_new)
7880         {
7881             sort1[i1++] = sort2[i2++];
7882         }
7883         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7884                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7885                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7886         {
7887             sort1[i1++] = sort2[i2++];
7888         }
7889         else
7890         {
7891             sort1[i1++] = sort_new[i_new++];
7892         }
7893     }
7894 }
7895
7896 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7897                           rvec *cgcm,t_forcerec *fr,t_state *state,
7898                           int ncg_home_old)
7899 {
7900     gmx_domdec_sort_t *sort;
7901     gmx_cgsort_t *cgsort,*sort_i;
7902     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7903     rvec *vbuf;
7904     
7905     sort = dd->comm->sort;
7906     
7907     if (dd->ncg_home > sort->sort_nalloc)
7908     {
7909         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7910         srenew(sort->sort1,sort->sort_nalloc);
7911         srenew(sort->sort2,sort->sort_nalloc);
7912     }
7913     
7914     if (ncg_home_old >= 0)
7915     {
7916         /* The charge groups that remained in the same ns grid cell
7917          * are completely ordered. So we can sort efficiently by sorting
7918          * the charge groups that did move into the stationary list.
7919          */
7920         ncg_new = 0;
7921         nsort2 = 0;
7922         nsort_new = 0;
7923         for(i=0; i<dd->ncg_home; i++)
7924         {
7925             /* Check if this cg did not move to another node */
7926             cell_index = fr->ns.grid->cell_index[i];
7927             if (cell_index !=  4*fr->ns.grid->ncells)
7928             {
7929                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7930                 {
7931                     /* This cg is new on this node or moved ns grid cell */
7932                     if (nsort_new >= sort->sort_new_nalloc)
7933                     {
7934                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7935                         srenew(sort->sort_new,sort->sort_new_nalloc);
7936                     }
7937                     sort_i = &(sort->sort_new[nsort_new++]);
7938                 }
7939                 else
7940                 {
7941                     /* This cg did not move */
7942                     sort_i = &(sort->sort2[nsort2++]);
7943                 }
7944                 /* Sort on the ns grid cell indices
7945                  * and the global topology index
7946                  */
7947                 sort_i->nsc    = cell_index;
7948                 sort_i->ind_gl = dd->index_gl[i];
7949                 sort_i->ind    = i;
7950                 ncg_new++;
7951             }
7952         }
7953         if (debug)
7954         {
7955             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7956                     nsort2,nsort_new);
7957         }
7958         /* Sort efficiently */
7959         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7960     }
7961     else
7962     {
7963         cgsort = sort->sort1;
7964         ncg_new = 0;
7965         for(i=0; i<dd->ncg_home; i++)
7966         {
7967             /* Sort on the ns grid cell indices
7968              * and the global topology index
7969              */
7970             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7971             cgsort[i].ind_gl = dd->index_gl[i];
7972             cgsort[i].ind    = i;
7973             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7974             {
7975                 ncg_new++;
7976             }
7977         }
7978         if (debug)
7979         {
7980             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7981         }
7982         /* Determine the order of the charge groups using qsort */
7983         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7984     }
7985     cgsort = sort->sort1;
7986     
7987     /* We alloc with the old size, since cgindex is still old */
7988     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7989     vbuf = dd->comm->vbuf.v;
7990     
7991     /* Remove the charge groups which are no longer at home here */
7992     dd->ncg_home = ncg_new;
7993     
7994     /* Reorder the state */
7995     for(i=0; i<estNR; i++)
7996     {
7997         if (EST_DISTR(i) && (state->flags & (1<<i)))
7998         {
7999             switch (i)
8000             {
8001             case estX:
8002                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8003                 break;
8004             case estV:
8005                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8006                 break;
8007             case estSDX:
8008                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8009                 break;
8010             case estCGP:
8011                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8012                 break;
8013             case estLD_RNG:
8014             case estLD_RNGI:
8015             case estDISRE_INITF:
8016             case estDISRE_RM3TAV:
8017             case estORIRE_INITF:
8018             case estORIRE_DTAV:
8019                 /* No ordering required */
8020                 break;
8021             default:
8022                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8023                 break;
8024             }
8025         }
8026     }
8027     /* Reorder cgcm */
8028     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8029     
8030     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8031     {
8032         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8033         srenew(sort->ibuf,sort->ibuf_nalloc);
8034     }
8035     ibuf = sort->ibuf;
8036     /* Reorder the global cg index */
8037     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8038     /* Reorder the cginfo */
8039     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8040     /* Rebuild the local cg index */
8041     ibuf[0] = 0;
8042     for(i=0; i<dd->ncg_home; i++)
8043     {
8044         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8045         ibuf[i+1] = ibuf[i] + cgsize;
8046     }
8047     for(i=0; i<dd->ncg_home+1; i++)
8048     {
8049         dd->cgindex[i] = ibuf[i];
8050     }
8051     /* Set the home atom number */
8052     dd->nat_home = dd->cgindex[dd->ncg_home];
8053     
8054     /* Copy the sorted ns cell indices back to the ns grid struct */
8055     for(i=0; i<dd->ncg_home; i++)
8056     {
8057         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8058     }
8059     fr->ns.grid->nr = dd->ncg_home;
8060 }
8061
8062 static void add_dd_statistics(gmx_domdec_t *dd)
8063 {
8064     gmx_domdec_comm_t *comm;
8065     int ddnat;
8066     
8067     comm = dd->comm;
8068     
8069     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8070     {
8071         comm->sum_nat[ddnat-ddnatZONE] +=
8072             comm->nat[ddnat] - comm->nat[ddnat-1];
8073     }
8074     comm->ndecomp++;
8075 }
8076
8077 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8078 {
8079     gmx_domdec_comm_t *comm;
8080     int ddnat;
8081     
8082     comm = dd->comm;
8083
8084     /* Reset all the statistics and counters for total run counting */
8085     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8086     {
8087         comm->sum_nat[ddnat-ddnatZONE] = 0;
8088     }
8089     comm->ndecomp = 0;
8090     comm->nload = 0;
8091     comm->load_step = 0;
8092     comm->load_sum = 0;
8093     comm->load_max = 0;
8094     clear_ivec(comm->load_lim);
8095     comm->load_mdf = 0;
8096     comm->load_pme = 0;
8097 }
8098
8099 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8100 {
8101     gmx_domdec_comm_t *comm;
8102     int ddnat;
8103     double av;
8104    
8105     comm = cr->dd->comm;
8106     
8107     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8108     
8109     if (fplog == NULL)
8110     {
8111         return;
8112     }
8113     
8114     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8115             
8116     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8117     {
8118         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8119         switch(ddnat)
8120         {
8121         case ddnatZONE:
8122             fprintf(fplog,
8123                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8124                     2,av);
8125             break;
8126         case ddnatVSITE:
8127             if (cr->dd->vsite_comm)
8128             {
8129                 fprintf(fplog,
8130                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8131                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8132                         av);
8133             }
8134             break;
8135         case ddnatCON:
8136             if (cr->dd->constraint_comm)
8137             {
8138                 fprintf(fplog,
8139                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8140                         1 + ir->nLincsIter,av);
8141             }
8142             break;
8143         default:
8144             gmx_incons(" Unknown type for DD statistics");
8145         }
8146     }
8147     fprintf(fplog,"\n");
8148     
8149     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8150     {
8151         print_dd_load_av(fplog,cr->dd);
8152     }
8153 }
8154
8155 void dd_partition_system(FILE            *fplog,
8156                          gmx_large_int_t      step,
8157                          t_commrec       *cr,
8158                          gmx_bool            bMasterState,
8159                          int             nstglobalcomm,
8160                          t_state         *state_global,
8161                          gmx_mtop_t      *top_global,
8162                          t_inputrec      *ir,
8163                          t_state         *state_local,
8164                          rvec            **f,
8165                          t_mdatoms       *mdatoms,
8166                          gmx_localtop_t  *top_local,
8167                          t_forcerec      *fr,
8168                          gmx_vsite_t     *vsite,
8169                          gmx_shellfc_t   shellfc,
8170                          gmx_constr_t    constr,
8171                          t_nrnb          *nrnb,
8172                          gmx_wallcycle_t wcycle,
8173                          gmx_bool            bVerbose)
8174 {
8175     gmx_domdec_t *dd;
8176     gmx_domdec_comm_t *comm;
8177     gmx_ddbox_t ddbox={0};
8178     t_block *cgs_gl;
8179     gmx_large_int_t step_pcoupl;
8180     rvec cell_ns_x0,cell_ns_x1;
8181     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8182     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8183     gmx_bool bRedist,bSortCG,bResortAll;
8184     ivec ncells_old,np;
8185     real grid_density;
8186     char sbuf[22];
8187         
8188     dd = cr->dd;
8189     comm = dd->comm;
8190
8191     bBoxChanged = (bMasterState || DEFORM(*ir));
8192     if (ir->epc != epcNO)
8193     {
8194         /* With nstpcouple > 1 pressure coupling happens.
8195          * one step after calculating the pressure.
8196          * Box scaling happens at the end of the MD step,
8197          * after the DD partitioning.
8198          * We therefore have to do DLB in the first partitioning
8199          * after an MD step where P-coupling occured.
8200          * We need to determine the last step in which p-coupling occurred.
8201          * MRS -- need to validate this for vv?
8202          */
8203         n = ir->nstpcouple;
8204         if (n == 1)
8205         {
8206             step_pcoupl = step - 1;
8207         }
8208         else
8209         {
8210             step_pcoupl = ((step - 1)/n)*n + 1;
8211         }
8212         if (step_pcoupl >= comm->globalcomm_step)
8213         {
8214             bBoxChanged = TRUE;
8215         }
8216     }
8217
8218     bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8219
8220     if (!comm->bDynLoadBal)
8221     {
8222         bDoDLB = FALSE;
8223     }
8224     else
8225     {
8226         /* Should we do dynamic load balacing this step?
8227          * Since it requires (possibly expensive) global communication,
8228          * we might want to do DLB less frequently.
8229          */
8230         if (bBoxChanged || ir->epc != epcNO)
8231         {
8232             bDoDLB = bBoxChanged;
8233         }
8234         else
8235         {
8236             bDoDLB = bNStGlobalComm;
8237         }
8238     }
8239
8240     /* Check if we have recorded loads on the nodes */
8241     if (comm->bRecordLoad && dd_load_count(comm))
8242     {
8243         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8244         {
8245             /* Check if we should use DLB at the second partitioning
8246              * and every 100 partitionings,
8247              * so the extra communication cost is negligible.
8248              */
8249             n = max(100,nstglobalcomm);
8250             bCheckDLB = (comm->n_load_collect == 0 ||
8251                          comm->n_load_have % n == n-1);
8252         }
8253         else
8254         {
8255             bCheckDLB = FALSE;
8256         }
8257         
8258         /* Print load every nstlog, first and last step to the log file */
8259         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8260                     comm->n_load_collect == 0 ||
8261                     (ir->nsteps >= 0 &&
8262                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
8263
8264         /* Avoid extra communication due to verbose screen output
8265          * when nstglobalcomm is set.
8266          */
8267         if (bDoDLB || bLogLoad || bCheckDLB ||
8268             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8269         {
8270             get_load_distribution(dd,wcycle);
8271             if (DDMASTER(dd))
8272             {
8273                 if (bLogLoad)
8274                 {
8275                     dd_print_load(fplog,dd,step-1);
8276                 }
8277                 if (bVerbose)
8278                 {
8279                     dd_print_load_verbose(dd);
8280                 }
8281             }
8282             comm->n_load_collect++;
8283
8284             if (bCheckDLB) {
8285                 /* Since the timings are node dependent, the master decides */
8286                 if (DDMASTER(dd))
8287                 {
8288                     bTurnOnDLB =
8289                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8290                     if (debug)
8291                     {
8292                         fprintf(debug,"step %s, imb loss %f\n",
8293                                 gmx_step_str(step,sbuf),
8294                                 dd_force_imb_perf_loss(dd));
8295                     }
8296                 }
8297                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8298                 if (bTurnOnDLB)
8299                 {
8300                     turn_on_dlb(fplog,cr,step);
8301                     bDoDLB = TRUE;
8302                 }
8303             }
8304         }
8305         comm->n_load_have++;
8306     }
8307
8308     cgs_gl = &comm->cgs_gl;
8309
8310     bRedist = FALSE;
8311     if (bMasterState)
8312     {
8313         /* Clear the old state */
8314         clear_dd_indices(dd,0,0);
8315
8316         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8317                   TRUE,cgs_gl,state_global->x,&ddbox);
8318     
8319         get_cg_distribution(fplog,step,dd,cgs_gl,
8320                             state_global->box,&ddbox,state_global->x);
8321         
8322         dd_distribute_state(dd,cgs_gl,
8323                             state_global,state_local,f);
8324         
8325         dd_make_local_cgs(dd,&top_local->cgs);
8326         
8327         if (dd->ncg_home > fr->cg_nalloc)
8328         {
8329             dd_realloc_fr_cg(fr,dd->ncg_home);
8330         }
8331         calc_cgcm(fplog,0,dd->ncg_home,
8332                   &top_local->cgs,state_local->x,fr->cg_cm);
8333         
8334         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8335         
8336         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8337
8338         cg0 = 0;
8339     }
8340     else if (state_local->ddp_count != dd->ddp_count)
8341     {
8342         if (state_local->ddp_count > dd->ddp_count)
8343         {
8344             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8345         }
8346         
8347         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8348         {
8349             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8350         }
8351         
8352         /* Clear the old state */
8353         clear_dd_indices(dd,0,0);
8354         
8355         /* Build the new indices */
8356         rebuild_cgindex(dd,cgs_gl->index,state_local);
8357         make_dd_indices(dd,cgs_gl->index,0);
8358         
8359         /* Redetermine the cg COMs */
8360         calc_cgcm(fplog,0,dd->ncg_home,
8361                   &top_local->cgs,state_local->x,fr->cg_cm);
8362         
8363         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8364
8365         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8366
8367         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8368                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8369
8370         bRedist = comm->bDynLoadBal;
8371     }
8372     else
8373     {
8374         /* We have the full state, only redistribute the cgs */
8375
8376         /* Clear the non-home indices */
8377         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8378
8379         /* Avoid global communication for dim's without pbc and -gcom */
8380         if (!bNStGlobalComm)
8381         {
8382             copy_rvec(comm->box0    ,ddbox.box0    );
8383             copy_rvec(comm->box_size,ddbox.box_size);
8384         }
8385         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8386                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8387
8388         bBoxChanged = TRUE;
8389         bRedist = TRUE;
8390     }
8391     /* For dim's without pbc and -gcom */
8392     copy_rvec(ddbox.box0    ,comm->box0    );
8393     copy_rvec(ddbox.box_size,comm->box_size);
8394     
8395     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8396                       step,wcycle);
8397     
8398     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8399     {
8400         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8401     }
8402     
8403     /* Check if we should sort the charge groups */
8404     if (comm->nstSortCG > 0)
8405     {
8406         bSortCG = (bMasterState ||
8407                    (bRedist && (step % comm->nstSortCG == 0)));
8408     }
8409     else
8410     {
8411         bSortCG = FALSE;
8412     }
8413
8414     ncg_home_old = dd->ncg_home;
8415
8416     if (bRedist)
8417     {
8418         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8419                                  state_local,f,fr,mdatoms,
8420                                  !bSortCG,nrnb);
8421     }
8422     
8423     get_nsgrid_boundaries(fr->ns.grid,dd,
8424                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8425                           dd->ncg_home,fr->cg_cm,
8426                           cell_ns_x0,cell_ns_x1,&grid_density);
8427
8428     if (bBoxChanged)
8429     {
8430         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8431     }
8432
8433     copy_ivec(fr->ns.grid->n,ncells_old);
8434     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8435                state_local->box,cell_ns_x0,cell_ns_x1,
8436                fr->rlistlong,grid_density);
8437     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8438     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8439
8440     if (bSortCG)
8441     {
8442         /* Sort the state on charge group position.
8443          * This enables exact restarts from this step.
8444          * It also improves performance by about 15% with larger numbers
8445          * of atoms per node.
8446          */
8447         
8448         /* Fill the ns grid with the home cell,
8449          * so we can sort with the indices.
8450          */
8451         set_zones_ncg_home(dd);
8452         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8453                   0,dd->ncg_home,fr->cg_cm);
8454         
8455         /* Check if we can user the old order and ns grid cell indices
8456          * of the charge groups to sort the charge groups efficiently.
8457          */
8458         bResortAll = (bMasterState ||
8459                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8460                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8461                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8462
8463         if (debug)
8464         {
8465             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8466                     gmx_step_str(step,sbuf),dd->ncg_home);
8467         }
8468         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8469                       bResortAll ? -1 : ncg_home_old);
8470         /* Rebuild all the indices */
8471         cg0 = 0;
8472         ga2la_clear(dd->ga2la);
8473     }
8474     
8475     /* Setup up the communication and communicate the coordinates */
8476     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8477     
8478     /* Set the indices */
8479     make_dd_indices(dd,cgs_gl->index,cg0);
8480
8481     /* Set the charge group boundaries for neighbor searching */
8482     set_cg_boundaries(&comm->zones);
8483     
8484     /*
8485     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8486                  -1,state_local->x,state_local->box);
8487     */
8488     
8489     /* Extract a local topology from the global topology */
8490     for(i=0; i<dd->ndim; i++)
8491     {
8492         np[dd->dim[i]] = comm->cd[i].np;
8493     }
8494     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8495                       comm->cellsize_min,np,
8496                       fr,vsite,top_global,top_local);
8497     
8498     /* Set up the special atom communication */
8499     n = comm->nat[ddnatZONE];
8500     for(i=ddnatZONE+1; i<ddnatNR; i++)
8501     {
8502         switch(i)
8503         {
8504         case ddnatVSITE:
8505             if (vsite && vsite->n_intercg_vsite)
8506             {
8507                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8508             }
8509             break;
8510         case ddnatCON:
8511             if (dd->bInterCGcons)
8512             {
8513                 /* Only for inter-cg constraints we need special code */
8514                 n = dd_make_local_constraints(dd,n,top_global,
8515                                               constr,ir->nProjOrder,
8516                                               &top_local->idef.il[F_CONSTR]);
8517             }
8518             break;
8519         default:
8520             gmx_incons("Unknown special atom type setup");
8521         }
8522         comm->nat[i] = n;
8523     }
8524     
8525     /* Make space for the extra coordinates for virtual site
8526      * or constraint communication.
8527      */
8528     state_local->natoms = comm->nat[ddnatNR-1];
8529     if (state_local->natoms > state_local->nalloc)
8530     {
8531         dd_realloc_state(state_local,f,state_local->natoms);
8532     }
8533
8534     if (fr->bF_NoVirSum)
8535     {
8536         if (vsite && vsite->n_intercg_vsite)
8537         {
8538             nat_f_novirsum = comm->nat[ddnatVSITE];
8539         }
8540         else
8541         {
8542             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8543             {
8544                 nat_f_novirsum = dd->nat_tot;
8545             }
8546             else
8547             {
8548                 nat_f_novirsum = dd->nat_home;
8549             }
8550         }
8551     }
8552     else
8553     {
8554         nat_f_novirsum = 0;
8555     }
8556
8557     /* Set the number of atoms required for the force calculation.
8558      * Forces need to be constrained when using a twin-range setup
8559      * or with energy minimization. For simple simulations we could
8560      * avoid some allocation, zeroing and copying, but this is
8561      * probably not worth the complications ande checking.
8562      */
8563     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8564                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8565
8566     /* We make the all mdatoms up to nat_tot_con.
8567      * We could save some work by only setting invmass
8568      * between nat_tot and nat_tot_con.
8569      */
8570     /* This call also sets the new number of home particles to dd->nat_home */
8571     atoms2md(top_global,ir,
8572              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8573
8574     /* Now we have the charges we can sort the FE interactions */
8575     dd_sort_local_top(dd,mdatoms,top_local);
8576
8577     if (shellfc)
8578     {
8579         /* Make the local shell stuff, currently no communication is done */
8580         make_local_shells(cr,mdatoms,shellfc);
8581     }
8582     
8583         if (ir->implicit_solvent)
8584     {
8585         make_local_gb(cr,fr->born,ir->gb_algorithm);
8586     }
8587         
8588     if (!(cr->duty & DUTY_PME))
8589     {
8590         /* Send the charges to our PME only node */
8591         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8592                        mdatoms->chargeA,mdatoms->chargeB,
8593                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8594     }
8595     
8596     if (constr)
8597     {
8598         set_constraints(constr,top_local,ir,mdatoms,cr);
8599     }
8600     
8601     if (ir->ePull != epullNO)
8602     {
8603         /* Update the local pull groups */
8604         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8605     }
8606     
8607     if (ir->bRot)
8608     {
8609         /* Update the local rotation groups */
8610         dd_make_local_rotation_groups(dd,ir->rot);
8611     }
8612
8613
8614     add_dd_statistics(dd);
8615     
8616     /* Make sure we only count the cycles for this DD partitioning */
8617     clear_dd_cycle_counts(dd);
8618     
8619     /* Because the order of the atoms might have changed since
8620      * the last vsite construction, we need to communicate the constructing
8621      * atom coordinates again (for spreading the forces this MD step).
8622      */
8623     dd_move_x_vsites(dd,state_local->box,state_local->x);
8624     
8625     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8626     {
8627         dd_move_x(dd,state_local->box,state_local->x);
8628         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8629                      -1,state_local->x,state_local->box);
8630     }
8631
8632     if (bNStGlobalComm)
8633     {
8634         /* Store the global communication step */
8635         comm->globalcomm_step = step;
8636     }
8637     
8638     /* Increase the DD partitioning counter */
8639     dd->ddp_count++;
8640     /* The state currently matches this DD partitioning count, store it */
8641     state_local->ddp_count = dd->ddp_count;
8642     if (bMasterState)
8643     {
8644         /* The DD master node knows the complete cg distribution,
8645          * store the count so we can possibly skip the cg info communication.
8646          */
8647         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8648     }
8649
8650     if (comm->DD_debug > 0)
8651     {
8652         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8653         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8654                                 "after partitioning");
8655     }
8656 }