Merge remote-tracking branch 'gerrit/release-4-6'
[alexxy/gromacs.git] / src / gromacs / mdlib / domdec.c
1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
2  *
3  * 
4  * This file is part of Gromacs        Copyright (c) 1991-2008
5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
6  *
7  * This program is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU General Public License
9  * as published by the Free Software Foundation; either version 2
10  * of the License, or (at your option) any later version.
11  *
12  * To help us fund GROMACS development, we humbly ask that you cite
13  * the research papers on the package. Check out http://www.gromacs.org
14  * 
15  * And Hey:
16  * Gnomes, ROck Monsters And Chili Sauce
17  */
18
19 #ifdef HAVE_CONFIG_H
20 #include <config.h>
21 #endif
22
23 #include <stdio.h>
24 #include <time.h>
25 #include <math.h>
26 #include <string.h>
27 #include <stdlib.h>
28 #include "typedefs.h"
29 #include "smalloc.h"
30 #include "vec.h"
31 #include "domdec.h"
32 #include "domdec_network.h"
33 #include "nrnb.h"
34 #include "pbc.h"
35 #include "chargegroup.h"
36 #include "constr.h"
37 #include "mdatoms.h"
38 #include "names.h"
39 #include "pdbio.h"
40 #include "futil.h"
41 #include "force.h"
42 #include "pme.h"
43 #include "pull.h"
44 #include "pull_rotation.h"
45 #include "gmx_wallcycle.h"
46 #include "mdrun.h"
47 #include "nsgrid.h"
48 #include "shellfc.h"
49 #include "mtop_util.h"
50 #include "gmxfio.h"
51 #include "gmx_ga2la.h"
52 #include "gmx_sort.h"
53 #include "macros.h"
54
55 #ifdef GMX_LIB_MPI
56 #include <mpi.h>
57 #endif
58 #ifdef GMX_THREAD_MPI
59 #include "tmpi.h"
60 #endif
61
62 #define DDRANK(dd,rank)    (rank)
63 #define DDMASTERRANK(dd)   (dd->masterrank)
64
65 typedef struct gmx_domdec_master
66 {
67     /* The cell boundaries */
68     real **cell_x;
69     /* The global charge group division */
70     int  *ncg;     /* Number of home charge groups for each node */
71     int  *index;   /* Index of nnodes+1 into cg */
72     int  *cg;      /* Global charge group index */
73     int  *nat;     /* Number of home atoms for each node. */
74     int  *ibuf;    /* Buffer for communication */
75     rvec *vbuf;    /* Buffer for state scattering and gathering */
76 } gmx_domdec_master_t;
77
78 typedef struct
79 {
80     /* The numbers of charge groups to send and receive for each cell
81      * that requires communication, the last entry contains the total
82      * number of atoms that needs to be communicated.
83      */
84     int nsend[DD_MAXIZONE+2];
85     int nrecv[DD_MAXIZONE+2];
86     /* The charge groups to send */
87     int *index;
88     int nalloc;
89     /* The atom range for non-in-place communication */
90     int cell2at0[DD_MAXIZONE];
91     int cell2at1[DD_MAXIZONE];
92 } gmx_domdec_ind_t;
93
94 typedef struct
95 {
96     int  np;                   /* Number of grid pulses in this dimension */
97     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
98     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
99     int  np_nalloc;
100     gmx_bool bInPlace;             /* Can we communicate in place?            */
101 } gmx_domdec_comm_dim_t;
102
103 typedef struct
104 {
105     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
106     real *cell_f;      /* State var.: cell boundaries, box relative      */
107     real *old_cell_f;  /* Temp. var.: old cell size                      */
108     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
109     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
110     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
111     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
112     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
113     real *buf_ncd;     /* Temp. var.                                     */
114 } gmx_domdec_root_t;
115
116 #define DD_NLOAD_MAX 9
117
118 /* Here floats are accurate enough, since these variables
119  * only influence the load balancing, not the actual MD results.
120  */
121 typedef struct
122 {
123     int  nload;
124     float *load;
125     float sum;
126     float max;
127     float sum_m;
128     float cvol_min;
129     float mdf;
130     float pme;
131     int   flags;
132 } gmx_domdec_load_t;
133
134 typedef struct
135 {
136     int  nsc;
137     int  ind_gl;
138     int  ind;
139 } gmx_cgsort_t;
140
141 typedef struct
142 {
143     gmx_cgsort_t *sort1,*sort2;
144     int  sort_nalloc;
145     gmx_cgsort_t *sort_new;
146     int  sort_new_nalloc;
147     int  *ibuf;
148     int  ibuf_nalloc;
149 } gmx_domdec_sort_t;
150
151 typedef struct
152 {
153     rvec *v;
154     int  nalloc;
155 } vec_rvec_t;
156
157 /* This enum determines the order of the coordinates.
158  * ddnatHOME and ddnatZONE should be first and second,
159  * the others can be ordered as wanted.
160  */
161 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
162
163 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
164 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
165
166 typedef struct
167 {
168     int  dim;      /* The dimension                                          */
169     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
170     int  nslab;    /* The number of PME slabs in this dimension              */
171     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
172     int  *pp_min;  /* The minimum pp node location, size nslab               */
173     int  *pp_max;  /* The maximum pp node location,size nslab                */
174     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
175 } gmx_ddpme_t;
176
177 typedef struct
178 {
179     real min0;    /* The minimum bottom of this zone                        */
180     real max1;    /* The maximum top of this zone                           */
181     real mch0;    /* The maximum bottom communicaton height for this zone   */
182     real mch1;    /* The maximum top communicaton height for this zone      */
183     real p1_0;    /* The bottom value of the first cell in this zone        */
184     real p1_1;    /* The top value of the first cell in this zone           */
185 } gmx_ddzone_t;
186
187 typedef struct gmx_domdec_comm
188 {
189     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
190      * unless stated otherwise.
191      */
192
193     /* The number of decomposition dimensions for PME, 0: no PME */
194     int  npmedecompdim;
195     /* The number of nodes doing PME (PP/PME or only PME) */
196     int  npmenodes;
197     int  npmenodes_x;
198     int  npmenodes_y;
199     /* The communication setup including the PME only nodes */
200     gmx_bool bCartesianPP_PME;
201     ivec ntot;
202     int  cartpmedim;
203     int  *pmenodes;          /* size npmenodes                         */
204     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
205                               * but with bCartesianPP_PME              */
206     gmx_ddpme_t ddpme[2];
207     
208     /* The DD particle-particle nodes only */
209     gmx_bool bCartesianPP;
210     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
211     
212     /* The global charge groups */
213     t_block cgs_gl;
214
215     /* Should we sort the cgs */
216     int  nstSortCG;
217     gmx_domdec_sort_t *sort;
218     
219     /* Are there bonded and multi-body interactions between charge groups? */
220     gmx_bool bInterCGBondeds;
221     gmx_bool bInterCGMultiBody;
222
223     /* Data for the optional bonded interaction atom communication range */
224     gmx_bool bBondComm;
225     t_blocka *cglink;
226     char *bLocalCG;
227
228     /* The DLB option */
229     int  eDLB;
230     /* Are we actually using DLB? */
231     gmx_bool bDynLoadBal;
232
233     /* Cell sizes for static load balancing, first index cartesian */
234     real **slb_frac;
235     
236     /* The width of the communicated boundaries */
237     real cutoff_mbody;
238     real cutoff;
239     /* The minimum cell size (including triclinic correction) */
240     rvec cellsize_min;
241     /* For dlb, for use with edlbAUTO */
242     rvec cellsize_min_dlb;
243     /* The lower limit for the DD cell size with DLB */
244     real cellsize_limit;
245     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
246     gmx_bool bVacDLBNoLimit;
247
248     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
249     ivec tric_dir;
250     /* box0 and box_size are required with dim's without pbc and -gcom */
251     rvec box0;
252     rvec box_size;
253     
254     /* The cell boundaries */
255     rvec cell_x0;
256     rvec cell_x1;
257
258     /* The old location of the cell boundaries, to check cg displacements */
259     rvec old_cell_x0;
260     rvec old_cell_x1;
261
262     /* The communication setup and charge group boundaries for the zones */
263     gmx_domdec_zones_t zones;
264     
265     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
266      * cell boundaries of neighboring cells for dynamic load balancing.
267      */
268     gmx_ddzone_t zone_d1[2];
269     gmx_ddzone_t zone_d2[2][2];
270     
271     /* The coordinate/force communication setup and indices */
272     gmx_domdec_comm_dim_t cd[DIM];
273     /* The maximum number of cells to communicate with in one dimension */
274     int  maxpulse;
275     
276     /* Which cg distribution is stored on the master node */
277     int master_cg_ddp_count;
278     
279     /* The number of cg's received from the direct neighbors */
280     int  zone_ncg1[DD_MAXZONE];
281     
282     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
283     int  nat[ddnatNR];
284     
285     /* Communication buffer for general use */
286     int  *buf_int;
287     int  nalloc_int;
288
289      /* Communication buffer for general use */
290     vec_rvec_t vbuf;
291     
292     /* Communication buffers only used with multiple grid pulses */
293     int  *buf_int2;
294     int  nalloc_int2;
295     vec_rvec_t vbuf2;
296     
297     /* Communication buffers for local redistribution */
298     int  **cggl_flag;
299     int  cggl_flag_nalloc[DIM*2];
300     rvec **cgcm_state;
301     int  cgcm_state_nalloc[DIM*2];
302     
303     /* Cell sizes for dynamic load balancing */
304     gmx_domdec_root_t **root;
305     real *cell_f_row;
306     real cell_f0[DIM];
307     real cell_f1[DIM];
308     real cell_f_max0[DIM];
309     real cell_f_min1[DIM];
310     
311     /* Stuff for load communication */
312     gmx_bool bRecordLoad;
313     gmx_domdec_load_t *load;
314 #ifdef GMX_MPI
315     MPI_Comm *mpi_comm_load;
316 #endif
317
318     /* Maximum DLB scaling per load balancing step in percent */
319     int dlb_scale_lim;
320
321     /* Cycle counters */
322     float cycl[ddCyclNr];
323     int   cycl_n[ddCyclNr];
324     float cycl_max[ddCyclNr];
325     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
326     int eFlop;
327     double flop;
328     int    flop_n;
329     /* Have often have did we have load measurements */
330     int    n_load_have;
331     /* Have often have we collected the load measurements */
332     int    n_load_collect;
333     
334     /* Statistics */
335     double sum_nat[ddnatNR-ddnatZONE];
336     int    ndecomp;
337     int    nload;
338     double load_step;
339     double load_sum;
340     double load_max;
341     ivec   load_lim;
342     double load_mdf;
343     double load_pme;
344
345     /* The last partition step */
346     gmx_large_int_t globalcomm_step;
347
348     /* Debugging */
349     int  nstDDDump;
350     int  nstDDDumpGrid;
351     int  DD_debug;
352 } gmx_domdec_comm_t;
353
354 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
355 #define DD_CGIBS 2
356
357 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
358 #define DD_FLAG_NRCG  65535
359 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
360 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
361
362 /* Zone permutation required to obtain consecutive charge groups
363  * for neighbor searching.
364  */
365 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
366
367 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
368  * components see only j zones with that component 0.
369  */
370
371 /* The DD zone order */
372 static const ivec dd_zo[DD_MAXZONE] =
373   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
374
375 /* The 3D setup */
376 #define dd_z3n  8
377 #define dd_zp3n 4
378 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
379
380 /* The 2D setup */
381 #define dd_z2n  4
382 #define dd_zp2n 2
383 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
384
385 /* The 1D setup */
386 #define dd_z1n  2
387 #define dd_zp1n 1
388 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
389
390 /* Factors used to avoid problems due to rounding issues */
391 #define DD_CELL_MARGIN       1.0001
392 #define DD_CELL_MARGIN2      1.00005
393 /* Factor to account for pressure scaling during nstlist steps */
394 #define DD_PRES_SCALE_MARGIN 1.02
395
396 /* Allowed performance loss before we DLB or warn */
397 #define DD_PERF_LOSS 0.05
398
399 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
400
401 /* Use separate MPI send and receive commands
402  * when nnodes <= GMX_DD_NNODES_SENDRECV.
403  * This saves memory (and some copying for small nnodes).
404  * For high parallelization scatter and gather calls are used.
405  */
406 #define GMX_DD_NNODES_SENDRECV 4
407
408
409 /*
410 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
411
412 static void index2xyz(ivec nc,int ind,ivec xyz)
413 {
414   xyz[XX] = ind % nc[XX];
415   xyz[YY] = (ind / nc[XX]) % nc[YY];
416   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
417 }
418 */
419
420 /* This order is required to minimize the coordinate communication in PME
421  * which uses decomposition in the x direction.
422  */
423 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
424
425 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
426 {
427     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
428     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
429     xyz[ZZ] = ind % nc[ZZ];
430 }
431
432 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
433 {
434     int ddindex;
435     int ddnodeid=-1;
436     
437     ddindex = dd_index(dd->nc,c);
438     if (dd->comm->bCartesianPP_PME)
439     {
440         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
441     }
442     else if (dd->comm->bCartesianPP)
443     {
444 #ifdef GMX_MPI
445         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
446 #endif
447     }
448     else
449     {
450         ddnodeid = ddindex;
451     }
452     
453     return ddnodeid;
454 }
455
456 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
457 {
458     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
459 }
460
461 int ddglatnr(gmx_domdec_t *dd,int i)
462 {
463     int atnr;
464     
465     if (dd == NULL)
466     {
467         atnr = i + 1;
468     }
469     else
470     {
471         if (i >= dd->comm->nat[ddnatNR-1])
472         {
473             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
474         }
475         atnr = dd->gatindex[i] + 1;
476     }
477     
478     return atnr;
479 }
480
481 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
482 {
483     return &dd->comm->cgs_gl;
484 }
485
486 static void vec_rvec_init(vec_rvec_t *v)
487 {
488     v->nalloc = 0;
489     v->v      = NULL;
490 }
491
492 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
493 {
494     if (n > v->nalloc)
495     {
496         v->nalloc = over_alloc_dd(n);
497         srenew(v->v,v->nalloc);
498     }
499 }
500
501 void dd_store_state(gmx_domdec_t *dd,t_state *state)
502 {
503     int i;
504     
505     if (state->ddp_count != dd->ddp_count)
506     {
507         gmx_incons("The state does not the domain decomposition state");
508     }
509     
510     state->ncg_gl = dd->ncg_home;
511     if (state->ncg_gl > state->cg_gl_nalloc)
512     {
513         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
514         srenew(state->cg_gl,state->cg_gl_nalloc);
515     }
516     for(i=0; i<state->ncg_gl; i++)
517     {
518         state->cg_gl[i] = dd->index_gl[i];
519     }
520     
521     state->ddp_count_cg_gl = dd->ddp_count;
522 }
523
524 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
525 {
526     return &dd->comm->zones;
527 }
528
529 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
530                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
531 {
532     gmx_domdec_zones_t *zones;
533     int izone,d,dim;
534
535     zones = &dd->comm->zones;
536
537     izone = 0;
538     while (icg >= zones->izone[izone].cg1)
539     {
540         izone++;
541     }
542     
543     if (izone == 0)
544     {
545         *jcg0 = icg;
546     }
547     else if (izone < zones->nizone)
548     {
549         *jcg0 = zones->izone[izone].jcg0;
550     }
551     else
552     {
553         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
554                   icg,izone,zones->nizone);
555     }
556         
557     *jcg1 = zones->izone[izone].jcg1;
558     
559     for(d=0; d<dd->ndim; d++)
560     {
561         dim = dd->dim[d];
562         shift0[dim] = zones->izone[izone].shift0[dim];
563         shift1[dim] = zones->izone[izone].shift1[dim];
564         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
565         {
566             /* A conservative approach, this can be optimized */
567             shift0[dim] -= 1;
568             shift1[dim] += 1;
569         }
570     }
571 }
572
573 int dd_natoms_vsite(gmx_domdec_t *dd)
574 {
575     return dd->comm->nat[ddnatVSITE];
576 }
577
578 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
579 {
580     *at_start = dd->comm->nat[ddnatCON-1];
581     *at_end   = dd->comm->nat[ddnatCON];
582 }
583
584 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
585 {
586     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
587     int  *index,*cgindex;
588     gmx_domdec_comm_t *comm;
589     gmx_domdec_comm_dim_t *cd;
590     gmx_domdec_ind_t *ind;
591     rvec shift={0,0,0},*buf,*rbuf;
592     gmx_bool bPBC,bScrew;
593     
594     comm = dd->comm;
595     
596     cgindex = dd->cgindex;
597     
598     buf = comm->vbuf.v;
599
600     nzone = 1;
601     nat_tot = dd->nat_home;
602     for(d=0; d<dd->ndim; d++)
603     {
604         bPBC   = (dd->ci[dd->dim[d]] == 0);
605         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
606         if (bPBC)
607         {
608             copy_rvec(box[dd->dim[d]],shift);
609         }
610         cd = &comm->cd[d];
611         for(p=0; p<cd->np; p++)
612         {
613             ind = &cd->ind[p];
614             index = ind->index;
615             n = 0;
616             if (!bPBC)
617             {
618                 for(i=0; i<ind->nsend[nzone]; i++)
619                 {
620                     at0 = cgindex[index[i]];
621                     at1 = cgindex[index[i]+1];
622                     for(j=at0; j<at1; j++)
623                     {
624                         copy_rvec(x[j],buf[n]);
625                         n++;
626                     }
627                 }
628             }
629             else if (!bScrew)
630             {
631                 for(i=0; i<ind->nsend[nzone]; i++)
632                 {
633                     at0 = cgindex[index[i]];
634                     at1 = cgindex[index[i]+1];
635                     for(j=at0; j<at1; j++)
636                     {
637                         /* We need to shift the coordinates */
638                         rvec_add(x[j],shift,buf[n]);
639                         n++;
640                     }
641                 }
642             }
643             else
644             {
645                 for(i=0; i<ind->nsend[nzone]; i++)
646                 {
647                     at0 = cgindex[index[i]];
648                     at1 = cgindex[index[i]+1];
649                     for(j=at0; j<at1; j++)
650                     {
651                         /* Shift x */
652                         buf[n][XX] = x[j][XX] + shift[XX];
653                         /* Rotate y and z.
654                          * This operation requires a special shift force
655                          * treatment, which is performed in calc_vir.
656                          */
657                         buf[n][YY] = box[YY][YY] - x[j][YY];
658                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
659                         n++;
660                     }
661                 }
662             }
663             
664             if (cd->bInPlace)
665             {
666                 rbuf = x + nat_tot;
667             }
668             else
669             {
670                 rbuf = comm->vbuf2.v;
671             }
672             /* Send and receive the coordinates */
673             dd_sendrecv_rvec(dd, d, dddirBackward,
674                              buf,  ind->nsend[nzone+1],
675                              rbuf, ind->nrecv[nzone+1]);
676             if (!cd->bInPlace)
677             {
678                 j = 0;
679                 for(zone=0; zone<nzone; zone++)
680                 {
681                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
682                     {
683                         copy_rvec(rbuf[j],x[i]);
684                         j++;
685                     }
686                 }
687             }
688             nat_tot += ind->nrecv[nzone+1];
689         }
690         nzone += nzone;
691     }
692 }
693
694 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
695 {
696     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
697     int  *index,*cgindex;
698     gmx_domdec_comm_t *comm;
699     gmx_domdec_comm_dim_t *cd;
700     gmx_domdec_ind_t *ind;
701     rvec *buf,*sbuf;
702     ivec vis;
703     int  is;
704     gmx_bool bPBC,bScrew;
705     
706     comm = dd->comm;
707     
708     cgindex = dd->cgindex;
709
710     buf = comm->vbuf.v;
711
712     n = 0;
713     nzone = comm->zones.n/2;
714     nat_tot = dd->nat_tot;
715     for(d=dd->ndim-1; d>=0; d--)
716     {
717         bPBC   = (dd->ci[dd->dim[d]] == 0);
718         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
719         if (fshift == NULL && !bScrew)
720         {
721             bPBC = FALSE;
722         }
723         /* Determine which shift vector we need */
724         clear_ivec(vis);
725         vis[dd->dim[d]] = 1;
726         is = IVEC2IS(vis);
727         
728         cd = &comm->cd[d];
729         for(p=cd->np-1; p>=0; p--) {
730             ind = &cd->ind[p];
731             nat_tot -= ind->nrecv[nzone+1];
732             if (cd->bInPlace)
733             {
734                 sbuf = f + nat_tot;
735             }
736             else
737             {
738                 sbuf = comm->vbuf2.v;
739                 j = 0;
740                 for(zone=0; zone<nzone; zone++)
741                 {
742                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
743                     {
744                         copy_rvec(f[i],sbuf[j]);
745                         j++;
746                     }
747                 }
748             }
749             /* Communicate the forces */
750             dd_sendrecv_rvec(dd, d, dddirForward,
751                              sbuf, ind->nrecv[nzone+1],
752                              buf,  ind->nsend[nzone+1]);
753             index = ind->index;
754             /* Add the received forces */
755             n = 0;
756             if (!bPBC)
757             {
758                 for(i=0; i<ind->nsend[nzone]; i++)
759                 {
760                     at0 = cgindex[index[i]];
761                     at1 = cgindex[index[i]+1];
762                     for(j=at0; j<at1; j++)
763                     {
764                         rvec_inc(f[j],buf[n]);
765                         n++;
766                     }
767                 } 
768             }
769             else if (!bScrew)
770             {
771                 for(i=0; i<ind->nsend[nzone]; i++)
772                 {
773                     at0 = cgindex[index[i]];
774                     at1 = cgindex[index[i]+1];
775                     for(j=at0; j<at1; j++)
776                     {
777                         rvec_inc(f[j],buf[n]);
778                         /* Add this force to the shift force */
779                         rvec_inc(fshift[is],buf[n]);
780                         n++;
781                     }
782                 }
783             }
784             else
785             {
786                 for(i=0; i<ind->nsend[nzone]; i++)
787                 {
788                     at0 = cgindex[index[i]];
789                     at1 = cgindex[index[i]+1];
790                     for(j=at0; j<at1; j++)
791                     {
792                         /* Rotate the force */
793                         f[j][XX] += buf[n][XX];
794                         f[j][YY] -= buf[n][YY];
795                         f[j][ZZ] -= buf[n][ZZ];
796                         if (fshift)
797                         {
798                             /* Add this force to the shift force */
799                             rvec_inc(fshift[is],buf[n]);
800                         }
801                         n++;
802                     }
803                 }
804             }
805         }
806         nzone /= 2;
807     }
808 }
809
810 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
811 {
812     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
813     int  *index,*cgindex;
814     gmx_domdec_comm_t *comm;
815     gmx_domdec_comm_dim_t *cd;
816     gmx_domdec_ind_t *ind;
817     real *buf,*rbuf;
818     
819     comm = dd->comm;
820     
821     cgindex = dd->cgindex;
822     
823     buf = &comm->vbuf.v[0][0];
824
825     nzone = 1;
826     nat_tot = dd->nat_home;
827     for(d=0; d<dd->ndim; d++)
828     {
829         cd = &comm->cd[d];
830         for(p=0; p<cd->np; p++)
831         {
832             ind = &cd->ind[p];
833             index = ind->index;
834             n = 0;
835             for(i=0; i<ind->nsend[nzone]; i++)
836             {
837                 at0 = cgindex[index[i]];
838                 at1 = cgindex[index[i]+1];
839                 for(j=at0; j<at1; j++)
840                 {
841                     buf[n] = v[j];
842                     n++;
843                 }
844             }
845             
846             if (cd->bInPlace)
847             {
848                 rbuf = v + nat_tot;
849             }
850             else
851             {
852                 rbuf = &comm->vbuf2.v[0][0];
853             }
854             /* Send and receive the coordinates */
855             dd_sendrecv_real(dd, d, dddirBackward,
856                              buf,  ind->nsend[nzone+1],
857                              rbuf, ind->nrecv[nzone+1]);
858             if (!cd->bInPlace)
859             {
860                 j = 0;
861                 for(zone=0; zone<nzone; zone++)
862                 {
863                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
864                     {
865                         v[i] = rbuf[j];
866                         j++;
867                     }
868                 }
869             }
870             nat_tot += ind->nrecv[nzone+1];
871         }
872         nzone += nzone;
873     }
874 }
875
876 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
877 {
878     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
879     int  *index,*cgindex;
880     gmx_domdec_comm_t *comm;
881     gmx_domdec_comm_dim_t *cd;
882     gmx_domdec_ind_t *ind;
883     real *buf,*sbuf;
884     
885     comm = dd->comm;
886     
887     cgindex = dd->cgindex;
888
889     buf = &comm->vbuf.v[0][0];
890
891     n = 0;
892     nzone = comm->zones.n/2;
893     nat_tot = dd->nat_tot;
894     for(d=dd->ndim-1; d>=0; d--)
895     {
896         cd = &comm->cd[d];
897         for(p=cd->np-1; p>=0; p--) {
898             ind = &cd->ind[p];
899             nat_tot -= ind->nrecv[nzone+1];
900             if (cd->bInPlace)
901             {
902                 sbuf = v + nat_tot;
903             }
904             else
905             {
906                 sbuf = &comm->vbuf2.v[0][0];
907                 j = 0;
908                 for(zone=0; zone<nzone; zone++)
909                 {
910                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
911                     {
912                         sbuf[j] = v[i];
913                         j++;
914                     }
915                 }
916             }
917             /* Communicate the forces */
918             dd_sendrecv_real(dd, d, dddirForward,
919                              sbuf, ind->nrecv[nzone+1],
920                              buf,  ind->nsend[nzone+1]);
921             index = ind->index;
922             /* Add the received forces */
923             n = 0;
924             for(i=0; i<ind->nsend[nzone]; i++)
925             {
926                 at0 = cgindex[index[i]];
927                 at1 = cgindex[index[i]+1];
928                 for(j=at0; j<at1; j++)
929                 {
930                     v[j] += buf[n];
931                     n++;
932                 }
933             } 
934         }
935         nzone /= 2;
936     }
937 }
938
939 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
940 {
941     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
942             d,i,j,
943             zone->min0,zone->max1,
944             zone->mch0,zone->mch0,
945             zone->p1_0,zone->p1_1);
946 }
947
948 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
949                                int ddimind,int direction,
950                                gmx_ddzone_t *buf_s,int n_s,
951                                gmx_ddzone_t *buf_r,int n_r)
952 {
953     rvec vbuf_s[5*2],vbuf_r[5*2];
954     int i;
955
956     for(i=0; i<n_s; i++)
957     {
958         vbuf_s[i*2  ][0] = buf_s[i].min0;
959         vbuf_s[i*2  ][1] = buf_s[i].max1;
960         vbuf_s[i*2  ][2] = buf_s[i].mch0;
961         vbuf_s[i*2+1][0] = buf_s[i].mch1;
962         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
963         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
964     }
965
966     dd_sendrecv_rvec(dd, ddimind, direction,
967                      vbuf_s, n_s*2,
968                      vbuf_r, n_r*2);
969
970     for(i=0; i<n_r; i++)
971     {
972         buf_r[i].min0 = vbuf_r[i*2  ][0];
973         buf_r[i].max1 = vbuf_r[i*2  ][1];
974         buf_r[i].mch0 = vbuf_r[i*2  ][2];
975         buf_r[i].mch1 = vbuf_r[i*2+1][0];
976         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
977         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
978     }
979 }
980
981 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
982                           rvec cell_ns_x0,rvec cell_ns_x1)
983 {
984     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
985     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
986     rvec extr_s[2],extr_r[2];
987     rvec dh;
988     real dist_d,c=0,det;
989     gmx_domdec_comm_t *comm;
990     gmx_bool bPBC,bUse;
991
992     comm = dd->comm;
993
994     for(d=1; d<dd->ndim; d++)
995     {
996         dim = dd->dim[d];
997         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
998         zp->min0 = cell_ns_x0[dim];
999         zp->max1 = cell_ns_x1[dim];
1000         zp->mch0 = cell_ns_x0[dim];
1001         zp->mch1 = cell_ns_x1[dim];
1002         zp->p1_0 = cell_ns_x0[dim];
1003         zp->p1_1 = cell_ns_x1[dim];
1004     }
1005     
1006     for(d=dd->ndim-2; d>=0; d--)
1007     {
1008         dim  = dd->dim[d];
1009         bPBC = (dim < ddbox->npbcdim);
1010
1011         /* Use an rvec to store two reals */
1012         extr_s[d][0] = comm->cell_f0[d+1];
1013         extr_s[d][1] = comm->cell_f1[d+1];
1014         extr_s[d][2] = 0;
1015
1016         pos = 0;
1017         /* Store the extremes in the backward sending buffer,
1018          * so the get updated separately from the forward communication.
1019          */
1020         for(d1=d; d1<dd->ndim-1; d1++)
1021         {
1022             /* We invert the order to be able to use the same loop for buf_e */
1023             buf_s[pos].min0 = extr_s[d1][1];
1024             buf_s[pos].max1 = extr_s[d1][0];
1025             buf_s[pos].mch0 = 0;
1026             buf_s[pos].mch1 = 0;
1027             /* Store the cell corner of the dimension we communicate along */
1028             buf_s[pos].p1_0 = comm->cell_x0[dim];
1029             buf_s[pos].p1_1 = 0;
1030             pos++;
1031         }
1032
1033         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1034         pos++;
1035
1036         if (dd->ndim == 3 && d == 0)
1037         {
1038             buf_s[pos] = comm->zone_d2[0][1];
1039             pos++;
1040             buf_s[pos] = comm->zone_d1[0];
1041             pos++;
1042         }
1043
1044         /* We only need to communicate the extremes
1045          * in the forward direction
1046          */
1047         npulse = comm->cd[d].np;
1048         if (bPBC)
1049         {
1050             /* Take the minimum to avoid double communication */
1051             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1052         }
1053         else
1054         {
1055             /* Without PBC we should really not communicate over
1056              * the boundaries, but implementing that complicates
1057              * the communication setup and therefore we simply
1058              * do all communication, but ignore some data.
1059              */
1060             npulse_min = npulse;
1061         }
1062         for(p=0; p<npulse_min; p++)
1063         {
1064             /* Communicate the extremes forward */
1065             bUse = (bPBC || dd->ci[dim] > 0);
1066
1067             dd_sendrecv_rvec(dd, d, dddirForward,
1068                              extr_s+d, dd->ndim-d-1,
1069                              extr_r+d, dd->ndim-d-1);
1070
1071             if (bUse)
1072             {
1073                 for(d1=d; d1<dd->ndim-1; d1++)
1074                 {
1075                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1076                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1077                 }
1078             }
1079         }
1080
1081         buf_size = pos;
1082         for(p=0; p<npulse; p++)
1083         {
1084             /* Communicate all the zone information backward */
1085             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1086
1087             dd_sendrecv_ddzone(dd, d, dddirBackward,
1088                                buf_s, buf_size,
1089                                buf_r, buf_size);
1090
1091             clear_rvec(dh);
1092             if (p > 0)
1093             {
1094                 for(d1=d+1; d1<dd->ndim; d1++)
1095                 {
1096                     /* Determine the decrease of maximum required
1097                      * communication height along d1 due to the distance along d,
1098                      * this avoids a lot of useless atom communication.
1099                      */
1100                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1101
1102                     if (ddbox->tric_dir[dim])
1103                     {
1104                         /* c is the off-diagonal coupling between the cell planes
1105                          * along directions d and d1.
1106                          */
1107                         c = ddbox->v[dim][dd->dim[d1]][dim];
1108                     }
1109                     else
1110                     {
1111                         c = 0;
1112                     }
1113                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1114                     if (det > 0)
1115                     {
1116                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1117                     }
1118                     else
1119                     {
1120                         /* A negative value signals out of range */
1121                         dh[d1] = -1;
1122                     }
1123                 }
1124             }
1125
1126             /* Accumulate the extremes over all pulses */
1127             for(i=0; i<buf_size; i++)
1128             {
1129                 if (p == 0)
1130                 {
1131                     buf_e[i] = buf_r[i];
1132                 }
1133                 else
1134                 {
1135                     if (bUse)
1136                     {
1137                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1138                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1139                     }
1140
1141                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1142                     {
1143                         d1 = 1;
1144                     }
1145                     else
1146                     {
1147                         d1 = d + 1;
1148                     }
1149                     if (bUse && dh[d1] >= 0)
1150                     {
1151                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1152                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1153                     }
1154                 }
1155                 /* Copy the received buffer to the send buffer,
1156                  * to pass the data through with the next pulse.
1157                  */
1158                 buf_s[i] = buf_r[i];
1159             }
1160             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1161                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1162             {
1163                 /* Store the extremes */ 
1164                 pos = 0;
1165
1166                 for(d1=d; d1<dd->ndim-1; d1++)
1167                 {
1168                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1169                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1170                     pos++;
1171                 }
1172
1173                 if (d == 1 || (d == 0 && dd->ndim == 3))
1174                 {
1175                     for(i=d; i<2; i++)
1176                     {
1177                         comm->zone_d2[1-d][i] = buf_e[pos];
1178                         pos++;
1179                     }
1180                 }
1181                 if (d == 0)
1182                 {
1183                     comm->zone_d1[1] = buf_e[pos];
1184                     pos++;
1185                 }
1186             }
1187         }
1188     }
1189     
1190     if (dd->ndim >= 2)
1191     {
1192         dim = dd->dim[1];
1193         for(i=0; i<2; i++)
1194         {
1195             if (debug)
1196             {
1197                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1198             }
1199             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1200             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1201         }
1202     }
1203     if (dd->ndim >= 3)
1204     {
1205         dim = dd->dim[2];
1206         for(i=0; i<2; i++)
1207         {
1208             for(j=0; j<2; j++)
1209             {
1210                 if (debug)
1211                 {
1212                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1213                 }
1214                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1215                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1216             }
1217         }
1218     }
1219     for(d=1; d<dd->ndim; d++)
1220     {
1221         comm->cell_f_max0[d] = extr_s[d-1][0];
1222         comm->cell_f_min1[d] = extr_s[d-1][1];
1223         if (debug)
1224         {
1225             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1226                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1227         }
1228     }
1229 }
1230
1231 static void dd_collect_cg(gmx_domdec_t *dd,
1232                           t_state *state_local)
1233 {
1234     gmx_domdec_master_t *ma=NULL;
1235     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1236     t_block *cgs_gl;
1237
1238     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1239     {
1240         /* The master has the correct distribution */
1241         return;
1242     }
1243     
1244     if (state_local->ddp_count == dd->ddp_count)
1245     {
1246         ncg_home = dd->ncg_home;
1247         cg       = dd->index_gl;
1248         nat_home = dd->nat_home;
1249     } 
1250     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1251     {
1252         cgs_gl = &dd->comm->cgs_gl;
1253
1254         ncg_home = state_local->ncg_gl;
1255         cg       = state_local->cg_gl;
1256         nat_home = 0;
1257         for(i=0; i<ncg_home; i++)
1258         {
1259             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1260         }
1261     }
1262     else
1263     {
1264         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1265     }
1266     
1267     buf2[0] = dd->ncg_home;
1268     buf2[1] = dd->nat_home;
1269     if (DDMASTER(dd))
1270     {
1271         ma = dd->ma;
1272         ibuf = ma->ibuf;
1273     }
1274     else
1275     {
1276         ibuf = NULL;
1277     }
1278     /* Collect the charge group and atom counts on the master */
1279     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1280     
1281     if (DDMASTER(dd))
1282     {
1283         ma->index[0] = 0;
1284         for(i=0; i<dd->nnodes; i++)
1285         {
1286             ma->ncg[i] = ma->ibuf[2*i];
1287             ma->nat[i] = ma->ibuf[2*i+1];
1288             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1289             
1290         }
1291         /* Make byte counts and indices */
1292         for(i=0; i<dd->nnodes; i++)
1293         {
1294             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1295             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1296         }
1297         if (debug)
1298         {
1299             fprintf(debug,"Initial charge group distribution: ");
1300             for(i=0; i<dd->nnodes; i++)
1301                 fprintf(debug," %d",ma->ncg[i]);
1302             fprintf(debug,"\n");
1303         }
1304     }
1305     
1306     /* Collect the charge group indices on the master */
1307     dd_gatherv(dd,
1308                dd->ncg_home*sizeof(int),dd->index_gl,
1309                DDMASTER(dd) ? ma->ibuf : NULL,
1310                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1311                DDMASTER(dd) ? ma->cg : NULL);
1312     
1313     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1314 }
1315
1316 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1317                                     rvec *lv,rvec *v)
1318 {
1319     gmx_domdec_master_t *ma;
1320     int  n,i,c,a,nalloc=0;
1321     rvec *buf=NULL;
1322     t_block *cgs_gl;
1323
1324     ma = dd->ma;
1325     
1326     if (!DDMASTER(dd))
1327     {
1328 #ifdef GMX_MPI
1329         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1330                  dd->rank,dd->mpi_comm_all);
1331 #endif
1332     } else {
1333         /* Copy the master coordinates to the global array */
1334         cgs_gl = &dd->comm->cgs_gl;
1335
1336         n = DDMASTERRANK(dd);
1337         a = 0;
1338         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1339         {
1340             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1341             {
1342                 copy_rvec(lv[a++],v[c]);
1343             }
1344         }
1345         
1346         for(n=0; n<dd->nnodes; n++)
1347         {
1348             if (n != dd->rank)
1349             {
1350                 if (ma->nat[n] > nalloc)
1351                 {
1352                     nalloc = over_alloc_dd(ma->nat[n]);
1353                     srenew(buf,nalloc);
1354                 }
1355 #ifdef GMX_MPI
1356                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1357                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1358 #endif
1359                 a = 0;
1360                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1361                 {
1362                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1363                     {
1364                         copy_rvec(buf[a++],v[c]);
1365                     }
1366                 }
1367             }
1368         }
1369         sfree(buf);
1370     }
1371 }
1372
1373 static void get_commbuffer_counts(gmx_domdec_t *dd,
1374                                   int **counts,int **disps)
1375 {
1376     gmx_domdec_master_t *ma;
1377     int n;
1378
1379     ma = dd->ma;
1380     
1381     /* Make the rvec count and displacment arrays */
1382     *counts  = ma->ibuf;
1383     *disps   = ma->ibuf + dd->nnodes;
1384     for(n=0; n<dd->nnodes; n++)
1385     {
1386         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1387         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1388     }
1389 }
1390
1391 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1392                                    rvec *lv,rvec *v)
1393 {
1394     gmx_domdec_master_t *ma;
1395     int  *rcounts=NULL,*disps=NULL;
1396     int  n,i,c,a;
1397     rvec *buf=NULL;
1398     t_block *cgs_gl;
1399     
1400     ma = dd->ma;
1401     
1402     if (DDMASTER(dd))
1403     {
1404         get_commbuffer_counts(dd,&rcounts,&disps);
1405
1406         buf = ma->vbuf;
1407     }
1408     
1409     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1410
1411     if (DDMASTER(dd))
1412     {
1413         cgs_gl = &dd->comm->cgs_gl;
1414
1415         a = 0;
1416         for(n=0; n<dd->nnodes; n++)
1417         {
1418             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1419             {
1420                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1421                 {
1422                     copy_rvec(buf[a++],v[c]);
1423                 }
1424             }
1425         }
1426     }
1427 }
1428
1429 void dd_collect_vec(gmx_domdec_t *dd,
1430                     t_state *state_local,rvec *lv,rvec *v)
1431 {
1432     gmx_domdec_master_t *ma;
1433     int  n,i,c,a,nalloc=0;
1434     rvec *buf=NULL;
1435     
1436     dd_collect_cg(dd,state_local);
1437
1438     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1439     {
1440         dd_collect_vec_sendrecv(dd,lv,v);
1441     }
1442     else
1443     {
1444         dd_collect_vec_gatherv(dd,lv,v);
1445     }
1446 }
1447
1448
1449 void dd_collect_state(gmx_domdec_t *dd,
1450                       t_state *state_local,t_state *state)
1451 {
1452     int est,i,j,nh;
1453
1454     nh = state->nhchainlength;
1455
1456     if (DDMASTER(dd))
1457     {
1458         for (i=0;i<efptNR;i++) {
1459             state->lambda[i] = state_local->lambda[i];
1460         }
1461         state->fep_state = state_local->fep_state;
1462         state->veta = state_local->veta;
1463         state->vol0 = state_local->vol0;
1464         copy_mat(state_local->box,state->box);
1465         copy_mat(state_local->boxv,state->boxv);
1466         copy_mat(state_local->svir_prev,state->svir_prev);
1467         copy_mat(state_local->fvir_prev,state->fvir_prev);
1468         copy_mat(state_local->pres_prev,state->pres_prev);
1469
1470
1471         for(i=0; i<state_local->ngtc; i++)
1472         {
1473             for(j=0; j<nh; j++) {
1474                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1475                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1476             }
1477             state->therm_integral[i] = state_local->therm_integral[i];            
1478         }
1479         for(i=0; i<state_local->nnhpres; i++) 
1480         {
1481             for(j=0; j<nh; j++) {
1482                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1483                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1484             }
1485         }
1486     }
1487     for(est=0; est<estNR; est++)
1488     {
1489         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1490         {
1491             switch (est) {
1492             case estX:
1493                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1494                 break;
1495             case estV:
1496                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1497                 break;
1498             case estSDX:
1499                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1500                 break;
1501             case estCGP:
1502                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1503                 break;
1504             case estLD_RNG:
1505                 if (state->nrngi == 1)
1506                 {
1507                     if (DDMASTER(dd))
1508                     {
1509                         for(i=0; i<state_local->nrng; i++)
1510                         {
1511                             state->ld_rng[i] = state_local->ld_rng[i];
1512                         }
1513                     }
1514                 }
1515                 else
1516                 {
1517                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1518                               state_local->ld_rng,state->ld_rng);
1519                 }
1520                 break;
1521             case estLD_RNGI:
1522                 if (state->nrngi == 1)
1523                 {
1524                    if (DDMASTER(dd))
1525                     {
1526                         state->ld_rngi[0] = state_local->ld_rngi[0];
1527                     } 
1528                 }
1529                 else
1530                 {
1531                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1532                               state_local->ld_rngi,state->ld_rngi);
1533                 }
1534                 break;
1535             case estDISRE_INITF:
1536             case estDISRE_RM3TAV:
1537             case estORIRE_INITF:
1538             case estORIRE_DTAV:
1539                 break;
1540             default:
1541                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1542             }
1543         }
1544     }
1545 }
1546
1547 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1548 {
1549     if (debug)
1550     {
1551         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1552     }
1553     fr->cg_nalloc = over_alloc_dd(nalloc);
1554     srenew(fr->cg_cm,fr->cg_nalloc);
1555     srenew(fr->cginfo,fr->cg_nalloc);
1556 }
1557
1558 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1559 {
1560     int est;
1561
1562     if (debug)
1563     {
1564         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1565     }
1566
1567     state->nalloc = over_alloc_dd(nalloc);
1568     
1569     for(est=0; est<estNR; est++)
1570     {
1571         if (EST_DISTR(est) && (state->flags & (1<<est)))
1572         {
1573             switch(est) {
1574             case estX:
1575                 srenew(state->x,state->nalloc);
1576                 break;
1577             case estV:
1578                 srenew(state->v,state->nalloc);
1579                 break;
1580             case estSDX:
1581                 srenew(state->sd_X,state->nalloc);
1582                 break;
1583             case estCGP:
1584                 srenew(state->cg_p,state->nalloc);
1585                 break;
1586             case estLD_RNG:
1587             case estLD_RNGI:
1588             case estDISRE_INITF:
1589             case estDISRE_RM3TAV:
1590             case estORIRE_INITF:
1591             case estORIRE_DTAV:
1592                 /* No reallocation required */
1593                 break;
1594             default:
1595                 gmx_incons("Unknown state entry encountered in dd_realloc_state");            
1596             }
1597         }
1598     }
1599     
1600     if (f != NULL)
1601     {
1602         srenew(*f,state->nalloc);
1603     }
1604 }
1605
1606 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1607                                        rvec *v,rvec *lv)
1608 {
1609     gmx_domdec_master_t *ma;
1610     int  n,i,c,a,nalloc=0;
1611     rvec *buf=NULL;
1612     
1613     if (DDMASTER(dd))
1614     {
1615         ma  = dd->ma;
1616         
1617         for(n=0; n<dd->nnodes; n++)
1618         {
1619             if (n != dd->rank)
1620             {
1621                 if (ma->nat[n] > nalloc)
1622                 {
1623                     nalloc = over_alloc_dd(ma->nat[n]);
1624                     srenew(buf,nalloc);
1625                 }
1626                 /* Use lv as a temporary buffer */
1627                 a = 0;
1628                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1629                 {
1630                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1631                     {
1632                         copy_rvec(v[c],buf[a++]);
1633                     }
1634                 }
1635                 if (a != ma->nat[n])
1636                 {
1637                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1638                               a,ma->nat[n]);
1639                 }
1640                 
1641 #ifdef GMX_MPI
1642                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1643                          DDRANK(dd,n),n,dd->mpi_comm_all);
1644 #endif
1645             }
1646         }
1647         sfree(buf);
1648         n = DDMASTERRANK(dd);
1649         a = 0;
1650         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1651         {
1652             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1653             {
1654                 copy_rvec(v[c],lv[a++]);
1655             }
1656         }
1657     }
1658     else
1659     {
1660 #ifdef GMX_MPI
1661         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1662                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1663 #endif
1664     }
1665 }
1666
1667 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1668                                        rvec *v,rvec *lv)
1669 {
1670     gmx_domdec_master_t *ma;
1671     int  *scounts=NULL,*disps=NULL;
1672     int  n,i,c,a,nalloc=0;
1673     rvec *buf=NULL;
1674     
1675     if (DDMASTER(dd))
1676     {
1677         ma  = dd->ma;
1678      
1679         get_commbuffer_counts(dd,&scounts,&disps);
1680
1681         buf = ma->vbuf;
1682         a = 0;
1683         for(n=0; n<dd->nnodes; n++)
1684         {
1685             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1686             {
1687                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1688                 {
1689                     copy_rvec(v[c],buf[a++]);
1690                 }
1691             }
1692         }
1693     }
1694
1695     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1696 }
1697
1698 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1699 {
1700     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1701     {
1702         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1703     }
1704     else
1705     {
1706         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1707     }
1708 }
1709
1710 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1711                                 t_state *state,t_state *state_local,
1712                                 rvec **f)
1713 {
1714     int  i,j,nh;
1715
1716     nh = state->nhchainlength;
1717
1718     if (DDMASTER(dd))
1719     {
1720         for(i=0;i<efptNR;i++)
1721         {
1722             state_local->lambda[i] = state->lambda[i];
1723         }
1724         state_local->fep_state = state->fep_state;
1725         state_local->veta   = state->veta;
1726         state_local->vol0   = state->vol0;
1727         copy_mat(state->box,state_local->box);
1728         copy_mat(state->box_rel,state_local->box_rel);
1729         copy_mat(state->boxv,state_local->boxv);
1730         copy_mat(state->svir_prev,state_local->svir_prev);
1731         copy_mat(state->fvir_prev,state_local->fvir_prev);
1732         for(i=0; i<state_local->ngtc; i++)
1733         {
1734             for(j=0; j<nh; j++) {
1735                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1736                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1737             }
1738             state_local->therm_integral[i] = state->therm_integral[i];
1739         }
1740         for(i=0; i<state_local->nnhpres; i++)
1741         {
1742             for(j=0; j<nh; j++) {
1743                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1744                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1745             }
1746         }
1747     }
1748     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1749     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1750     dd_bcast(dd,sizeof(real),&state_local->veta);
1751     dd_bcast(dd,sizeof(real),&state_local->vol0);
1752     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1753     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1754     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1755     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1756     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1757     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1758     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1759     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1760     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1761     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1762
1763     if (dd->nat_home > state_local->nalloc)
1764     {
1765         dd_realloc_state(state_local,f,dd->nat_home);
1766     }
1767     for(i=0; i<estNR; i++)
1768     {
1769         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1770         {
1771             switch (i) {
1772             case estX:
1773                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1774                 break;
1775             case estV:
1776                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1777                 break;
1778             case estSDX:
1779                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1780                 break;
1781             case estCGP:
1782                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1783                 break;
1784             case estLD_RNG:
1785                 if (state->nrngi == 1)
1786                 {
1787                     dd_bcastc(dd,
1788                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1789                               state->ld_rng,state_local->ld_rng);
1790                 }
1791                 else
1792                 {
1793                     dd_scatter(dd,
1794                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1795                                state->ld_rng,state_local->ld_rng);
1796                 }
1797                 break;
1798             case estLD_RNGI:
1799                 if (state->nrngi == 1)
1800                 {
1801                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1802                               state->ld_rngi,state_local->ld_rngi);
1803                 }
1804                 else
1805                 {
1806                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1807                                state->ld_rngi,state_local->ld_rngi);
1808                 }   
1809                 break;
1810             case estDISRE_INITF:
1811             case estDISRE_RM3TAV:
1812             case estORIRE_INITF:
1813             case estORIRE_DTAV:
1814                 /* Not implemented yet */
1815                 break;
1816             default:
1817                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1818             }
1819         }
1820     }
1821 }
1822
1823 static char dim2char(int dim)
1824 {
1825     char c='?';
1826     
1827     switch (dim)
1828     {
1829     case XX: c = 'X'; break;
1830     case YY: c = 'Y'; break;
1831     case ZZ: c = 'Z'; break;
1832     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1833     }
1834     
1835     return c;
1836 }
1837
1838 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1839                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1840 {
1841     rvec grid_s[2],*grid_r=NULL,cx,r;
1842     char fname[STRLEN],format[STRLEN],buf[22];
1843     FILE *out;
1844     int  a,i,d,z,y,x;
1845     matrix tric;
1846     real vol;
1847
1848     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1849     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1850     
1851     if (DDMASTER(dd))
1852     {
1853         snew(grid_r,2*dd->nnodes);
1854     }
1855     
1856     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1857     
1858     if (DDMASTER(dd))
1859     {
1860         for(d=0; d<DIM; d++)
1861         {
1862             for(i=0; i<DIM; i++)
1863             {
1864                 if (d == i)
1865                 {
1866                     tric[d][i] = 1;
1867                 }
1868                 else
1869                 {
1870                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1871                     {
1872                         tric[d][i] = box[i][d]/box[i][i];
1873                     }
1874                     else
1875                     {
1876                         tric[d][i] = 0;
1877                     }
1878                 }
1879             }
1880         }
1881         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1882         sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1883         out = gmx_fio_fopen(fname,"w");
1884         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1885         a = 1;
1886         for(i=0; i<dd->nnodes; i++)
1887         {
1888             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1889             for(d=0; d<DIM; d++)
1890             {
1891                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1892             }
1893             for(z=0; z<2; z++)
1894             {
1895                 for(y=0; y<2; y++)
1896                 {
1897                     for(x=0; x<2; x++)
1898                     {
1899                         cx[XX] = grid_r[i*2+x][XX];
1900                         cx[YY] = grid_r[i*2+y][YY];
1901                         cx[ZZ] = grid_r[i*2+z][ZZ];
1902                         mvmul(tric,cx,r);
1903                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1904                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1905                     }
1906                 }
1907             }
1908             for(d=0; d<DIM; d++)
1909             {
1910                 for(x=0; x<4; x++)
1911                 {
1912                     switch(d)
1913                     {
1914                     case 0: y = 1 + i*8 + 2*x; break;
1915                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1916                     case 2: y = 1 + i*8 + x; break;
1917                     }
1918                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1919                 }
1920             }
1921         }
1922         gmx_fio_fclose(out);
1923         sfree(grid_r);
1924     }
1925 }
1926
1927 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1928                   gmx_mtop_t *mtop,t_commrec *cr,
1929                   int natoms,rvec x[],matrix box)
1930 {
1931     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1932     FILE *out;
1933     int  i,ii,resnr,c;
1934     char *atomname,*resname;
1935     real b;
1936     gmx_domdec_t *dd;
1937     
1938     dd = cr->dd;
1939     if (natoms == -1)
1940     {
1941         natoms = dd->comm->nat[ddnatVSITE];
1942     }
1943     
1944     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1945     
1946     sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1947     sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
1948     
1949     out = gmx_fio_fopen(fname,"w");
1950     
1951     fprintf(out,"TITLE     %s\n",title);
1952     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1953     for(i=0; i<natoms; i++)
1954     {
1955         ii = dd->gatindex[i];
1956         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1957         if (i < dd->comm->nat[ddnatZONE])
1958         {
1959             c = 0;
1960             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1961             {
1962                 c++;
1963             }
1964             b = c;
1965         }
1966         else if (i < dd->comm->nat[ddnatVSITE])
1967         {
1968             b = dd->comm->zones.n;
1969         }
1970         else
1971         {
1972             b = dd->comm->zones.n + 1;
1973         }
1974         fprintf(out,strlen(atomname)<4 ? format : format4,
1975                 "ATOM",(ii+1)%100000,
1976                 atomname,resname,' ',resnr%10000,' ',
1977                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1978     }
1979     fprintf(out,"TER\n");
1980     
1981     gmx_fio_fclose(out);
1982 }
1983
1984 real dd_cutoff_mbody(gmx_domdec_t *dd)
1985 {
1986     gmx_domdec_comm_t *comm;
1987     int  di;
1988     real r;
1989
1990     comm = dd->comm;
1991
1992     r = -1;
1993     if (comm->bInterCGBondeds)
1994     {
1995         if (comm->cutoff_mbody > 0)
1996         {
1997             r = comm->cutoff_mbody;
1998         }
1999         else
2000         {
2001             /* cutoff_mbody=0 means we do not have DLB */
2002             r = comm->cellsize_min[dd->dim[0]];
2003             for(di=1; di<dd->ndim; di++)
2004             {
2005                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2006             }
2007             if (comm->bBondComm)
2008             {
2009                 r = max(r,comm->cutoff_mbody);
2010             }
2011             else
2012             {
2013                 r = min(r,comm->cutoff);
2014             }
2015         }
2016     }
2017
2018     return r;
2019 }
2020
2021 real dd_cutoff_twobody(gmx_domdec_t *dd)
2022 {
2023     real r_mb;
2024
2025     r_mb = dd_cutoff_mbody(dd);
2026
2027     return max(dd->comm->cutoff,r_mb);
2028 }
2029
2030
2031 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2032 {
2033     int nc,ntot;
2034     
2035     nc   = dd->nc[dd->comm->cartpmedim];
2036     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2037     copy_ivec(coord,coord_pme);
2038     coord_pme[dd->comm->cartpmedim] =
2039         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2040 }
2041
2042 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2043 {
2044     /* Here we assign a PME node to communicate with this DD node
2045      * by assuming that the major index of both is x.
2046      * We add cr->npmenodes/2 to obtain an even distribution.
2047      */
2048     return (ddindex*npme + npme/2)/ndd;
2049 }
2050
2051 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2052 {
2053     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2054 }
2055
2056 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2057 {
2058     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2059 }
2060
2061 static int *dd_pmenodes(t_commrec *cr)
2062 {
2063     int *pmenodes;
2064     int n,i,p0,p1;
2065     
2066     snew(pmenodes,cr->npmenodes);
2067     n = 0;
2068     for(i=0; i<cr->dd->nnodes; i++) {
2069         p0 = cr_ddindex2pmeindex(cr,i);
2070         p1 = cr_ddindex2pmeindex(cr,i+1);
2071         if (i+1 == cr->dd->nnodes || p1 > p0) {
2072             if (debug)
2073                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2074             pmenodes[n] = i + 1 + n;
2075             n++;
2076         }
2077     }
2078
2079     return pmenodes;
2080 }
2081
2082 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2083 {
2084     gmx_domdec_t *dd;
2085     ivec coords,coords_pme,nc;
2086     int  slab;
2087     
2088     dd = cr->dd;
2089     /*
2090       if (dd->comm->bCartesian) {
2091       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2092       dd_coords2pmecoords(dd,coords,coords_pme);
2093       copy_ivec(dd->ntot,nc);
2094       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2095       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2096       
2097       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2098       } else {
2099       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2100       }
2101     */
2102     coords[XX] = x;
2103     coords[YY] = y;
2104     coords[ZZ] = z;
2105     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2106     
2107     return slab;
2108 }
2109
2110 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2111 {
2112     gmx_domdec_comm_t *comm;
2113     ivec coords;
2114     int  ddindex,nodeid=-1;
2115     
2116     comm = cr->dd->comm;
2117     
2118     coords[XX] = x;
2119     coords[YY] = y;
2120     coords[ZZ] = z;
2121     if (comm->bCartesianPP_PME)
2122     {
2123 #ifdef GMX_MPI
2124         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2125 #endif
2126     }
2127     else
2128     {
2129         ddindex = dd_index(cr->dd->nc,coords);
2130         if (comm->bCartesianPP)
2131         {
2132             nodeid = comm->ddindex2simnodeid[ddindex];
2133         }
2134         else
2135         {
2136             if (comm->pmenodes)
2137             {
2138                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2139             }
2140             else
2141             {
2142                 nodeid = ddindex;
2143             }
2144         }
2145     }
2146   
2147     return nodeid;
2148 }
2149
2150 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2151 {
2152     gmx_domdec_t *dd;
2153     gmx_domdec_comm_t *comm;
2154     ivec coord,coord_pme;
2155     int  i;
2156     int  pmenode=-1;
2157     
2158     dd = cr->dd;
2159     comm = dd->comm;
2160     
2161     /* This assumes a uniform x domain decomposition grid cell size */
2162     if (comm->bCartesianPP_PME)
2163     {
2164 #ifdef GMX_MPI
2165         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2166         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2167         {
2168             /* This is a PP node */
2169             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2170             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2171         }
2172 #endif
2173     }
2174     else if (comm->bCartesianPP)
2175     {
2176         if (sim_nodeid < dd->nnodes)
2177         {
2178             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2179         }
2180     }
2181     else
2182     {
2183         /* This assumes DD cells with identical x coordinates
2184          * are numbered sequentially.
2185          */
2186         if (dd->comm->pmenodes == NULL)
2187         {
2188             if (sim_nodeid < dd->nnodes)
2189             {
2190                 /* The DD index equals the nodeid */
2191                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2192             }
2193         }
2194         else
2195         {
2196             i = 0;
2197             while (sim_nodeid > dd->comm->pmenodes[i])
2198             {
2199                 i++;
2200             }
2201             if (sim_nodeid < dd->comm->pmenodes[i])
2202             {
2203                 pmenode = dd->comm->pmenodes[i];
2204             }
2205         }
2206     }
2207     
2208     return pmenode;
2209 }
2210
2211 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2212 {
2213     gmx_bool bPMEOnlyNode;
2214     
2215     if (DOMAINDECOMP(cr))
2216     {
2217         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2218     }
2219     else
2220     {
2221         bPMEOnlyNode = FALSE;
2222     }
2223     
2224     return bPMEOnlyNode;
2225 }
2226
2227 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2228                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2229 {
2230     gmx_domdec_t *dd;
2231     int x,y,z;
2232     ivec coord,coord_pme;
2233     
2234     dd = cr->dd;
2235     
2236     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2237     
2238     *nmy_ddnodes = 0;
2239     for(x=0; x<dd->nc[XX]; x++)
2240     {
2241         for(y=0; y<dd->nc[YY]; y++)
2242         {
2243             for(z=0; z<dd->nc[ZZ]; z++)
2244             {
2245                 if (dd->comm->bCartesianPP_PME)
2246                 {
2247                     coord[XX] = x;
2248                     coord[YY] = y;
2249                     coord[ZZ] = z;
2250                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2251                     if (dd->ci[XX] == coord_pme[XX] &&
2252                         dd->ci[YY] == coord_pme[YY] &&
2253                         dd->ci[ZZ] == coord_pme[ZZ])
2254                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2255                 }
2256                 else
2257                 {
2258                     /* The slab corresponds to the nodeid in the PME group */
2259                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2260                     {
2261                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2262                     }
2263                 }
2264             }
2265         }
2266     }
2267     
2268     /* The last PP-only node is the peer node */
2269     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2270     
2271     if (debug)
2272     {
2273         fprintf(debug,"Receive coordinates from PP nodes:");
2274         for(x=0; x<*nmy_ddnodes; x++)
2275         {
2276             fprintf(debug," %d",(*my_ddnodes)[x]);
2277         }
2278         fprintf(debug,"\n");
2279     }
2280 }
2281
2282 static gmx_bool receive_vir_ener(t_commrec *cr)
2283 {
2284     gmx_domdec_comm_t *comm;
2285     int  pmenode,coords[DIM],rank;
2286     gmx_bool bReceive;
2287     
2288     bReceive = TRUE;
2289     if (cr->npmenodes < cr->dd->nnodes)
2290     {
2291         comm = cr->dd->comm;
2292         if (comm->bCartesianPP_PME)
2293         {
2294             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2295 #ifdef GMX_MPI
2296             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2297             coords[comm->cartpmedim]++;
2298             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2299             {
2300                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2301                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2302                 {
2303                     /* This is not the last PP node for pmenode */
2304                     bReceive = FALSE;
2305                 }
2306             }
2307 #endif  
2308         }
2309         else
2310         {
2311             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2312             if (cr->sim_nodeid+1 < cr->nnodes &&
2313                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2314             {
2315                 /* This is not the last PP node for pmenode */
2316                 bReceive = FALSE;
2317             }
2318         }
2319     }
2320     
2321     return bReceive;
2322 }
2323
2324 static void set_zones_ncg_home(gmx_domdec_t *dd)
2325 {
2326     gmx_domdec_zones_t *zones;
2327     int i;
2328
2329     zones = &dd->comm->zones;
2330
2331     zones->cg_range[0] = 0;
2332     for(i=1; i<zones->n+1; i++)
2333     {
2334         zones->cg_range[i] = dd->ncg_home;
2335     }
2336 }
2337
2338 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2339 {
2340     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2341     
2342     ind = state->cg_gl;
2343     dd_cg_gl = dd->index_gl;
2344     cgindex  = dd->cgindex;
2345     nat = 0;
2346     cgindex[0] = nat;
2347     for(i=0; i<state->ncg_gl; i++)
2348     {
2349         cgindex[i] = nat;
2350         cg_gl = ind[i];
2351         dd_cg_gl[i] = cg_gl;
2352         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2353     }
2354     cgindex[i] = nat;
2355     
2356     dd->ncg_home = state->ncg_gl;
2357     dd->nat_home = nat;
2358
2359     set_zones_ncg_home(dd);
2360 }
2361
2362 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2363 {
2364     while (cg >= cginfo_mb->cg_end)
2365     {
2366         cginfo_mb++;
2367     }
2368
2369     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2370 }
2371
2372 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2373                           t_forcerec *fr,char *bLocalCG)
2374 {
2375     cginfo_mb_t *cginfo_mb;
2376     int *cginfo;
2377     int cg;
2378
2379     if (fr != NULL)
2380     {
2381         cginfo_mb = fr->cginfo_mb;
2382         cginfo    = fr->cginfo;
2383
2384         for(cg=cg0; cg<cg1; cg++)
2385         {
2386             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2387         }
2388     }
2389
2390     if (bLocalCG != NULL)
2391     {
2392         for(cg=cg0; cg<cg1; cg++)
2393         {
2394             bLocalCG[index_gl[cg]] = TRUE;
2395         }
2396     }
2397 }
2398
2399 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2400 {
2401     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2402     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2403     gmx_ga2la_t *ga2la;
2404     char *bLocalCG;
2405
2406     bLocalCG = dd->comm->bLocalCG;
2407
2408     if (dd->nat_tot > dd->gatindex_nalloc)
2409     {
2410         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2411         srenew(dd->gatindex,dd->gatindex_nalloc);
2412     }
2413
2414     nzone      = dd->comm->zones.n;
2415     zone2cg    = dd->comm->zones.cg_range;
2416     zone_ncg1  = dd->comm->zone_ncg1;
2417     index_gl   = dd->index_gl;
2418     gatindex   = dd->gatindex;
2419
2420     if (zone2cg[1] != dd->ncg_home)
2421     {
2422         gmx_incons("dd->ncg_zone is not up to date");
2423     }
2424     
2425     /* Make the local to global and global to local atom index */
2426     a = dd->cgindex[cg_start];
2427     for(zone=0; zone<nzone; zone++)
2428     {
2429         if (zone == 0)
2430         {
2431             cg0 = cg_start;
2432         }
2433         else
2434         {
2435             cg0 = zone2cg[zone];
2436         }
2437         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2438         {
2439             zone1 = zone;
2440             if (cg - cg0 >= zone_ncg1[zone])
2441             {
2442                 /* Signal that this cg is from more than one zone away */
2443                 zone1 += nzone;
2444             }
2445             cg_gl = index_gl[cg];
2446             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2447             {
2448                 gatindex[a] = a_gl;
2449                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2450                 a++;
2451             }
2452         }
2453     }
2454 }
2455
2456 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2457                           const char *where)
2458 {
2459     int ncg,i,ngl,nerr;
2460
2461     nerr = 0;
2462     if (bLocalCG == NULL)
2463     {
2464         return nerr;
2465     }
2466     for(i=0; i<dd->ncg_tot; i++)
2467     {
2468         if (!bLocalCG[dd->index_gl[i]])
2469         {
2470             fprintf(stderr,
2471                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2472             nerr++;
2473         }
2474     }
2475     ngl = 0;
2476     for(i=0; i<ncg_sys; i++)
2477     {
2478         if (bLocalCG[i])
2479         {
2480             ngl++;
2481         }
2482     }
2483     if (ngl != dd->ncg_tot)
2484     {
2485         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2486         nerr++;
2487     }
2488
2489     return nerr;
2490 }
2491
2492 static void check_index_consistency(gmx_domdec_t *dd,
2493                                     int natoms_sys,int ncg_sys,
2494                                     const char *where)
2495 {
2496     int  nerr,ngl,i,a,cell;
2497     int  *have;
2498
2499     nerr = 0;
2500
2501     if (dd->comm->DD_debug > 1)
2502     {
2503         snew(have,natoms_sys);
2504         for(a=0; a<dd->nat_tot; a++)
2505         {
2506             if (have[dd->gatindex[a]] > 0)
2507             {
2508                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2509             }
2510             else
2511             {
2512                 have[dd->gatindex[a]] = a + 1;
2513             }
2514         }
2515         sfree(have);
2516     }
2517
2518     snew(have,dd->nat_tot);
2519
2520     ngl  = 0;
2521     for(i=0; i<natoms_sys; i++)
2522     {
2523         if (ga2la_get(dd->ga2la,i,&a,&cell))
2524         {
2525             if (a >= dd->nat_tot)
2526             {
2527                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2528                 nerr++;
2529             }
2530             else
2531             {
2532                 have[a] = 1;
2533                 if (dd->gatindex[a] != i)
2534                 {
2535                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2536                     nerr++;
2537                 }
2538             }
2539             ngl++;
2540         }
2541     }
2542     if (ngl != dd->nat_tot)
2543     {
2544         fprintf(stderr,
2545                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2546                 dd->rank,where,ngl,dd->nat_tot);
2547     }
2548     for(a=0; a<dd->nat_tot; a++)
2549     {
2550         if (have[a] == 0)
2551         {
2552             fprintf(stderr,
2553                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2554                     dd->rank,where,a+1,dd->gatindex[a]+1);
2555         }
2556     }
2557     sfree(have);
2558
2559     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2560
2561     if (nerr > 0) {
2562         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2563                   dd->rank,where,nerr);
2564     }
2565 }
2566
2567 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2568 {
2569     int  i;
2570     char *bLocalCG;
2571
2572     if (a_start == 0)
2573     {
2574         /* Clear the whole list without searching */
2575         ga2la_clear(dd->ga2la);
2576     }
2577     else
2578     {
2579         for(i=a_start; i<dd->nat_tot; i++)
2580         {
2581             ga2la_del(dd->ga2la,dd->gatindex[i]);
2582         }
2583     }
2584
2585     bLocalCG = dd->comm->bLocalCG;
2586     if (bLocalCG)
2587     {
2588         for(i=cg_start; i<dd->ncg_tot; i++)
2589         {
2590             bLocalCG[dd->index_gl[i]] = FALSE;
2591         }
2592     }
2593
2594     dd_clear_local_vsite_indices(dd);
2595     
2596     if (dd->constraints)
2597     {
2598         dd_clear_local_constraint_indices(dd);
2599     }
2600 }
2601
2602 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2603 {
2604     real grid_jump_limit;
2605
2606     /* The distance between the boundaries of cells at distance
2607      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2608      * and by the fact that cells should not be shifted by more than
2609      * half their size, such that cg's only shift by one cell
2610      * at redecomposition.
2611      */
2612     grid_jump_limit = comm->cellsize_limit;
2613     if (!comm->bVacDLBNoLimit)
2614     {
2615         grid_jump_limit = max(grid_jump_limit,
2616                               comm->cutoff/comm->cd[dim_ind].np);
2617     }
2618
2619     return grid_jump_limit;
2620 }
2621
2622 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2623 {
2624     gmx_domdec_comm_t *comm;
2625     int  d,dim;
2626     real limit,bfac;
2627     
2628     comm = dd->comm;
2629     
2630     for(d=1; d<dd->ndim; d++)
2631     {
2632         dim = dd->dim[d];
2633         limit = grid_jump_limit(comm,d);
2634         bfac = ddbox->box_size[dim];
2635         if (ddbox->tric_dir[dim])
2636         {
2637             bfac *= ddbox->skew_fac[dim];
2638         }
2639         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2640             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2641         {
2642             char buf[22];
2643             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2644                       gmx_step_str(step,buf),
2645                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2646         }
2647     }
2648 }
2649
2650 static int dd_load_count(gmx_domdec_comm_t *comm)
2651 {
2652     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2653 }
2654
2655 static float dd_force_load(gmx_domdec_comm_t *comm)
2656 {
2657     float load;
2658     
2659     if (comm->eFlop)
2660     {
2661         load = comm->flop;
2662         if (comm->eFlop > 1)
2663         {
2664             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2665         }
2666     } 
2667     else
2668     {
2669         load = comm->cycl[ddCyclF];
2670         if (comm->cycl_n[ddCyclF] > 1)
2671         {
2672             /* Subtract the maximum of the last n cycle counts
2673              * to get rid of possible high counts due to other soures,
2674              * for instance system activity, that would otherwise
2675              * affect the dynamic load balancing.
2676              */
2677             load -= comm->cycl_max[ddCyclF];
2678         }
2679     }
2680     
2681     return load;
2682 }
2683
2684 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2685 {
2686     gmx_domdec_comm_t *comm;
2687     int i;
2688     
2689     comm = dd->comm;
2690     
2691     snew(*dim_f,dd->nc[dim]+1);
2692     (*dim_f)[0] = 0;
2693     for(i=1; i<dd->nc[dim]; i++)
2694     {
2695         if (comm->slb_frac[dim])
2696         {
2697             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2698         }
2699         else
2700         {
2701             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2702         }
2703     }
2704     (*dim_f)[dd->nc[dim]] = 1;
2705 }
2706
2707 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2708 {
2709     int  pmeindex,slab,nso,i;
2710     ivec xyz;
2711     
2712     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2713     {
2714         ddpme->dim = YY;
2715     }
2716     else
2717     {
2718         ddpme->dim = dimind;
2719     }
2720     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2721     
2722     ddpme->nslab = (ddpme->dim == 0 ?
2723                     dd->comm->npmenodes_x :
2724                     dd->comm->npmenodes_y);
2725
2726     if (ddpme->nslab <= 1)
2727     {
2728         return;
2729     }
2730
2731     nso = dd->comm->npmenodes/ddpme->nslab;
2732     /* Determine for each PME slab the PP location range for dimension dim */
2733     snew(ddpme->pp_min,ddpme->nslab);
2734     snew(ddpme->pp_max,ddpme->nslab);
2735     for(slab=0; slab<ddpme->nslab; slab++) {
2736         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2737         ddpme->pp_max[slab] = 0;
2738     }
2739     for(i=0; i<dd->nnodes; i++) {
2740         ddindex2xyz(dd->nc,i,xyz);
2741         /* For y only use our y/z slab.
2742          * This assumes that the PME x grid size matches the DD grid size.
2743          */
2744         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2745             pmeindex = ddindex2pmeindex(dd,i);
2746             if (dimind == 0) {
2747                 slab = pmeindex/nso;
2748             } else {
2749                 slab = pmeindex % ddpme->nslab;
2750             }
2751             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2752             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2753         }
2754     }
2755
2756     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2757 }
2758
2759 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2760 {
2761     if (dd->comm->ddpme[0].dim == XX)
2762     {
2763         return dd->comm->ddpme[0].maxshift;
2764     }
2765     else
2766     {
2767         return 0;
2768     }
2769 }
2770
2771 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2772 {
2773     if (dd->comm->ddpme[0].dim == YY)
2774     {
2775         return dd->comm->ddpme[0].maxshift;
2776     }
2777     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2778     {
2779         return dd->comm->ddpme[1].maxshift;
2780     }
2781     else
2782     {
2783         return 0;
2784     }
2785 }
2786
2787 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2788                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2789 {
2790     gmx_domdec_comm_t *comm;
2791     int  nc,ns,s;
2792     int  *xmin,*xmax;
2793     real range,pme_boundary;
2794     int  sh;
2795     
2796     comm = dd->comm;
2797     nc  = dd->nc[ddpme->dim];
2798     ns  = ddpme->nslab;
2799     
2800     if (!ddpme->dim_match)
2801     {
2802         /* PP decomposition is not along dim: the worst situation */
2803         sh = ns/2;
2804     }
2805     else if (ns <= 3 || (bUniform && ns == nc))
2806     {
2807         /* The optimal situation */
2808         sh = 1;
2809     }
2810     else
2811     {
2812         /* We need to check for all pme nodes which nodes they
2813          * could possibly need to communicate with.
2814          */
2815         xmin = ddpme->pp_min;
2816         xmax = ddpme->pp_max;
2817         /* Allow for atoms to be maximally 2/3 times the cut-off
2818          * out of their DD cell. This is a reasonable balance between
2819          * between performance and support for most charge-group/cut-off
2820          * combinations.
2821          */
2822         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2823         /* Avoid extra communication when we are exactly at a boundary */
2824         range *= 0.999;
2825         
2826         sh = 1;
2827         for(s=0; s<ns; s++)
2828         {
2829             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2830             pme_boundary = (real)s/ns;
2831             while (sh+1 < ns &&
2832                    ((s-(sh+1) >= 0 &&
2833                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2834                     (s-(sh+1) <  0 &&
2835                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2836             {
2837                 sh++;
2838             }
2839             pme_boundary = (real)(s+1)/ns;
2840             while (sh+1 < ns &&
2841                    ((s+(sh+1) <  ns &&
2842                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2843                     (s+(sh+1) >= ns &&
2844                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2845             {
2846                 sh++;
2847             }
2848         }
2849     }
2850     
2851     ddpme->maxshift = sh;
2852     
2853     if (debug)
2854     {
2855         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2856                 ddpme->dim,ddpme->maxshift);
2857     }
2858 }
2859
2860 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2861 {
2862     int d,dim;
2863     
2864     for(d=0; d<dd->ndim; d++)
2865     {
2866         dim = dd->dim[d];
2867         if (dim < ddbox->nboundeddim &&
2868             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2869             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2870         {
2871             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2872                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2873                       dd->nc[dim],dd->comm->cellsize_limit);
2874         }
2875     }
2876 }
2877
2878 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2879                                   gmx_bool bMaster,ivec npulse)
2880 {
2881     gmx_domdec_comm_t *comm;
2882     int  d,j;
2883     rvec cellsize_min;
2884     real *cell_x,cell_dx,cellsize;
2885     
2886     comm = dd->comm;
2887     
2888     for(d=0; d<DIM; d++)
2889     {
2890         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2891         npulse[d] = 1;
2892         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2893         {
2894             /* Uniform grid */
2895             cell_dx = ddbox->box_size[d]/dd->nc[d];
2896             if (bMaster)
2897             {
2898                 for(j=0; j<dd->nc[d]+1; j++)
2899                 {
2900                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2901                 }
2902             }
2903             else
2904             {
2905                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2906                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2907             }
2908             cellsize = cell_dx*ddbox->skew_fac[d];
2909             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2910             {
2911                 npulse[d]++;
2912             }
2913             cellsize_min[d] = cellsize;
2914         }
2915         else
2916         {
2917             /* Statically load balanced grid */
2918             /* Also when we are not doing a master distribution we determine
2919              * all cell borders in a loop to obtain identical values
2920              * to the master distribution case and to determine npulse.
2921              */
2922             if (bMaster)
2923             {
2924                 cell_x = dd->ma->cell_x[d];
2925             }
2926             else
2927             {
2928                 snew(cell_x,dd->nc[d]+1);
2929             }
2930             cell_x[0] = ddbox->box0[d];
2931             for(j=0; j<dd->nc[d]; j++)
2932             {
2933                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2934                 cell_x[j+1] = cell_x[j] + cell_dx;
2935                 cellsize = cell_dx*ddbox->skew_fac[d];
2936                 while (cellsize*npulse[d] < comm->cutoff &&
2937                        npulse[d] < dd->nc[d]-1)
2938                 {
2939                     npulse[d]++;
2940                 }
2941                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2942             }
2943             if (!bMaster)
2944             {
2945                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2946                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2947                 sfree(cell_x);
2948             }
2949         }
2950         /* The following limitation is to avoid that a cell would receive
2951          * some of its own home charge groups back over the periodic boundary.
2952          * Double charge groups cause trouble with the global indices.
2953          */
2954         if (d < ddbox->npbcdim &&
2955             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2956         {
2957             gmx_fatal_collective(FARGS,NULL,dd,
2958                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2959                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2960                                  comm->cutoff,
2961                                  dd->nc[d],dd->nc[d],
2962                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2963         }
2964     }
2965     
2966     if (!comm->bDynLoadBal)
2967     {
2968         copy_rvec(cellsize_min,comm->cellsize_min);
2969     }
2970    
2971     for(d=0; d<comm->npmedecompdim; d++)
2972     {
2973         set_pme_maxshift(dd,&comm->ddpme[d],
2974                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2975                          comm->ddpme[d].slb_dim_f);
2976     }
2977 }
2978
2979
2980 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2981                                        int d,int dim,gmx_domdec_root_t *root,
2982                                        gmx_ddbox_t *ddbox,
2983                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2984 {
2985     gmx_domdec_comm_t *comm;
2986     int  ncd,i,j,nmin,nmin_old;
2987     gmx_bool bLimLo,bLimHi;
2988     real *cell_size;
2989     real fac,halfway,cellsize_limit_f_i,region_size;
2990     gmx_bool bPBC,bLastHi=FALSE;
2991     int nrange[]={range[0],range[1]};
2992
2993     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];  
2994
2995     comm = dd->comm;
2996
2997     ncd = dd->nc[dim];
2998
2999     bPBC = (dim < ddbox->npbcdim);
3000
3001     cell_size = root->buf_ncd;
3002
3003     if (debug) 
3004     {
3005         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3006     }
3007
3008     /* First we need to check if the scaling does not make cells
3009      * smaller than the smallest allowed size.
3010      * We need to do this iteratively, since if a cell is too small,
3011      * it needs to be enlarged, which makes all the other cells smaller,
3012      * which could in turn make another cell smaller than allowed.
3013      */
3014     for(i=range[0]; i<range[1]; i++)
3015     {
3016         root->bCellMin[i] = FALSE;
3017     }
3018     nmin = 0;
3019     do
3020     {
3021         nmin_old = nmin;
3022         /* We need the total for normalization */
3023         fac = 0;
3024         for(i=range[0]; i<range[1]; i++)
3025         {
3026             if (root->bCellMin[i] == FALSE)
3027             {
3028                 fac += cell_size[i];
3029             }
3030         }
3031         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3032         /* Determine the cell boundaries */
3033         for(i=range[0]; i<range[1]; i++)
3034         {
3035             if (root->bCellMin[i] == FALSE)
3036             {
3037                 cell_size[i] *= fac;
3038                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3039                 {
3040                     cellsize_limit_f_i = 0;
3041                 }
3042                 else
3043                 {
3044                     cellsize_limit_f_i = cellsize_limit_f;
3045                 }
3046                 if (cell_size[i] < cellsize_limit_f_i)
3047                 {
3048                     root->bCellMin[i] = TRUE;
3049                     cell_size[i] = cellsize_limit_f_i;
3050                     nmin++;
3051                 }
3052             }
3053             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3054         }
3055     }
3056     while (nmin > nmin_old);
3057     
3058     i=range[1]-1;
3059     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3060     /* For this check we should not use DD_CELL_MARGIN,
3061      * but a slightly smaller factor,
3062      * since rounding could get use below the limit.
3063      */
3064     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3065     {
3066         char buf[22];
3067         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3068                   gmx_step_str(step,buf),
3069                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3070                   ncd,comm->cellsize_min[dim]);
3071     }
3072     
3073     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3074     
3075     if (!bUniform)
3076     {
3077         /* Check if the boundary did not displace more than halfway
3078          * each of the cells it bounds, as this could cause problems,
3079          * especially when the differences between cell sizes are large.
3080          * If changes are applied, they will not make cells smaller
3081          * than the cut-off, as we check all the boundaries which
3082          * might be affected by a change and if the old state was ok,
3083          * the cells will at most be shrunk back to their old size.
3084          */
3085         for(i=range[0]+1; i<range[1]; i++)
3086         {
3087             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3088             if (root->cell_f[i] < halfway)
3089             {
3090                 root->cell_f[i] = halfway;
3091                 /* Check if the change also causes shifts of the next boundaries */
3092                 for(j=i+1; j<range[1]; j++)
3093                 {
3094                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3095                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3096                 }
3097             }
3098             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3099             if (root->cell_f[i] > halfway)
3100             {
3101                 root->cell_f[i] = halfway;
3102                 /* Check if the change also causes shifts of the next boundaries */
3103                 for(j=i-1; j>=range[0]+1; j--)
3104                 {
3105                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3106                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3107                 }
3108             }
3109         }
3110     }
3111     
3112     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3113     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3114      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3115      * for a and b nrange is used */
3116     if (d > 0)
3117     {
3118         /* Take care of the staggering of the cell boundaries */
3119         if (bUniform)
3120         {
3121             for(i=range[0]; i<range[1]; i++)
3122             {
3123                 root->cell_f_max0[i] = root->cell_f[i];
3124                 root->cell_f_min1[i] = root->cell_f[i+1];
3125             }
3126         }
3127         else
3128         {
3129             for(i=range[0]+1; i<range[1]; i++)
3130             {
3131                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3132                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3133                 if (bLimLo && bLimHi)
3134                 {
3135                     /* Both limits violated, try the best we can */
3136                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3137                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3138                     nrange[0]=range[0];
3139                     nrange[1]=i;
3140                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3141
3142                     nrange[0]=i;
3143                     nrange[1]=range[1];
3144                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3145
3146                     return;
3147                 }
3148                 else if (bLimLo)
3149                 {
3150                     /* root->cell_f[i] = root->bound_min[i]; */
3151                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3152                     bLastHi=FALSE;
3153                 }
3154                 else if (bLimHi && !bLastHi)
3155                 {
3156                     bLastHi=TRUE;
3157                     if (nrange[1] < range[1])   /* found a LimLo before */
3158                     {
3159                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3160                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3161                         nrange[0]=nrange[1];
3162                     }
3163                     root->cell_f[i] = root->bound_max[i];
3164                     nrange[1]=i; 
3165                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3166                     nrange[0]=i;
3167                     nrange[1]=range[1];
3168                 }
3169             }
3170             if (nrange[1] < range[1])   /* found last a LimLo */
3171             {
3172                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3173                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3174                 nrange[0]=nrange[1];
3175                 nrange[1]=range[1];
3176                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3177             } 
3178             else if (nrange[0] > range[0]) /* found at least one LimHi */
3179             {
3180                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3181             }
3182         }
3183     }
3184 }
3185
3186
3187 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3188                                        int d,int dim,gmx_domdec_root_t *root,
3189                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3190                                        gmx_bool bUniform,gmx_large_int_t step)
3191 {
3192     gmx_domdec_comm_t *comm;
3193     int  ncd,d1,i,j,pos;
3194     real *cell_size;
3195     real load_aver,load_i,imbalance,change,change_max,sc;
3196     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3197     real change_limit;
3198     real relax = 0.5;
3199     gmx_bool bPBC;
3200     int range[] = { 0, 0 };
3201
3202     comm = dd->comm;
3203
3204     /* Convert the maximum change from the input percentage to a fraction */
3205     change_limit = comm->dlb_scale_lim*0.01;
3206
3207     ncd = dd->nc[dim];
3208
3209     bPBC = (dim < ddbox->npbcdim);
3210
3211     cell_size = root->buf_ncd;
3212
3213     /* Store the original boundaries */
3214     for(i=0; i<ncd+1; i++)
3215     {
3216         root->old_cell_f[i] = root->cell_f[i];
3217     }
3218     if (bUniform) {
3219         for(i=0; i<ncd; i++)
3220         {
3221             cell_size[i] = 1.0/ncd;
3222         }
3223     }
3224     else if (dd_load_count(comm))
3225     {
3226         load_aver = comm->load[d].sum_m/ncd;
3227         change_max = 0;
3228         for(i=0; i<ncd; i++)
3229         {
3230             /* Determine the relative imbalance of cell i */
3231             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3232             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3233             /* Determine the change of the cell size using underrelaxation */
3234             change = -relax*imbalance;
3235             change_max = max(change_max,max(change,-change));
3236         }
3237         /* Limit the amount of scaling.
3238          * We need to use the same rescaling for all cells in one row,
3239          * otherwise the load balancing might not converge.
3240          */
3241         sc = relax;
3242         if (change_max > change_limit)
3243         {
3244             sc *= change_limit/change_max;
3245         }
3246         for(i=0; i<ncd; i++)
3247         {
3248             /* Determine the relative imbalance of cell i */
3249             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3250             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3251             /* Determine the change of the cell size using underrelaxation */
3252             change = -sc*imbalance;
3253             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3254         }
3255     }
3256     
3257     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3258     cellsize_limit_f *= DD_CELL_MARGIN;
3259     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3260     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3261     if (ddbox->tric_dir[dim])
3262     {
3263         cellsize_limit_f /= ddbox->skew_fac[dim];
3264         dist_min_f       /= ddbox->skew_fac[dim];
3265     }
3266     if (bDynamicBox && d > 0)
3267     {
3268         dist_min_f *= DD_PRES_SCALE_MARGIN;
3269     }
3270     if (d > 0 && !bUniform)
3271     {
3272         /* Make sure that the grid is not shifted too much */
3273         for(i=1; i<ncd; i++) {
3274             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard) 
3275             {
3276                 gmx_incons("Inconsistent DD boundary staggering limits!");
3277             }
3278             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3279             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3280             if (space > 0) {
3281                 root->bound_min[i] += 0.5*space;
3282             }
3283             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3284             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3285             if (space < 0) {
3286                 root->bound_max[i] += 0.5*space;
3287             }
3288             if (debug)
3289             {
3290                 fprintf(debug,
3291                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3292                         d,i,
3293                         root->cell_f_max0[i-1] + dist_min_f,
3294                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3295                         root->cell_f_min1[i] - dist_min_f);
3296             }
3297         }
3298     }
3299     range[1]=ncd;
3300     root->cell_f[0] = 0;
3301     root->cell_f[ncd] = 1;
3302     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3303
3304
3305     /* After the checks above, the cells should obey the cut-off
3306      * restrictions, but it does not hurt to check.
3307      */
3308     for(i=0; i<ncd; i++)
3309     {
3310         if (debug)
3311         {
3312             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3313                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3314         }
3315
3316         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3317             root->cell_f[i+1] - root->cell_f[i] <
3318             cellsize_limit_f/DD_CELL_MARGIN)
3319         {
3320             char buf[22];
3321             fprintf(stderr,
3322                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3323                     gmx_step_str(step,buf),dim2char(dim),i,
3324                     (root->cell_f[i+1] - root->cell_f[i])
3325                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3326         }
3327     }
3328     
3329     pos = ncd + 1;
3330     /* Store the cell boundaries of the lower dimensions at the end */
3331     for(d1=0; d1<d; d1++)
3332     {
3333         root->cell_f[pos++] = comm->cell_f0[d1];
3334         root->cell_f[pos++] = comm->cell_f1[d1];
3335     }
3336     
3337     if (d < comm->npmedecompdim)
3338     {
3339         /* The master determines the maximum shift for
3340          * the coordinate communication between separate PME nodes.
3341          */
3342         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3343     }
3344     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3345     if (d >= 1)
3346     {
3347         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3348     }
3349 }    
3350
3351 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3352                                              gmx_ddbox_t *ddbox,int dimind)
3353 {
3354     gmx_domdec_comm_t *comm;
3355     int dim;
3356
3357     comm = dd->comm;
3358
3359     /* Set the cell dimensions */
3360     dim = dd->dim[dimind];
3361     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3362     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3363     if (dim >= ddbox->nboundeddim)
3364     {
3365         comm->cell_x0[dim] += ddbox->box0[dim];
3366         comm->cell_x1[dim] += ddbox->box0[dim];
3367     }
3368 }
3369
3370 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3371                                          int d,int dim,real *cell_f_row,
3372                                          gmx_ddbox_t *ddbox)
3373 {
3374     gmx_domdec_comm_t *comm;
3375     int d1,dim1,pos;
3376
3377     comm = dd->comm;
3378
3379 #ifdef GMX_MPI
3380     /* Each node would only need to know two fractions,
3381      * but it is probably cheaper to broadcast the whole array.
3382      */
3383     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3384               0,comm->mpi_comm_load[d]);
3385 #endif
3386     /* Copy the fractions for this dimension from the buffer */
3387     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3388     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3389     /* The whole array was communicated, so set the buffer position */
3390     pos = dd->nc[dim] + 1;
3391     for(d1=0; d1<=d; d1++)
3392     {
3393         if (d1 < d)
3394         {
3395             /* Copy the cell fractions of the lower dimensions */
3396             comm->cell_f0[d1] = cell_f_row[pos++];
3397             comm->cell_f1[d1] = cell_f_row[pos++];
3398         }
3399         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3400     }
3401     /* Convert the communicated shift from float to int */
3402     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3403     if (d >= 1)
3404     {
3405         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3406     }
3407 }
3408
3409 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3410                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3411                                          gmx_bool bUniform,gmx_large_int_t step)
3412 {
3413     gmx_domdec_comm_t *comm;
3414     int d,dim,d1;
3415     gmx_bool bRowMember,bRowRoot;
3416     real *cell_f_row;
3417     
3418     comm = dd->comm;
3419
3420     for(d=0; d<dd->ndim; d++)
3421     {
3422         dim = dd->dim[d];
3423         bRowMember = TRUE;
3424         bRowRoot = TRUE;
3425         for(d1=d; d1<dd->ndim; d1++)
3426         {
3427             if (dd->ci[dd->dim[d1]] > 0)
3428             {
3429                 if (d1 > d)
3430                 {
3431                     bRowMember = FALSE;
3432                 }
3433                 bRowRoot = FALSE;
3434             }
3435         }
3436         if (bRowMember)
3437         {
3438             if (bRowRoot)
3439             {
3440                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3441                                            ddbox,bDynamicBox,bUniform,step);
3442                 cell_f_row = comm->root[d]->cell_f;
3443             }
3444             else
3445             {
3446                 cell_f_row = comm->cell_f_row;
3447             }
3448             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3449         }
3450     }
3451 }    
3452
3453 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3454 {
3455     int d;
3456
3457     /* This function assumes the box is static and should therefore
3458      * not be called when the box has changed since the last
3459      * call to dd_partition_system.
3460      */
3461     for(d=0; d<dd->ndim; d++)
3462     {
3463         relative_to_absolute_cell_bounds(dd,ddbox,d); 
3464     }
3465 }
3466
3467
3468
3469 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3470                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3471                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3472                                   gmx_wallcycle_t wcycle)
3473 {
3474     gmx_domdec_comm_t *comm;
3475     int dim;
3476
3477     comm = dd->comm;
3478     
3479     if (bDoDLB)
3480     {
3481         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3482         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3483         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3484     }
3485     else if (bDynamicBox)
3486     {
3487         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3488     }
3489     
3490     /* Set the dimensions for which no DD is used */
3491     for(dim=0; dim<DIM; dim++) {
3492         if (dd->nc[dim] == 1) {
3493             comm->cell_x0[dim] = 0;
3494             comm->cell_x1[dim] = ddbox->box_size[dim];
3495             if (dim >= ddbox->nboundeddim)
3496             {
3497                 comm->cell_x0[dim] += ddbox->box0[dim];
3498                 comm->cell_x1[dim] += ddbox->box0[dim];
3499             }
3500         }
3501     }
3502 }
3503
3504 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3505 {
3506     int d,np,i;
3507     gmx_domdec_comm_dim_t *cd;
3508     
3509     for(d=0; d<dd->ndim; d++)
3510     {
3511         cd = &dd->comm->cd[d];
3512         np = npulse[dd->dim[d]];
3513         if (np > cd->np_nalloc)
3514         {
3515             if (debug)
3516             {
3517                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3518                         dim2char(dd->dim[d]),np);
3519             }
3520             if (DDMASTER(dd) && cd->np_nalloc > 0)
3521             {
3522                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3523             }
3524             srenew(cd->ind,np);
3525             for(i=cd->np_nalloc; i<np; i++)
3526             {
3527                 cd->ind[i].index  = NULL;
3528                 cd->ind[i].nalloc = 0;
3529             }
3530             cd->np_nalloc = np;
3531         }
3532         cd->np = np;
3533     }
3534 }
3535
3536
3537 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3538                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3539                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3540                               gmx_wallcycle_t wcycle)
3541 {
3542     gmx_domdec_comm_t *comm;
3543     int  d;
3544     ivec npulse;
3545     
3546     comm = dd->comm;
3547
3548     /* Copy the old cell boundaries for the cg displacement check */
3549     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3550     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3551     
3552     if (comm->bDynLoadBal)
3553     {
3554         if (DDMASTER(dd))
3555         {
3556             check_box_size(dd,ddbox);
3557         }
3558         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3559     }
3560     else
3561     {
3562         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3563         realloc_comm_ind(dd,npulse);
3564     }
3565     
3566     if (debug)
3567     {
3568         for(d=0; d<DIM; d++)
3569         {
3570             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3571                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3572         }
3573     }
3574 }
3575
3576 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3577                                   gmx_ddbox_t *ddbox,
3578                                   rvec cell_ns_x0,rvec cell_ns_x1,
3579                                   gmx_large_int_t step)
3580 {
3581     gmx_domdec_comm_t *comm;
3582     int dim_ind,dim;
3583     
3584     comm = dd->comm;
3585
3586     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3587     {
3588         dim = dd->dim[dim_ind];
3589         
3590         /* Without PBC we don't have restrictions on the outer cells */
3591         if (!(dim >= ddbox->npbcdim && 
3592               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3593             comm->bDynLoadBal &&
3594             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3595             comm->cellsize_min[dim])
3596         {
3597             char buf[22];
3598             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3599                       gmx_step_str(step,buf),dim2char(dim),
3600                       comm->cell_x1[dim] - comm->cell_x0[dim],
3601                       ddbox->skew_fac[dim],
3602                       dd->comm->cellsize_min[dim],
3603                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3604         }
3605     }
3606     
3607     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3608     {
3609         /* Communicate the boundaries and update cell_ns_x0/1 */
3610         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3611         if (dd->bGridJump && dd->ndim > 1)
3612         {
3613             check_grid_jump(step,dd,ddbox);
3614         }
3615     }
3616 }
3617
3618 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3619 {
3620     if (YY < npbcdim)
3621     {
3622         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3623     }
3624     else
3625     {
3626         tcm[YY][XX] = 0;
3627     }
3628     if (ZZ < npbcdim)
3629     {
3630         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3631         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3632     }
3633     else
3634     {
3635         tcm[ZZ][XX] = 0;
3636         tcm[ZZ][YY] = 0;
3637     }
3638 }
3639
3640 static void check_screw_box(matrix box)
3641 {
3642     /* Mathematical limitation */
3643     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3644     {
3645         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3646     }
3647     
3648     /* Limitation due to the asymmetry of the eighth shell method */
3649     if (box[ZZ][YY] != 0)
3650     {
3651         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3652     }
3653 }
3654
3655 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3656                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3657                           gmx_domdec_t *dd)
3658 {
3659     gmx_domdec_master_t *ma;
3660     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3661     int  i,icg,j,k,k0,k1,d,npbcdim;
3662     matrix tcm;
3663     rvec box_size,cg_cm;
3664     ivec ind;
3665     real nrcg,inv_ncg,pos_d;
3666     atom_id *cgindex;
3667     gmx_bool bUnbounded,bScrew;
3668
3669     ma = dd->ma;
3670     
3671     if (tmp_ind == NULL)
3672     {
3673         snew(tmp_nalloc,dd->nnodes);
3674         snew(tmp_ind,dd->nnodes);
3675         for(i=0; i<dd->nnodes; i++)
3676         {
3677             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3678             snew(tmp_ind[i],tmp_nalloc[i]);
3679         }
3680     }
3681     
3682     /* Clear the count */
3683     for(i=0; i<dd->nnodes; i++)
3684     {
3685         ma->ncg[i] = 0;
3686         ma->nat[i] = 0;
3687     }
3688     
3689     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3690     
3691     cgindex = cgs->index;
3692     
3693     /* Compute the center of geometry for all charge groups */
3694     for(icg=0; icg<cgs->nr; icg++)
3695     {
3696         k0      = cgindex[icg];
3697         k1      = cgindex[icg+1];
3698         nrcg    = k1 - k0;
3699         if (nrcg == 1)
3700         {
3701             copy_rvec(pos[k0],cg_cm);
3702         }
3703         else
3704         {
3705             inv_ncg = 1.0/nrcg;
3706             
3707             clear_rvec(cg_cm);
3708             for(k=k0; (k<k1); k++)
3709             {
3710                 rvec_inc(cg_cm,pos[k]);
3711             }
3712             for(d=0; (d<DIM); d++)
3713             {
3714                 cg_cm[d] *= inv_ncg;
3715             }
3716         }
3717         /* Put the charge group in the box and determine the cell index */
3718         for(d=DIM-1; d>=0; d--) {
3719             pos_d = cg_cm[d];
3720             if (d < dd->npbcdim)
3721             {
3722                 bScrew = (dd->bScrewPBC && d == XX);
3723                 if (tric_dir[d] && dd->nc[d] > 1)
3724                 {
3725                     /* Use triclinic coordintates for this dimension */
3726                     for(j=d+1; j<DIM; j++)
3727                     {
3728                         pos_d += cg_cm[j]*tcm[j][d];
3729                     }
3730                 }
3731                 while(pos_d >= box[d][d])
3732                 {
3733                     pos_d -= box[d][d];
3734                     rvec_dec(cg_cm,box[d]);
3735                     if (bScrew)
3736                     {
3737                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3738                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3739                     }
3740                     for(k=k0; (k<k1); k++)
3741                     {
3742                         rvec_dec(pos[k],box[d]);
3743                         if (bScrew)
3744                         {
3745                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3746                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3747                         }
3748                     }
3749                 }
3750                 while(pos_d < 0)
3751                 {
3752                     pos_d += box[d][d];
3753                     rvec_inc(cg_cm,box[d]);
3754                     if (bScrew)
3755                     {
3756                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3757                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3758                     }
3759                     for(k=k0; (k<k1); k++)
3760                     {
3761                         rvec_inc(pos[k],box[d]);
3762                         if (bScrew) {
3763                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3764                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3765                         }
3766                     }
3767                 }
3768             }
3769             /* This could be done more efficiently */
3770             ind[d] = 0;
3771             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3772             {
3773                 ind[d]++;
3774             }
3775         }
3776         i = dd_index(dd->nc,ind);
3777         if (ma->ncg[i] == tmp_nalloc[i])
3778         {
3779             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3780             srenew(tmp_ind[i],tmp_nalloc[i]);
3781         }
3782         tmp_ind[i][ma->ncg[i]] = icg;
3783         ma->ncg[i]++;
3784         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3785     }
3786     
3787     k1 = 0;
3788     for(i=0; i<dd->nnodes; i++)
3789     {
3790         ma->index[i] = k1;
3791         for(k=0; k<ma->ncg[i]; k++)
3792         {
3793             ma->cg[k1++] = tmp_ind[i][k];
3794         }
3795     }
3796     ma->index[dd->nnodes] = k1;
3797     
3798     for(i=0; i<dd->nnodes; i++)
3799     {
3800         sfree(tmp_ind[i]);
3801     }
3802     sfree(tmp_ind);
3803     sfree(tmp_nalloc);
3804     
3805     if (fplog)
3806     {
3807         char buf[22];
3808         fprintf(fplog,"Charge group distribution at step %s:",
3809                 gmx_step_str(step,buf));
3810         for(i=0; i<dd->nnodes; i++)
3811         {
3812             fprintf(fplog," %d",ma->ncg[i]);
3813         }
3814         fprintf(fplog,"\n");
3815     }
3816 }
3817
3818 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3819                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3820                                 rvec pos[])
3821 {
3822     gmx_domdec_master_t *ma=NULL;
3823     ivec npulse;
3824     int  i,cg_gl;
3825     int  *ibuf,buf2[2] = { 0, 0 };
3826     gmx_bool bMaster = DDMASTER(dd);
3827     if (bMaster)
3828     {
3829         ma = dd->ma;
3830         
3831         if (dd->bScrewPBC)
3832         {
3833             check_screw_box(box);
3834         }
3835     
3836         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3837     
3838         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3839         for(i=0; i<dd->nnodes; i++)
3840         {
3841             ma->ibuf[2*i]   = ma->ncg[i];
3842             ma->ibuf[2*i+1] = ma->nat[i];
3843         }
3844         ibuf = ma->ibuf;
3845     }
3846     else
3847     {
3848         ibuf = NULL;
3849     }
3850     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3851     
3852     dd->ncg_home = buf2[0];
3853     dd->nat_home = buf2[1];
3854     dd->ncg_tot  = dd->ncg_home;
3855     dd->nat_tot  = dd->nat_home;
3856     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3857     {
3858         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3859         srenew(dd->index_gl,dd->cg_nalloc);
3860         srenew(dd->cgindex,dd->cg_nalloc+1);
3861     }
3862     if (bMaster)
3863     {
3864         for(i=0; i<dd->nnodes; i++)
3865         {
3866             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3867             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3868         }
3869     }
3870     
3871     dd_scatterv(dd,
3872                 DDMASTER(dd) ? ma->ibuf : NULL,
3873                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3874                 DDMASTER(dd) ? ma->cg : NULL,
3875                 dd->ncg_home*sizeof(int),dd->index_gl);
3876     
3877     /* Determine the home charge group sizes */
3878     dd->cgindex[0] = 0;
3879     for(i=0; i<dd->ncg_home; i++)
3880     {
3881         cg_gl = dd->index_gl[i];
3882         dd->cgindex[i+1] =
3883             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3884     }
3885     
3886     if (debug)
3887     {
3888         fprintf(debug,"Home charge groups:\n");
3889         for(i=0; i<dd->ncg_home; i++)
3890         {
3891             fprintf(debug," %d",dd->index_gl[i]);
3892             if (i % 10 == 9) 
3893                 fprintf(debug,"\n");
3894         }
3895         fprintf(debug,"\n");
3896     }
3897 }
3898
3899 static int compact_and_copy_vec_at(int ncg,int *move,
3900                                    int *cgindex,
3901                                    int nvec,int vec,
3902                                    rvec *src,gmx_domdec_comm_t *comm,
3903                                    gmx_bool bCompact)
3904 {
3905     int m,icg,i,i0,i1,nrcg;
3906     int home_pos;
3907     int pos_vec[DIM*2];
3908     
3909     home_pos = 0;
3910
3911     for(m=0; m<DIM*2; m++)
3912     {
3913         pos_vec[m] = 0;
3914     }
3915     
3916     i0 = 0;
3917     for(icg=0; icg<ncg; icg++)
3918     {
3919         i1 = cgindex[icg+1];
3920         m = move[icg];
3921         if (m == -1)
3922         {
3923             if (bCompact)
3924             {
3925                 /* Compact the home array in place */
3926                 for(i=i0; i<i1; i++)
3927                 {
3928                     copy_rvec(src[i],src[home_pos++]);
3929                 }
3930             }
3931         }
3932         else
3933         {
3934             /* Copy to the communication buffer */
3935             nrcg = i1 - i0;
3936             pos_vec[m] += 1 + vec*nrcg;
3937             for(i=i0; i<i1; i++)
3938             {
3939                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3940             }
3941             pos_vec[m] += (nvec - vec - 1)*nrcg;
3942         }
3943         if (!bCompact)
3944         {
3945             home_pos += i1 - i0;
3946         }
3947         i0 = i1;
3948     }
3949     
3950     return home_pos;
3951 }
3952
3953 static int compact_and_copy_vec_cg(int ncg,int *move,
3954                                    int *cgindex,
3955                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3956                                    gmx_bool bCompact)
3957 {
3958     int m,icg,i0,i1,nrcg;
3959     int home_pos;
3960     int pos_vec[DIM*2];
3961     
3962     home_pos = 0;
3963     
3964     for(m=0; m<DIM*2; m++)
3965     {
3966         pos_vec[m] = 0;
3967     }
3968     
3969     i0 = 0;
3970     for(icg=0; icg<ncg; icg++)
3971     {
3972         i1 = cgindex[icg+1];
3973         m = move[icg];
3974         if (m == -1)
3975         {
3976             if (bCompact)
3977             {
3978                 /* Compact the home array in place */
3979                 copy_rvec(src[icg],src[home_pos++]);
3980             }
3981         }
3982         else
3983         {
3984             nrcg = i1 - i0;
3985             /* Copy to the communication buffer */
3986             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3987             pos_vec[m] += 1 + nrcg*nvec;
3988         }
3989         i0 = i1;
3990     }
3991     if (!bCompact)
3992     {
3993         home_pos = ncg;
3994     }
3995     
3996     return home_pos;
3997 }
3998
3999 static int compact_ind(int ncg,int *move,
4000                        int *index_gl,int *cgindex,
4001                        int *gatindex,
4002                        gmx_ga2la_t ga2la,char *bLocalCG,
4003                        int *cginfo)
4004 {
4005     int cg,nat,a0,a1,a,a_gl;
4006     int home_pos;
4007
4008     home_pos = 0;
4009     nat = 0;
4010     for(cg=0; cg<ncg; cg++)
4011     {
4012         a0 = cgindex[cg];
4013         a1 = cgindex[cg+1];
4014         if (move[cg] == -1)
4015         {
4016             /* Compact the home arrays in place.
4017              * Anything that can be done here avoids access to global arrays.
4018              */
4019             cgindex[home_pos] = nat;
4020             for(a=a0; a<a1; a++)
4021             {
4022                 a_gl = gatindex[a];
4023                 gatindex[nat] = a_gl;
4024                 /* The cell number stays 0, so we don't need to set it */
4025                 ga2la_change_la(ga2la,a_gl,nat);
4026                 nat++;
4027             }
4028             index_gl[home_pos] = index_gl[cg];
4029             cginfo[home_pos]   = cginfo[cg];
4030             /* The charge group remains local, so bLocalCG does not change */
4031             home_pos++;
4032         }
4033         else
4034         {
4035             /* Clear the global indices */
4036             for(a=a0; a<a1; a++)
4037             {
4038                 ga2la_del(ga2la,gatindex[a]);
4039             }
4040             if (bLocalCG)
4041             {
4042                 bLocalCG[index_gl[cg]] = FALSE;
4043             }
4044         }
4045     }
4046     cgindex[home_pos] = nat;
4047     
4048     return home_pos;
4049 }
4050
4051 static void clear_and_mark_ind(int ncg,int *move,
4052                                int *index_gl,int *cgindex,int *gatindex,
4053                                gmx_ga2la_t ga2la,char *bLocalCG,
4054                                int *cell_index)
4055 {
4056     int cg,a0,a1,a;
4057     
4058     for(cg=0; cg<ncg; cg++)
4059     {
4060         if (move[cg] >= 0)
4061         {
4062             a0 = cgindex[cg];
4063             a1 = cgindex[cg+1];
4064             /* Clear the global indices */
4065             for(a=a0; a<a1; a++)
4066             {
4067                 ga2la_del(ga2la,gatindex[a]);
4068             }
4069             if (bLocalCG)
4070             {
4071                 bLocalCG[index_gl[cg]] = FALSE;
4072             }
4073             /* Signal that this cg has moved using the ns cell index.
4074              * Here we set it to -1.
4075              * fill_grid will change it from -1 to 4*grid->ncells.
4076              */
4077             cell_index[cg] = -1;
4078         }
4079     }
4080 }
4081
4082 static void print_cg_move(FILE *fplog,
4083                           gmx_domdec_t *dd,
4084                           gmx_large_int_t step,int cg,int dim,int dir,
4085                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4086                           rvec cm_old,rvec cm_new,real pos_d)
4087 {
4088     gmx_domdec_comm_t *comm;
4089     char buf[22];
4090
4091     comm = dd->comm;
4092
4093     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4094     if (bHaveLimitdAndCMOld)
4095     {
4096         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4097                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4098     }
4099     else
4100     {
4101         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4102                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4103     }
4104     fprintf(fplog,"distance out of cell %f\n",
4105             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4106     if (bHaveLimitdAndCMOld)
4107     {
4108         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4109                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4110     }
4111     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4112             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4113     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4114             dim2char(dim),
4115             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4116     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4117             dim2char(dim),
4118             comm->cell_x0[dim],comm->cell_x1[dim]);
4119 }
4120
4121 static void cg_move_error(FILE *fplog,
4122                           gmx_domdec_t *dd,
4123                           gmx_large_int_t step,int cg,int dim,int dir,
4124                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4125                           rvec cm_old,rvec cm_new,real pos_d)
4126 {
4127     if (fplog)
4128     {
4129         print_cg_move(fplog, dd,step,cg,dim,dir,
4130                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4131     }
4132     print_cg_move(stderr,dd,step,cg,dim,dir,
4133                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4134     gmx_fatal(FARGS,
4135               "A charge group moved too far between two domain decomposition steps\n"
4136               "This usually means that your system is not well equilibrated");
4137 }
4138
4139 static void rotate_state_atom(t_state *state,int a)
4140 {
4141     int est;
4142
4143     for(est=0; est<estNR; est++)
4144     {
4145         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4146             switch (est) {
4147             case estX:
4148                 /* Rotate the complete state; for a rectangular box only */
4149                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4150                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4151                 break;
4152             case estV:
4153                 state->v[a][YY] = -state->v[a][YY];
4154                 state->v[a][ZZ] = -state->v[a][ZZ];
4155                 break;
4156             case estSDX:
4157                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4158                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4159                 break;
4160             case estCGP:
4161                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4162                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4163                 break;
4164             case estDISRE_INITF:
4165             case estDISRE_RM3TAV:
4166             case estORIRE_INITF:
4167             case estORIRE_DTAV:
4168                 /* These are distances, so not affected by rotation */
4169                 break;
4170             default:
4171                 gmx_incons("Unknown state entry encountered in rotate_state_atom");            
4172             }
4173         }
4174     }
4175 }
4176
4177 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4178                               gmx_domdec_t *dd,ivec tric_dir,
4179                               t_state *state,rvec **f,
4180                               t_forcerec *fr,t_mdatoms *md,
4181                               gmx_bool bCompact,
4182                               t_nrnb *nrnb)
4183 {
4184     int  *move;
4185     int  npbcdim;
4186     int  ncg[DIM*2],nat[DIM*2];
4187     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4188     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4189     int  sbuf[2],rbuf[2];
4190     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4191     int  flag;
4192     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4193     gmx_bool bScrew;
4194     ivec dev;
4195     real inv_ncg,pos_d;
4196     matrix tcm;
4197     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4198     atom_id *cgindex;
4199     cginfo_mb_t *cginfo_mb;
4200     gmx_domdec_comm_t *comm;
4201     
4202     if (dd->bScrewPBC)
4203     {
4204         check_screw_box(state->box);
4205     }
4206     
4207     comm  = dd->comm;
4208     cg_cm = fr->cg_cm;
4209     
4210     for(i=0; i<estNR; i++)
4211     {
4212         if (EST_DISTR(i))
4213         {
4214             switch (i)
4215             {
4216             case estX:   /* Always present */            break;
4217             case estV:   bV   = (state->flags & (1<<i)); break;
4218             case estSDX: bSDX = (state->flags & (1<<i)); break;
4219             case estCGP: bCGP = (state->flags & (1<<i)); break;
4220             case estLD_RNG:
4221             case estLD_RNGI:
4222             case estDISRE_INITF:
4223             case estDISRE_RM3TAV:
4224             case estORIRE_INITF:
4225             case estORIRE_DTAV:
4226                 /* No processing required */
4227                 break;
4228             default:
4229             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4230             }
4231         }
4232     }
4233     
4234     if (dd->ncg_tot > comm->nalloc_int)
4235     {
4236         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4237         srenew(comm->buf_int,comm->nalloc_int);
4238     }
4239     move = comm->buf_int;
4240     
4241     /* Clear the count */
4242     for(c=0; c<dd->ndim*2; c++)
4243     {
4244         ncg[c] = 0;
4245         nat[c] = 0;
4246     }
4247
4248     npbcdim = dd->npbcdim;
4249
4250     for(d=0; (d<DIM); d++)
4251     {
4252         limitd[d] = dd->comm->cellsize_min[d];
4253         if (d >= npbcdim && dd->ci[d] == 0)
4254         {
4255             cell_x0[d] = -GMX_FLOAT_MAX;
4256         }
4257         else
4258         {
4259             cell_x0[d] = comm->cell_x0[d];
4260         }
4261         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4262         {
4263             cell_x1[d] = GMX_FLOAT_MAX;
4264         }
4265         else
4266         {
4267             cell_x1[d] = comm->cell_x1[d];
4268         }
4269         if (d < npbcdim)
4270         {
4271             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4272             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4273         }
4274         else
4275         {
4276             /* We check after communication if a charge group moved
4277              * more than one cell. Set the pre-comm check limit to float_max.
4278              */
4279             limit0[d] = -GMX_FLOAT_MAX;
4280             limit1[d] =  GMX_FLOAT_MAX;
4281         }
4282     }
4283     
4284     make_tric_corr_matrix(npbcdim,state->box,tcm);
4285     
4286     cgindex = dd->cgindex;
4287     
4288     /* Compute the center of geometry for all home charge groups
4289      * and put them in the box and determine where they should go.
4290      */
4291     for(cg=0; cg<dd->ncg_home; cg++)
4292     {
4293         k0   = cgindex[cg];
4294         k1   = cgindex[cg+1];
4295         nrcg = k1 - k0;
4296         if (nrcg == 1)
4297         {
4298             copy_rvec(state->x[k0],cm_new);
4299         }
4300         else
4301         {
4302             inv_ncg = 1.0/nrcg;
4303             
4304             clear_rvec(cm_new);
4305             for(k=k0; (k<k1); k++)
4306             {
4307                 rvec_inc(cm_new,state->x[k]);
4308             }
4309             for(d=0; (d<DIM); d++)
4310             {
4311                 cm_new[d] = inv_ncg*cm_new[d];
4312             }
4313         }
4314         
4315         clear_ivec(dev);
4316         /* Do pbc and check DD cell boundary crossings */
4317         for(d=DIM-1; d>=0; d--)
4318         {
4319             if (dd->nc[d] > 1)
4320             {
4321                 bScrew = (dd->bScrewPBC && d == XX);
4322                 /* Determine the location of this cg in lattice coordinates */
4323                 pos_d = cm_new[d];
4324                 if (tric_dir[d])
4325                 {
4326                     for(d2=d+1; d2<DIM; d2++)
4327                     {
4328                         pos_d += cm_new[d2]*tcm[d2][d];
4329                     }
4330                 }
4331                 /* Put the charge group in the triclinic unit-cell */
4332                 if (pos_d >= cell_x1[d])
4333                 {
4334                     if (pos_d >= limit1[d])
4335                     {
4336                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4337                                       cg_cm[cg],cm_new,pos_d);
4338                     }
4339                     dev[d] = 1;
4340                     if (dd->ci[d] == dd->nc[d] - 1)
4341                     {
4342                         rvec_dec(cm_new,state->box[d]);
4343                         if (bScrew)
4344                         {
4345                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4346                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4347                         }
4348                         for(k=k0; (k<k1); k++)
4349                         {
4350                             rvec_dec(state->x[k],state->box[d]);
4351                             if (bScrew)
4352                             {
4353                                 rotate_state_atom(state,k);
4354                             }
4355                         }
4356                     }
4357                 }
4358                 else if (pos_d < cell_x0[d])
4359                 {
4360                     if (pos_d < limit0[d])
4361                     {
4362                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4363                                       cg_cm[cg],cm_new,pos_d);
4364                     }
4365                     dev[d] = -1;
4366                     if (dd->ci[d] == 0)
4367                     {
4368                         rvec_inc(cm_new,state->box[d]);
4369                         if (bScrew)
4370                         {
4371                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4372                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4373                         }
4374                         for(k=k0; (k<k1); k++)
4375                         {
4376                             rvec_inc(state->x[k],state->box[d]);
4377                             if (bScrew)
4378                             {
4379                                 rotate_state_atom(state,k);
4380                             }
4381                         }
4382                     }
4383                 }
4384             }
4385             else if (d < npbcdim)
4386             {
4387                 /* Put the charge group in the rectangular unit-cell */
4388                 while (cm_new[d] >= state->box[d][d])
4389                 {
4390                     rvec_dec(cm_new,state->box[d]);
4391                     for(k=k0; (k<k1); k++)
4392                     {
4393                         rvec_dec(state->x[k],state->box[d]);
4394                     }
4395                 }
4396                 while (cm_new[d] < 0)
4397                 {
4398                     rvec_inc(cm_new,state->box[d]);
4399                     for(k=k0; (k<k1); k++)
4400                     {
4401                         rvec_inc(state->x[k],state->box[d]);
4402                     }
4403                 }
4404             }
4405         }
4406     
4407         copy_rvec(cm_new,cg_cm[cg]);
4408         
4409         /* Determine where this cg should go */
4410         flag = 0;
4411         mc = -1;
4412         for(d=0; d<dd->ndim; d++)
4413         {
4414             dim = dd->dim[d];
4415             if (dev[dim] == 1)
4416             {
4417                 flag |= DD_FLAG_FW(d);
4418                 if (mc == -1)
4419                 {
4420                     mc = d*2;
4421                 }
4422             }
4423             else if (dev[dim] == -1)
4424             {
4425                 flag |= DD_FLAG_BW(d);
4426                 if (mc == -1) {
4427                     if (dd->nc[dim] > 2)
4428                     {
4429                         mc = d*2 + 1;
4430                     }
4431                     else
4432                     {
4433                         mc = d*2;
4434                     }
4435                 }
4436             }
4437         }
4438         move[cg] = mc;
4439         if (mc >= 0)
4440         {
4441             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4442             {
4443                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4444                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4445             }
4446             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4447             /* We store the cg size in the lower 16 bits
4448              * and the place where the charge group should go
4449              * in the next 6 bits. This saves some communication volume.
4450              */
4451             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4452             ncg[mc] += 1;
4453             nat[mc] += nrcg;
4454         }
4455     }
4456     
4457     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4458     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4459     
4460     nvec = 1;
4461     if (bV)
4462     {
4463         nvec++;
4464     }
4465     if (bSDX)
4466     {
4467         nvec++;
4468     }
4469     if (bCGP)
4470     {
4471         nvec++;
4472     }
4473     
4474     /* Make sure the communication buffers are large enough */
4475     for(mc=0; mc<dd->ndim*2; mc++)
4476     {
4477         nvr = ncg[mc] + nat[mc]*nvec;
4478         if (nvr > comm->cgcm_state_nalloc[mc])
4479         {
4480             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4481             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4482         }
4483     }
4484     
4485     /* Recalculating cg_cm might be cheaper than communicating,
4486      * but that could give rise to rounding issues.
4487      */
4488     home_pos_cg =
4489         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4490                                 nvec,cg_cm,comm,bCompact);
4491     
4492     vec = 0;
4493     home_pos_at =
4494         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4495                                 nvec,vec++,state->x,comm,bCompact);
4496     if (bV)
4497     {
4498         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4499                                 nvec,vec++,state->v,comm,bCompact);
4500     }
4501     if (bSDX)
4502     {
4503         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4504                                 nvec,vec++,state->sd_X,comm,bCompact);
4505     }
4506     if (bCGP)
4507     {
4508         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4509                                 nvec,vec++,state->cg_p,comm,bCompact);
4510     }
4511     
4512     if (bCompact)
4513     {
4514         compact_ind(dd->ncg_home,move,
4515                     dd->index_gl,dd->cgindex,dd->gatindex,
4516                     dd->ga2la,comm->bLocalCG,
4517                     fr->cginfo);
4518     }
4519     else
4520     {
4521         clear_and_mark_ind(dd->ncg_home,move,
4522                            dd->index_gl,dd->cgindex,dd->gatindex,
4523                            dd->ga2la,comm->bLocalCG,
4524                            fr->ns.grid->cell_index);
4525     }
4526     
4527     cginfo_mb = fr->cginfo_mb;
4528
4529     ncg_stay_home = home_pos_cg;
4530     for(d=0; d<dd->ndim; d++)
4531     {
4532         dim = dd->dim[d];
4533         ncg_recv = 0;
4534         nat_recv = 0;
4535         nvr      = 0;
4536         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4537         {
4538             cdd = d*2 + dir;
4539             /* Communicate the cg and atom counts */
4540             sbuf[0] = ncg[cdd];
4541             sbuf[1] = nat[cdd];
4542             if (debug)
4543             {
4544                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4545                         d,dir,sbuf[0],sbuf[1]);
4546             }
4547             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4548             
4549             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4550             {
4551                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4552                 srenew(comm->buf_int,comm->nalloc_int);
4553             }
4554             
4555             /* Communicate the charge group indices, sizes and flags */
4556             dd_sendrecv_int(dd, d, dir,
4557                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4558                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4559             
4560             nvs = ncg[cdd] + nat[cdd]*nvec;
4561             i   = rbuf[0]  + rbuf[1] *nvec;
4562             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4563             
4564             /* Communicate cgcm and state */
4565             dd_sendrecv_rvec(dd, d, dir,
4566                              comm->cgcm_state[cdd], nvs,
4567                              comm->vbuf.v+nvr, i);
4568             ncg_recv += rbuf[0];
4569             nat_recv += rbuf[1];
4570             nvr      += i;
4571         }
4572         
4573         /* Process the received charge groups */
4574         buf_pos = 0;
4575         for(cg=0; cg<ncg_recv; cg++)
4576         {
4577             flag = comm->buf_int[cg*DD_CGIBS+1];
4578
4579             if (dim >= npbcdim && dd->nc[dim] > 2)
4580             {
4581                 /* No pbc in this dim and more than one domain boundary.
4582                  * We to a separate check if a charge did not move too far.
4583                  */
4584                 if (((flag & DD_FLAG_FW(d)) &&
4585                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4586                     ((flag & DD_FLAG_BW(d)) &&
4587                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4588                 {
4589                     cg_move_error(fplog,dd,step,cg,d,
4590                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4591                                    FALSE,0,
4592                                    comm->vbuf.v[buf_pos],
4593                                    comm->vbuf.v[buf_pos],
4594                                    comm->vbuf.v[buf_pos][d]);
4595                 }
4596             }
4597
4598             mc = -1;
4599             if (d < dd->ndim-1)
4600             {
4601                 /* Check which direction this cg should go */
4602                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4603                 {
4604                     if (dd->bGridJump)
4605                     {
4606                         /* The cell boundaries for dimension d2 are not equal
4607                          * for each cell row of the lower dimension(s),
4608                          * therefore we might need to redetermine where
4609                          * this cg should go.
4610                          */
4611                         dim2 = dd->dim[d2];
4612                         /* If this cg crosses the box boundary in dimension d2
4613                          * we can use the communicated flag, so we do not
4614                          * have to worry about pbc.
4615                          */
4616                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4617                                (flag & DD_FLAG_FW(d2))) ||
4618                               (dd->ci[dim2] == 0 &&
4619                                (flag & DD_FLAG_BW(d2)))))
4620                         {
4621                             /* Clear the two flags for this dimension */
4622                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4623                             /* Determine the location of this cg
4624                              * in lattice coordinates
4625                              */
4626                             pos_d = comm->vbuf.v[buf_pos][dim2];
4627                             if (tric_dir[dim2])
4628                             {
4629                                 for(d3=dim2+1; d3<DIM; d3++)
4630                                 {
4631                                     pos_d +=
4632                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4633                                 }
4634                             }
4635                             /* Check of we are not at the box edge.
4636                              * pbc is only handled in the first step above,
4637                              * but this check could move over pbc while
4638                              * the first step did not due to different rounding.
4639                              */
4640                             if (pos_d >= cell_x1[dim2] &&
4641                                 dd->ci[dim2] != dd->nc[dim2]-1)
4642                             {
4643                                 flag |= DD_FLAG_FW(d2);
4644                             }
4645                             else if (pos_d < cell_x0[dim2] &&
4646                                      dd->ci[dim2] != 0)
4647                             {
4648                                 flag |= DD_FLAG_BW(d2);
4649                             }
4650                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4651                         }
4652                     }
4653                     /* Set to which neighboring cell this cg should go */
4654                     if (flag & DD_FLAG_FW(d2))
4655                     {
4656                         mc = d2*2;
4657                     }
4658                     else if (flag & DD_FLAG_BW(d2))
4659                     {
4660                         if (dd->nc[dd->dim[d2]] > 2)
4661                         {
4662                             mc = d2*2+1;
4663                         }
4664                         else
4665                         {
4666                             mc = d2*2;
4667                         }
4668                     }
4669                 }
4670             }
4671             
4672             nrcg = flag & DD_FLAG_NRCG;
4673             if (mc == -1)
4674             {
4675                 if (home_pos_cg+1 > dd->cg_nalloc)
4676                 {
4677                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4678                     srenew(dd->index_gl,dd->cg_nalloc);
4679                     srenew(dd->cgindex,dd->cg_nalloc+1);
4680                 }
4681                 /* Set the global charge group index and size */
4682                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4683                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4684                 /* Copy the state from the buffer */
4685                 if (home_pos_cg >= fr->cg_nalloc)
4686                 {
4687                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4688                     cg_cm = fr->cg_cm;
4689                 }
4690                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4691                 /* Set the cginfo */
4692                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4693                                                    dd->index_gl[home_pos_cg]);
4694                 if (comm->bLocalCG)
4695                 {
4696                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4697                 }
4698
4699                 if (home_pos_at+nrcg > state->nalloc)
4700                 {
4701                     dd_realloc_state(state,f,home_pos_at+nrcg);
4702                 }
4703                 for(i=0; i<nrcg; i++)
4704                 {
4705                     copy_rvec(comm->vbuf.v[buf_pos++],
4706                               state->x[home_pos_at+i]);
4707                 }
4708                 if (bV)
4709                 {
4710                     for(i=0; i<nrcg; i++)
4711                     {
4712                         copy_rvec(comm->vbuf.v[buf_pos++],
4713                                   state->v[home_pos_at+i]);
4714                     }
4715                 }
4716                 if (bSDX)
4717                 {
4718                     for(i=0; i<nrcg; i++)
4719                     {
4720                         copy_rvec(comm->vbuf.v[buf_pos++],
4721                                   state->sd_X[home_pos_at+i]);
4722                     }
4723                 }
4724                 if (bCGP)
4725                 {
4726                     for(i=0; i<nrcg; i++)
4727                     {
4728                         copy_rvec(comm->vbuf.v[buf_pos++],
4729                                   state->cg_p[home_pos_at+i]);
4730                     }
4731                 }
4732                 home_pos_cg += 1;
4733                 home_pos_at += nrcg;
4734             }
4735             else
4736             {
4737                 /* Reallocate the buffers if necessary  */
4738                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4739                 {
4740                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4741                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4742                 }
4743                 nvr = ncg[mc] + nat[mc]*nvec;
4744                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4745                 {
4746                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4747                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4748                 }
4749                 /* Copy from the receive to the send buffers */
4750                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4751                        comm->buf_int + cg*DD_CGIBS,
4752                        DD_CGIBS*sizeof(int));
4753                 memcpy(comm->cgcm_state[mc][nvr],
4754                        comm->vbuf.v[buf_pos],
4755                        (1+nrcg*nvec)*sizeof(rvec));
4756                 buf_pos += 1 + nrcg*nvec;
4757                 ncg[mc] += 1;
4758                 nat[mc] += nrcg;
4759             }
4760         }
4761     }
4762     
4763     /* With sorting (!bCompact) the indices are now only partially up to date
4764      * and ncg_home and nat_home are not the real count, since there are
4765      * "holes" in the arrays for the charge groups that moved to neighbors.
4766      */
4767     dd->ncg_home = home_pos_cg;
4768     dd->nat_home = home_pos_at;
4769
4770     if (debug)
4771     {
4772         fprintf(debug,"Finished repartitioning\n");
4773     }
4774
4775     return ncg_stay_home;
4776 }
4777
4778 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4779 {
4780     dd->comm->cycl[ddCycl] += cycles;
4781     dd->comm->cycl_n[ddCycl]++;
4782     if (cycles > dd->comm->cycl_max[ddCycl])
4783     {
4784         dd->comm->cycl_max[ddCycl] = cycles;
4785     }
4786 }
4787
4788 static double force_flop_count(t_nrnb *nrnb)
4789 {
4790     int i;
4791     double sum;
4792     const char *name;
4793
4794     sum = 0;
4795     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4796     {
4797         /* To get closer to the real timings, we half the count
4798          * for the normal loops and again half it for water loops.
4799          */
4800         name = nrnb_str(i);
4801         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4802         {
4803             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4804         }
4805         else
4806         {
4807             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4808         }
4809     }
4810     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4811     {
4812         name = nrnb_str(i);
4813         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4814         sum += nrnb->n[i]*cost_nrnb(i);
4815     }
4816     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4817     {
4818         sum += nrnb->n[i]*cost_nrnb(i);
4819     }
4820
4821     return sum;
4822 }
4823
4824 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4825 {
4826     if (dd->comm->eFlop)
4827     {
4828         dd->comm->flop -= force_flop_count(nrnb);
4829     }
4830 }
4831 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4832 {
4833     if (dd->comm->eFlop)
4834     {
4835         dd->comm->flop += force_flop_count(nrnb);
4836         dd->comm->flop_n++;
4837     }
4838 }  
4839
4840 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4841 {
4842     int i;
4843     
4844     for(i=0; i<ddCyclNr; i++)
4845     {
4846         dd->comm->cycl[i] = 0;
4847         dd->comm->cycl_n[i] = 0;
4848         dd->comm->cycl_max[i] = 0;
4849     }
4850     dd->comm->flop = 0;
4851     dd->comm->flop_n = 0;
4852 }
4853
4854 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4855 {
4856     gmx_domdec_comm_t *comm;
4857     gmx_domdec_load_t *load;
4858     gmx_domdec_root_t *root=NULL;
4859     int  d,dim,cid,i,pos;
4860     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4861     gmx_bool bSepPME;
4862     
4863     if (debug)
4864     {
4865         fprintf(debug,"get_load_distribution start\n");
4866     }
4867
4868     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4869     
4870     comm = dd->comm;
4871     
4872     bSepPME = (dd->pme_nodeid >= 0);
4873     
4874     for(d=dd->ndim-1; d>=0; d--)
4875     {
4876         dim = dd->dim[d];
4877         /* Check if we participate in the communication in this dimension */
4878         if (d == dd->ndim-1 || 
4879             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4880         {
4881             load = &comm->load[d];
4882             if (dd->bGridJump)
4883             {
4884                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4885             }
4886             pos = 0;
4887             if (d == dd->ndim-1)
4888             {
4889                 sbuf[pos++] = dd_force_load(comm);
4890                 sbuf[pos++] = sbuf[0];
4891                 if (dd->bGridJump)
4892                 {
4893                     sbuf[pos++] = sbuf[0];
4894                     sbuf[pos++] = cell_frac;
4895                     if (d > 0)
4896                     {
4897                         sbuf[pos++] = comm->cell_f_max0[d];
4898                         sbuf[pos++] = comm->cell_f_min1[d];
4899                     }
4900                 }
4901                 if (bSepPME)
4902                 {
4903                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4904                     sbuf[pos++] = comm->cycl[ddCyclPME];
4905                 }
4906             }
4907             else
4908             {
4909                 sbuf[pos++] = comm->load[d+1].sum;
4910                 sbuf[pos++] = comm->load[d+1].max;
4911                 if (dd->bGridJump)
4912                 {
4913                     sbuf[pos++] = comm->load[d+1].sum_m;
4914                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4915                     sbuf[pos++] = comm->load[d+1].flags;
4916                     if (d > 0)
4917                     {
4918                         sbuf[pos++] = comm->cell_f_max0[d];
4919                         sbuf[pos++] = comm->cell_f_min1[d];
4920                     }
4921                 }
4922                 if (bSepPME)
4923                 {
4924                     sbuf[pos++] = comm->load[d+1].mdf;
4925                     sbuf[pos++] = comm->load[d+1].pme;
4926                 }
4927             }
4928             load->nload = pos;
4929             /* Communicate a row in DD direction d.
4930              * The communicators are setup such that the root always has rank 0.
4931              */
4932 #ifdef GMX_MPI
4933             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4934                        load->load,load->nload*sizeof(float),MPI_BYTE,
4935                        0,comm->mpi_comm_load[d]);
4936 #endif
4937             if (dd->ci[dim] == dd->master_ci[dim])
4938             {
4939                 /* We are the root, process this row */
4940                 if (comm->bDynLoadBal)
4941                 {
4942                     root = comm->root[d];
4943                 }
4944                 load->sum = 0;
4945                 load->max = 0;
4946                 load->sum_m = 0;
4947                 load->cvol_min = 1;
4948                 load->flags = 0;
4949                 load->mdf = 0;
4950                 load->pme = 0;
4951                 pos = 0;
4952                 for(i=0; i<dd->nc[dim]; i++)
4953                 {
4954                     load->sum += load->load[pos++];
4955                     load->max = max(load->max,load->load[pos]);
4956                     pos++;
4957                     if (dd->bGridJump)
4958                     {
4959                         if (root->bLimited)
4960                         {
4961                             /* This direction could not be load balanced properly,
4962                              * therefore we need to use the maximum iso the average load.
4963                              */
4964                             load->sum_m = max(load->sum_m,load->load[pos]);
4965                         }
4966                         else
4967                         {
4968                             load->sum_m += load->load[pos];
4969                         }
4970                         pos++;
4971                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4972                         pos++;
4973                         if (d < dd->ndim-1)
4974                         {
4975                             load->flags = (int)(load->load[pos++] + 0.5);
4976                         }
4977                         if (d > 0)
4978                         {
4979                             root->cell_f_max0[i] = load->load[pos++];
4980                             root->cell_f_min1[i] = load->load[pos++];
4981                         }
4982                     }
4983                     if (bSepPME)
4984                     {
4985                         load->mdf = max(load->mdf,load->load[pos]);
4986                         pos++;
4987                         load->pme = max(load->pme,load->load[pos]);
4988                         pos++;
4989                     }
4990                 }
4991                 if (comm->bDynLoadBal && root->bLimited)
4992                 {
4993                     load->sum_m *= dd->nc[dim];
4994                     load->flags |= (1<<d);
4995                 }
4996             }
4997         }
4998     }
4999
5000     if (DDMASTER(dd))
5001     {
5002         comm->nload      += dd_load_count(comm);
5003         comm->load_step  += comm->cycl[ddCyclStep];
5004         comm->load_sum   += comm->load[0].sum;
5005         comm->load_max   += comm->load[0].max;
5006         if (comm->bDynLoadBal)
5007         {
5008             for(d=0; d<dd->ndim; d++)
5009             {
5010                 if (comm->load[0].flags & (1<<d))
5011                 {
5012                     comm->load_lim[d]++;
5013                 }
5014             }
5015         }
5016         if (bSepPME)
5017         {
5018             comm->load_mdf += comm->load[0].mdf;
5019             comm->load_pme += comm->load[0].pme;
5020         }
5021     }
5022
5023     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5024     
5025     if (debug)
5026     {
5027         fprintf(debug,"get_load_distribution finished\n");
5028     }
5029 }
5030
5031 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5032 {
5033     /* Return the relative performance loss on the total run time
5034      * due to the force calculation load imbalance.
5035      */
5036     if (dd->comm->nload > 0)
5037     {
5038         return
5039             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5040             (dd->comm->load_step*dd->nnodes);
5041     }
5042     else
5043     {
5044         return 0;
5045     }
5046 }
5047
5048 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5049 {
5050     char  buf[STRLEN];
5051     int   npp,npme,nnodes,d,limp;
5052     float imbal,pme_f_ratio,lossf,lossp=0;
5053     gmx_bool  bLim;
5054     gmx_domdec_comm_t *comm;
5055
5056     comm = dd->comm;
5057     if (DDMASTER(dd) && comm->nload > 0)
5058     {
5059         npp    = dd->nnodes;
5060         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5061         nnodes = npp + npme;
5062         imbal = comm->load_max*npp/comm->load_sum - 1;
5063         lossf = dd_force_imb_perf_loss(dd);
5064         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5065         fprintf(fplog,"%s",buf);
5066         fprintf(stderr,"\n");
5067         fprintf(stderr,"%s",buf);
5068         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5069         fprintf(fplog,"%s",buf);
5070         fprintf(stderr,"%s",buf);
5071         bLim = FALSE;
5072         if (comm->bDynLoadBal)
5073         {
5074             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5075             for(d=0; d<dd->ndim; d++)
5076             {
5077                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5078                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5079                 if (limp >= 50)
5080                 {
5081                     bLim = TRUE;
5082                 }
5083             }
5084             sprintf(buf+strlen(buf),"\n");
5085             fprintf(fplog,"%s",buf);
5086             fprintf(stderr,"%s",buf);
5087         }
5088         if (npme > 0)
5089         {
5090             pme_f_ratio = comm->load_pme/comm->load_mdf;
5091             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5092             if (lossp <= 0)
5093             {
5094                 lossp *= (float)npme/(float)nnodes;
5095             }
5096             else
5097             {
5098                 lossp *= (float)npp/(float)nnodes;
5099             }
5100             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5101             fprintf(fplog,"%s",buf);
5102             fprintf(stderr,"%s",buf);
5103             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5104             fprintf(fplog,"%s",buf);
5105             fprintf(stderr,"%s",buf);
5106         }
5107         fprintf(fplog,"\n");
5108         fprintf(stderr,"\n");
5109         
5110         if (lossf >= DD_PERF_LOSS)
5111         {
5112             sprintf(buf,
5113                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5114                     "      in the domain decomposition.\n",lossf*100);
5115             if (!comm->bDynLoadBal)
5116             {
5117                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5118             }
5119             else if (bLim)
5120             {
5121                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5122             }
5123             fprintf(fplog,"%s\n",buf);
5124             fprintf(stderr,"%s\n",buf);
5125         }
5126         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5127         {
5128             sprintf(buf,
5129                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5130                     "      had %s work to do than the PP nodes.\n"
5131                     "      You might want to %s the number of PME nodes\n"
5132                     "      or %s the cut-off and the grid spacing.\n",
5133                     fabs(lossp*100),
5134                     (lossp < 0) ? "less"     : "more",
5135                     (lossp < 0) ? "decrease" : "increase",
5136                     (lossp < 0) ? "decrease" : "increase");
5137             fprintf(fplog,"%s\n",buf);
5138             fprintf(stderr,"%s\n",buf);
5139         }
5140     }
5141 }
5142
5143 static float dd_vol_min(gmx_domdec_t *dd)
5144 {
5145     return dd->comm->load[0].cvol_min*dd->nnodes;
5146 }
5147
5148 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5149 {
5150     return dd->comm->load[0].flags;
5151 }
5152
5153 static float dd_f_imbal(gmx_domdec_t *dd)
5154 {
5155     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5156 }
5157
5158 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5159 {
5160     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5161 }
5162
5163 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5164 {
5165     int flags,d;
5166     char buf[22];
5167     
5168     flags = dd_load_flags(dd);
5169     if (flags)
5170     {
5171         fprintf(fplog,
5172                 "DD  load balancing is limited by minimum cell size in dimension");
5173         for(d=0; d<dd->ndim; d++)
5174         {
5175             if (flags & (1<<d))
5176             {
5177                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5178             }
5179         }
5180         fprintf(fplog,"\n");
5181     }
5182     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5183     if (dd->comm->bDynLoadBal)
5184     {
5185         fprintf(fplog,"  vol min/aver %5.3f%c",
5186                 dd_vol_min(dd),flags ? '!' : ' ');
5187     }
5188     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5189     if (dd->comm->cycl_n[ddCyclPME])
5190     {
5191         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5192     }
5193     fprintf(fplog,"\n\n");
5194 }
5195
5196 static void dd_print_load_verbose(gmx_domdec_t *dd)
5197 {
5198     if (dd->comm->bDynLoadBal)
5199     {
5200         fprintf(stderr,"vol %4.2f%c ",
5201                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5202     }
5203     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5204     if (dd->comm->cycl_n[ddCyclPME])
5205     {
5206         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5207     }
5208 }
5209
5210 #ifdef GMX_MPI
5211 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5212                                    int dim_ind,ivec loc)
5213 {
5214     MPI_Group g_row = MPI_GROUP_EMPTY;
5215     MPI_Comm  c_row;
5216     int  dim,i,*rank;
5217     ivec loc_c;
5218     gmx_domdec_root_t *root;
5219     gmx_bool bPartOfGroup = FALSE;
5220     
5221     dim = dd->dim[dim_ind];
5222     copy_ivec(loc,loc_c);
5223     snew(rank,dd->nc[dim]);
5224     for(i=0; i<dd->nc[dim]; i++)
5225     {
5226         loc_c[dim] = i;
5227         rank[i] = dd_index(dd->nc,loc_c);
5228         if (rank[i] == dd->rank)
5229         {
5230             /* This process is part of the group */
5231             bPartOfGroup = TRUE;
5232         }
5233     }
5234     if (bPartOfGroup)
5235     {
5236         MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5237     }
5238     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5239     if (bPartOfGroup)
5240     {
5241         dd->comm->mpi_comm_load[dim_ind] = c_row;
5242         if (dd->comm->eDLB != edlbNO)
5243         {
5244             if (dd->ci[dim] == dd->master_ci[dim])
5245             {
5246                 /* This is the root process of this row */
5247                 snew(dd->comm->root[dim_ind],1);
5248                 root = dd->comm->root[dim_ind];
5249                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5250                 snew(root->old_cell_f,dd->nc[dim]+1);
5251                 snew(root->bCellMin,dd->nc[dim]);
5252                 if (dim_ind > 0)
5253                 {
5254                     snew(root->cell_f_max0,dd->nc[dim]);
5255                     snew(root->cell_f_min1,dd->nc[dim]);
5256                     snew(root->bound_min,dd->nc[dim]);
5257                     snew(root->bound_max,dd->nc[dim]);
5258                 }
5259                 snew(root->buf_ncd,dd->nc[dim]);
5260             }
5261             else
5262             {
5263                 /* This is not a root process, we only need to receive cell_f */
5264                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5265             }
5266         }
5267         if (dd->ci[dim] == dd->master_ci[dim])
5268         {
5269             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5270         }
5271     }
5272     sfree(rank);
5273 }
5274 #endif
5275
5276 static void make_load_communicators(gmx_domdec_t *dd)
5277 {
5278 #ifdef GMX_MPI
5279   MPI_Group g_all;
5280   int  dim0,dim1,i,j;
5281   ivec loc;
5282
5283   if (debug)
5284     fprintf(debug,"Making load communicators\n");
5285
5286   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5287   
5288   snew(dd->comm->load,dd->ndim);
5289   snew(dd->comm->mpi_comm_load,dd->ndim);
5290   
5291   clear_ivec(loc);
5292   make_load_communicator(dd,g_all,0,loc);
5293   if (dd->ndim > 1) {
5294     dim0 = dd->dim[0];
5295     for(i=0; i<dd->nc[dim0]; i++) {
5296       loc[dim0] = i;
5297       make_load_communicator(dd,g_all,1,loc);
5298     }
5299   }
5300   if (dd->ndim > 2) {
5301     dim0 = dd->dim[0];
5302     for(i=0; i<dd->nc[dim0]; i++) {
5303       loc[dim0] = i;
5304       dim1 = dd->dim[1];
5305       for(j=0; j<dd->nc[dim1]; j++) {
5306           loc[dim1] = j;
5307           make_load_communicator(dd,g_all,2,loc);
5308       }
5309     }
5310   }
5311
5312   MPI_Group_free(&g_all);
5313
5314   if (debug)
5315     fprintf(debug,"Finished making load communicators\n");
5316 #endif
5317 }
5318
5319 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5320 {
5321     gmx_bool bZYX;
5322     int  d,dim,i,j,m;
5323     ivec tmp,s;
5324     int  nzone,nzonep;
5325     ivec dd_zp[DD_MAXIZONE];
5326     gmx_domdec_zones_t *zones;
5327     gmx_domdec_ns_ranges_t *izone;
5328     
5329     for(d=0; d<dd->ndim; d++)
5330     {
5331         dim = dd->dim[d];
5332         copy_ivec(dd->ci,tmp);
5333         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5334         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5335         copy_ivec(dd->ci,tmp);
5336         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5337         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5338         if (debug)
5339         {
5340             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5341                     dd->rank,dim,
5342                     dd->neighbor[d][0],
5343                     dd->neighbor[d][1]);
5344         }
5345     }
5346     
5347     if (DDMASTER(dd))
5348     {
5349         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5350             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5351     }
5352     if (fplog)
5353     {
5354         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5355                 dd->ndim,
5356                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5357                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5358     }
5359     switch (dd->ndim)
5360     {
5361     case 3:
5362         nzone  = dd_z3n;
5363         nzonep = dd_zp3n;
5364         for(i=0; i<nzonep; i++)
5365         {
5366             copy_ivec(dd_zp3[i],dd_zp[i]);
5367         }
5368         break;
5369     case 2:
5370         nzone  = dd_z2n;
5371         nzonep = dd_zp2n;
5372         for(i=0; i<nzonep; i++)
5373         {
5374             copy_ivec(dd_zp2[i],dd_zp[i]);
5375         }
5376         break;
5377     case 1:
5378         nzone  = dd_z1n;
5379         nzonep = dd_zp1n;
5380         for(i=0; i<nzonep; i++)
5381         {
5382             copy_ivec(dd_zp1[i],dd_zp[i]);
5383         }
5384         break;
5385     default:
5386         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5387         nzone = 0;
5388         nzonep = 0;
5389     }
5390
5391     zones = &dd->comm->zones;
5392
5393     for(i=0; i<nzone; i++)
5394     {
5395         m = 0;
5396         clear_ivec(zones->shift[i]);
5397         for(d=0; d<dd->ndim; d++)
5398         {
5399             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5400         }
5401     }
5402     
5403     zones->n = nzone;
5404     for(i=0; i<nzone; i++)
5405     {
5406         for(d=0; d<DIM; d++)
5407         {
5408             s[d] = dd->ci[d] - zones->shift[i][d];
5409             if (s[d] < 0)
5410             {
5411                 s[d] += dd->nc[d];
5412             }
5413             else if (s[d] >= dd->nc[d])
5414             {
5415                 s[d] -= dd->nc[d];
5416             }
5417         }
5418     }
5419     zones->nizone = nzonep;
5420     for(i=0; i<zones->nizone; i++)
5421     {
5422         if (dd_zp[i][0] != i)
5423         {
5424             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5425         }
5426         izone = &zones->izone[i];
5427         izone->j0 = dd_zp[i][1];
5428         izone->j1 = dd_zp[i][2];
5429         for(dim=0; dim<DIM; dim++)
5430         {
5431             if (dd->nc[dim] == 1)
5432             {
5433                 /* All shifts should be allowed */
5434                 izone->shift0[dim] = -1;
5435                 izone->shift1[dim] = 1;
5436             }
5437             else
5438             {
5439                 /*
5440                   izone->shift0[d] = 0;
5441                   izone->shift1[d] = 0;
5442                   for(j=izone->j0; j<izone->j1; j++) {
5443                   if (dd->shift[j][d] > dd->shift[i][d])
5444                   izone->shift0[d] = -1;
5445                   if (dd->shift[j][d] < dd->shift[i][d])
5446                   izone->shift1[d] = 1;
5447                   }
5448                 */
5449                 
5450                 int shift_diff;
5451                 
5452                 /* Assume the shift are not more than 1 cell */
5453                 izone->shift0[dim] = 1;
5454                 izone->shift1[dim] = -1;
5455                 for(j=izone->j0; j<izone->j1; j++)
5456                 {
5457                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5458                     if (shift_diff < izone->shift0[dim])
5459                     {
5460                         izone->shift0[dim] = shift_diff;
5461                     }
5462                     if (shift_diff > izone->shift1[dim])
5463                     {
5464                         izone->shift1[dim] = shift_diff;
5465                     }
5466                 }
5467             }
5468         }
5469     }
5470     
5471     if (dd->comm->eDLB != edlbNO)
5472     {
5473         snew(dd->comm->root,dd->ndim);
5474     }
5475     
5476     if (dd->comm->bRecordLoad)
5477     {
5478         make_load_communicators(dd);
5479     }
5480 }
5481
5482 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5483 {
5484     gmx_domdec_t *dd;
5485     gmx_domdec_comm_t *comm;
5486     int  i,rank,*buf;
5487     ivec periods;
5488 #ifdef GMX_MPI
5489     MPI_Comm comm_cart;
5490 #endif
5491     
5492     dd = cr->dd;
5493     comm = dd->comm;
5494     
5495 #ifdef GMX_MPI
5496     if (comm->bCartesianPP)
5497     {
5498         /* Set up cartesian communication for the particle-particle part */
5499         if (fplog)
5500         {
5501             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5502                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5503         }
5504         
5505         for(i=0; i<DIM; i++)
5506         {
5507             periods[i] = TRUE;
5508         }
5509         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5510                         &comm_cart);
5511         /* We overwrite the old communicator with the new cartesian one */
5512         cr->mpi_comm_mygroup = comm_cart;
5513     }
5514     
5515     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5516     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5517     
5518     if (comm->bCartesianPP_PME)
5519     {
5520         /* Since we want to use the original cartesian setup for sim,
5521          * and not the one after split, we need to make an index.
5522          */
5523         snew(comm->ddindex2ddnodeid,dd->nnodes);
5524         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5525         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5526         /* Get the rank of the DD master,
5527          * above we made sure that the master node is a PP node.
5528          */
5529         if (MASTER(cr))
5530         {
5531             rank = dd->rank;
5532         }
5533         else
5534         {
5535             rank = 0;
5536         }
5537         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5538     }
5539     else if (comm->bCartesianPP)
5540     {
5541         if (cr->npmenodes == 0)
5542         {
5543             /* The PP communicator is also
5544              * the communicator for this simulation
5545              */
5546             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5547         }
5548         cr->nodeid = dd->rank;
5549         
5550         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5551         
5552         /* We need to make an index to go from the coordinates
5553          * to the nodeid of this simulation.
5554          */
5555         snew(comm->ddindex2simnodeid,dd->nnodes);
5556         snew(buf,dd->nnodes);
5557         if (cr->duty & DUTY_PP)
5558         {
5559             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5560         }
5561         /* Communicate the ddindex to simulation nodeid index */
5562         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5563                       cr->mpi_comm_mysim);
5564         sfree(buf);
5565         
5566         /* Determine the master coordinates and rank.
5567          * The DD master should be the same node as the master of this sim.
5568          */
5569         for(i=0; i<dd->nnodes; i++)
5570         {
5571             if (comm->ddindex2simnodeid[i] == 0)
5572             {
5573                 ddindex2xyz(dd->nc,i,dd->master_ci);
5574                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5575             }
5576         }
5577         if (debug)
5578         {
5579             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5580         }
5581     }
5582     else
5583     {
5584         /* No Cartesian communicators */
5585         /* We use the rank in dd->comm->all as DD index */
5586         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5587         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5588         dd->masterrank = 0;
5589         clear_ivec(dd->master_ci);
5590     }
5591 #endif
5592   
5593     if (fplog)
5594     {
5595         fprintf(fplog,
5596                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5597                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5598     }
5599     if (debug)
5600     {
5601         fprintf(debug,
5602                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5603                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5604     }
5605 }
5606
5607 static void receive_ddindex2simnodeid(t_commrec *cr)
5608 {
5609     gmx_domdec_t *dd;
5610     
5611     gmx_domdec_comm_t *comm;
5612     int  *buf;
5613     
5614     dd = cr->dd;
5615     comm = dd->comm;
5616     
5617 #ifdef GMX_MPI
5618     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5619     {
5620         snew(comm->ddindex2simnodeid,dd->nnodes);
5621         snew(buf,dd->nnodes);
5622         if (cr->duty & DUTY_PP)
5623         {
5624             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5625         }
5626 #ifdef GMX_MPI
5627         /* Communicate the ddindex to simulation nodeid index */
5628         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5629                       cr->mpi_comm_mysim);
5630 #endif
5631         sfree(buf);
5632     }
5633 #endif
5634 }
5635
5636 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5637                                                      int ncg,int natoms)
5638 {
5639     gmx_domdec_master_t *ma;
5640     int i;
5641
5642     snew(ma,1);
5643     
5644     snew(ma->ncg,dd->nnodes);
5645     snew(ma->index,dd->nnodes+1);
5646     snew(ma->cg,ncg);
5647     snew(ma->nat,dd->nnodes);
5648     snew(ma->ibuf,dd->nnodes*2);
5649     snew(ma->cell_x,DIM);
5650     for(i=0; i<DIM; i++)
5651     {
5652         snew(ma->cell_x[i],dd->nc[i]+1);
5653     }
5654
5655     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5656     {
5657         ma->vbuf = NULL;
5658     }
5659     else
5660     {
5661         snew(ma->vbuf,natoms);
5662     }
5663
5664     return ma;
5665 }
5666
5667 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5668                                int reorder)
5669 {
5670     gmx_domdec_t *dd;
5671     gmx_domdec_comm_t *comm;
5672     int  i,rank;
5673     gmx_bool bDiv[DIM];
5674     ivec periods;
5675 #ifdef GMX_MPI
5676     MPI_Comm comm_cart;
5677 #endif
5678     
5679     dd = cr->dd;
5680     comm = dd->comm;
5681     
5682     if (comm->bCartesianPP)
5683     {
5684         for(i=1; i<DIM; i++)
5685         {
5686             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5687         }
5688         if (bDiv[YY] || bDiv[ZZ])
5689         {
5690             comm->bCartesianPP_PME = TRUE;
5691             /* If we have 2D PME decomposition, which is always in x+y,
5692              * we stack the PME only nodes in z.
5693              * Otherwise we choose the direction that provides the thinnest slab
5694              * of PME only nodes as this will have the least effect
5695              * on the PP communication.
5696              * But for the PME communication the opposite might be better.
5697              */
5698             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5699                              !bDiv[YY] ||
5700                              dd->nc[YY] > dd->nc[ZZ]))
5701             {
5702                 comm->cartpmedim = ZZ;
5703             }
5704             else
5705             {
5706                 comm->cartpmedim = YY;
5707             }
5708             comm->ntot[comm->cartpmedim]
5709                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5710         }
5711         else if (fplog)
5712         {
5713             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5714             fprintf(fplog,
5715                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5716         }
5717     }
5718     
5719 #ifdef GMX_MPI
5720     if (comm->bCartesianPP_PME)
5721     {
5722         if (fplog)
5723         {
5724             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5725         }
5726         
5727         for(i=0; i<DIM; i++)
5728         {
5729             periods[i] = TRUE;
5730         }
5731         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5732                         &comm_cart);
5733         
5734         MPI_Comm_rank(comm_cart,&rank);
5735         if (MASTERNODE(cr) && rank != 0)
5736         {
5737             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5738         }
5739         
5740         /* With this assigment we loose the link to the original communicator
5741          * which will usually be MPI_COMM_WORLD, unless have multisim.
5742          */
5743         cr->mpi_comm_mysim = comm_cart;
5744         cr->sim_nodeid = rank;
5745         
5746         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5747         
5748         if (fplog)
5749         {
5750             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5751                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5752         }
5753         
5754         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5755         {
5756             cr->duty = DUTY_PP;
5757         }
5758         if (cr->npmenodes == 0 ||
5759             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5760         {
5761             cr->duty = DUTY_PME;
5762         }
5763         
5764         /* Split the sim communicator into PP and PME only nodes */
5765         MPI_Comm_split(cr->mpi_comm_mysim,
5766                        cr->duty,
5767                        dd_index(comm->ntot,dd->ci),
5768                        &cr->mpi_comm_mygroup);
5769     }
5770     else
5771     {
5772         switch (dd_node_order)
5773         {
5774         case ddnoPP_PME:
5775             if (fplog)
5776             {
5777                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5778             }
5779             break;
5780         case ddnoINTERLEAVE:
5781             /* Interleave the PP-only and PME-only nodes,
5782              * as on clusters with dual-core machines this will double
5783              * the communication bandwidth of the PME processes
5784              * and thus speed up the PP <-> PME and inter PME communication.
5785              */
5786             if (fplog)
5787             {
5788                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5789             }
5790             comm->pmenodes = dd_pmenodes(cr);
5791             break;
5792         case ddnoCARTESIAN:
5793             break;
5794         default:
5795             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5796         }
5797     
5798         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5799         {
5800             cr->duty = DUTY_PME;
5801         }
5802         else
5803         {
5804             cr->duty = DUTY_PP;
5805         }
5806         
5807         /* Split the sim communicator into PP and PME only nodes */
5808         MPI_Comm_split(cr->mpi_comm_mysim,
5809                        cr->duty,
5810                        cr->nodeid,
5811                        &cr->mpi_comm_mygroup);
5812         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5813     }
5814 #endif
5815
5816     if (fplog)
5817     {
5818         fprintf(fplog,"This is a %s only node\n\n",
5819                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5820     }
5821 }
5822
5823 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5824 {
5825     gmx_domdec_t *dd;
5826     gmx_domdec_comm_t *comm;
5827     int CartReorder;
5828     
5829     dd = cr->dd;
5830     comm = dd->comm;
5831     
5832     copy_ivec(dd->nc,comm->ntot);
5833     
5834     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5835     comm->bCartesianPP_PME = FALSE;
5836     
5837     /* Reorder the nodes by default. This might change the MPI ranks.
5838      * Real reordering is only supported on very few architectures,
5839      * Blue Gene is one of them.
5840      */
5841     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5842     
5843     if (cr->npmenodes > 0)
5844     {
5845         /* Split the communicator into a PP and PME part */
5846         split_communicator(fplog,cr,dd_node_order,CartReorder);
5847         if (comm->bCartesianPP_PME)
5848         {
5849             /* We (possibly) reordered the nodes in split_communicator,
5850              * so it is no longer required in make_pp_communicator.
5851              */
5852             CartReorder = FALSE;
5853         }
5854     }
5855     else
5856     {
5857         /* All nodes do PP and PME */
5858 #ifdef GMX_MPI    
5859         /* We do not require separate communicators */
5860         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5861 #endif
5862     }
5863     
5864     if (cr->duty & DUTY_PP)
5865     {
5866         /* Copy or make a new PP communicator */
5867         make_pp_communicator(fplog,cr,CartReorder);
5868     }
5869     else
5870     {
5871         receive_ddindex2simnodeid(cr);
5872     }
5873     
5874     if (!(cr->duty & DUTY_PME))
5875     {
5876         /* Set up the commnuication to our PME node */
5877         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5878         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5879         if (debug)
5880         {
5881             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5882                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5883         }
5884     }
5885     else
5886     {
5887         dd->pme_nodeid = -1;
5888     }
5889
5890     if (DDMASTER(dd))
5891     {
5892         dd->ma = init_gmx_domdec_master_t(dd,
5893                                           comm->cgs_gl.nr,
5894                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5895     }
5896 }
5897
5898 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5899 {
5900     real *slb_frac,tot;
5901     int  i,n;
5902     double dbl;
5903     
5904     slb_frac = NULL;
5905     if (nc > 1 && size_string != NULL)
5906     {
5907         if (fplog)
5908         {
5909             fprintf(fplog,"Using static load balancing for the %s direction\n",
5910                     dir);
5911         }
5912         snew(slb_frac,nc);
5913         tot = 0;
5914         for (i=0; i<nc; i++)
5915         {
5916             dbl = 0;
5917             sscanf(size_string,"%lf%n",&dbl,&n);
5918             if (dbl == 0)
5919             {
5920                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5921             }
5922             slb_frac[i] = dbl;
5923             size_string += n;
5924             tot += slb_frac[i];
5925         }
5926         /* Normalize */
5927         if (fplog)
5928         {
5929             fprintf(fplog,"Relative cell sizes:");
5930         }
5931         for (i=0; i<nc; i++)
5932         {
5933             slb_frac[i] /= tot;
5934             if (fplog)
5935             {
5936                 fprintf(fplog," %5.3f",slb_frac[i]);
5937             }
5938         }
5939         if (fplog)
5940         {
5941             fprintf(fplog,"\n");
5942         }
5943     }
5944     
5945     return slb_frac;
5946 }
5947
5948 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5949 {
5950     int n,nmol,ftype;
5951     gmx_mtop_ilistloop_t iloop;
5952     t_ilist *il;
5953     
5954     n = 0;
5955     iloop = gmx_mtop_ilistloop_init(mtop);
5956     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5957     {
5958         for(ftype=0; ftype<F_NRE; ftype++)
5959         {
5960             if ((interaction_function[ftype].flags & IF_BOND) &&
5961                 NRAL(ftype) >  2)
5962             {
5963                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5964             }
5965         }
5966   }
5967
5968   return n;
5969 }
5970
5971 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5972 {
5973     char *val;
5974     int  nst;
5975     
5976     nst = def;
5977     val = getenv(env_var);
5978     if (val)
5979     {
5980         if (sscanf(val,"%d",&nst) <= 0)
5981         {
5982             nst = 1;
5983         }
5984         if (fplog)
5985         {
5986             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5987                     env_var,val,nst);
5988         }
5989     }
5990     
5991     return nst;
5992 }
5993
5994 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5995 {
5996     if (MASTER(cr))
5997     {
5998         fprintf(stderr,"\n%s\n",warn_string);
5999     }
6000     if (fplog)
6001     {
6002         fprintf(fplog,"\n%s\n",warn_string);
6003     }
6004 }
6005
6006 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6007                                   t_inputrec *ir,FILE *fplog)
6008 {
6009     if (ir->ePBC == epbcSCREW &&
6010         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6011     {
6012         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6013     }
6014
6015     if (ir->ns_type == ensSIMPLE)
6016     {
6017         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6018     }
6019
6020     if (ir->nstlist == 0)
6021     {
6022         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6023     }
6024
6025     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6026     {
6027         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6028     }
6029 }
6030
6031 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6032 {
6033     int  di,d;
6034     real r;
6035
6036     r = ddbox->box_size[XX];
6037     for(di=0; di<dd->ndim; di++)
6038     {
6039         d = dd->dim[di];
6040         /* Check using the initial average cell size */
6041         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6042     }
6043
6044     return r;
6045 }
6046
6047 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6048                              const char *dlb_opt,gmx_bool bRecordLoad,
6049                              unsigned long Flags,t_inputrec *ir)
6050 {
6051     gmx_domdec_t *dd;
6052     int  eDLB=-1;
6053     char buf[STRLEN];
6054
6055     switch (dlb_opt[0])
6056     {
6057     case 'a': eDLB = edlbAUTO; break;
6058     case 'n': eDLB = edlbNO;   break;
6059     case 'y': eDLB = edlbYES;  break;
6060     default: gmx_incons("Unknown dlb_opt");
6061     }
6062
6063     if (Flags & MD_RERUN)
6064     {
6065         return edlbNO;
6066     }
6067
6068     if (!EI_DYNAMICS(ir->eI))
6069     {
6070         if (eDLB == edlbYES)
6071         {
6072             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6073             dd_warning(cr,fplog,buf);
6074         }
6075             
6076         return edlbNO;
6077     }
6078
6079     if (!bRecordLoad)
6080     {
6081         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6082
6083         return edlbNO;
6084     }
6085
6086     if (Flags & MD_REPRODUCIBLE)
6087     {
6088         switch (eDLB)
6089         {
6090                         case edlbNO: 
6091                                 break;
6092                         case edlbAUTO:
6093                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6094                                 eDLB = edlbNO;
6095                                 break;
6096                         case edlbYES:
6097                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6098                                 break;
6099                         default:
6100                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6101                                 break;
6102         }
6103     }
6104
6105     return eDLB;
6106 }
6107
6108 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6109 {
6110     int dim;
6111
6112     dd->ndim = 0;
6113     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6114     {
6115         /* Decomposition order z,y,x */
6116         if (fplog)
6117         {
6118             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6119         }
6120         for(dim=DIM-1; dim>=0; dim--)
6121         {
6122             if (dd->nc[dim] > 1)
6123             {
6124                 dd->dim[dd->ndim++] = dim;
6125             }
6126         }
6127     }
6128     else
6129     {
6130         /* Decomposition order x,y,z */
6131         for(dim=0; dim<DIM; dim++)
6132         {
6133             if (dd->nc[dim] > 1)
6134             {
6135                 dd->dim[dd->ndim++] = dim;
6136             }
6137         }
6138     }
6139 }
6140
6141 static gmx_domdec_comm_t *init_dd_comm()
6142 {
6143     gmx_domdec_comm_t *comm;
6144     int  i;
6145
6146     snew(comm,1);
6147     snew(comm->cggl_flag,DIM*2);
6148     snew(comm->cgcm_state,DIM*2);
6149     for(i=0; i<DIM*2; i++)
6150     {
6151         comm->cggl_flag_nalloc[i]  = 0;
6152         comm->cgcm_state_nalloc[i] = 0;
6153     }
6154     
6155     comm->nalloc_int = 0;
6156     comm->buf_int    = NULL;
6157
6158     vec_rvec_init(&comm->vbuf);
6159
6160     comm->n_load_have    = 0;
6161     comm->n_load_collect = 0;
6162
6163     for(i=0; i<ddnatNR-ddnatZONE; i++)
6164     {
6165         comm->sum_nat[i] = 0;
6166     }
6167     comm->ndecomp = 0;
6168     comm->nload   = 0;
6169     comm->load_step = 0;
6170     comm->load_sum  = 0;
6171     comm->load_max  = 0;
6172     clear_ivec(comm->load_lim);
6173     comm->load_mdf  = 0;
6174     comm->load_pme  = 0;
6175
6176     return comm;
6177 }
6178
6179 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6180                                         unsigned long Flags,
6181                                         ivec nc,
6182                                         real comm_distance_min,real rconstr,
6183                                         const char *dlb_opt,real dlb_scale,
6184                                         const char *sizex,const char *sizey,const char *sizez,
6185                                         gmx_mtop_t *mtop,t_inputrec *ir,
6186                                         matrix box,rvec *x,
6187                                         gmx_ddbox_t *ddbox,
6188                                         int *npme_x,int *npme_y)
6189 {
6190     gmx_domdec_t *dd;
6191     gmx_domdec_comm_t *comm;
6192     int  recload;
6193     int  d,i,j;
6194     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6195     gmx_bool bC;
6196     char buf[STRLEN];
6197     
6198     if (fplog)
6199     {
6200         fprintf(fplog,
6201                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6202     }
6203     
6204     snew(dd,1);
6205
6206     dd->comm = init_dd_comm();
6207     comm = dd->comm;
6208     snew(comm->cggl_flag,DIM*2);
6209     snew(comm->cgcm_state,DIM*2);
6210
6211     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6212     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6213     
6214     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6215     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6216     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6217     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6218     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6219     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6220     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6221     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6222
6223     dd->pme_recv_f_alloc = 0;
6224     dd->pme_recv_f_buf = NULL;
6225
6226     if (dd->bSendRecv2 && fplog)
6227     {
6228         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6229     }
6230     if (comm->eFlop)
6231     {
6232         if (fplog)
6233         {
6234             fprintf(fplog,"Will load balance based on FLOP count\n");
6235         }
6236         if (comm->eFlop > 1)
6237         {
6238             srand(1+cr->nodeid);
6239         }
6240         comm->bRecordLoad = TRUE;
6241     }
6242     else
6243     {
6244         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6245                              
6246     }
6247     
6248     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6249     
6250     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6251     if (fplog)
6252     {
6253         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6254     }
6255     dd->bGridJump = comm->bDynLoadBal;
6256     
6257     if (comm->nstSortCG)
6258     {
6259         if (fplog)
6260         {
6261             if (comm->nstSortCG == 1)
6262             {
6263                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6264             }
6265             else
6266             {
6267                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6268                         comm->nstSortCG);
6269             }
6270         }
6271         snew(comm->sort,1);
6272     }
6273     else
6274     {
6275         if (fplog)
6276         {
6277             fprintf(fplog,"Will not sort the charge groups\n");
6278         }
6279     }
6280     
6281     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6282     if (comm->bInterCGBondeds)
6283     {
6284         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6285     }
6286     else
6287     {
6288         comm->bInterCGMultiBody = FALSE;
6289     }
6290     
6291     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6292
6293     if (ir->rlistlong == 0)
6294     {
6295         /* Set the cut-off to some very large value,
6296          * so we don't need if statements everywhere in the code.
6297          * We use sqrt, since the cut-off is squared in some places.
6298          */
6299         comm->cutoff   = GMX_CUTOFF_INF;
6300     }
6301     else
6302     {
6303         comm->cutoff   = ir->rlistlong;
6304     }
6305     comm->cutoff_mbody = 0;
6306     
6307     comm->cellsize_limit = 0;
6308     comm->bBondComm = FALSE;
6309
6310     if (comm->bInterCGBondeds)
6311     {
6312         if (comm_distance_min > 0)
6313         {
6314             comm->cutoff_mbody = comm_distance_min;
6315             if (Flags & MD_DDBONDCOMM)
6316             {
6317                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6318             }
6319             else
6320             {
6321                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6322             }
6323             r_bonded_limit = comm->cutoff_mbody;
6324         }
6325         else if (ir->bPeriodicMols)
6326         {
6327             /* Can not easily determine the required cut-off */
6328             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6329             comm->cutoff_mbody = comm->cutoff/2;
6330             r_bonded_limit = comm->cutoff_mbody;
6331         }
6332         else
6333         {
6334             if (MASTER(cr))
6335             {
6336                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6337                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6338             }
6339             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6340             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6341
6342             /* We use an initial margin of 10% for the minimum cell size,
6343              * except when we are just below the non-bonded cut-off.
6344              */
6345             if (Flags & MD_DDBONDCOMM)
6346             {
6347                 if (max(r_2b,r_mb) > comm->cutoff)
6348                 {
6349                     r_bonded       = max(r_2b,r_mb);
6350                     r_bonded_limit = 1.1*r_bonded;
6351                     comm->bBondComm = TRUE;
6352                 }
6353                 else
6354                 {
6355                     r_bonded       = r_mb;
6356                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6357                 }
6358                 /* We determine cutoff_mbody later */
6359             }
6360             else
6361             {
6362                 /* No special bonded communication,
6363                  * simply increase the DD cut-off.
6364                  */
6365                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6366                 comm->cutoff_mbody = r_bonded_limit;
6367                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6368             }
6369         }
6370         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6371         if (fplog)
6372         {
6373             fprintf(fplog,
6374                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6375                     comm->cellsize_limit);
6376         }
6377     }
6378
6379     if (dd->bInterCGcons && rconstr <= 0)
6380     {
6381         /* There is a cell size limit due to the constraints (P-LINCS) */
6382         rconstr = constr_r_max(fplog,mtop,ir);
6383         if (fplog)
6384         {
6385             fprintf(fplog,
6386                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6387                     rconstr);
6388             if (rconstr > comm->cellsize_limit)
6389             {
6390                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6391             }
6392         }
6393     }
6394     else if (rconstr > 0 && fplog)
6395     {
6396         /* Here we do not check for dd->bInterCGcons,
6397          * because one can also set a cell size limit for virtual sites only
6398          * and at this point we don't know yet if there are intercg v-sites.
6399          */
6400         fprintf(fplog,
6401                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6402                 rconstr);
6403     }
6404     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6405
6406     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6407
6408     if (nc[XX] > 0)
6409     {
6410         copy_ivec(nc,dd->nc);
6411         set_dd_dim(fplog,dd);
6412         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6413
6414         if (cr->npmenodes == -1)
6415         {
6416             cr->npmenodes = 0;
6417         }
6418         acs = average_cellsize_min(dd,ddbox);
6419         if (acs < comm->cellsize_limit)
6420         {
6421             if (fplog)
6422             {
6423                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6424             }
6425             gmx_fatal_collective(FARGS,cr,NULL,
6426                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6427                                  acs,comm->cellsize_limit);
6428         }
6429     }
6430     else
6431     {
6432         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6433
6434         /* We need to choose the optimal DD grid and possibly PME nodes */
6435         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6436                                comm->eDLB!=edlbNO,dlb_scale,
6437                                comm->cellsize_limit,comm->cutoff,
6438                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6439         
6440         if (dd->nc[XX] == 0)
6441         {
6442             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6443             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6444                     !bC ? "-rdd" : "-rcon",
6445                     comm->eDLB!=edlbNO ? " or -dds" : "",
6446                     bC ? " or your LINCS settings" : "");
6447
6448             gmx_fatal_collective(FARGS,cr,NULL,
6449                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6450                                  "%s\n"
6451                                  "Look in the log file for details on the domain decomposition",
6452                                  cr->nnodes-cr->npmenodes,limit,buf);
6453         }
6454         set_dd_dim(fplog,dd);
6455     }
6456
6457     if (fplog)
6458     {
6459         fprintf(fplog,
6460                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6461                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6462     }
6463     
6464     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6465     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6466     {
6467         gmx_fatal_collective(FARGS,cr,NULL,
6468                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6469                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6470     }
6471     if (cr->npmenodes > dd->nnodes)
6472     {
6473         gmx_fatal_collective(FARGS,cr,NULL,
6474                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6475     }
6476     if (cr->npmenodes > 0)
6477     {
6478         comm->npmenodes = cr->npmenodes;
6479     }
6480     else
6481     {
6482         comm->npmenodes = dd->nnodes;
6483     }
6484
6485     if (EEL_PME(ir->coulombtype))
6486     {
6487         /* The following choices should match those
6488          * in comm_cost_est in domdec_setup.c.
6489          * Note that here the checks have to take into account
6490          * that the decomposition might occur in a different order than xyz
6491          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6492          * in which case they will not match those in comm_cost_est,
6493          * but since that is mainly for testing purposes that's fine.
6494          */
6495         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6496             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6497             getenv("GMX_PMEONEDD") == NULL)
6498         {
6499             comm->npmedecompdim = 2;
6500             comm->npmenodes_x   = dd->nc[XX];
6501             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6502         }
6503         else
6504         {
6505             /* In case nc is 1 in both x and y we could still choose to
6506              * decompose pme in y instead of x, but we use x for simplicity.
6507              */
6508             comm->npmedecompdim = 1;
6509             if (dd->dim[0] == YY)
6510             {
6511                 comm->npmenodes_x = 1;
6512                 comm->npmenodes_y = comm->npmenodes;
6513             }
6514             else
6515             {
6516                 comm->npmenodes_x = comm->npmenodes;
6517                 comm->npmenodes_y = 1;
6518             }
6519         }    
6520         if (fplog)
6521         {
6522             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6523                     comm->npmenodes_x,comm->npmenodes_y,1);
6524         }
6525     }
6526     else
6527     {
6528         comm->npmedecompdim = 0;
6529         comm->npmenodes_x   = 0;
6530         comm->npmenodes_y   = 0;
6531     }
6532     
6533     /* Technically we don't need both of these,
6534      * but it simplifies code not having to recalculate it.
6535      */
6536     *npme_x = comm->npmenodes_x;
6537     *npme_y = comm->npmenodes_y;
6538         
6539     snew(comm->slb_frac,DIM);
6540     if (comm->eDLB == edlbNO)
6541     {
6542         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6543         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6544         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6545     }
6546
6547     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6548     {
6549         if (comm->bBondComm || comm->eDLB != edlbNO)
6550         {
6551             /* Set the bonded communication distance to halfway
6552              * the minimum and the maximum,
6553              * since the extra communication cost is nearly zero.
6554              */
6555             acs = average_cellsize_min(dd,ddbox);
6556             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6557             if (comm->eDLB != edlbNO)
6558             {
6559                 /* Check if this does not limit the scaling */
6560                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6561             }
6562             if (!comm->bBondComm)
6563             {
6564                 /* Without bBondComm do not go beyond the n.b. cut-off */
6565                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6566                 if (comm->cellsize_limit >= comm->cutoff)
6567                 {
6568                     /* We don't loose a lot of efficieny
6569                      * when increasing it to the n.b. cut-off.
6570                      * It can even be slightly faster, because we need
6571                      * less checks for the communication setup.
6572                      */
6573                     comm->cutoff_mbody = comm->cutoff;
6574                 }
6575             }
6576             /* Check if we did not end up below our original limit */
6577             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6578
6579             if (comm->cutoff_mbody > comm->cellsize_limit)
6580             {
6581                 comm->cellsize_limit = comm->cutoff_mbody;
6582             }
6583         }
6584         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6585     }
6586
6587     if (debug)
6588     {
6589         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6590                 "cellsize limit %f\n",
6591                 comm->bBondComm,comm->cellsize_limit);
6592     }
6593     
6594     if (MASTER(cr))
6595     {
6596         check_dd_restrictions(cr,dd,ir,fplog);
6597     }
6598
6599     comm->globalcomm_step = INT_MIN;
6600     dd->ddp_count = 0;
6601
6602     clear_dd_cycle_counts(dd);
6603
6604     return dd;
6605 }
6606
6607 static void set_dlb_limits(gmx_domdec_t *dd)
6608
6609 {
6610     int d;
6611
6612     for(d=0; d<dd->ndim; d++)
6613     {
6614         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6615         dd->comm->cellsize_min[dd->dim[d]] =
6616             dd->comm->cellsize_min_dlb[dd->dim[d]];
6617     }
6618 }
6619
6620
6621 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6622 {
6623     gmx_domdec_t *dd;
6624     gmx_domdec_comm_t *comm;
6625     real cellsize_min;
6626     int  d,nc,i;
6627     char buf[STRLEN];
6628     
6629     dd = cr->dd;
6630     comm = dd->comm;
6631     
6632     if (fplog)
6633     {
6634         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6635     }
6636
6637     cellsize_min = comm->cellsize_min[dd->dim[0]];
6638     for(d=1; d<dd->ndim; d++)
6639     {
6640         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6641     }
6642
6643     if (cellsize_min < comm->cellsize_limit*1.05)
6644     {
6645         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6646
6647         /* Change DLB from "auto" to "no". */
6648         comm->eDLB = edlbNO;
6649
6650         return;
6651     }
6652
6653     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6654     comm->bDynLoadBal = TRUE;
6655     dd->bGridJump = TRUE;
6656     
6657     set_dlb_limits(dd);
6658
6659     /* We can set the required cell size info here,
6660      * so we do not need to communicate this.
6661      * The grid is completely uniform.
6662      */
6663     for(d=0; d<dd->ndim; d++)
6664     {
6665         if (comm->root[d])
6666         {
6667             comm->load[d].sum_m = comm->load[d].sum;
6668
6669             nc = dd->nc[dd->dim[d]];
6670             for(i=0; i<nc; i++)
6671             {
6672                 comm->root[d]->cell_f[i]    = i/(real)nc;
6673                 if (d > 0)
6674                 {
6675                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6676                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6677                 }
6678             }
6679             comm->root[d]->cell_f[nc] = 1.0;
6680         }
6681     }
6682 }
6683
6684 static char *init_bLocalCG(gmx_mtop_t *mtop)
6685 {
6686     int  ncg,cg;
6687     char *bLocalCG;
6688     
6689     ncg = ncg_mtop(mtop);
6690     snew(bLocalCG,ncg);
6691     for(cg=0; cg<ncg; cg++)
6692     {
6693         bLocalCG[cg] = FALSE;
6694     }
6695
6696     return bLocalCG;
6697 }
6698
6699 void dd_init_bondeds(FILE *fplog,
6700                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6701                      gmx_vsite_t *vsite,gmx_constr_t constr,
6702                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6703 {
6704     gmx_domdec_comm_t *comm;
6705     gmx_bool bBondComm;
6706     int  d;
6707
6708     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6709
6710     comm = dd->comm;
6711
6712     if (comm->bBondComm)
6713     {
6714         /* Communicate atoms beyond the cut-off for bonded interactions */
6715         comm = dd->comm;
6716
6717         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6718
6719         comm->bLocalCG = init_bLocalCG(mtop);
6720     }
6721     else
6722     {
6723         /* Only communicate atoms based on cut-off */
6724         comm->cglink   = NULL;
6725         comm->bLocalCG = NULL;
6726     }
6727 }
6728
6729 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6730                               t_inputrec *ir,
6731                               gmx_bool bDynLoadBal,real dlb_scale,
6732                               gmx_ddbox_t *ddbox)
6733 {
6734     gmx_domdec_comm_t *comm;
6735     int  d;
6736     ivec np;
6737     real limit,shrink;
6738     char buf[64];
6739
6740     if (fplog == NULL)
6741     {
6742         return;
6743     }
6744
6745     comm = dd->comm;
6746
6747     if (bDynLoadBal)
6748     {
6749         fprintf(fplog,"The maximum number of communication pulses is:");
6750         for(d=0; d<dd->ndim; d++)
6751         {
6752             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6753         }
6754         fprintf(fplog,"\n");
6755         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6756         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6757         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6758         for(d=0; d<DIM; d++)
6759         {
6760             if (dd->nc[d] > 1)
6761             {
6762                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6763                 {
6764                     shrink = 0;
6765                 }
6766                 else
6767                 {
6768                     shrink =
6769                         comm->cellsize_min_dlb[d]/
6770                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6771                 }
6772                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6773             }
6774         }
6775         fprintf(fplog,"\n");
6776     }
6777     else
6778     {
6779         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6780         fprintf(fplog,"The initial number of communication pulses is:");
6781         for(d=0; d<dd->ndim; d++)
6782         {
6783             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6784         }
6785         fprintf(fplog,"\n");
6786         fprintf(fplog,"The initial domain decomposition cell size is:");
6787         for(d=0; d<DIM; d++) {
6788             if (dd->nc[d] > 1)
6789             {
6790                 fprintf(fplog," %c %.2f nm",
6791                         dim2char(d),dd->comm->cellsize_min[d]);
6792             }
6793         }
6794         fprintf(fplog,"\n\n");
6795     }
6796     
6797     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6798     {
6799         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6800         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6801                 "non-bonded interactions","",comm->cutoff);
6802
6803         if (bDynLoadBal)
6804         {
6805             limit = dd->comm->cellsize_limit;
6806         }
6807         else
6808         {
6809             if (dynamic_dd_box(ddbox,ir))
6810             {
6811                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6812             }
6813             limit = dd->comm->cellsize_min[XX];
6814             for(d=1; d<DIM; d++)
6815             {
6816                 limit = min(limit,dd->comm->cellsize_min[d]);
6817             }
6818         }
6819
6820         if (comm->bInterCGBondeds)
6821         {
6822             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6823                     "two-body bonded interactions","(-rdd)",
6824                     max(comm->cutoff,comm->cutoff_mbody));
6825             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6826                     "multi-body bonded interactions","(-rdd)",
6827                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6828         }
6829         if (dd->vsite_comm)
6830         {
6831             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6832                     "virtual site constructions","(-rcon)",limit);
6833         }
6834         if (dd->constraint_comm)
6835         {
6836             sprintf(buf,"atoms separated by up to %d constraints",
6837                     1+ir->nProjOrder);
6838             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6839                     buf,"(-rcon)",limit);
6840         }
6841         fprintf(fplog,"\n");
6842     }
6843     
6844     fflush(fplog);
6845 }
6846
6847 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6848                        t_inputrec *ir,t_forcerec *fr,
6849                        gmx_ddbox_t *ddbox)
6850 {
6851     gmx_domdec_comm_t *comm;
6852     int  d,dim,npulse,npulse_d_max,npulse_d;
6853     gmx_bool bNoCutOff;
6854     int  natoms_tot;
6855     real vol_frac;
6856
6857     comm = dd->comm;
6858
6859     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6860
6861     if (EEL_PME(ir->coulombtype))
6862     {
6863         init_ddpme(dd,&comm->ddpme[0],0);
6864         if (comm->npmedecompdim >= 2)
6865         {
6866             init_ddpme(dd,&comm->ddpme[1],1);
6867         }
6868     }
6869     else
6870     {
6871         comm->npmenodes = 0;
6872         if (dd->pme_nodeid >= 0)
6873         {
6874             gmx_fatal_collective(FARGS,NULL,dd,
6875                                  "Can not have separate PME nodes without PME electrostatics");
6876         }
6877     }
6878     
6879     /* If each molecule is a single charge group
6880      * or we use domain decomposition for each periodic dimension,
6881      * we do not need to take pbc into account for the bonded interactions.
6882      */
6883     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6884         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6885     {
6886         fr->bMolPBC = FALSE;
6887     }
6888     else
6889     {
6890         fr->bMolPBC = TRUE;
6891     }
6892         
6893     if (debug)
6894     {
6895         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6896     }
6897     if (comm->eDLB != edlbNO)
6898     {
6899         /* Determine the maximum number of comm. pulses in one dimension */
6900         
6901         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6902         
6903         /* Determine the maximum required number of grid pulses */
6904         if (comm->cellsize_limit >= comm->cutoff)
6905         {
6906             /* Only a single pulse is required */
6907             npulse = 1;
6908         }
6909         else if (!bNoCutOff && comm->cellsize_limit > 0)
6910         {
6911             /* We round down slightly here to avoid overhead due to the latency
6912              * of extra communication calls when the cut-off
6913              * would be only slightly longer than the cell size.
6914              * Later cellsize_limit is redetermined,
6915              * so we can not miss interactions due to this rounding.
6916              */
6917             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6918         }
6919         else
6920         {
6921             /* There is no cell size limit */
6922             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6923         }
6924
6925         if (!bNoCutOff && npulse > 1)
6926         {
6927             /* See if we can do with less pulses, based on dlb_scale */
6928             npulse_d_max = 0;
6929             for(d=0; d<dd->ndim; d++)
6930             {
6931                 dim = dd->dim[d];
6932                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6933                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6934                 npulse_d_max = max(npulse_d_max,npulse_d);
6935             }
6936             npulse = min(npulse,npulse_d_max);
6937         }
6938         
6939         /* This env var can override npulse */
6940         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6941         if (d > 0)
6942         {
6943             npulse = d;
6944         }
6945
6946         comm->maxpulse = 1;
6947         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6948         for(d=0; d<dd->ndim; d++)
6949         {
6950             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6951             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6952             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6953             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6954             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6955             {
6956                 comm->bVacDLBNoLimit = FALSE;
6957             }
6958         }
6959         
6960         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6961         if (!comm->bVacDLBNoLimit)
6962         {
6963             comm->cellsize_limit = max(comm->cellsize_limit,
6964                                        comm->cutoff/comm->maxpulse);
6965         }
6966         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6967         /* Set the minimum cell size for each DD dimension */
6968         for(d=0; d<dd->ndim; d++)
6969         {
6970             if (comm->bVacDLBNoLimit ||
6971                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6972             {
6973                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6974             }
6975             else
6976             {
6977                 comm->cellsize_min_dlb[dd->dim[d]] =
6978                     comm->cutoff/comm->cd[d].np_dlb;
6979             }
6980         }
6981         if (comm->cutoff_mbody <= 0)
6982         {
6983             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6984         }
6985         if (comm->bDynLoadBal)
6986         {
6987             set_dlb_limits(dd);
6988         }
6989     }
6990     
6991     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6992     if (comm->eDLB == edlbAUTO)
6993     {
6994         if (fplog)
6995         {
6996             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6997         }
6998         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6999     }
7000
7001     if (ir->ePBC == epbcNONE)
7002     {
7003         vol_frac = 1 - 1/(double)dd->nnodes;
7004     }
7005     else
7006     {
7007         vol_frac =
7008             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7009     }
7010     if (debug)
7011     {
7012         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7013     }
7014     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7015    
7016     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7017 }
7018
7019 static void merge_cg_buffers(int ncell,
7020                              gmx_domdec_comm_dim_t *cd, int pulse,
7021                              int  *ncg_cell,
7022                              int  *index_gl, int  *recv_i,
7023                              rvec *cg_cm,    rvec *recv_vr,
7024                              int *cgindex,
7025                              cginfo_mb_t *cginfo_mb,int *cginfo)
7026 {
7027     gmx_domdec_ind_t *ind,*ind_p;
7028     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7029     int shift,shift_at;
7030     
7031     ind = &cd->ind[pulse];
7032     
7033     /* First correct the already stored data */
7034     shift = ind->nrecv[ncell];
7035     for(cell=ncell-1; cell>=0; cell--)
7036     {
7037         shift -= ind->nrecv[cell];
7038         if (shift > 0)
7039         {
7040             /* Move the cg's present from previous grid pulses */
7041             cg0 = ncg_cell[ncell+cell];
7042             cg1 = ncg_cell[ncell+cell+1];
7043             cgindex[cg1+shift] = cgindex[cg1];
7044             for(cg=cg1-1; cg>=cg0; cg--)
7045             {
7046                 index_gl[cg+shift] = index_gl[cg];
7047                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7048                 cgindex[cg+shift] = cgindex[cg];
7049                 cginfo[cg+shift] = cginfo[cg];
7050             }
7051             /* Correct the already stored send indices for the shift */
7052             for(p=1; p<=pulse; p++)
7053             {
7054                 ind_p = &cd->ind[p];
7055                 cg0 = 0;
7056                 for(c=0; c<cell; c++)
7057                 {
7058                     cg0 += ind_p->nsend[c];
7059                 }
7060                 cg1 = cg0 + ind_p->nsend[cell];
7061                 for(cg=cg0; cg<cg1; cg++)
7062                 {
7063                     ind_p->index[cg] += shift;
7064                 }
7065             }
7066         }
7067     }
7068
7069     /* Merge in the communicated buffers */
7070     shift = 0;
7071     shift_at = 0;
7072     cg0 = 0;
7073     for(cell=0; cell<ncell; cell++)
7074     {
7075         cg1 = ncg_cell[ncell+cell+1] + shift;
7076         if (shift_at > 0)
7077         {
7078             /* Correct the old cg indices */
7079             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7080             {
7081                 cgindex[cg+1] += shift_at;
7082             }
7083         }
7084         for(cg=0; cg<ind->nrecv[cell]; cg++)
7085         {
7086             /* Copy this charge group from the buffer */
7087             index_gl[cg1] = recv_i[cg0];
7088             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7089             /* Add it to the cgindex */
7090             cg_gl = index_gl[cg1];
7091             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7092             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7093             cgindex[cg1+1] = cgindex[cg1] + nat;
7094             cg0++;
7095             cg1++;
7096             shift_at += nat;
7097         }
7098         shift += ind->nrecv[cell];
7099         ncg_cell[ncell+cell+1] = cg1;
7100     }
7101 }
7102
7103 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7104                                int nzone,int cg0,const int *cgindex)
7105 {
7106     int cg,zone,p;
7107     
7108     /* Store the atom block boundaries for easy copying of communication buffers
7109      */
7110     cg = cg0;
7111     for(zone=0; zone<nzone; zone++)
7112     {
7113         for(p=0; p<cd->np; p++) {
7114             cd->ind[p].cell2at0[zone] = cgindex[cg];
7115             cg += cd->ind[p].nrecv[zone];
7116             cd->ind[p].cell2at1[zone] = cgindex[cg];
7117         }
7118     }
7119 }
7120
7121 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7122 {
7123     int  i;
7124     gmx_bool bMiss;
7125
7126     bMiss = FALSE;
7127     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7128     {
7129         if (!bLocalCG[link->a[i]])
7130         {
7131             bMiss = TRUE;
7132         }
7133     }
7134
7135     return bMiss;
7136 }
7137
7138 static void setup_dd_communication(gmx_domdec_t *dd,
7139                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7140 {
7141     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7142     int nzone,nzone_send,zone,zonei,cg0,cg1;
7143     int c,i,j,cg,cg_gl,nrcg;
7144     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7145     gmx_domdec_comm_t *comm;
7146     gmx_domdec_zones_t *zones;
7147     gmx_domdec_comm_dim_t *cd;
7148     gmx_domdec_ind_t *ind;
7149     cginfo_mb_t *cginfo_mb;
7150     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7151     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7152     rvec rb,rn;
7153     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7154     real bcorner[DIM],bcorner_round_1=0;
7155     ivec tric_dist;
7156     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7157     real skew_fac2_d,skew_fac_01;
7158     rvec sf2_round;
7159     int  nsend,nat;
7160     
7161     if (debug)
7162     {
7163         fprintf(debug,"Setting up DD communication\n");
7164     }
7165     
7166     comm  = dd->comm;
7167     cg_cm = fr->cg_cm;
7168
7169     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7170     {
7171         dim = dd->dim[dim_ind];
7172
7173         /* Check if we need to use triclinic distances */
7174         tric_dist[dim_ind] = 0;
7175         for(i=0; i<=dim_ind; i++)
7176         {
7177             if (ddbox->tric_dir[dd->dim[i]])
7178             {
7179                 tric_dist[dim_ind] = 1;
7180             }
7181         }
7182     }
7183
7184     bBondComm = comm->bBondComm;
7185
7186     /* Do we need to determine extra distances for multi-body bondeds? */
7187     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7188     
7189     /* Do we need to determine extra distances for only two-body bondeds? */
7190     bDist2B = (bBondComm && !bDistMB);
7191
7192     r_comm2  = sqr(comm->cutoff);
7193     r_bcomm2 = sqr(comm->cutoff_mbody);
7194
7195     if (debug)
7196     {
7197         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7198     }
7199
7200     zones = &comm->zones;
7201     
7202     dim0 = dd->dim[0];
7203     /* The first dimension is equal for all cells */
7204     corner[0][0] = comm->cell_x0[dim0];
7205     if (bDistMB)
7206     {
7207         bcorner[0] = corner[0][0];
7208     }
7209     if (dd->ndim >= 2)
7210     {
7211         dim1 = dd->dim[1];
7212         /* This cell row is only seen from the first row */
7213         corner[1][0] = comm->cell_x0[dim1];
7214         /* All rows can see this row */
7215         corner[1][1] = comm->cell_x0[dim1];
7216         if (dd->bGridJump)
7217         {
7218             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7219             if (bDistMB)
7220             {
7221                 /* For the multi-body distance we need the maximum */
7222                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7223             }
7224         }
7225         /* Set the upper-right corner for rounding */
7226         corner_round_0 = comm->cell_x1[dim0];
7227         
7228         if (dd->ndim >= 3)
7229         {
7230             dim2 = dd->dim[2];
7231             for(j=0; j<4; j++)
7232             {
7233                 corner[2][j] = comm->cell_x0[dim2];
7234             }
7235             if (dd->bGridJump)
7236             {
7237                 /* Use the maximum of the i-cells that see a j-cell */
7238                 for(i=0; i<zones->nizone; i++)
7239                 {
7240                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7241                     {
7242                         if (j >= 4)
7243                         {
7244                             corner[2][j-4] =
7245                                 max(corner[2][j-4],
7246                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7247                         }
7248                     }
7249                 }
7250                 if (bDistMB)
7251                 {
7252                     /* For the multi-body distance we need the maximum */
7253                     bcorner[2] = comm->cell_x0[dim2];
7254                     for(i=0; i<2; i++)
7255                     {
7256                         for(j=0; j<2; j++)
7257                         {
7258                             bcorner[2] = max(bcorner[2],
7259                                              comm->zone_d2[i][j].p1_0);
7260                         }
7261                     }
7262                 }
7263             }
7264             
7265             /* Set the upper-right corner for rounding */
7266             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7267              * Only cell (0,0,0) can see cell 7 (1,1,1)
7268              */
7269             corner_round_1[0] = comm->cell_x1[dim1];
7270             corner_round_1[3] = comm->cell_x1[dim1];
7271             if (dd->bGridJump)
7272             {
7273                 corner_round_1[0] = max(comm->cell_x1[dim1],
7274                                         comm->zone_d1[1].mch1);
7275                 if (bDistMB)
7276                 {
7277                     /* For the multi-body distance we need the maximum */
7278                     bcorner_round_1 = max(comm->cell_x1[dim1],
7279                                           comm->zone_d1[1].p1_1);
7280                 }
7281             }
7282         }
7283     }
7284     
7285     /* Triclinic stuff */
7286     normal = ddbox->normal;
7287     skew_fac_01 = 0;
7288     if (dd->ndim >= 2)
7289     {
7290         v_0 = ddbox->v[dim0];
7291         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7292         {
7293             /* Determine the coupling coefficient for the distances
7294              * to the cell planes along dim0 and dim1 through dim2.
7295              * This is required for correct rounding.
7296              */
7297             skew_fac_01 =
7298                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7299             if (debug)
7300             {
7301                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7302             }
7303         }
7304     }
7305     if (dd->ndim >= 3)
7306     {
7307         v_1 = ddbox->v[dim1];
7308     }
7309     
7310     zone_cg_range = zones->cg_range;
7311     index_gl = dd->index_gl;
7312     cgindex  = dd->cgindex;
7313     cginfo_mb = fr->cginfo_mb;
7314     
7315     zone_cg_range[0]   = 0;
7316     zone_cg_range[1]   = dd->ncg_home;
7317     comm->zone_ncg1[0] = dd->ncg_home;
7318     pos_cg             = dd->ncg_home;
7319     
7320     nat_tot = dd->nat_home;
7321     nzone = 1;
7322     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7323     {
7324         dim = dd->dim[dim_ind];
7325         cd = &comm->cd[dim_ind];
7326         
7327         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7328         {
7329             /* No pbc in this dimension, the first node should not comm. */
7330             nzone_send = 0;
7331         }
7332         else
7333         {
7334             nzone_send = nzone;
7335         }
7336
7337         bScrew = (dd->bScrewPBC && dim == XX);
7338         
7339         v_d = ddbox->v[dim];
7340         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7341
7342         cd->bInPlace = TRUE;
7343         for(p=0; p<cd->np; p++)
7344         {
7345             /* Only atoms communicated in the first pulse are used
7346              * for multi-body bonded interactions or for bBondComm.
7347              */
7348             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7349             bDistMB_pulse = (bDistMB && bDistBonded);
7350
7351             ind = &cd->ind[p];
7352             nsend = 0;
7353             nat = 0;
7354             for(zone=0; zone<nzone_send; zone++)
7355             {
7356                 if (tric_dist[dim_ind] && dim_ind > 0)
7357                 {
7358                     /* Determine slightly more optimized skew_fac's
7359                      * for rounding.
7360                      * This reduces the number of communicated atoms
7361                      * by about 10% for 3D DD of rhombic dodecahedra.
7362                      */
7363                     for(dimd=0; dimd<dim; dimd++)
7364                     {
7365                         sf2_round[dimd] = 1;
7366                         if (ddbox->tric_dir[dimd])
7367                         {
7368                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7369                             {
7370                                 /* If we are shifted in dimension i
7371                                  * and the cell plane is tilted forward
7372                                  * in dimension i, skip this coupling.
7373                                  */
7374                                 if (!(zones->shift[nzone+zone][i] &&
7375                                       ddbox->v[dimd][i][dimd] >= 0))
7376                                 {
7377                                     sf2_round[dimd] +=
7378                                         sqr(ddbox->v[dimd][i][dimd]);
7379                                 }
7380                             }
7381                             sf2_round[dimd] = 1/sf2_round[dimd];
7382                         }
7383                     }
7384                 }
7385
7386                 zonei = zone_perm[dim_ind][zone];
7387                 if (p == 0)
7388                 {
7389                     /* Here we permutate the zones to obtain a convenient order
7390                      * for neighbor searching
7391                      */
7392                     cg0 = zone_cg_range[zonei];
7393                     cg1 = zone_cg_range[zonei+1];
7394                 }
7395                 else
7396                 {
7397                     /* Look only at the cg's received in the previous grid pulse
7398                      */
7399                     cg1 = zone_cg_range[nzone+zone+1];
7400                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7401                 }
7402                 ind->nsend[zone] = 0;
7403                 for(cg=cg0; cg<cg1; cg++)
7404                 {
7405                     r2  = 0;
7406                     rb2 = 0;
7407                     if (tric_dist[dim_ind] == 0)
7408                     {
7409                         /* Rectangular direction, easy */
7410                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7411                         if (r > 0)
7412                         {
7413                             r2 += r*r;
7414                         }
7415                         if (bDistMB_pulse)
7416                         {
7417                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7418                             if (r > 0)
7419                             {
7420                                 rb2 += r*r;
7421                             }
7422                         }
7423                         /* Rounding gives at most a 16% reduction
7424                          * in communicated atoms
7425                          */
7426                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7427                         {
7428                             r = cg_cm[cg][dim0] - corner_round_0;
7429                             /* This is the first dimension, so always r >= 0 */
7430                             r2 += r*r;
7431                             if (bDistMB_pulse)
7432                             {
7433                                 rb2 += r*r;
7434                             }
7435                         }
7436                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7437                         {
7438                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7439                             if (r > 0)
7440                             {
7441                                 r2 += r*r;
7442                             }
7443                             if (bDistMB_pulse)
7444                             {
7445                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7446                                 if (r > 0)
7447                                 {
7448                                     rb2 += r*r;
7449                                 }
7450                             }
7451                         }
7452                     }
7453                     else
7454                     {
7455                         /* Triclinic direction, more complicated */
7456                         clear_rvec(rn);
7457                         clear_rvec(rb);
7458                         /* Rounding, conservative as the skew_fac multiplication
7459                          * will slightly underestimate the distance.
7460                          */
7461                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7462                         {
7463                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7464                             for(i=dim0+1; i<DIM; i++)
7465                             {
7466                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7467                             }
7468                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7469                             if (bDistMB_pulse)
7470                             {
7471                                 rb[dim0] = rn[dim0];
7472                                 rb2 = r2;
7473                             }
7474                             /* Take care that the cell planes along dim0 might not
7475                              * be orthogonal to those along dim1 and dim2.
7476                              */
7477                             for(i=1; i<=dim_ind; i++)
7478                             {
7479                                 dimd = dd->dim[i];
7480                                 if (normal[dim0][dimd] > 0)
7481                                 {
7482                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7483                                     if (bDistMB_pulse)
7484                                     {
7485                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7486                                     }
7487                                 }
7488                             }
7489                         }
7490                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7491                         {
7492                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7493                             tric_sh = 0;
7494                             for(i=dim1+1; i<DIM; i++)
7495                             {
7496                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7497                             }
7498                             rn[dim1] += tric_sh;
7499                             if (rn[dim1] > 0)
7500                             {
7501                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7502                                 /* Take care of coupling of the distances
7503                                  * to the planes along dim0 and dim1 through dim2.
7504                                  */
7505                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7506                                 /* Take care that the cell planes along dim1
7507                                  * might not be orthogonal to that along dim2.
7508                                  */
7509                                 if (normal[dim1][dim2] > 0)
7510                                 {
7511                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7512                                 }
7513                             }
7514                             if (bDistMB_pulse)
7515                             {
7516                                 rb[dim1] +=
7517                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7518                                 if (rb[dim1] > 0)
7519                                 {
7520                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7521                                     /* Take care of coupling of the distances
7522                                      * to the planes along dim0 and dim1 through dim2.
7523                                      */
7524                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7525                                     /* Take care that the cell planes along dim1
7526                                      * might not be orthogonal to that along dim2.
7527                                      */
7528                                     if (normal[dim1][dim2] > 0)
7529                                     {
7530                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7531                                     }
7532                                 }
7533                             }
7534                         }
7535                         /* The distance along the communication direction */
7536                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7537                         tric_sh = 0;
7538                         for(i=dim+1; i<DIM; i++)
7539                         {
7540                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7541                         }
7542                         rn[dim] += tric_sh;
7543                         if (rn[dim] > 0)
7544                         {
7545                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7546                             /* Take care of coupling of the distances
7547                              * to the planes along dim0 and dim1 through dim2.
7548                              */
7549                             if (dim_ind == 1 && zonei == 1)
7550                             {
7551                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7552                             }
7553                         }
7554                         if (bDistMB_pulse)
7555                         {
7556                             clear_rvec(rb);
7557                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7558                             if (rb[dim] > 0)
7559                             {
7560                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7561                                 /* Take care of coupling of the distances
7562                                  * to the planes along dim0 and dim1 through dim2.
7563                                  */
7564                                 if (dim_ind == 1 && zonei == 1)
7565                                 {
7566                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7567                                 }
7568                             }
7569                         }
7570                     }
7571                     
7572                     if (r2 < r_comm2 ||
7573                         (bDistBonded &&
7574                          ((bDistMB && rb2 < r_bcomm2) ||
7575                           (bDist2B && r2  < r_bcomm2)) &&
7576                          (!bBondComm ||
7577                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7578                            missing_link(comm->cglink,index_gl[cg],
7579                                         comm->bLocalCG)))))
7580                     {
7581                         /* Make an index to the local charge groups */
7582                         if (nsend+1 > ind->nalloc)
7583                         {
7584                             ind->nalloc = over_alloc_large(nsend+1);
7585                             srenew(ind->index,ind->nalloc);
7586                         }
7587                         if (nsend+1 > comm->nalloc_int)
7588                         {
7589                             comm->nalloc_int = over_alloc_large(nsend+1);
7590                             srenew(comm->buf_int,comm->nalloc_int);
7591                         }
7592                         ind->index[nsend] = cg;
7593                         comm->buf_int[nsend] = index_gl[cg];
7594                         ind->nsend[zone]++;
7595                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7596
7597                         if (dd->ci[dim] == 0)
7598                         {
7599                             /* Correct cg_cm for pbc */
7600                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7601                             if (bScrew)
7602                             {
7603                                 comm->vbuf.v[nsend][YY] =
7604                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7605                                 comm->vbuf.v[nsend][ZZ] =
7606                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7607                             }
7608                         }
7609                         else
7610                         {
7611                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7612                         }
7613                         nsend++;
7614                         nat += cgindex[cg+1] - cgindex[cg];
7615                     }
7616                 }
7617             }
7618             /* Clear the counts in case we do not have pbc */
7619             for(zone=nzone_send; zone<nzone; zone++)
7620             {
7621                 ind->nsend[zone] = 0;
7622             }
7623             ind->nsend[nzone]   = nsend;
7624             ind->nsend[nzone+1] = nat;
7625             /* Communicate the number of cg's and atoms to receive */
7626             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7627                             ind->nsend, nzone+2,
7628                             ind->nrecv, nzone+2);
7629             
7630             /* The rvec buffer is also required for atom buffers of size nsend
7631              * in dd_move_x and dd_move_f.
7632              */
7633             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7634
7635             if (p > 0)
7636             {
7637                 /* We can receive in place if only the last zone is not empty */
7638                 for(zone=0; zone<nzone-1; zone++)
7639                 {
7640                     if (ind->nrecv[zone] > 0)
7641                     {
7642                         cd->bInPlace = FALSE;
7643                     }
7644                 }
7645                 if (!cd->bInPlace)
7646                 {
7647                     /* The int buffer is only required here for the cg indices */
7648                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7649                     {
7650                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7651                         srenew(comm->buf_int2,comm->nalloc_int2);
7652                     }
7653                     /* The rvec buffer is also required for atom buffers
7654                      * of size nrecv in dd_move_x and dd_move_f.
7655                      */
7656                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7657                     vec_rvec_check_alloc(&comm->vbuf2,i);
7658                 }
7659             }
7660             
7661             /* Make space for the global cg indices */
7662             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7663                 || dd->cg_nalloc == 0)
7664             {
7665                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7666                 srenew(index_gl,dd->cg_nalloc);
7667                 srenew(cgindex,dd->cg_nalloc+1);
7668             }
7669             /* Communicate the global cg indices */
7670             if (cd->bInPlace)
7671             {
7672                 recv_i = index_gl + pos_cg;
7673             }
7674             else
7675             {
7676                 recv_i = comm->buf_int2;
7677             }
7678             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7679                             comm->buf_int, nsend,
7680                             recv_i,        ind->nrecv[nzone]);
7681
7682             /* Make space for cg_cm */
7683             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7684             {
7685                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7686                 cg_cm = fr->cg_cm;
7687             }
7688             /* Communicate cg_cm */
7689             if (cd->bInPlace)
7690             {
7691                 recv_vr = cg_cm + pos_cg;
7692             }
7693             else
7694             {
7695                 recv_vr = comm->vbuf2.v;
7696             }
7697             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7698                              comm->vbuf.v, nsend,
7699                              recv_vr,      ind->nrecv[nzone]);
7700             
7701             /* Make the charge group index */
7702             if (cd->bInPlace)
7703             {
7704                 zone = (p == 0 ? 0 : nzone - 1);
7705                 while (zone < nzone)
7706                 {
7707                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7708                     {
7709                         cg_gl = index_gl[pos_cg];
7710                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7711                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7712                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7713                         if (bBondComm)
7714                         {
7715                             /* Update the charge group presence,
7716                              * so we can use it in the next pass of the loop.
7717                              */
7718                             comm->bLocalCG[cg_gl] = TRUE;
7719                         }
7720                         pos_cg++;
7721                     }
7722                     if (p == 0)
7723                     {
7724                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7725                     }
7726                     zone++;
7727                     zone_cg_range[nzone+zone] = pos_cg;
7728                 }
7729             }
7730             else
7731             {
7732                 /* This part of the code is never executed with bBondComm. */
7733                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7734                                  index_gl,recv_i,cg_cm,recv_vr,
7735                                  cgindex,fr->cginfo_mb,fr->cginfo);
7736                 pos_cg += ind->nrecv[nzone];
7737             }
7738             nat_tot += ind->nrecv[nzone+1];
7739         }
7740         if (!cd->bInPlace)
7741         {
7742             /* Store the atom block for easy copying of communication buffers */
7743             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7744         }
7745         nzone += nzone;
7746     }
7747     dd->index_gl = index_gl;
7748     dd->cgindex  = cgindex;
7749     
7750     dd->ncg_tot = zone_cg_range[zones->n];
7751     dd->nat_tot = nat_tot;
7752     comm->nat[ddnatHOME] = dd->nat_home;
7753     for(i=ddnatZONE; i<ddnatNR; i++)
7754     {
7755         comm->nat[i] = dd->nat_tot;
7756     }
7757
7758     if (!bBondComm)
7759     {
7760         /* We don't need to update cginfo, since that was alrady done above.
7761          * So we pass NULL for the forcerec.
7762          */
7763         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7764                       NULL,comm->bLocalCG);
7765     }
7766
7767     if (debug)
7768     {
7769         fprintf(debug,"Finished setting up DD communication, zones:");
7770         for(c=0; c<zones->n; c++)
7771         {
7772             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7773         }
7774         fprintf(debug,"\n");
7775     }
7776 }
7777
7778 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7779 {
7780     int c;
7781     
7782     for(c=0; c<zones->nizone; c++)
7783     {
7784         zones->izone[c].cg1  = zones->cg_range[c+1];
7785         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7786         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7787     }
7788 }
7789
7790 static int comp_cgsort(const void *a,const void *b)
7791 {
7792     int comp;
7793     
7794     gmx_cgsort_t *cga,*cgb;
7795     cga = (gmx_cgsort_t *)a;
7796     cgb = (gmx_cgsort_t *)b;
7797     
7798     comp = cga->nsc - cgb->nsc;
7799     if (comp == 0)
7800     {
7801         comp = cga->ind_gl - cgb->ind_gl;
7802     }
7803     
7804     return comp;
7805 }
7806
7807 static void order_int_cg(int n,gmx_cgsort_t *sort,
7808                          int *a,int *buf)
7809 {
7810     int i;
7811     
7812     /* Order the data */
7813     for(i=0; i<n; i++)
7814     {
7815         buf[i] = a[sort[i].ind];
7816     }
7817     
7818     /* Copy back to the original array */
7819     for(i=0; i<n; i++)
7820     {
7821         a[i] = buf[i];
7822     }
7823 }
7824
7825 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7826                          rvec *v,rvec *buf)
7827 {
7828     int i;
7829     
7830     /* Order the data */
7831     for(i=0; i<n; i++)
7832     {
7833         copy_rvec(v[sort[i].ind],buf[i]);
7834     }
7835     
7836     /* Copy back to the original array */
7837     for(i=0; i<n; i++)
7838     {
7839         copy_rvec(buf[i],v[i]);
7840     }
7841 }
7842
7843 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7844                            rvec *v,rvec *buf)
7845 {
7846     int a,atot,cg,cg0,cg1,i;
7847     
7848     /* Order the data */
7849     a = 0;
7850     for(cg=0; cg<ncg; cg++)
7851     {
7852         cg0 = cgindex[sort[cg].ind];
7853         cg1 = cgindex[sort[cg].ind+1];
7854         for(i=cg0; i<cg1; i++)
7855         {
7856             copy_rvec(v[i],buf[a]);
7857             a++;
7858         }
7859     }
7860     atot = a;
7861     
7862     /* Copy back to the original array */
7863     for(a=0; a<atot; a++)
7864     {
7865         copy_rvec(buf[a],v[a]);
7866     }
7867 }
7868
7869 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7870                          int nsort_new,gmx_cgsort_t *sort_new,
7871                          gmx_cgsort_t *sort1)
7872 {
7873     int i1,i2,i_new;
7874     
7875     /* The new indices are not very ordered, so we qsort them */
7876     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7877     
7878     /* sort2 is already ordered, so now we can merge the two arrays */
7879     i1 = 0;
7880     i2 = 0;
7881     i_new = 0;
7882     while(i2 < nsort2 || i_new < nsort_new)
7883     {
7884         if (i2 == nsort2)
7885         {
7886             sort1[i1++] = sort_new[i_new++];
7887         }
7888         else if (i_new == nsort_new)
7889         {
7890             sort1[i1++] = sort2[i2++];
7891         }
7892         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7893                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7894                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7895         {
7896             sort1[i1++] = sort2[i2++];
7897         }
7898         else
7899         {
7900             sort1[i1++] = sort_new[i_new++];
7901         }
7902     }
7903 }
7904
7905 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7906                           rvec *cgcm,t_forcerec *fr,t_state *state,
7907                           int ncg_home_old)
7908 {
7909     gmx_domdec_sort_t *sort;
7910     gmx_cgsort_t *cgsort,*sort_i;
7911     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7912     rvec *vbuf;
7913     
7914     sort = dd->comm->sort;
7915     
7916     if (dd->ncg_home > sort->sort_nalloc)
7917     {
7918         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7919         srenew(sort->sort1,sort->sort_nalloc);
7920         srenew(sort->sort2,sort->sort_nalloc);
7921     }
7922     
7923     if (ncg_home_old >= 0)
7924     {
7925         /* The charge groups that remained in the same ns grid cell
7926          * are completely ordered. So we can sort efficiently by sorting
7927          * the charge groups that did move into the stationary list.
7928          */
7929         ncg_new = 0;
7930         nsort2 = 0;
7931         nsort_new = 0;
7932         for(i=0; i<dd->ncg_home; i++)
7933         {
7934             /* Check if this cg did not move to another node */
7935             cell_index = fr->ns.grid->cell_index[i];
7936             if (cell_index !=  4*fr->ns.grid->ncells)
7937             {
7938                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7939                 {
7940                     /* This cg is new on this node or moved ns grid cell */
7941                     if (nsort_new >= sort->sort_new_nalloc)
7942                     {
7943                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7944                         srenew(sort->sort_new,sort->sort_new_nalloc);
7945                     }
7946                     sort_i = &(sort->sort_new[nsort_new++]);
7947                 }
7948                 else
7949                 {
7950                     /* This cg did not move */
7951                     sort_i = &(sort->sort2[nsort2++]);
7952                 }
7953                 /* Sort on the ns grid cell indices
7954                  * and the global topology index
7955                  */
7956                 sort_i->nsc    = cell_index;
7957                 sort_i->ind_gl = dd->index_gl[i];
7958                 sort_i->ind    = i;
7959                 ncg_new++;
7960             }
7961         }
7962         if (debug)
7963         {
7964             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7965                     nsort2,nsort_new);
7966         }
7967         /* Sort efficiently */
7968         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7969     }
7970     else
7971     {
7972         cgsort = sort->sort1;
7973         ncg_new = 0;
7974         for(i=0; i<dd->ncg_home; i++)
7975         {
7976             /* Sort on the ns grid cell indices
7977              * and the global topology index
7978              */
7979             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7980             cgsort[i].ind_gl = dd->index_gl[i];
7981             cgsort[i].ind    = i;
7982             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7983             {
7984                 ncg_new++;
7985             }
7986         }
7987         if (debug)
7988         {
7989             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7990         }
7991         /* Determine the order of the charge groups using qsort */
7992         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7993     }
7994     cgsort = sort->sort1;
7995     
7996     /* We alloc with the old size, since cgindex is still old */
7997     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7998     vbuf = dd->comm->vbuf.v;
7999     
8000     /* Remove the charge groups which are no longer at home here */
8001     dd->ncg_home = ncg_new;
8002     
8003     /* Reorder the state */
8004     for(i=0; i<estNR; i++)
8005     {
8006         if (EST_DISTR(i) && (state->flags & (1<<i)))
8007         {
8008             switch (i)
8009             {
8010             case estX:
8011                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8012                 break;
8013             case estV:
8014                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8015                 break;
8016             case estSDX:
8017                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8018                 break;
8019             case estCGP:
8020                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8021                 break;
8022             case estLD_RNG:
8023             case estLD_RNGI:
8024             case estDISRE_INITF:
8025             case estDISRE_RM3TAV:
8026             case estORIRE_INITF:
8027             case estORIRE_DTAV:
8028                 /* No ordering required */
8029                 break;
8030             default:
8031                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8032                 break;
8033             }
8034         }
8035     }
8036     /* Reorder cgcm */
8037     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8038     
8039     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8040     {
8041         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8042         srenew(sort->ibuf,sort->ibuf_nalloc);
8043     }
8044     ibuf = sort->ibuf;
8045     /* Reorder the global cg index */
8046     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8047     /* Reorder the cginfo */
8048     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8049     /* Rebuild the local cg index */
8050     ibuf[0] = 0;
8051     for(i=0; i<dd->ncg_home; i++)
8052     {
8053         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8054         ibuf[i+1] = ibuf[i] + cgsize;
8055     }
8056     for(i=0; i<dd->ncg_home+1; i++)
8057     {
8058         dd->cgindex[i] = ibuf[i];
8059     }
8060     /* Set the home atom number */
8061     dd->nat_home = dd->cgindex[dd->ncg_home];
8062     
8063     /* Copy the sorted ns cell indices back to the ns grid struct */
8064     for(i=0; i<dd->ncg_home; i++)
8065     {
8066         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8067     }
8068     fr->ns.grid->nr = dd->ncg_home;
8069 }
8070
8071 static void add_dd_statistics(gmx_domdec_t *dd)
8072 {
8073     gmx_domdec_comm_t *comm;
8074     int ddnat;
8075     
8076     comm = dd->comm;
8077     
8078     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8079     {
8080         comm->sum_nat[ddnat-ddnatZONE] +=
8081             comm->nat[ddnat] - comm->nat[ddnat-1];
8082     }
8083     comm->ndecomp++;
8084 }
8085
8086 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8087 {
8088     gmx_domdec_comm_t *comm;
8089     int ddnat;
8090     
8091     comm = dd->comm;
8092
8093     /* Reset all the statistics and counters for total run counting */
8094     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8095     {
8096         comm->sum_nat[ddnat-ddnatZONE] = 0;
8097     }
8098     comm->ndecomp = 0;
8099     comm->nload = 0;
8100     comm->load_step = 0;
8101     comm->load_sum = 0;
8102     comm->load_max = 0;
8103     clear_ivec(comm->load_lim);
8104     comm->load_mdf = 0;
8105     comm->load_pme = 0;
8106 }
8107
8108 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8109 {
8110     gmx_domdec_comm_t *comm;
8111     int ddnat;
8112     double av;
8113    
8114     comm = cr->dd->comm;
8115     
8116     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8117     
8118     if (fplog == NULL)
8119     {
8120         return;
8121     }
8122     
8123     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8124             
8125     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8126     {
8127         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8128         switch(ddnat)
8129         {
8130         case ddnatZONE:
8131             fprintf(fplog,
8132                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8133                     2,av);
8134             break;
8135         case ddnatVSITE:
8136             if (cr->dd->vsite_comm)
8137             {
8138                 fprintf(fplog,
8139                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8140                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8141                         av);
8142             }
8143             break;
8144         case ddnatCON:
8145             if (cr->dd->constraint_comm)
8146             {
8147                 fprintf(fplog,
8148                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8149                         1 + ir->nLincsIter,av);
8150             }
8151             break;
8152         default:
8153             gmx_incons(" Unknown type for DD statistics");
8154         }
8155     }
8156     fprintf(fplog,"\n");
8157     
8158     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8159     {
8160         print_dd_load_av(fplog,cr->dd);
8161     }
8162 }
8163
8164 void dd_partition_system(FILE            *fplog,
8165                          gmx_large_int_t      step,
8166                          t_commrec       *cr,
8167                          gmx_bool            bMasterState,
8168                          int             nstglobalcomm,
8169                          t_state         *state_global,
8170                          gmx_mtop_t      *top_global,
8171                          t_inputrec      *ir,
8172                          t_state         *state_local,
8173                          rvec            **f,
8174                          t_mdatoms       *mdatoms,
8175                          gmx_localtop_t  *top_local,
8176                          t_forcerec      *fr,
8177                          gmx_vsite_t     *vsite,
8178                          gmx_shellfc_t   shellfc,
8179                          gmx_constr_t    constr,
8180                          t_nrnb          *nrnb,
8181                          gmx_wallcycle_t wcycle,
8182                          gmx_bool            bVerbose)
8183 {
8184     gmx_domdec_t *dd;
8185     gmx_domdec_comm_t *comm;
8186     gmx_ddbox_t ddbox={0};
8187     t_block *cgs_gl;
8188     gmx_large_int_t step_pcoupl;
8189     rvec cell_ns_x0,cell_ns_x1;
8190     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8191     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8192     gmx_bool bRedist,bSortCG,bResortAll;
8193     ivec ncells_old,np;
8194     real grid_density;
8195     char sbuf[22];
8196         
8197     dd = cr->dd;
8198     comm = dd->comm;
8199
8200     bBoxChanged = (bMasterState || DEFORM(*ir));
8201     if (ir->epc != epcNO)
8202     {
8203         /* With nstpcouple > 1 pressure coupling happens.
8204          * one step after calculating the pressure.
8205          * Box scaling happens at the end of the MD step,
8206          * after the DD partitioning.
8207          * We therefore have to do DLB in the first partitioning
8208          * after an MD step where P-coupling occured.
8209          * We need to determine the last step in which p-coupling occurred.
8210          * MRS -- need to validate this for vv?
8211          */
8212         n = ir->nstpcouple;
8213         if (n == 1)
8214         {
8215             step_pcoupl = step - 1;
8216         }
8217         else
8218         {
8219             step_pcoupl = ((step - 1)/n)*n + 1;
8220         }
8221         if (step_pcoupl >= comm->globalcomm_step)
8222         {
8223             bBoxChanged = TRUE;
8224         }
8225     }
8226
8227     bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8228
8229     if (!comm->bDynLoadBal)
8230     {
8231         bDoDLB = FALSE;
8232     }
8233     else
8234     {
8235         /* Should we do dynamic load balacing this step?
8236          * Since it requires (possibly expensive) global communication,
8237          * we might want to do DLB less frequently.
8238          */
8239         if (bBoxChanged || ir->epc != epcNO)
8240         {
8241             bDoDLB = bBoxChanged;
8242         }
8243         else
8244         {
8245             bDoDLB = bNStGlobalComm;
8246         }
8247     }
8248
8249     /* Check if we have recorded loads on the nodes */
8250     if (comm->bRecordLoad && dd_load_count(comm))
8251     {
8252         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8253         {
8254             /* Check if we should use DLB at the second partitioning
8255              * and every 100 partitionings,
8256              * so the extra communication cost is negligible.
8257              */
8258             n = max(100,nstglobalcomm);
8259             bCheckDLB = (comm->n_load_collect == 0 ||
8260                          comm->n_load_have % n == n-1);
8261         }
8262         else
8263         {
8264             bCheckDLB = FALSE;
8265         }
8266         
8267         /* Print load every nstlog, first and last step to the log file */
8268         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8269                     comm->n_load_collect == 0 ||
8270                     (ir->nsteps >= 0 &&
8271                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
8272
8273         /* Avoid extra communication due to verbose screen output
8274          * when nstglobalcomm is set.
8275          */
8276         if (bDoDLB || bLogLoad || bCheckDLB ||
8277             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8278         {
8279             get_load_distribution(dd,wcycle);
8280             if (DDMASTER(dd))
8281             {
8282                 if (bLogLoad)
8283                 {
8284                     dd_print_load(fplog,dd,step-1);
8285                 }
8286                 if (bVerbose)
8287                 {
8288                     dd_print_load_verbose(dd);
8289                 }
8290             }
8291             comm->n_load_collect++;
8292
8293             if (bCheckDLB) {
8294                 /* Since the timings are node dependent, the master decides */
8295                 if (DDMASTER(dd))
8296                 {
8297                     bTurnOnDLB =
8298                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8299                     if (debug)
8300                     {
8301                         fprintf(debug,"step %s, imb loss %f\n",
8302                                 gmx_step_str(step,sbuf),
8303                                 dd_force_imb_perf_loss(dd));
8304                     }
8305                 }
8306                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8307                 if (bTurnOnDLB)
8308                 {
8309                     turn_on_dlb(fplog,cr,step);
8310                     bDoDLB = TRUE;
8311                 }
8312             }
8313         }
8314         comm->n_load_have++;
8315     }
8316
8317     cgs_gl = &comm->cgs_gl;
8318
8319     bRedist = FALSE;
8320     if (bMasterState)
8321     {
8322         /* Clear the old state */
8323         clear_dd_indices(dd,0,0);
8324
8325         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8326                   TRUE,cgs_gl,state_global->x,&ddbox);
8327     
8328         get_cg_distribution(fplog,step,dd,cgs_gl,
8329                             state_global->box,&ddbox,state_global->x);
8330         
8331         dd_distribute_state(dd,cgs_gl,
8332                             state_global,state_local,f);
8333         
8334         dd_make_local_cgs(dd,&top_local->cgs);
8335         
8336         if (dd->ncg_home > fr->cg_nalloc)
8337         {
8338             dd_realloc_fr_cg(fr,dd->ncg_home);
8339         }
8340         calc_cgcm(fplog,0,dd->ncg_home,
8341                   &top_local->cgs,state_local->x,fr->cg_cm);
8342         
8343         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8344         
8345         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8346
8347         cg0 = 0;
8348     }
8349     else if (state_local->ddp_count != dd->ddp_count)
8350     {
8351         if (state_local->ddp_count > dd->ddp_count)
8352         {
8353             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8354         }
8355         
8356         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8357         {
8358             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8359         }
8360         
8361         /* Clear the old state */
8362         clear_dd_indices(dd,0,0);
8363         
8364         /* Build the new indices */
8365         rebuild_cgindex(dd,cgs_gl->index,state_local);
8366         make_dd_indices(dd,cgs_gl->index,0);
8367         
8368         /* Redetermine the cg COMs */
8369         calc_cgcm(fplog,0,dd->ncg_home,
8370                   &top_local->cgs,state_local->x,fr->cg_cm);
8371         
8372         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8373
8374         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8375
8376         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8377                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8378
8379         bRedist = comm->bDynLoadBal;
8380     }
8381     else
8382     {
8383         /* We have the full state, only redistribute the cgs */
8384
8385         /* Clear the non-home indices */
8386         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8387
8388         /* Avoid global communication for dim's without pbc and -gcom */
8389         if (!bNStGlobalComm)
8390         {
8391             copy_rvec(comm->box0    ,ddbox.box0    );
8392             copy_rvec(comm->box_size,ddbox.box_size);
8393         }
8394         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8395                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8396
8397         bBoxChanged = TRUE;
8398         bRedist = TRUE;
8399     }
8400     /* For dim's without pbc and -gcom */
8401     copy_rvec(ddbox.box0    ,comm->box0    );
8402     copy_rvec(ddbox.box_size,comm->box_size);
8403     
8404     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8405                       step,wcycle);
8406     
8407     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8408     {
8409         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8410     }
8411     
8412     /* Check if we should sort the charge groups */
8413     if (comm->nstSortCG > 0)
8414     {
8415         bSortCG = (bMasterState ||
8416                    (bRedist && (step % comm->nstSortCG == 0)));
8417     }
8418     else
8419     {
8420         bSortCG = FALSE;
8421     }
8422
8423     ncg_home_old = dd->ncg_home;
8424
8425     if (bRedist)
8426     {
8427         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8428                                  state_local,f,fr,mdatoms,
8429                                  !bSortCG,nrnb);
8430     }
8431     
8432     get_nsgrid_boundaries(fr->ns.grid,dd,
8433                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8434                           dd->ncg_home,fr->cg_cm,
8435                           cell_ns_x0,cell_ns_x1,&grid_density);
8436
8437     if (bBoxChanged)
8438     {
8439         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8440     }
8441
8442     copy_ivec(fr->ns.grid->n,ncells_old);
8443     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8444                state_local->box,cell_ns_x0,cell_ns_x1,
8445                fr->rlistlong,grid_density);
8446     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8447     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8448
8449     if (bSortCG)
8450     {
8451         /* Sort the state on charge group position.
8452          * This enables exact restarts from this step.
8453          * It also improves performance by about 15% with larger numbers
8454          * of atoms per node.
8455          */
8456         
8457         /* Fill the ns grid with the home cell,
8458          * so we can sort with the indices.
8459          */
8460         set_zones_ncg_home(dd);
8461         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8462                   0,dd->ncg_home,fr->cg_cm);
8463         
8464         /* Check if we can user the old order and ns grid cell indices
8465          * of the charge groups to sort the charge groups efficiently.
8466          */
8467         bResortAll = (bMasterState ||
8468                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8469                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8470                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8471
8472         if (debug)
8473         {
8474             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8475                     gmx_step_str(step,sbuf),dd->ncg_home);
8476         }
8477         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8478                       bResortAll ? -1 : ncg_home_old);
8479         /* Rebuild all the indices */
8480         cg0 = 0;
8481         ga2la_clear(dd->ga2la);
8482     }
8483     
8484     /* Setup up the communication and communicate the coordinates */
8485     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8486     
8487     /* Set the indices */
8488     make_dd_indices(dd,cgs_gl->index,cg0);
8489
8490     /* Set the charge group boundaries for neighbor searching */
8491     set_cg_boundaries(&comm->zones);
8492     
8493     /*
8494     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8495                  -1,state_local->x,state_local->box);
8496     */
8497     
8498     /* Extract a local topology from the global topology */
8499     for(i=0; i<dd->ndim; i++)
8500     {
8501         np[dd->dim[i]] = comm->cd[i].np;
8502     }
8503     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8504                       comm->cellsize_min,np,
8505                       fr,vsite,top_global,top_local);
8506     
8507     /* Set up the special atom communication */
8508     n = comm->nat[ddnatZONE];
8509     for(i=ddnatZONE+1; i<ddnatNR; i++)
8510     {
8511         switch(i)
8512         {
8513         case ddnatVSITE:
8514             if (vsite && vsite->n_intercg_vsite)
8515             {
8516                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8517             }
8518             break;
8519         case ddnatCON:
8520             if (dd->bInterCGcons)
8521             {
8522                 /* Only for inter-cg constraints we need special code */
8523                 n = dd_make_local_constraints(dd,n,top_global,
8524                                               constr,ir->nProjOrder,
8525                                               &top_local->idef.il[F_CONSTR]);
8526             }
8527             break;
8528         default:
8529             gmx_incons("Unknown special atom type setup");
8530         }
8531         comm->nat[i] = n;
8532     }
8533     
8534     /* Make space for the extra coordinates for virtual site
8535      * or constraint communication.
8536      */
8537     state_local->natoms = comm->nat[ddnatNR-1];
8538     if (state_local->natoms > state_local->nalloc)
8539     {
8540         dd_realloc_state(state_local,f,state_local->natoms);
8541     }
8542
8543     if (fr->bF_NoVirSum)
8544     {
8545         if (vsite && vsite->n_intercg_vsite)
8546         {
8547             nat_f_novirsum = comm->nat[ddnatVSITE];
8548         }
8549         else
8550         {
8551             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8552             {
8553                 nat_f_novirsum = dd->nat_tot;
8554             }
8555             else
8556             {
8557                 nat_f_novirsum = dd->nat_home;
8558             }
8559         }
8560     }
8561     else
8562     {
8563         nat_f_novirsum = 0;
8564     }
8565
8566     /* Set the number of atoms required for the force calculation.
8567      * Forces need to be constrained when using a twin-range setup
8568      * or with energy minimization. For simple simulations we could
8569      * avoid some allocation, zeroing and copying, but this is
8570      * probably not worth the complications ande checking.
8571      */
8572     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8573                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8574
8575     /* We make the all mdatoms up to nat_tot_con.
8576      * We could save some work by only setting invmass
8577      * between nat_tot and nat_tot_con.
8578      */
8579     /* This call also sets the new number of home particles to dd->nat_home */
8580     atoms2md(top_global,ir,
8581              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8582
8583     /* Now we have the charges we can sort the FE interactions */
8584     dd_sort_local_top(dd,mdatoms,top_local);
8585
8586     if (shellfc)
8587     {
8588         /* Make the local shell stuff, currently no communication is done */
8589         make_local_shells(cr,mdatoms,shellfc);
8590     }
8591     
8592         if (ir->implicit_solvent)
8593     {
8594         make_local_gb(cr,fr->born,ir->gb_algorithm);
8595     }
8596         
8597     if (!(cr->duty & DUTY_PME))
8598     {
8599         /* Send the charges to our PME only node */
8600         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8601                        mdatoms->chargeA,mdatoms->chargeB,
8602                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8603     }
8604     
8605     if (constr)
8606     {
8607         set_constraints(constr,top_local,ir,mdatoms,cr);
8608     }
8609     
8610     if (ir->ePull != epullNO)
8611     {
8612         /* Update the local pull groups */
8613         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8614     }
8615     
8616     if (ir->bRot)
8617     {
8618         /* Update the local rotation groups */
8619         dd_make_local_rotation_groups(dd,ir->rot);
8620     }
8621
8622
8623     add_dd_statistics(dd);
8624     
8625     /* Make sure we only count the cycles for this DD partitioning */
8626     clear_dd_cycle_counts(dd);
8627     
8628     /* Because the order of the atoms might have changed since
8629      * the last vsite construction, we need to communicate the constructing
8630      * atom coordinates again (for spreading the forces this MD step).
8631      */
8632     dd_move_x_vsites(dd,state_local->box,state_local->x);
8633     
8634     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8635     {
8636         dd_move_x(dd,state_local->box,state_local->x);
8637         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8638                      -1,state_local->x,state_local->box);
8639     }
8640
8641     if (bNStGlobalComm)
8642     {
8643         /* Store the global communication step */
8644         comm->globalcomm_step = step;
8645     }
8646     
8647     /* Increase the DD partitioning counter */
8648     dd->ddp_count++;
8649     /* The state currently matches this DD partitioning count, store it */
8650     state_local->ddp_count = dd->ddp_count;
8651     if (bMasterState)
8652     {
8653         /* The DD master node knows the complete cg distribution,
8654          * store the count so we can possibly skip the cg info communication.
8655          */
8656         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8657     }
8658
8659     if (comm->DD_debug > 0)
8660     {
8661         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8662         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8663                                 "after partitioning");
8664     }
8665 }