src/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "vec.h"
  31 #include "domdec.h"
  32 #include "domdec_network.h"
  33 #include "nrnb.h"
  34 #include "pbc.h"
  35 #include "chargegroup.h"
  36 #include "constr.h"
  37 #include "mdatoms.h"
  38 #include "names.h"
  39 #include "pdbio.h"
  40 #include "futil.h"
  41 #include "force.h"
  42 #include "pme.h"
  43 #include "pull.h"
  44 #include "gmx_wallcycle.h"
  45 #include "mdrun.h"
  46 #include "nsgrid.h"
  47 #include "shellfc.h"
  48 #include "mtop_util.h"
  49 #include "gmxfio.h"
  50 #include "gmx_ga2la.h"
  51 #include "gmx_sort.h"
  52
  53 #ifdef GMX_LIB_MPI
  54 #include <mpi.h>
  55 #endif
  56 #ifdef GMX_THREADS
  57 #include "tmpi.h"
  58 #endif
  59
  60 #define DDRANK(dd,rank)    (rank)
  61 #define DDMASTERRANK(dd)   (dd->masterrank)
  62
  63 typedef struct gmx_domdec_master
  64 {
  65     /* The cell boundaries */
  66     real **cell_x;
  67     /* The global charge group division */
  68     int  *ncg;     /* Number of home charge groups for each node */
  69     int  *index;   /* Index of nnodes+1 into cg */
  70     int  *cg;      /* Global charge group index */
  71     int  *nat;     /* Number of home atoms for each node. */
  72     int  *ibuf;    /* Buffer for communication */
  73     rvec *vbuf;    /* Buffer for state scattering and gathering */
  74 } gmx_domdec_master_t;
  75
  76 typedef struct
  77 {
  78     /* The numbers of charge groups to send and receive for each cell
  79      * that requires communication, the last entry contains the total
  80      * number of atoms that needs to be communicated.
  81      */
  82     int nsend[DD_MAXIZONE+2];
  83     int nrecv[DD_MAXIZONE+2];
  84     /* The charge groups to send */
  85     int *index;
  86     int nalloc;
  87     /* The atom range for non-in-place communication */
  88     int cell2at0[DD_MAXIZONE];
  89     int cell2at1[DD_MAXIZONE];
  90 } gmx_domdec_ind_t;
  91
  92 typedef struct
  93 {
  94     int  np;                   /* Number of grid pulses in this dimension */
  95     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
  96     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
  97     int  np_nalloc;
  98     gmx_bool bInPlace;             /* Can we communicate in place?            */
  99 } gmx_domdec_comm_dim_t;
 100
 101 typedef struct
 102 {
 103     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 104     real *cell_f;      /* State var.: cell boundaries, box relative      */
 105     real *old_cell_f;  /* Temp. var.: old cell size                      */
 106     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 107     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 108     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 109     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 110     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 111     real *buf_ncd;     /* Temp. var.                                     */
 112 } gmx_domdec_root_t;
 113
 114 #define DD_NLOAD_MAX 9
 115
 116 /* Here floats are accurate enough, since these variables
 117  * only influence the load balancing, not the actual MD results.
 118  */
 119 typedef struct
 120 {
 121     int  nload;
 122     float *load;
 123     float sum;
 124     float max;
 125     float sum_m;
 126     float cvol_min;
 127     float mdf;
 128     float pme;
 129     int   flags;
 130 } gmx_domdec_load_t;
 131
 132 typedef struct
 133 {
 134     int  nsc;
 135     int  ind_gl;
 136     int  ind;
 137 } gmx_cgsort_t;
 138
 139 typedef struct
 140 {
 141     gmx_cgsort_t *sort1,*sort2;
 142     int  sort_nalloc;
 143     gmx_cgsort_t *sort_new;
 144     int  sort_new_nalloc;
 145     int  *ibuf;
 146     int  ibuf_nalloc;
 147 } gmx_domdec_sort_t;
 148
 149 typedef struct
 150 {
 151     rvec *v;
 152     int  nalloc;
 153 } vec_rvec_t;
 154
 155 /* This enum determines the order of the coordinates.
 156  * ddnatHOME and ddnatZONE should be first and second,
 157  * the others can be ordered as wanted.
 158  */
 159 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 160
 161 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 162 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 163
 164 typedef struct
 165 {
 166     int  dim;      /* The dimension                                          */
 167     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 168     int  nslab;    /* The number of PME slabs in this dimension              */
 169     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 170     int  *pp_min;  /* The minimum pp node location, size nslab               */
 171     int  *pp_max;  /* The maximum pp node location,size nslab                */
 172     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 173 } gmx_ddpme_t;
 174
 175 typedef struct
 176 {
 177     real min0;    /* The minimum bottom of this zone                        */
 178     real max1;    /* The maximum top of this zone                           */
 179     real mch0;    /* The maximum bottom communicaton height for this zone   */
 180     real mch1;    /* The maximum top communicaton height for this zone      */
 181     real p1_0;    /* The bottom value of the first cell in this zone        */
 182     real p1_1;    /* The top value of the first cell in this zone           */
 183 } gmx_ddzone_t;
 184
 185 typedef struct gmx_domdec_comm
 186 {
 187     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 188      * unless stated otherwise.
 189      */
 190
 191     /* The number of decomposition dimensions for PME, 0: no PME */
 192     int  npmedecompdim;
 193     /* The number of nodes doing PME (PP/PME or only PME) */
 194     int  npmenodes;
 195     int  npmenodes_x;
 196     int  npmenodes_y;
 197     /* The communication setup including the PME only nodes */
 198     gmx_bool bCartesianPP_PME;
 199     ivec ntot;
 200     int  cartpmedim;
 201     int  *pmenodes;          /* size npmenodes                         */
 202     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 203                               * but with bCartesianPP_PME              */
 204     gmx_ddpme_t ddpme[2];
 205
 206     /* The DD particle-particle nodes only */
 207     gmx_bool bCartesianPP;
 208     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 209
 210     /* The global charge groups */
 211     t_block cgs_gl;
 212
 213     /* Should we sort the cgs */
 214     int  nstSortCG;
 215     gmx_domdec_sort_t *sort;
 216
 217     /* Are there bonded and multi-body interactions between charge groups? */
 218     gmx_bool bInterCGBondeds;
 219     gmx_bool bInterCGMultiBody;
 220
 221     /* Data for the optional bonded interaction atom communication range */
 222     gmx_bool bBondComm;
 223     t_blocka *cglink;
 224     char *bLocalCG;
 225
 226     /* The DLB option */
 227     int  eDLB;
 228     /* Are we actually using DLB? */
 229     gmx_bool bDynLoadBal;
 230
 231     /* Cell sizes for static load balancing, first index cartesian */
 232     real **slb_frac;
 233
 234     /* The width of the communicated boundaries */
 235     real cutoff_mbody;
 236     real cutoff;
 237     /* The minimum cell size (including triclinic correction) */
 238     rvec cellsize_min;
 239     /* For dlb, for use with edlbAUTO */
 240     rvec cellsize_min_dlb;
 241     /* The lower limit for the DD cell size with DLB */
 242     real cellsize_limit;
 243     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 244     gmx_bool bVacDLBNoLimit;
 245
 246     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 247     ivec tric_dir;
 248     /* box0 and box_size are required with dim's without pbc and -gcom */
 249     rvec box0;
 250     rvec box_size;
 251
 252     /* The cell boundaries */
 253     rvec cell_x0;
 254     rvec cell_x1;
 255
 256     /* The old location of the cell boundaries, to check cg displacements */
 257     rvec old_cell_x0;
 258     rvec old_cell_x1;
 259
 260     /* The communication setup and charge group boundaries for the zones */
 261     gmx_domdec_zones_t zones;
 262
 263     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 264      * cell boundaries of neighboring cells for dynamic load balancing.
 265      */
 266     gmx_ddzone_t zone_d1[2];
 267     gmx_ddzone_t zone_d2[2][2];
 268
 269     /* The coordinate/force communication setup and indices */
 270     gmx_domdec_comm_dim_t cd[DIM];
 271     /* The maximum number of cells to communicate with in one dimension */
 272     int  maxpulse;
 273
 274     /* Which cg distribution is stored on the master node */
 275     int master_cg_ddp_count;
 276
 277     /* The number of cg's received from the direct neighbors */
 278     int  zone_ncg1[DD_MAXZONE];
 279
 280     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 281     int  nat[ddnatNR];
 282
 283     /* Communication buffer for general use */
 284     int  *buf_int;
 285     int  nalloc_int;
 286
 287      /* Communication buffer for general use */
 288     vec_rvec_t vbuf;
 289
 290     /* Communication buffers only used with multiple grid pulses */
 291     int  *buf_int2;
 292     int  nalloc_int2;
 293     vec_rvec_t vbuf2;
 294
 295     /* Communication buffers for local redistribution */
 296     int  **cggl_flag;
 297     int  cggl_flag_nalloc[DIM*2];
 298     rvec **cgcm_state;
 299     int  cgcm_state_nalloc[DIM*2];
 300
 301     /* Cell sizes for dynamic load balancing */
 302     gmx_domdec_root_t **root;
 303     real *cell_f_row;
 304     real cell_f0[DIM];
 305     real cell_f1[DIM];
 306     real cell_f_max0[DIM];
 307     real cell_f_min1[DIM];
 308
 309     /* Stuff for load communication */
 310     gmx_bool bRecordLoad;
 311     gmx_domdec_load_t *load;
 312 #ifdef GMX_MPI
 313     MPI_Comm *mpi_comm_load;
 314 #endif
 315
 316     /* Maximum DLB scaling per load balancing step in percent */
 317     int dlb_scale_lim;
 318
 319     /* Cycle counters */
 320     float cycl[ddCyclNr];
 321     int   cycl_n[ddCyclNr];
 322     float cycl_max[ddCyclNr];
 323     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 324     int eFlop;
 325     double flop;
 326     int    flop_n;
 327     /* Have often have did we have load measurements */
 328     int    n_load_have;
 329     /* Have often have we collected the load measurements */
 330     int    n_load_collect;
 331
 332     /* Statistics */
 333     double sum_nat[ddnatNR-ddnatZONE];
 334     int    ndecomp;
 335     int    nload;
 336     double load_step;
 337     double load_sum;
 338     double load_max;
 339     ivec   load_lim;
 340     double load_mdf;
 341     double load_pme;
 342
 343     /* The last partition step */
 344     gmx_large_int_t globalcomm_step;
 345
 346     /* Debugging */
 347     int  nstDDDump;
 348     int  nstDDDumpGrid;
 349     int  DD_debug;
 350 } gmx_domdec_comm_t;
 351
 352 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 353 #define DD_CGIBS 2
 354
 355 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 356 #define DD_FLAG_NRCG  65535
 357 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 358 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 359
 360 /* Zone permutation required to obtain consecutive charge groups
 361  * for neighbor searching.
 362  */
 363 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 364
 365 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 366  * components see only j zones with that component 0.
 367  */
 368
 369 /* The DD zone order */
 370 static const ivec dd_zo[DD_MAXZONE] =
 371   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 372
 373 /* The 3D setup */
 374 #define dd_z3n  8
 375 #define dd_zp3n 4
 376 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 377
 378 /* The 2D setup */
 379 #define dd_z2n  4
 380 #define dd_zp2n 2
 381 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 382
 383 /* The 1D setup */
 384 #define dd_z1n  2
 385 #define dd_zp1n 1
 386 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 387
 388 /* Factors used to avoid problems due to rounding issues */
 389 #define DD_CELL_MARGIN       1.0001
 390 #define DD_CELL_MARGIN2      1.00005
 391 /* Factor to account for pressure scaling during nstlist steps */
 392 #define DD_PRES_SCALE_MARGIN 1.02
 393
 394 /* Allowed performance loss before we DLB or warn */
 395 #define DD_PERF_LOSS 0.05
 396
 397 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 398
 399 /* Use separate MPI send and receive commands
 400  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 401  * This saves memory (and some copying for small nnodes).
 402  * For high parallelization scatter and gather calls are used.
 403  */
 404 #define GMX_DD_NNODES_SENDRECV 4
 405
 406
 407 /*
 408 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 409
 410 static void index2xyz(ivec nc,int ind,ivec xyz)
 411 {
 412   xyz[XX] = ind % nc[XX];
 413   xyz[YY] = (ind / nc[XX]) % nc[YY];
 414   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 415 }
 416 */
 417
 418 /* This order is required to minimize the coordinate communication in PME
 419  * which uses decomposition in the x direction.
 420  */
 421 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 422
 423 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 424 {
 425     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 426     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 427     xyz[ZZ] = ind % nc[ZZ];
 428 }
 429
 430 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 431 {
 432     int ddindex;
 433     int ddnodeid=-1;
 434
 435     ddindex = dd_index(dd->nc,c);
 436     if (dd->comm->bCartesianPP_PME)
 437     {
 438         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 439     }
 440     else if (dd->comm->bCartesianPP)
 441     {
 442 #ifdef GMX_MPI
 443         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 444 #endif
 445     }
 446     else
 447     {
 448         ddnodeid = ddindex;
 449     }
 450
 451     return ddnodeid;
 452 }
 453
 454 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 455 {
 456     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 457 }
 458
 459 int ddglatnr(gmx_domdec_t *dd,int i)
 460 {
 461     int atnr;
 462
 463     if (dd == NULL)
 464     {
 465         atnr = i + 1;
 466     }
 467     else
 468     {
 469         if (i >= dd->comm->nat[ddnatNR-1])
 470         {
 471             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 472         }
 473         atnr = dd->gatindex[i] + 1;
 474     }
 475
 476     return atnr;
 477 }
 478
 479 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 480 {
 481     return &dd->comm->cgs_gl;
 482 }
 483
 484 static void vec_rvec_init(vec_rvec_t *v)
 485 {
 486     v->nalloc = 0;
 487     v->v      = NULL;
 488 }
 489
 490 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 491 {
 492     if (n > v->nalloc)
 493     {
 494         v->nalloc = over_alloc_dd(n);
 495         srenew(v->v,v->nalloc);
 496     }
 497 }
 498
 499 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 500 {
 501     int i;
 502
 503     if (state->ddp_count != dd->ddp_count)
 504     {
 505         gmx_incons("The state does not the domain decomposition state");
 506     }
 507
 508     state->ncg_gl = dd->ncg_home;
 509     if (state->ncg_gl > state->cg_gl_nalloc)
 510     {
 511         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 512         srenew(state->cg_gl,state->cg_gl_nalloc);
 513     }
 514     for(i=0; i<state->ncg_gl; i++)
 515     {
 516         state->cg_gl[i] = dd->index_gl[i];
 517     }
 518
 519     state->ddp_count_cg_gl = dd->ddp_count;
 520 }
 521
 522 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 523 {
 524     return &dd->comm->zones;
 525 }
 526
 527 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 528                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 529 {
 530     gmx_domdec_zones_t *zones;
 531     int izone,d,dim;
 532
 533     zones = &dd->comm->zones;
 534
 535     izone = 0;
 536     while (icg >= zones->izone[izone].cg1)
 537     {
 538         izone++;
 539     }
 540
 541     if (izone == 0)
 542     {
 543         *jcg0 = icg;
 544     }
 545     else if (izone < zones->nizone)
 546     {
 547         *jcg0 = zones->izone[izone].jcg0;
 548     }
 549     else
 550     {
 551         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 552                   icg,izone,zones->nizone);
 553     }
 554
 555     *jcg1 = zones->izone[izone].jcg1;
 556
 557     for(d=0; d<dd->ndim; d++)
 558     {
 559         dim = dd->dim[d];
 560         shift0[dim] = zones->izone[izone].shift0[dim];
 561         shift1[dim] = zones->izone[izone].shift1[dim];
 562         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 563         {
 564             /* A conservative approach, this can be optimized */
 565             shift0[dim] -= 1;
 566             shift1[dim] += 1;
 567         }
 568     }
 569 }
 570
 571 int dd_natoms_vsite(gmx_domdec_t *dd)
 572 {
 573     return dd->comm->nat[ddnatVSITE];
 574 }
 575
 576 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 577 {
 578     *at_start = dd->comm->nat[ddnatCON-1];
 579     *at_end   = dd->comm->nat[ddnatCON];
 580 }
 581
 582 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 583 {
 584     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 585     int  *index,*cgindex;
 586     gmx_domdec_comm_t *comm;
 587     gmx_domdec_comm_dim_t *cd;
 588     gmx_domdec_ind_t *ind;
 589     rvec shift={0,0,0},*buf,*rbuf;
 590     gmx_bool bPBC,bScrew;
 591
 592     comm = dd->comm;
 593
 594     cgindex = dd->cgindex;
 595
 596     buf = comm->vbuf.v;
 597
 598     nzone = 1;
 599     nat_tot = dd->nat_home;
 600     for(d=0; d<dd->ndim; d++)
 601     {
 602         bPBC   = (dd->ci[dd->dim[d]] == 0);
 603         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 604         if (bPBC)
 605         {
 606             copy_rvec(box[dd->dim[d]],shift);
 607         }
 608         cd = &comm->cd[d];
 609         for(p=0; p<cd->np; p++)
 610         {
 611             ind = &cd->ind[p];
 612             index = ind->index;
 613             n = 0;
 614             if (!bPBC)
 615             {
 616                 for(i=0; i<ind->nsend[nzone]; i++)
 617                 {
 618                     at0 = cgindex[index[i]];
 619                     at1 = cgindex[index[i]+1];
 620                     for(j=at0; j<at1; j++)
 621                     {
 622                         copy_rvec(x[j],buf[n]);
 623                         n++;
 624                     }
 625                 }
 626             }
 627             else if (!bScrew)
 628             {
 629                 for(i=0; i<ind->nsend[nzone]; i++)
 630                 {
 631                     at0 = cgindex[index[i]];
 632                     at1 = cgindex[index[i]+1];
 633                     for(j=at0; j<at1; j++)
 634                     {
 635                         /* We need to shift the coordinates */
 636                         rvec_add(x[j],shift,buf[n]);
 637                         n++;
 638                     }
 639                 }
 640             }
 641             else
 642             {
 643                 for(i=0; i<ind->nsend[nzone]; i++)
 644                 {
 645                     at0 = cgindex[index[i]];
 646                     at1 = cgindex[index[i]+1];
 647                     for(j=at0; j<at1; j++)
 648                     {
 649                         /* Shift x */
 650                         buf[n][XX] = x[j][XX] + shift[XX];
 651                         /* Rotate y and z.
 652                          * This operation requires a special shift force
 653                          * treatment, which is performed in calc_vir.
 654                          */
 655                         buf[n][YY] = box[YY][YY] - x[j][YY];
 656                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 657                         n++;
 658                     }
 659                 }
 660             }
 661
 662             if (cd->bInPlace)
 663             {
 664                 rbuf = x + nat_tot;
 665             }
 666             else
 667             {
 668                 rbuf = comm->vbuf2.v;
 669             }
 670             /* Send and receive the coordinates */
 671             dd_sendrecv_rvec(dd, d, dddirBackward,
 672                              buf,  ind->nsend[nzone+1],
 673                              rbuf, ind->nrecv[nzone+1]);
 674             if (!cd->bInPlace)
 675             {
 676                 j = 0;
 677                 for(zone=0; zone<nzone; zone++)
 678                 {
 679                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 680                     {
 681                         copy_rvec(rbuf[j],x[i]);
 682                         j++;
 683                     }
 684                 }
 685             }
 686             nat_tot += ind->nrecv[nzone+1];
 687         }
 688         nzone += nzone;
 689     }
 690 }
 691
 692 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 693 {
 694     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 695     int  *index,*cgindex;
 696     gmx_domdec_comm_t *comm;
 697     gmx_domdec_comm_dim_t *cd;
 698     gmx_domdec_ind_t *ind;
 699     rvec *buf,*sbuf;
 700     ivec vis;
 701     int  is;
 702     gmx_bool bPBC,bScrew;
 703
 704     comm = dd->comm;
 705
 706     cgindex = dd->cgindex;
 707
 708     buf = comm->vbuf.v;
 709
 710     n = 0;
 711     nzone = comm->zones.n/2;
 712     nat_tot = dd->nat_tot;
 713     for(d=dd->ndim-1; d>=0; d--)
 714     {
 715         bPBC   = (dd->ci[dd->dim[d]] == 0);
 716         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 717         if (fshift == NULL && !bScrew)
 718         {
 719             bPBC = FALSE;
 720         }
 721         /* Determine which shift vector we need */
 722         clear_ivec(vis);
 723         vis[dd->dim[d]] = 1;
 724         is = IVEC2IS(vis);
 725
 726         cd = &comm->cd[d];
 727         for(p=cd->np-1; p>=0; p--) {
 728             ind = &cd->ind[p];
 729             nat_tot -= ind->nrecv[nzone+1];
 730             if (cd->bInPlace)
 731             {
 732                 sbuf = f + nat_tot;
 733             }
 734             else
 735             {
 736                 sbuf = comm->vbuf2.v;
 737                 j = 0;
 738                 for(zone=0; zone<nzone; zone++)
 739                 {
 740                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 741                     {
 742                         copy_rvec(f[i],sbuf[j]);
 743                         j++;
 744                     }
 745                 }
 746             }
 747             /* Communicate the forces */
 748             dd_sendrecv_rvec(dd, d, dddirForward,
 749                              sbuf, ind->nrecv[nzone+1],
 750                              buf,  ind->nsend[nzone+1]);
 751             index = ind->index;
 752             /* Add the received forces */
 753             n = 0;
 754             if (!bPBC)
 755             {
 756                 for(i=0; i<ind->nsend[nzone]; i++)
 757                 {
 758                     at0 = cgindex[index[i]];
 759                     at1 = cgindex[index[i]+1];
 760                     for(j=at0; j<at1; j++)
 761                     {
 762                         rvec_inc(f[j],buf[n]);
 763                         n++;
 764                     }
 765                 }
 766             }
 767             else if (!bScrew)
 768             {
 769                 for(i=0; i<ind->nsend[nzone]; i++)
 770                 {
 771                     at0 = cgindex[index[i]];
 772                     at1 = cgindex[index[i]+1];
 773                     for(j=at0; j<at1; j++)
 774                     {
 775                         rvec_inc(f[j],buf[n]);
 776                         /* Add this force to the shift force */
 777                         rvec_inc(fshift[is],buf[n]);
 778                         n++;
 779                     }
 780                 }
 781             }
 782             else
 783             {
 784                 for(i=0; i<ind->nsend[nzone]; i++)
 785                 {
 786                     at0 = cgindex[index[i]];
 787                     at1 = cgindex[index[i]+1];
 788                     for(j=at0; j<at1; j++)
 789                     {
 790                         /* Rotate the force */
 791                         f[j][XX] += buf[n][XX];
 792                         f[j][YY] -= buf[n][YY];
 793                         f[j][ZZ] -= buf[n][ZZ];
 794                         if (fshift)
 795                         {
 796                             /* Add this force to the shift force */
 797                             rvec_inc(fshift[is],buf[n]);
 798                         }
 799                         n++;
 800                     }
 801                 }
 802             }
 803         }
 804         nzone /= 2;
 805     }
 806 }
 807
 808 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 809 {
 810     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 811     int  *index,*cgindex;
 812     gmx_domdec_comm_t *comm;
 813     gmx_domdec_comm_dim_t *cd;
 814     gmx_domdec_ind_t *ind;
 815     real *buf,*rbuf;
 816
 817     comm = dd->comm;
 818
 819     cgindex = dd->cgindex;
 820
 821     buf = &comm->vbuf.v[0][0];
 822
 823     nzone = 1;
 824     nat_tot = dd->nat_home;
 825     for(d=0; d<dd->ndim; d++)
 826     {
 827         cd = &comm->cd[d];
 828         for(p=0; p<cd->np; p++)
 829         {
 830             ind = &cd->ind[p];
 831             index = ind->index;
 832             n = 0;
 833             for(i=0; i<ind->nsend[nzone]; i++)
 834             {
 835                 at0 = cgindex[index[i]];
 836                 at1 = cgindex[index[i]+1];
 837                 for(j=at0; j<at1; j++)
 838                 {
 839                     buf[n] = v[j];
 840                     n++;
 841                 }
 842             }
 843
 844             if (cd->bInPlace)
 845             {
 846                 rbuf = v + nat_tot;
 847             }
 848             else
 849             {
 850                 rbuf = &comm->vbuf2.v[0][0];
 851             }
 852             /* Send and receive the coordinates */
 853             dd_sendrecv_real(dd, d, dddirBackward,
 854                              buf,  ind->nsend[nzone+1],
 855                              rbuf, ind->nrecv[nzone+1]);
 856             if (!cd->bInPlace)
 857             {
 858                 j = 0;
 859                 for(zone=0; zone<nzone; zone++)
 860                 {
 861                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 862                     {
 863                         v[i] = rbuf[j];
 864                         j++;
 865                     }
 866                 }
 867             }
 868             nat_tot += ind->nrecv[nzone+1];
 869         }
 870         nzone += nzone;
 871     }
 872 }
 873
 874 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 875 {
 876     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 877     int  *index,*cgindex;
 878     gmx_domdec_comm_t *comm;
 879     gmx_domdec_comm_dim_t *cd;
 880     gmx_domdec_ind_t *ind;
 881     real *buf,*sbuf;
 882
 883     comm = dd->comm;
 884
 885     cgindex = dd->cgindex;
 886
 887     buf = &comm->vbuf.v[0][0];
 888
 889     n = 0;
 890     nzone = comm->zones.n/2;
 891     nat_tot = dd->nat_tot;
 892     for(d=dd->ndim-1; d>=0; d--)
 893     {
 894         cd = &comm->cd[d];
 895         for(p=cd->np-1; p>=0; p--) {
 896             ind = &cd->ind[p];
 897             nat_tot -= ind->nrecv[nzone+1];
 898             if (cd->bInPlace)
 899             {
 900                 sbuf = v + nat_tot;
 901             }
 902             else
 903             {
 904                 sbuf = &comm->vbuf2.v[0][0];
 905                 j = 0;
 906                 for(zone=0; zone<nzone; zone++)
 907                 {
 908                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 909                     {
 910                         sbuf[j] = v[i];
 911                         j++;
 912                     }
 913                 }
 914             }
 915             /* Communicate the forces */
 916             dd_sendrecv_real(dd, d, dddirForward,
 917                              sbuf, ind->nrecv[nzone+1],
 918                              buf,  ind->nsend[nzone+1]);
 919             index = ind->index;
 920             /* Add the received forces */
 921             n = 0;
 922             for(i=0; i<ind->nsend[nzone]; i++)
 923             {
 924                 at0 = cgindex[index[i]];
 925                 at1 = cgindex[index[i]+1];
 926                 for(j=at0; j<at1; j++)
 927                 {
 928                     v[j] += buf[n];
 929                     n++;
 930                 }
 931             }
 932         }
 933         nzone /= 2;
 934     }
 935 }
 936
 937 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 938 {
 939     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 940             d,i,j,
 941             zone->min0,zone->max1,
 942             zone->mch0,zone->mch0,
 943             zone->p1_0,zone->p1_1);
 944 }
 945
 946 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 947                                int ddimind,int direction,
 948                                gmx_ddzone_t *buf_s,int n_s,
 949                                gmx_ddzone_t *buf_r,int n_r)
 950 {
 951     rvec vbuf_s[5*2],vbuf_r[5*2];
 952     int i;
 953
 954     for(i=0; i<n_s; i++)
 955     {
 956         vbuf_s[i*2  ][0] = buf_s[i].min0;
 957         vbuf_s[i*2  ][1] = buf_s[i].max1;
 958         vbuf_s[i*2  ][2] = buf_s[i].mch0;
 959         vbuf_s[i*2+1][0] = buf_s[i].mch1;
 960         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
 961         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
 962     }
 963
 964     dd_sendrecv_rvec(dd, ddimind, direction,
 965                      vbuf_s, n_s*2,
 966                      vbuf_r, n_r*2);
 967
 968     for(i=0; i<n_r; i++)
 969     {
 970         buf_r[i].min0 = vbuf_r[i*2  ][0];
 971         buf_r[i].max1 = vbuf_r[i*2  ][1];
 972         buf_r[i].mch0 = vbuf_r[i*2  ][2];
 973         buf_r[i].mch1 = vbuf_r[i*2+1][0];
 974         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
 975         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
 976     }
 977 }
 978
 979 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 980                           rvec cell_ns_x0,rvec cell_ns_x1)
 981 {
 982     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 983     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
 984     rvec extr_s[2],extr_r[2];
 985     rvec dh;
 986     real dist_d,c=0,det;
 987     gmx_domdec_comm_t *comm;
 988     gmx_bool bPBC,bUse;
 989
 990     comm = dd->comm;
 991
 992     for(d=1; d<dd->ndim; d++)
 993     {
 994         dim = dd->dim[d];
 995         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 996         zp->min0 = cell_ns_x0[dim];
 997         zp->max1 = cell_ns_x1[dim];
 998         zp->mch0 = cell_ns_x0[dim];
 999         zp->mch1 = cell_ns_x1[dim];
1000         zp->p1_0 = cell_ns_x0[dim];
1001         zp->p1_1 = cell_ns_x1[dim];
1002     }
1003
1004     for(d=dd->ndim-2; d>=0; d--)
1005     {
1006         dim  = dd->dim[d];
1007         bPBC = (dim < ddbox->npbcdim);
1008
1009         /* Use an rvec to store two reals */
1010         extr_s[d][0] = comm->cell_f0[d+1];
1011         extr_s[d][1] = comm->cell_f1[d+1];
1012         extr_s[d][2] = 0;
1013
1014         pos = 0;
1015         /* Store the extremes in the backward sending buffer,
1016          * so the get updated separately from the forward communication.
1017          */
1018         for(d1=d; d1<dd->ndim-1; d1++)
1019         {
1020             /* We invert the order to be able to use the same loop for buf_e */
1021             buf_s[pos].min0 = extr_s[d1][1];
1022             buf_s[pos].max1 = extr_s[d1][0];
1023             buf_s[pos].mch0 = 0;
1024             buf_s[pos].mch1 = 0;
1025             /* Store the cell corner of the dimension we communicate along */
1026             buf_s[pos].p1_0 = comm->cell_x0[dim];
1027             buf_s[pos].p1_1 = 0;
1028             pos++;
1029         }
1030
1031         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1032         pos++;
1033
1034         if (dd->ndim == 3 && d == 0)
1035         {
1036             buf_s[pos] = comm->zone_d2[0][1];
1037             pos++;
1038             buf_s[pos] = comm->zone_d1[0];
1039             pos++;
1040         }
1041
1042         /* We only need to communicate the extremes
1043          * in the forward direction
1044          */
1045         npulse = comm->cd[d].np;
1046         if (bPBC)
1047         {
1048             /* Take the minimum to avoid double communication */
1049             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1050         }
1051         else
1052         {
1053             /* Without PBC we should really not communicate over
1054              * the boundaries, but implementing that complicates
1055              * the communication setup and therefore we simply
1056              * do all communication, but ignore some data.
1057              */
1058             npulse_min = npulse;
1059         }
1060         for(p=0; p<npulse_min; p++)
1061         {
1062             /* Communicate the extremes forward */
1063             bUse = (bPBC || dd->ci[dim] > 0);
1064
1065             dd_sendrecv_rvec(dd, d, dddirForward,
1066                              extr_s+d, dd->ndim-d-1,
1067                              extr_r+d, dd->ndim-d-1);
1068
1069             if (bUse)
1070             {
1071                 for(d1=d; d1<dd->ndim-1; d1++)
1072                 {
1073                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1074                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1075                 }
1076             }
1077         }
1078
1079         buf_size = pos;
1080         for(p=0; p<npulse; p++)
1081         {
1082             /* Communicate all the zone information backward */
1083             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1084
1085             dd_sendrecv_ddzone(dd, d, dddirBackward,
1086                                buf_s, buf_size,
1087                                buf_r, buf_size);
1088
1089             clear_rvec(dh);
1090             if (p > 0)
1091             {
1092                 for(d1=d+1; d1<dd->ndim; d1++)
1093                 {
1094                     /* Determine the decrease of maximum required
1095                      * communication height along d1 due to the distance along d,
1096                      * this avoids a lot of useless atom communication.
1097                      */
1098                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1099
1100                     if (ddbox->tric_dir[dim])
1101                     {
1102                         /* c is the off-diagonal coupling between the cell planes
1103                          * along directions d and d1.
1104                          */
1105                         c = ddbox->v[dim][dd->dim[d1]][dim];
1106                     }
1107                     else
1108                     {
1109                         c = 0;
1110                     }
1111                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1112                     if (det > 0)
1113                     {
1114                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1115                     }
1116                     else
1117                     {
1118                         /* A negative value signals out of range */
1119                         dh[d1] = -1;
1120                     }
1121                 }
1122             }
1123
1124             /* Accumulate the extremes over all pulses */
1125             for(i=0; i<buf_size; i++)
1126             {
1127                 if (p == 0)
1128                 {
1129                     buf_e[i] = buf_r[i];
1130                 }
1131                 else
1132                 {
1133                     if (bUse)
1134                     {
1135                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1136                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1137                     }
1138
1139                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1140                     {
1141                         d1 = 1;
1142                     }
1143                     else
1144                     {
1145                         d1 = d + 1;
1146                     }
1147                     if (bUse && dh[d1] >= 0)
1148                     {
1149                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1150                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1151                     }
1152                 }
1153                 /* Copy the received buffer to the send buffer,
1154                  * to pass the data through with the next pulse.
1155                  */
1156                 buf_s[i] = buf_r[i];
1157             }
1158             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1159                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1160             {
1161                 /* Store the extremes */
1162                 pos = 0;
1163
1164                 for(d1=d; d1<dd->ndim-1; d1++)
1165                 {
1166                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1167                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1168                     pos++;
1169                 }
1170
1171                 if (d == 1 || (d == 0 && dd->ndim == 3))
1172                 {
1173                     for(i=d; i<2; i++)
1174                     {
1175                         comm->zone_d2[1-d][i] = buf_e[pos];
1176                         pos++;
1177                     }
1178                 }
1179                 if (d == 0)
1180                 {
1181                     comm->zone_d1[1] = buf_e[pos];
1182                     pos++;
1183                 }
1184             }
1185         }
1186     }
1187
1188     if (dd->ndim >= 2)
1189     {
1190         dim = dd->dim[1];
1191         for(i=0; i<2; i++)
1192         {
1193             if (debug)
1194             {
1195                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1196             }
1197             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1198             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1199         }
1200     }
1201     if (dd->ndim >= 3)
1202     {
1203         dim = dd->dim[2];
1204         for(i=0; i<2; i++)
1205         {
1206             for(j=0; j<2; j++)
1207             {
1208                 if (debug)
1209                 {
1210                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1211                 }
1212                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1213                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1214             }
1215         }
1216     }
1217     for(d=1; d<dd->ndim; d++)
1218     {
1219         comm->cell_f_max0[d] = extr_s[d-1][0];
1220         comm->cell_f_min1[d] = extr_s[d-1][1];
1221         if (debug)
1222         {
1223             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1224                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1225         }
1226     }
1227 }
1228
1229 static void dd_collect_cg(gmx_domdec_t *dd,
1230                           t_state *state_local)
1231 {
1232     gmx_domdec_master_t *ma=NULL;
1233     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1234     t_block *cgs_gl;
1235
1236     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1237     {
1238         /* The master has the correct distribution */
1239         return;
1240     }
1241
1242     if (state_local->ddp_count == dd->ddp_count)
1243     {
1244         ncg_home = dd->ncg_home;
1245         cg       = dd->index_gl;
1246         nat_home = dd->nat_home;
1247     }
1248     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1249     {
1250         cgs_gl = &dd->comm->cgs_gl;
1251
1252         ncg_home = state_local->ncg_gl;
1253         cg       = state_local->cg_gl;
1254         nat_home = 0;
1255         for(i=0; i<ncg_home; i++)
1256         {
1257             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1258         }
1259     }
1260     else
1261     {
1262         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1263     }
1264
1265     buf2[0] = dd->ncg_home;
1266     buf2[1] = dd->nat_home;
1267     if (DDMASTER(dd))
1268     {
1269         ma = dd->ma;
1270         ibuf = ma->ibuf;
1271     }
1272     else
1273     {
1274         ibuf = NULL;
1275     }
1276     /* Collect the charge group and atom counts on the master */
1277     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1278
1279     if (DDMASTER(dd))
1280     {
1281         ma->index[0] = 0;
1282         for(i=0; i<dd->nnodes; i++)
1283         {
1284             ma->ncg[i] = ma->ibuf[2*i];
1285             ma->nat[i] = ma->ibuf[2*i+1];
1286             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1287
1288         }
1289         /* Make byte counts and indices */
1290         for(i=0; i<dd->nnodes; i++)
1291         {
1292             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1293             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1294         }
1295         if (debug)
1296         {
1297             fprintf(debug,"Initial charge group distribution: ");
1298             for(i=0; i<dd->nnodes; i++)
1299                 fprintf(debug," %d",ma->ncg[i]);
1300             fprintf(debug,"\n");
1301         }
1302     }
1303
1304     /* Collect the charge group indices on the master */
1305     dd_gatherv(dd,
1306                dd->ncg_home*sizeof(int),dd->index_gl,
1307                DDMASTER(dd) ? ma->ibuf : NULL,
1308                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1309                DDMASTER(dd) ? ma->cg : NULL);
1310
1311     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1312 }
1313
1314 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1315                                     rvec *lv,rvec *v)
1316 {
1317     gmx_domdec_master_t *ma;
1318     int  n,i,c,a,nalloc=0;
1319     rvec *buf=NULL;
1320     t_block *cgs_gl;
1321
1322     ma = dd->ma;
1323
1324     if (!DDMASTER(dd))
1325     {
1326 #ifdef GMX_MPI
1327         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1328                  dd->rank,dd->mpi_comm_all);
1329 #endif
1330     } else {
1331         /* Copy the master coordinates to the global array */
1332         cgs_gl = &dd->comm->cgs_gl;
1333
1334         n = DDMASTERRANK(dd);
1335         a = 0;
1336         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1337         {
1338             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1339             {
1340                 copy_rvec(lv[a++],v[c]);
1341             }
1342         }
1343
1344         for(n=0; n<dd->nnodes; n++)
1345         {
1346             if (n != dd->rank)
1347             {
1348                 if (ma->nat[n] > nalloc)
1349                 {
1350                     nalloc = over_alloc_dd(ma->nat[n]);
1351                     srenew(buf,nalloc);
1352                 }
1353 #ifdef GMX_MPI
1354                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1355                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1356 #endif
1357                 a = 0;
1358                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1359                 {
1360                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1361                     {
1362                         copy_rvec(buf[a++],v[c]);
1363                     }
1364                 }
1365             }
1366         }
1367         sfree(buf);
1368     }
1369 }
1370
1371 static void get_commbuffer_counts(gmx_domdec_t *dd,
1372                                   int **counts,int **disps)
1373 {
1374     gmx_domdec_master_t *ma;
1375     int n;
1376
1377     ma = dd->ma;
1378
1379     /* Make the rvec count and displacment arrays */
1380     *counts  = ma->ibuf;
1381     *disps   = ma->ibuf + dd->nnodes;
1382     for(n=0; n<dd->nnodes; n++)
1383     {
1384         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1385         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1386     }
1387 }
1388
1389 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1390                                    rvec *lv,rvec *v)
1391 {
1392     gmx_domdec_master_t *ma;
1393     int  *rcounts=NULL,*disps=NULL;
1394     int  n,i,c,a;
1395     rvec *buf=NULL;
1396     t_block *cgs_gl;
1397
1398     ma = dd->ma;
1399
1400     if (DDMASTER(dd))
1401     {
1402         get_commbuffer_counts(dd,&rcounts,&disps);
1403
1404         buf = ma->vbuf;
1405     }
1406
1407     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1408
1409     if (DDMASTER(dd))
1410     {
1411         cgs_gl = &dd->comm->cgs_gl;
1412
1413         a = 0;
1414         for(n=0; n<dd->nnodes; n++)
1415         {
1416             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1417             {
1418                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1419                 {
1420                     copy_rvec(buf[a++],v[c]);
1421                 }
1422             }
1423         }
1424     }
1425 }
1426
1427 void dd_collect_vec(gmx_domdec_t *dd,
1428                     t_state *state_local,rvec *lv,rvec *v)
1429 {
1430     gmx_domdec_master_t *ma;
1431     int  n,i,c,a,nalloc=0;
1432     rvec *buf=NULL;
1433
1434     dd_collect_cg(dd,state_local);
1435
1436     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1437     {
1438         dd_collect_vec_sendrecv(dd,lv,v);
1439     }
1440     else
1441     {
1442         dd_collect_vec_gatherv(dd,lv,v);
1443     }
1444 }
1445
1446
1447 void dd_collect_state(gmx_domdec_t *dd,
1448                       t_state *state_local,t_state *state)
1449 {
1450     int est,i,j,nh;
1451
1452     nh = state->nhchainlength;
1453
1454     if (DDMASTER(dd))
1455     {
1456         state->lambda = state_local->lambda;
1457         state->veta = state_local->veta;
1458         state->vol0 = state_local->vol0;
1459         copy_mat(state_local->box,state->box);
1460         copy_mat(state_local->boxv,state->boxv);
1461         copy_mat(state_local->svir_prev,state->svir_prev);
1462         copy_mat(state_local->fvir_prev,state->fvir_prev);
1463         copy_mat(state_local->pres_prev,state->pres_prev);
1464
1465
1466         for(i=0; i<state_local->ngtc; i++)
1467         {
1468             for(j=0; j<nh; j++) {
1469                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1470                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1471             }
1472             state->therm_integral[i] = state_local->therm_integral[i];
1473         }
1474         for(i=0; i<state_local->nnhpres; i++)
1475         {
1476             for(j=0; j<nh; j++) {
1477                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1478                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1479             }
1480         }
1481     }
1482     for(est=0; est<estNR; est++)
1483     {
1484         if (EST_DISTR(est) && state_local->flags & (1<<est))
1485         {
1486             switch (est) {
1487             case estX:
1488                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1489                 break;
1490             case estV:
1491                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1492                 break;
1493             case estSDX:
1494                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1495                 break;
1496             case estCGP:
1497                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1498                 break;
1499             case estLD_RNG:
1500                 if (state->nrngi == 1)
1501                 {
1502                     if (DDMASTER(dd))
1503                     {
1504                         for(i=0; i<state_local->nrng; i++)
1505                         {
1506                             state->ld_rng[i] = state_local->ld_rng[i];
1507                         }
1508                     }
1509                 }
1510                 else
1511                 {
1512                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1513                               state_local->ld_rng,state->ld_rng);
1514                 }
1515                 break;
1516             case estLD_RNGI:
1517                 if (state->nrngi == 1)
1518                 {
1519                    if (DDMASTER(dd))
1520                     {
1521                         state->ld_rngi[0] = state_local->ld_rngi[0];
1522                     }
1523                 }
1524                 else
1525                 {
1526                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1527                               state_local->ld_rngi,state->ld_rngi);
1528                 }
1529                 break;
1530             case estDISRE_INITF:
1531             case estDISRE_RM3TAV:
1532             case estORIRE_INITF:
1533             case estORIRE_DTAV:
1534                 break;
1535             default:
1536                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1537             }
1538         }
1539     }
1540 }
1541
1542 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1543 {
1544     if (debug)
1545     {
1546         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1547     }
1548     fr->cg_nalloc = over_alloc_dd(nalloc);
1549     srenew(fr->cg_cm,fr->cg_nalloc);
1550     srenew(fr->cginfo,fr->cg_nalloc);
1551 }
1552
1553 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1554 {
1555     int est;
1556
1557     if (debug)
1558     {
1559         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1560     }
1561
1562     state->nalloc = over_alloc_dd(nalloc);
1563
1564     for(est=0; est<estNR; est++)
1565     {
1566         if (EST_DISTR(est) && state->flags & (1<<est))
1567         {
1568             switch(est) {
1569             case estX:
1570                 srenew(state->x,state->nalloc);
1571                 break;
1572             case estV:
1573                 srenew(state->v,state->nalloc);
1574                 break;
1575             case estSDX:
1576                 srenew(state->sd_X,state->nalloc);
1577                 break;
1578             case estCGP:
1579                 srenew(state->cg_p,state->nalloc);
1580                 break;
1581             case estLD_RNG:
1582             case estLD_RNGI:
1583             case estDISRE_INITF:
1584             case estDISRE_RM3TAV:
1585             case estORIRE_INITF:
1586             case estORIRE_DTAV:
1587                 /* No reallocation required */
1588                 break;
1589             default:
1590                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1591             }
1592         }
1593     }
1594
1595     if (f != NULL)
1596     {
1597         srenew(*f,state->nalloc);
1598     }
1599 }
1600
1601 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1602                                        rvec *v,rvec *lv)
1603 {
1604     gmx_domdec_master_t *ma;
1605     int  n,i,c,a,nalloc=0;
1606     rvec *buf=NULL;
1607
1608     if (DDMASTER(dd))
1609     {
1610         ma  = dd->ma;
1611
1612         for(n=0; n<dd->nnodes; n++)
1613         {
1614             if (n != dd->rank)
1615             {
1616                 if (ma->nat[n] > nalloc)
1617                 {
1618                     nalloc = over_alloc_dd(ma->nat[n]);
1619                     srenew(buf,nalloc);
1620                 }
1621                 /* Use lv as a temporary buffer */
1622                 a = 0;
1623                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1624                 {
1625                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1626                     {
1627                         copy_rvec(v[c],buf[a++]);
1628                     }
1629                 }
1630                 if (a != ma->nat[n])
1631                 {
1632                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1633                               a,ma->nat[n]);
1634                 }
1635
1636 #ifdef GMX_MPI
1637                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1638                          DDRANK(dd,n),n,dd->mpi_comm_all);
1639 #endif
1640             }
1641         }
1642         sfree(buf);
1643         n = DDMASTERRANK(dd);
1644         a = 0;
1645         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1646         {
1647             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1648             {
1649                 copy_rvec(v[c],lv[a++]);
1650             }
1651         }
1652     }
1653     else
1654     {
1655 #ifdef GMX_MPI
1656         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1657                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1658 #endif
1659     }
1660 }
1661
1662 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1663                                        rvec *v,rvec *lv)
1664 {
1665     gmx_domdec_master_t *ma;
1666     int  *scounts=NULL,*disps=NULL;
1667     int  n,i,c,a,nalloc=0;
1668     rvec *buf=NULL;
1669
1670     if (DDMASTER(dd))
1671     {
1672         ma  = dd->ma;
1673
1674         get_commbuffer_counts(dd,&scounts,&disps);
1675
1676         buf = ma->vbuf;
1677         a = 0;
1678         for(n=0; n<dd->nnodes; n++)
1679         {
1680             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1681             {
1682                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1683                 {
1684                     copy_rvec(v[c],buf[a++]);
1685                 }
1686             }
1687         }
1688     }
1689
1690     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1691 }
1692
1693 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1694 {
1695     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1696     {
1697         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1698     }
1699     else
1700     {
1701         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1702     }
1703 }
1704
1705 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1706                                 t_state *state,t_state *state_local,
1707                                 rvec **f)
1708 {
1709     int  i,j,ngtch,ngtcp,nh;
1710
1711     nh = state->nhchainlength;
1712
1713     if (DDMASTER(dd))
1714     {
1715         state_local->lambda = state->lambda;
1716         state_local->veta   = state->veta;
1717         state_local->vol0   = state->vol0;
1718         copy_mat(state->box,state_local->box);
1719         copy_mat(state->box_rel,state_local->box_rel);
1720         copy_mat(state->boxv,state_local->boxv);
1721         copy_mat(state->svir_prev,state_local->svir_prev);
1722         copy_mat(state->fvir_prev,state_local->fvir_prev);
1723         for(i=0; i<state_local->ngtc; i++)
1724         {
1725             for(j=0; j<nh; j++) {
1726                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1727                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1728             }
1729             state_local->therm_integral[i] = state->therm_integral[i];
1730         }
1731         for(i=0; i<state_local->nnhpres; i++)
1732         {
1733             for(j=0; j<nh; j++) {
1734                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1735                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1736             }
1737         }
1738     }
1739     dd_bcast(dd,sizeof(real),&state_local->lambda);
1740     dd_bcast(dd,sizeof(real),&state_local->veta);
1741     dd_bcast(dd,sizeof(real),&state_local->vol0);
1742     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1743     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1744     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1745     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1746     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1747     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1748     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1749     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1750     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1751     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1752
1753     if (dd->nat_home > state_local->nalloc)
1754     {
1755         dd_realloc_state(state_local,f,dd->nat_home);
1756     }
1757     for(i=0; i<estNR; i++)
1758     {
1759         if (EST_DISTR(i) && state_local->flags & (1<<i))
1760         {
1761             switch (i) {
1762             case estX:
1763                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1764                 break;
1765             case estV:
1766                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1767                 break;
1768             case estSDX:
1769                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1770                 break;
1771             case estCGP:
1772                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1773                 break;
1774             case estLD_RNG:
1775                 if (state->nrngi == 1)
1776                 {
1777                     dd_bcastc(dd,
1778                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1779                               state->ld_rng,state_local->ld_rng);
1780                 }
1781                 else
1782                 {
1783                     dd_scatter(dd,
1784                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1785                                state->ld_rng,state_local->ld_rng);
1786                 }
1787                 break;
1788             case estLD_RNGI:
1789                 if (state->nrngi == 1)
1790                 {
1791                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1792                               state->ld_rngi,state_local->ld_rngi);
1793                 }
1794                 else
1795                 {
1796                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1797                                state->ld_rngi,state_local->ld_rngi);
1798                 }
1799                 break;
1800             case estDISRE_INITF:
1801             case estDISRE_RM3TAV:
1802             case estORIRE_INITF:
1803             case estORIRE_DTAV:
1804                 /* Not implemented yet */
1805                 break;
1806             default:
1807                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1808             }
1809         }
1810     }
1811 }
1812
1813 static char dim2char(int dim)
1814 {
1815     char c='?';
1816
1817     switch (dim)
1818     {
1819     case XX: c = 'X'; break;
1820     case YY: c = 'Y'; break;
1821     case ZZ: c = 'Z'; break;
1822     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1823     }
1824
1825     return c;
1826 }
1827
1828 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1829                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1830 {
1831     rvec grid_s[2],*grid_r=NULL,cx,r;
1832     char fname[STRLEN],format[STRLEN],buf[22];
1833     FILE *out;
1834     int  a,i,d,z,y,x;
1835     matrix tric;
1836     real vol;
1837
1838     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1839     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1840
1841     if (DDMASTER(dd))
1842     {
1843         snew(grid_r,2*dd->nnodes);
1844     }
1845
1846     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1847
1848     if (DDMASTER(dd))
1849     {
1850         for(d=0; d<DIM; d++)
1851         {
1852             for(i=0; i<DIM; i++)
1853             {
1854                 if (d == i)
1855                 {
1856                     tric[d][i] = 1;
1857                 }
1858                 else
1859                 {
1860                     if (dd->nc[d] > 1 && d < ddbox->npbcdim)
1861                     {
1862                         tric[d][i] = box[i][d]/box[i][i];
1863                     }
1864                     else
1865                     {
1866                         tric[d][i] = 0;
1867                     }
1868                 }
1869             }
1870         }
1871         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1872         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1873         out = gmx_fio_fopen(fname,"w");
1874         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1875         a = 1;
1876         for(i=0; i<dd->nnodes; i++)
1877         {
1878             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1879             for(d=0; d<DIM; d++)
1880             {
1881                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1882             }
1883             for(z=0; z<2; z++)
1884             {
1885                 for(y=0; y<2; y++)
1886                 {
1887                     for(x=0; x<2; x++)
1888                     {
1889                         cx[XX] = grid_r[i*2+x][XX];
1890                         cx[YY] = grid_r[i*2+y][YY];
1891                         cx[ZZ] = grid_r[i*2+z][ZZ];
1892                         mvmul(tric,cx,r);
1893                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1894                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1895                     }
1896                 }
1897             }
1898             for(d=0; d<DIM; d++)
1899             {
1900                 for(x=0; x<4; x++)
1901                 {
1902                     switch(d)
1903                     {
1904                     case 0: y = 1 + i*8 + 2*x; break;
1905                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1906                     case 2: y = 1 + i*8 + x; break;
1907                     }
1908                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1909                 }
1910             }
1911         }
1912         gmx_fio_fclose(out);
1913         sfree(grid_r);
1914     }
1915 }
1916
1917 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1918                   gmx_mtop_t *mtop,t_commrec *cr,
1919                   int natoms,rvec x[],matrix box)
1920 {
1921     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1922     FILE *out;
1923     int  i,ii,resnr,c;
1924     char *atomname,*resname;
1925     real b;
1926     gmx_domdec_t *dd;
1927
1928     dd = cr->dd;
1929     if (natoms == -1)
1930     {
1931         natoms = dd->comm->nat[ddnatVSITE];
1932     }
1933
1934     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1935
1936     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1937     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
1938
1939     out = gmx_fio_fopen(fname,"w");
1940
1941     fprintf(out,"TITLE     %s\n",title);
1942     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1943     for(i=0; i<natoms; i++)
1944     {
1945         ii = dd->gatindex[i];
1946         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1947         if (i < dd->comm->nat[ddnatZONE])
1948         {
1949             c = 0;
1950             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1951             {
1952                 c++;
1953             }
1954             b = c;
1955         }
1956         else if (i < dd->comm->nat[ddnatVSITE])
1957         {
1958             b = dd->comm->zones.n;
1959         }
1960         else
1961         {
1962             b = dd->comm->zones.n + 1;
1963         }
1964         fprintf(out,strlen(atomname)<4 ? format : format4,
1965                 "ATOM",(ii+1)%100000,
1966                 atomname,resname,' ',resnr%10000,' ',
1967                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1968     }
1969     fprintf(out,"TER\n");
1970
1971     gmx_fio_fclose(out);
1972 }
1973
1974 real dd_cutoff_mbody(gmx_domdec_t *dd)
1975 {
1976     gmx_domdec_comm_t *comm;
1977     int  di;
1978     real r;
1979
1980     comm = dd->comm;
1981
1982     r = -1;
1983     if (comm->bInterCGBondeds)
1984     {
1985         if (comm->cutoff_mbody > 0)
1986         {
1987             r = comm->cutoff_mbody;
1988         }
1989         else
1990         {
1991             /* cutoff_mbody=0 means we do not have DLB */
1992             r = comm->cellsize_min[dd->dim[0]];
1993             for(di=1; di<dd->ndim; di++)
1994             {
1995                 r = min(r,comm->cellsize_min[dd->dim[di]]);
1996             }
1997             if (comm->bBondComm)
1998             {
1999                 r = max(r,comm->cutoff_mbody);
2000             }
2001             else
2002             {
2003                 r = min(r,comm->cutoff);
2004             }
2005         }
2006     }
2007
2008     return r;
2009 }
2010
2011 real dd_cutoff_twobody(gmx_domdec_t *dd)
2012 {
2013     real r_mb;
2014
2015     r_mb = dd_cutoff_mbody(dd);
2016
2017     return max(dd->comm->cutoff,r_mb);
2018 }
2019
2020
2021 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2022 {
2023     int nc,ntot;
2024
2025     nc   = dd->nc[dd->comm->cartpmedim];
2026     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2027     copy_ivec(coord,coord_pme);
2028     coord_pme[dd->comm->cartpmedim] =
2029         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2030 }
2031
2032 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2033 {
2034     /* Here we assign a PME node to communicate with this DD node
2035      * by assuming that the major index of both is x.
2036      * We add cr->npmenodes/2 to obtain an even distribution.
2037      */
2038     return (ddindex*npme + npme/2)/ndd;
2039 }
2040
2041 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2042 {
2043     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2044 }
2045
2046 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2047 {
2048     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2049 }
2050
2051 static int *dd_pmenodes(t_commrec *cr)
2052 {
2053     int *pmenodes;
2054     int n,i,p0,p1;
2055
2056     snew(pmenodes,cr->npmenodes);
2057     n = 0;
2058     for(i=0; i<cr->dd->nnodes; i++) {
2059         p0 = cr_ddindex2pmeindex(cr,i);
2060         p1 = cr_ddindex2pmeindex(cr,i+1);
2061         if (i+1 == cr->dd->nnodes || p1 > p0) {
2062             if (debug)
2063                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2064             pmenodes[n] = i + 1 + n;
2065             n++;
2066         }
2067     }
2068
2069     return pmenodes;
2070 }
2071
2072 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2073 {
2074     gmx_domdec_t *dd;
2075     ivec coords,coords_pme,nc;
2076     int  slab;
2077
2078     dd = cr->dd;
2079     /*
2080       if (dd->comm->bCartesian) {
2081       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2082       dd_coords2pmecoords(dd,coords,coords_pme);
2083       copy_ivec(dd->ntot,nc);
2084       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2085       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2086
2087       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2088       } else {
2089       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2090       }
2091     */
2092     coords[XX] = x;
2093     coords[YY] = y;
2094     coords[ZZ] = z;
2095     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2096
2097     return slab;
2098 }
2099
2100 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2101 {
2102     gmx_domdec_comm_t *comm;
2103     ivec coords;
2104     int  ddindex,nodeid=-1;
2105
2106     comm = cr->dd->comm;
2107
2108     coords[XX] = x;
2109     coords[YY] = y;
2110     coords[ZZ] = z;
2111     if (comm->bCartesianPP_PME)
2112     {
2113 #ifdef GMX_MPI
2114         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2115 #endif
2116     }
2117     else
2118     {
2119         ddindex = dd_index(cr->dd->nc,coords);
2120         if (comm->bCartesianPP)
2121         {
2122             nodeid = comm->ddindex2simnodeid[ddindex];
2123         }
2124         else
2125         {
2126             if (comm->pmenodes)
2127             {
2128                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2129             }
2130             else
2131             {
2132                 nodeid = ddindex;
2133             }
2134         }
2135     }
2136
2137     return nodeid;
2138 }
2139
2140 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2141 {
2142     gmx_domdec_t *dd;
2143     gmx_domdec_comm_t *comm;
2144     ivec coord,coord_pme;
2145     int  i;
2146     int  pmenode=-1;
2147
2148     dd = cr->dd;
2149     comm = dd->comm;
2150
2151     /* This assumes a uniform x domain decomposition grid cell size */
2152     if (comm->bCartesianPP_PME)
2153     {
2154 #ifdef GMX_MPI
2155         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2156         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2157         {
2158             /* This is a PP node */
2159             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2160             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2161         }
2162 #endif
2163     }
2164     else if (comm->bCartesianPP)
2165     {
2166         if (sim_nodeid < dd->nnodes)
2167         {
2168             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2169         }
2170     }
2171     else
2172     {
2173         /* This assumes DD cells with identical x coordinates
2174          * are numbered sequentially.
2175          */
2176         if (dd->comm->pmenodes == NULL)
2177         {
2178             if (sim_nodeid < dd->nnodes)
2179             {
2180                 /* The DD index equals the nodeid */
2181                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2182             }
2183         }
2184         else
2185         {
2186             i = 0;
2187             while (sim_nodeid > dd->comm->pmenodes[i])
2188             {
2189                 i++;
2190             }
2191             if (sim_nodeid < dd->comm->pmenodes[i])
2192             {
2193                 pmenode = dd->comm->pmenodes[i];
2194             }
2195         }
2196     }
2197
2198     return pmenode;
2199 }
2200
2201 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2202 {
2203     gmx_bool bPMEOnlyNode;
2204
2205     if (DOMAINDECOMP(cr))
2206     {
2207         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2208     }
2209     else
2210     {
2211         bPMEOnlyNode = FALSE;
2212     }
2213
2214     return bPMEOnlyNode;
2215 }
2216
2217 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2218                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2219 {
2220     gmx_domdec_t *dd;
2221     int x,y,z;
2222     ivec coord,coord_pme;
2223
2224     dd = cr->dd;
2225
2226     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2227
2228     *nmy_ddnodes = 0;
2229     for(x=0; x<dd->nc[XX]; x++)
2230     {
2231         for(y=0; y<dd->nc[YY]; y++)
2232         {
2233             for(z=0; z<dd->nc[ZZ]; z++)
2234             {
2235                 if (dd->comm->bCartesianPP_PME)
2236                 {
2237                     coord[XX] = x;
2238                     coord[YY] = y;
2239                     coord[ZZ] = z;
2240                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2241                     if (dd->ci[XX] == coord_pme[XX] &&
2242                         dd->ci[YY] == coord_pme[YY] &&
2243                         dd->ci[ZZ] == coord_pme[ZZ])
2244                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2245                 }
2246                 else
2247                 {
2248                     /* The slab corresponds to the nodeid in the PME group */
2249                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2250                     {
2251                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2252                     }
2253                 }
2254             }
2255         }
2256     }
2257
2258     /* The last PP-only node is the peer node */
2259     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2260
2261     if (debug)
2262     {
2263         fprintf(debug,"Receive coordinates from PP nodes:");
2264         for(x=0; x<*nmy_ddnodes; x++)
2265         {
2266             fprintf(debug," %d",(*my_ddnodes)[x]);
2267         }
2268         fprintf(debug,"\n");
2269     }
2270 }
2271
2272 static gmx_bool receive_vir_ener(t_commrec *cr)
2273 {
2274     gmx_domdec_comm_t *comm;
2275     int  pmenode,coords[DIM],rank;
2276     gmx_bool bReceive;
2277
2278     bReceive = TRUE;
2279     if (cr->npmenodes < cr->dd->nnodes)
2280     {
2281         comm = cr->dd->comm;
2282         if (comm->bCartesianPP_PME)
2283         {
2284             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2285 #ifdef GMX_MPI
2286             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2287             coords[comm->cartpmedim]++;
2288             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2289             {
2290                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2291                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2292                 {
2293                     /* This is not the last PP node for pmenode */
2294                     bReceive = FALSE;
2295                 }
2296             }
2297 #endif
2298         }
2299         else
2300         {
2301             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2302             if (cr->sim_nodeid+1 < cr->nnodes &&
2303                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2304             {
2305                 /* This is not the last PP node for pmenode */
2306                 bReceive = FALSE;
2307             }
2308         }
2309     }
2310
2311     return bReceive;
2312 }
2313
2314 static void set_zones_ncg_home(gmx_domdec_t *dd)
2315 {
2316     gmx_domdec_zones_t *zones;
2317     int i;
2318
2319     zones = &dd->comm->zones;
2320
2321     zones->cg_range[0] = 0;
2322     for(i=1; i<zones->n+1; i++)
2323     {
2324         zones->cg_range[i] = dd->ncg_home;
2325     }
2326 }
2327
2328 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2329 {
2330     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2331
2332     ind = state->cg_gl;
2333     dd_cg_gl = dd->index_gl;
2334     cgindex  = dd->cgindex;
2335     nat = 0;
2336     cgindex[0] = nat;
2337     for(i=0; i<state->ncg_gl; i++)
2338     {
2339         cgindex[i] = nat;
2340         cg_gl = ind[i];
2341         dd_cg_gl[i] = cg_gl;
2342         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2343     }
2344     cgindex[i] = nat;
2345
2346     dd->ncg_home = state->ncg_gl;
2347     dd->nat_home = nat;
2348
2349     set_zones_ncg_home(dd);
2350 }
2351
2352 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2353 {
2354     while (cg >= cginfo_mb->cg_end)
2355     {
2356         cginfo_mb++;
2357     }
2358
2359     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2360 }
2361
2362 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2363                           t_forcerec *fr,char *bLocalCG)
2364 {
2365     cginfo_mb_t *cginfo_mb;
2366     int *cginfo;
2367     int cg;
2368
2369     if (fr != NULL)
2370     {
2371         cginfo_mb = fr->cginfo_mb;
2372         cginfo    = fr->cginfo;
2373
2374         for(cg=cg0; cg<cg1; cg++)
2375         {
2376             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2377         }
2378     }
2379
2380     if (bLocalCG != NULL)
2381     {
2382         for(cg=cg0; cg<cg1; cg++)
2383         {
2384             bLocalCG[index_gl[cg]] = TRUE;
2385         }
2386     }
2387 }
2388
2389 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2390 {
2391     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2392     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2393     gmx_ga2la_t *ga2la;
2394     char *bLocalCG;
2395
2396     bLocalCG = dd->comm->bLocalCG;
2397
2398     if (dd->nat_tot > dd->gatindex_nalloc)
2399     {
2400         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2401         srenew(dd->gatindex,dd->gatindex_nalloc);
2402     }
2403
2404     nzone      = dd->comm->zones.n;
2405     zone2cg    = dd->comm->zones.cg_range;
2406     zone_ncg1  = dd->comm->zone_ncg1;
2407     index_gl   = dd->index_gl;
2408     gatindex   = dd->gatindex;
2409
2410     if (zone2cg[1] != dd->ncg_home)
2411     {
2412         gmx_incons("dd->ncg_zone is not up to date");
2413     }
2414
2415     /* Make the local to global and global to local atom index */
2416     a = dd->cgindex[cg_start];
2417     for(zone=0; zone<nzone; zone++)
2418     {
2419         if (zone == 0)
2420         {
2421             cg0 = cg_start;
2422         }
2423         else
2424         {
2425             cg0 = zone2cg[zone];
2426         }
2427         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2428         {
2429             zone1 = zone;
2430             if (cg - cg0 >= zone_ncg1[zone])
2431             {
2432                 /* Signal that this cg is from more than one zone away */
2433                 zone1 += nzone;
2434             }
2435             cg_gl = index_gl[cg];
2436             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2437             {
2438                 gatindex[a] = a_gl;
2439                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2440                 a++;
2441             }
2442         }
2443     }
2444 }
2445
2446 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2447                           const char *where)
2448 {
2449     int ncg,i,ngl,nerr;
2450
2451     nerr = 0;
2452     if (bLocalCG == NULL)
2453     {
2454         return nerr;
2455     }
2456     for(i=0; i<dd->ncg_tot; i++)
2457     {
2458         if (!bLocalCG[dd->index_gl[i]])
2459         {
2460             fprintf(stderr,
2461                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2462             nerr++;
2463         }
2464     }
2465     ngl = 0;
2466     for(i=0; i<ncg_sys; i++)
2467     {
2468         if (bLocalCG[i])
2469         {
2470             ngl++;
2471         }
2472     }
2473     if (ngl != dd->ncg_tot)
2474     {
2475         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2476         nerr++;
2477     }
2478
2479     return nerr;
2480 }
2481
2482 static void check_index_consistency(gmx_domdec_t *dd,
2483                                     int natoms_sys,int ncg_sys,
2484                                     const char *where)
2485 {
2486     int  nerr,ngl,i,a,cell;
2487     int  *have;
2488
2489     nerr = 0;
2490
2491     if (dd->comm->DD_debug > 1)
2492     {
2493         snew(have,natoms_sys);
2494         for(a=0; a<dd->nat_tot; a++)
2495         {
2496             if (have[dd->gatindex[a]] > 0)
2497             {
2498                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2499             }
2500             else
2501             {
2502                 have[dd->gatindex[a]] = a + 1;
2503             }
2504         }
2505         sfree(have);
2506     }
2507
2508     snew(have,dd->nat_tot);
2509
2510     ngl  = 0;
2511     for(i=0; i<natoms_sys; i++)
2512     {
2513         if (ga2la_get(dd->ga2la,i,&a,&cell))
2514         {
2515             if (a >= dd->nat_tot)
2516             {
2517                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2518                 nerr++;
2519             }
2520             else
2521             {
2522                 have[a] = 1;
2523                 if (dd->gatindex[a] != i)
2524                 {
2525                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2526                     nerr++;
2527                 }
2528             }
2529             ngl++;
2530         }
2531     }
2532     if (ngl != dd->nat_tot)
2533     {
2534         fprintf(stderr,
2535                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2536                 dd->rank,where,ngl,dd->nat_tot);
2537     }
2538     for(a=0; a<dd->nat_tot; a++)
2539     {
2540         if (have[a] == 0)
2541         {
2542             fprintf(stderr,
2543                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2544                     dd->rank,where,a+1,dd->gatindex[a]+1);
2545         }
2546     }
2547     sfree(have);
2548
2549     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2550
2551     if (nerr > 0) {
2552         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2553                   dd->rank,where,nerr);
2554     }
2555 }
2556
2557 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2558 {
2559     int  i;
2560     char *bLocalCG;
2561
2562     if (a_start == 0)
2563     {
2564         /* Clear the whole list without searching */
2565         ga2la_clear(dd->ga2la);
2566     }
2567     else
2568     {
2569         for(i=a_start; i<dd->nat_tot; i++)
2570         {
2571             ga2la_del(dd->ga2la,dd->gatindex[i]);
2572         }
2573     }
2574
2575     bLocalCG = dd->comm->bLocalCG;
2576     if (bLocalCG)
2577     {
2578         for(i=cg_start; i<dd->ncg_tot; i++)
2579         {
2580             bLocalCG[dd->index_gl[i]] = FALSE;
2581         }
2582     }
2583
2584     dd_clear_local_vsite_indices(dd);
2585
2586     if (dd->constraints)
2587     {
2588         dd_clear_local_constraint_indices(dd);
2589     }
2590 }
2591
2592 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2593 {
2594     real grid_jump_limit;
2595
2596     /* The distance between the boundaries of cells at distance
2597      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2598      * and by the fact that cells should not be shifted by more than
2599      * half their size, such that cg's only shift by one cell
2600      * at redecomposition.
2601      */
2602     grid_jump_limit = comm->cellsize_limit;
2603     if (!comm->bVacDLBNoLimit)
2604     {
2605         grid_jump_limit = max(grid_jump_limit,
2606                               comm->cutoff/comm->cd[dim_ind].np);
2607     }
2608
2609     return grid_jump_limit;
2610 }
2611
2612 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2613 {
2614     gmx_domdec_comm_t *comm;
2615     int  d,dim;
2616     real limit,bfac;
2617
2618     comm = dd->comm;
2619
2620     for(d=1; d<dd->ndim; d++)
2621     {
2622         dim = dd->dim[d];
2623         limit = grid_jump_limit(comm,d);
2624         bfac = ddbox->box_size[dim];
2625         if (ddbox->tric_dir[dim])
2626         {
2627             bfac *= ddbox->skew_fac[dim];
2628         }
2629         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2630             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2631         {
2632             char buf[22];
2633             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2634                       gmx_step_str(step,buf),
2635                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2636         }
2637     }
2638 }
2639
2640 static int dd_load_count(gmx_domdec_comm_t *comm)
2641 {
2642     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2643 }
2644
2645 static float dd_force_load(gmx_domdec_comm_t *comm)
2646 {
2647     float load;
2648
2649     if (comm->eFlop)
2650     {
2651         load = comm->flop;
2652         if (comm->eFlop > 1)
2653         {
2654             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2655         }
2656     }
2657     else
2658     {
2659         load = comm->cycl[ddCyclF];
2660         if (comm->cycl_n[ddCyclF] > 1)
2661         {
2662             /* Subtract the maximum of the last n cycle counts
2663              * to get rid of possible high counts due to other soures,
2664              * for instance system activity, that would otherwise
2665              * affect the dynamic load balancing.
2666              */
2667             load -= comm->cycl_max[ddCyclF];
2668         }
2669     }
2670
2671     return load;
2672 }
2673
2674 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2675 {
2676     gmx_domdec_comm_t *comm;
2677     int i;
2678
2679     comm = dd->comm;
2680
2681     snew(*dim_f,dd->nc[dim]+1);
2682     (*dim_f)[0] = 0;
2683     for(i=1; i<dd->nc[dim]; i++)
2684     {
2685         if (comm->slb_frac[dim])
2686         {
2687             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2688         }
2689         else
2690         {
2691             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2692         }
2693     }
2694     (*dim_f)[dd->nc[dim]] = 1;
2695 }
2696
2697 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2698 {
2699     int  pmeindex,slab,nso,i;
2700     ivec xyz;
2701
2702     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2703     {
2704         ddpme->dim = YY;
2705     }
2706     else
2707     {
2708         ddpme->dim = dimind;
2709     }
2710     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2711
2712     ddpme->nslab = (ddpme->dim == 0 ?
2713                     dd->comm->npmenodes_x :
2714                     dd->comm->npmenodes_y);
2715
2716     if (ddpme->nslab <= 1)
2717     {
2718         return;
2719     }
2720
2721     nso = dd->comm->npmenodes/ddpme->nslab;
2722     /* Determine for each PME slab the PP location range for dimension dim */
2723     snew(ddpme->pp_min,ddpme->nslab);
2724     snew(ddpme->pp_max,ddpme->nslab);
2725     for(slab=0; slab<ddpme->nslab; slab++) {
2726         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2727         ddpme->pp_max[slab] = 0;
2728     }
2729     for(i=0; i<dd->nnodes; i++) {
2730         ddindex2xyz(dd->nc,i,xyz);
2731         /* For y only use our y/z slab.
2732          * This assumes that the PME x grid size matches the DD grid size.
2733          */
2734         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2735             pmeindex = ddindex2pmeindex(dd,i);
2736             if (dimind == 0) {
2737                 slab = pmeindex/nso;
2738             } else {
2739                 slab = pmeindex % ddpme->nslab;
2740             }
2741             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2742             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2743         }
2744     }
2745
2746     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2747 }
2748
2749 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2750 {
2751     if (dd->comm->ddpme[0].dim == XX)
2752     {
2753         return dd->comm->ddpme[0].maxshift;
2754     }
2755     else
2756     {
2757         return 0;
2758     }
2759 }
2760
2761 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2762 {
2763     if (dd->comm->ddpme[0].dim == YY)
2764     {
2765         return dd->comm->ddpme[0].maxshift;
2766     }
2767     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2768     {
2769         return dd->comm->ddpme[1].maxshift;
2770     }
2771     else
2772     {
2773         return 0;
2774     }
2775 }
2776
2777 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2778                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2779 {
2780     gmx_domdec_comm_t *comm;
2781     int  nc,ns,s;
2782     int  *xmin,*xmax;
2783     real range,pme_boundary;
2784     int  sh;
2785
2786     comm = dd->comm;
2787     nc  = dd->nc[ddpme->dim];
2788     ns  = ddpme->nslab;
2789
2790     if (!ddpme->dim_match)
2791     {
2792         /* PP decomposition is not along dim: the worst situation */
2793         sh = ns/2;
2794     }
2795     else if (ns <= 3 || (bUniform && ns == nc))
2796     {
2797         /* The optimal situation */
2798         sh = 1;
2799     }
2800     else
2801     {
2802         /* We need to check for all pme nodes which nodes they
2803          * could possibly need to communicate with.
2804          */
2805         xmin = ddpme->pp_min;
2806         xmax = ddpme->pp_max;
2807         /* Allow for atoms to be maximally 2/3 times the cut-off
2808          * out of their DD cell. This is a reasonable balance between
2809          * between performance and support for most charge-group/cut-off
2810          * combinations.
2811          */
2812         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2813         /* Avoid extra communication when we are exactly at a boundary */
2814         range *= 0.999;
2815
2816         sh = 1;
2817         for(s=0; s<ns; s++)
2818         {
2819             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2820             pme_boundary = (real)s/ns;
2821             while (sh+1 < ns &&
2822                    ((s-(sh+1) >= 0 &&
2823                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2824                     (s-(sh+1) <  0 &&
2825                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2826             {
2827                 sh++;
2828             }
2829             pme_boundary = (real)(s+1)/ns;
2830             while (sh+1 < ns &&
2831                    ((s+(sh+1) <  ns &&
2832                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2833                     (s+(sh+1) >= ns &&
2834                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2835             {
2836                 sh++;
2837             }
2838         }
2839     }
2840
2841     ddpme->maxshift = sh;
2842
2843     if (debug)
2844     {
2845         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2846                 ddpme->dim,ddpme->maxshift);
2847     }
2848 }
2849
2850 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2851 {
2852     int d,dim;
2853
2854     for(d=0; d<dd->ndim; d++)
2855     {
2856         dim = dd->dim[d];
2857         if (dim < ddbox->nboundeddim &&
2858             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2859             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2860         {
2861             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2862                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2863                       dd->nc[dim],dd->comm->cellsize_limit);
2864         }
2865     }
2866 }
2867
2868 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2869                                   gmx_bool bMaster,ivec npulse)
2870 {
2871     gmx_domdec_comm_t *comm;
2872     int  d,j;
2873     rvec cellsize_min;
2874     real *cell_x,cell_dx,cellsize;
2875
2876     comm = dd->comm;
2877
2878     for(d=0; d<DIM; d++)
2879     {
2880         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2881         npulse[d] = 1;
2882         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2883         {
2884             /* Uniform grid */
2885             cell_dx = ddbox->box_size[d]/dd->nc[d];
2886             if (bMaster)
2887             {
2888                 for(j=0; j<dd->nc[d]+1; j++)
2889                 {
2890                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2891                 }
2892             }
2893             else
2894             {
2895                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2896                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2897             }
2898             cellsize = cell_dx*ddbox->skew_fac[d];
2899             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2900             {
2901                 npulse[d]++;
2902             }
2903             cellsize_min[d] = cellsize;
2904         }
2905         else
2906         {
2907             /* Statically load balanced grid */
2908             /* Also when we are not doing a master distribution we determine
2909              * all cell borders in a loop to obtain identical values
2910              * to the master distribution case and to determine npulse.
2911              */
2912             if (bMaster)
2913             {
2914                 cell_x = dd->ma->cell_x[d];
2915             }
2916             else
2917             {
2918                 snew(cell_x,dd->nc[d]+1);
2919             }
2920             cell_x[0] = ddbox->box0[d];
2921             for(j=0; j<dd->nc[d]; j++)
2922             {
2923                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2924                 cell_x[j+1] = cell_x[j] + cell_dx;
2925                 cellsize = cell_dx*ddbox->skew_fac[d];
2926                 while (cellsize*npulse[d] < comm->cutoff &&
2927                        npulse[d] < dd->nc[d]-1)
2928                 {
2929                     npulse[d]++;
2930                 }
2931                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2932             }
2933             if (!bMaster)
2934             {
2935                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2936                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2937                 sfree(cell_x);
2938             }
2939         }
2940         /* The following limitation is to avoid that a cell would receive
2941          * some of its own home charge groups back over the periodic boundary.
2942          * Double charge groups cause trouble with the global indices.
2943          */
2944         if (d < ddbox->npbcdim &&
2945             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2946         {
2947             gmx_fatal_collective(FARGS,NULL,dd,
2948                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2949                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2950                                  comm->cutoff,
2951                                  dd->nc[d],dd->nc[d],
2952                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2953         }
2954     }
2955
2956     if (!comm->bDynLoadBal)
2957     {
2958         copy_rvec(cellsize_min,comm->cellsize_min);
2959     }
2960
2961     for(d=0; d<comm->npmedecompdim; d++)
2962     {
2963         set_pme_maxshift(dd,&comm->ddpme[d],
2964                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2965                          comm->ddpme[d].slb_dim_f);
2966     }
2967 }
2968
2969
2970 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2971                                        int d,int dim,gmx_domdec_root_t *root,
2972                                        gmx_ddbox_t *ddbox,
2973                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2974 {
2975     gmx_domdec_comm_t *comm;
2976     int  ncd,i,j,nmin,nmin_old;
2977     gmx_bool bLimLo,bLimHi;
2978     real *cell_size;
2979     real fac,halfway,cellsize_limit_f_i,region_size;
2980     gmx_bool bPBC,bLastHi=FALSE;
2981     int nrange[]={range[0],range[1]};
2982
2983     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2984
2985     comm = dd->comm;
2986
2987     ncd = dd->nc[dim];
2988
2989     bPBC = (dim < ddbox->npbcdim);
2990
2991     cell_size = root->buf_ncd;
2992
2993     if (debug)
2994     {
2995         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
2996     }
2997
2998     /* First we need to check if the scaling does not make cells
2999      * smaller than the smallest allowed size.
3000      * We need to do this iteratively, since if a cell is too small,
3001      * it needs to be enlarged, which makes all the other cells smaller,
3002      * which could in turn make another cell smaller than allowed.
3003      */
3004     for(i=range[0]; i<range[1]; i++)
3005     {
3006         root->bCellMin[i] = FALSE;
3007     }
3008     nmin = 0;
3009     do
3010     {
3011         nmin_old = nmin;
3012         /* We need the total for normalization */
3013         fac = 0;
3014         for(i=range[0]; i<range[1]; i++)
3015         {
3016             if (root->bCellMin[i] == FALSE)
3017             {
3018                 fac += cell_size[i];
3019             }
3020         }
3021         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3022         /* Determine the cell boundaries */
3023         for(i=range[0]; i<range[1]; i++)
3024         {
3025             if (root->bCellMin[i] == FALSE)
3026             {
3027                 cell_size[i] *= fac;
3028                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3029                 {
3030                     cellsize_limit_f_i = 0;
3031                 }
3032                 else
3033                 {
3034                     cellsize_limit_f_i = cellsize_limit_f;
3035                 }
3036                 if (cell_size[i] < cellsize_limit_f_i)
3037                 {
3038                     root->bCellMin[i] = TRUE;
3039                     cell_size[i] = cellsize_limit_f_i;
3040                     nmin++;
3041                 }
3042             }
3043             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3044         }
3045     }
3046     while (nmin > nmin_old);
3047
3048     i=range[1]-1;
3049     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3050     /* For this check we should not use DD_CELL_MARGIN,
3051      * but a slightly smaller factor,
3052      * since rounding could get use below the limit.
3053      */
3054     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3055     {
3056         char buf[22];
3057         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3058                   gmx_step_str(step,buf),
3059                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3060                   ncd,comm->cellsize_min[dim]);
3061     }
3062
3063     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3064
3065     if (!bUniform)
3066     {
3067         /* Check if the boundary did not displace more than halfway
3068          * each of the cells it bounds, as this could cause problems,
3069          * especially when the differences between cell sizes are large.
3070          * If changes are applied, they will not make cells smaller
3071          * than the cut-off, as we check all the boundaries which
3072          * might be affected by a change and if the old state was ok,
3073          * the cells will at most be shrunk back to their old size.
3074          */
3075         for(i=range[0]+1; i<range[1]; i++)
3076         {
3077             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3078             if (root->cell_f[i] < halfway)
3079             {
3080                 root->cell_f[i] = halfway;
3081                 /* Check if the change also causes shifts of the next boundaries */
3082                 for(j=i+1; j<range[1]; j++)
3083                 {
3084                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3085                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3086                 }
3087             }
3088             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3089             if (root->cell_f[i] > halfway)
3090             {
3091                 root->cell_f[i] = halfway;
3092                 /* Check if the change also causes shifts of the next boundaries */
3093                 for(j=i-1; j>=range[0]+1; j--)
3094                 {
3095                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3096                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3097                 }
3098             }
3099         }
3100     }
3101
3102     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3103     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3104      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3105      * for a and b nrange is used */
3106     if (d > 0)
3107     {
3108         /* Take care of the staggering of the cell boundaries */
3109         if (bUniform)
3110         {
3111             for(i=range[0]; i<range[1]; i++)
3112             {
3113                 root->cell_f_max0[i] = root->cell_f[i];
3114                 root->cell_f_min1[i] = root->cell_f[i+1];
3115             }
3116         }
3117         else
3118         {
3119             for(i=range[0]+1; i<range[1]; i++)
3120             {
3121                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3122                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3123                 if (bLimLo && bLimHi)
3124                 {
3125                     /* Both limits violated, try the best we can */
3126                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3127                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3128                     nrange[0]=range[0];
3129                     nrange[1]=i;
3130                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3131
3132                     nrange[0]=i;
3133                     nrange[1]=range[1];
3134                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3135
3136                     return;
3137                 }
3138                 else if (bLimLo)
3139                 {
3140                     /* root->cell_f[i] = root->bound_min[i]; */
3141                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3142                     bLastHi=FALSE;
3143                 }
3144                 else if (bLimHi && !bLastHi)
3145                 {
3146                     bLastHi=TRUE;
3147                     if (nrange[1] < range[1])   /* found a LimLo before */
3148                     {
3149                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3150                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3151                         nrange[0]=nrange[1];
3152                     }
3153                     root->cell_f[i] = root->bound_max[i];
3154                     nrange[1]=i;
3155                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3156                     nrange[0]=i;
3157                     nrange[1]=range[1];
3158                 }
3159             }
3160             if (nrange[1] < range[1])   /* found last a LimLo */
3161             {
3162                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3163                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3164                 nrange[0]=nrange[1];
3165                 nrange[1]=range[1];
3166                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3167             }
3168             else if (nrange[0] > range[0]) /* found at least one LimHi */
3169             {
3170                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3171             }
3172         }
3173     }
3174 }
3175
3176
3177 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3178                                        int d,int dim,gmx_domdec_root_t *root,
3179                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3180                                        gmx_bool bUniform,gmx_large_int_t step)
3181 {
3182     gmx_domdec_comm_t *comm;
3183     int  ncd,d1,i,j,pos;
3184     real *cell_size;
3185     real load_aver,load_i,imbalance,change,change_max,sc;
3186     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3187     real change_limit;
3188     real relax = 0.5;
3189     gmx_bool bPBC;
3190     int range[] = { 0, 0 };
3191
3192     comm = dd->comm;
3193
3194     /* Convert the maximum change from the input percentage to a fraction */
3195     change_limit = comm->dlb_scale_lim*0.01;
3196
3197     ncd = dd->nc[dim];
3198
3199     bPBC = (dim < ddbox->npbcdim);
3200
3201     cell_size = root->buf_ncd;
3202
3203     /* Store the original boundaries */
3204     for(i=0; i<ncd+1; i++)
3205     {
3206         root->old_cell_f[i] = root->cell_f[i];
3207     }
3208     if (bUniform) {
3209         for(i=0; i<ncd; i++)
3210         {
3211             cell_size[i] = 1.0/ncd;
3212         }
3213     }
3214     else if (dd_load_count(comm))
3215     {
3216         load_aver = comm->load[d].sum_m/ncd;
3217         change_max = 0;
3218         for(i=0; i<ncd; i++)
3219         {
3220             /* Determine the relative imbalance of cell i */
3221             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3222             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3223             /* Determine the change of the cell size using underrelaxation */
3224             change = -relax*imbalance;
3225             change_max = max(change_max,max(change,-change));
3226         }
3227         /* Limit the amount of scaling.
3228          * We need to use the same rescaling for all cells in one row,
3229          * otherwise the load balancing might not converge.
3230          */
3231         sc = relax;
3232         if (change_max > change_limit)
3233         {
3234             sc *= change_limit/change_max;
3235         }
3236         for(i=0; i<ncd; i++)
3237         {
3238             /* Determine the relative imbalance of cell i */
3239             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3240             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3241             /* Determine the change of the cell size using underrelaxation */
3242             change = -sc*imbalance;
3243             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3244         }
3245     }
3246
3247     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3248     cellsize_limit_f *= DD_CELL_MARGIN;
3249     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3250     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3251     if (ddbox->tric_dir[dim])
3252     {
3253         cellsize_limit_f /= ddbox->skew_fac[dim];
3254         dist_min_f       /= ddbox->skew_fac[dim];
3255     }
3256     if (bDynamicBox && d > 0)
3257     {
3258         dist_min_f *= DD_PRES_SCALE_MARGIN;
3259     }
3260     if (d > 0 && !bUniform)
3261     {
3262         /* Make sure that the grid is not shifted too much */
3263         for(i=1; i<ncd; i++) {
3264             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3265             {
3266                 gmx_incons("Inconsistent DD boundary staggering limits!");
3267             }
3268             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3269             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3270             if (space > 0) {
3271                 root->bound_min[i] += 0.5*space;
3272             }
3273             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3274             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3275             if (space < 0) {
3276                 root->bound_max[i] += 0.5*space;
3277             }
3278             if (debug)
3279             {
3280                 fprintf(debug,
3281                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3282                         d,i,
3283                         root->cell_f_max0[i-1] + dist_min_f,
3284                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3285                         root->cell_f_min1[i] - dist_min_f);
3286             }
3287         }
3288     }
3289     range[1]=ncd;
3290     root->cell_f[0] = 0;
3291     root->cell_f[ncd] = 1;
3292     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3293
3294
3295     /* After the checks above, the cells should obey the cut-off
3296      * restrictions, but it does not hurt to check.
3297      */
3298     for(i=0; i<ncd; i++)
3299     {
3300         if (debug)
3301         {
3302             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3303                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3304         }
3305
3306         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3307             root->cell_f[i+1] - root->cell_f[i] <
3308             cellsize_limit_f/DD_CELL_MARGIN)
3309         {
3310             char buf[22];
3311             fprintf(stderr,
3312                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3313                     gmx_step_str(step,buf),dim2char(dim),i,
3314                     (root->cell_f[i+1] - root->cell_f[i])
3315                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3316         }
3317     }
3318
3319     pos = ncd + 1;
3320     /* Store the cell boundaries of the lower dimensions at the end */
3321     for(d1=0; d1<d; d1++)
3322     {
3323         root->cell_f[pos++] = comm->cell_f0[d1];
3324         root->cell_f[pos++] = comm->cell_f1[d1];
3325     }
3326
3327     if (d < comm->npmedecompdim)
3328     {
3329         /* The master determines the maximum shift for
3330          * the coordinate communication between separate PME nodes.
3331          */
3332         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3333     }
3334     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3335     if (d >= 1)
3336     {
3337         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3338     }
3339 }
3340
3341 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3342                                              gmx_ddbox_t *ddbox,int dimind)
3343 {
3344     gmx_domdec_comm_t *comm;
3345     int dim;
3346
3347     comm = dd->comm;
3348
3349     /* Set the cell dimensions */
3350     dim = dd->dim[dimind];
3351     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3352     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3353     if (dim >= ddbox->nboundeddim)
3354     {
3355         comm->cell_x0[dim] += ddbox->box0[dim];
3356         comm->cell_x1[dim] += ddbox->box0[dim];
3357     }
3358 }
3359
3360 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3361                                          int d,int dim,real *cell_f_row,
3362                                          gmx_ddbox_t *ddbox)
3363 {
3364     gmx_domdec_comm_t *comm;
3365     int d1,dim1,pos;
3366
3367     comm = dd->comm;
3368
3369 #ifdef GMX_MPI
3370     /* Each node would only need to know two fractions,
3371      * but it is probably cheaper to broadcast the whole array.
3372      */
3373     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3374               0,comm->mpi_comm_load[d]);
3375 #endif
3376     /* Copy the fractions for this dimension from the buffer */
3377     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3378     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3379     /* The whole array was communicated, so set the buffer position */
3380     pos = dd->nc[dim] + 1;
3381     for(d1=0; d1<=d; d1++)
3382     {
3383         if (d1 < d)
3384         {
3385             /* Copy the cell fractions of the lower dimensions */
3386             comm->cell_f0[d1] = cell_f_row[pos++];
3387             comm->cell_f1[d1] = cell_f_row[pos++];
3388         }
3389         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3390     }
3391     /* Convert the communicated shift from float to int */
3392     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3393     if (d >= 1)
3394     {
3395         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3396     }
3397 }
3398
3399 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3400                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3401                                          gmx_bool bUniform,gmx_large_int_t step)
3402 {
3403     gmx_domdec_comm_t *comm;
3404     int d,dim,d1;
3405     gmx_bool bRowMember,bRowRoot;
3406     real *cell_f_row;
3407
3408     comm = dd->comm;
3409
3410     for(d=0; d<dd->ndim; d++)
3411     {
3412         dim = dd->dim[d];
3413         bRowMember = TRUE;
3414         bRowRoot = TRUE;
3415         for(d1=d; d1<dd->ndim; d1++)
3416         {
3417             if (dd->ci[dd->dim[d1]] > 0)
3418             {
3419                 if (d1 > d)
3420                 {
3421                     bRowMember = FALSE;
3422                 }
3423                 bRowRoot = FALSE;
3424             }
3425         }
3426         if (bRowMember)
3427         {
3428             if (bRowRoot)
3429             {
3430                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3431                                            ddbox,bDynamicBox,bUniform,step);
3432                 cell_f_row = comm->root[d]->cell_f;
3433             }
3434             else
3435             {
3436                 cell_f_row = comm->cell_f_row;
3437             }
3438             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3439         }
3440     }
3441 }
3442
3443 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3444 {
3445     int d;
3446
3447     /* This function assumes the box is static and should therefore
3448      * not be called when the box has changed since the last
3449      * call to dd_partition_system.
3450      */
3451     for(d=0; d<dd->ndim; d++)
3452     {
3453         relative_to_absolute_cell_bounds(dd,ddbox,d);
3454     }
3455 }
3456
3457
3458
3459 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3460                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3461                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3462                                   gmx_wallcycle_t wcycle)
3463 {
3464     gmx_domdec_comm_t *comm;
3465     int dim;
3466
3467     comm = dd->comm;
3468
3469     if (bDoDLB)
3470     {
3471         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3472         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3473         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3474     }
3475     else if (bDynamicBox)
3476     {
3477         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3478     }
3479
3480     /* Set the dimensions for which no DD is used */
3481     for(dim=0; dim<DIM; dim++) {
3482         if (dd->nc[dim] == 1) {
3483             comm->cell_x0[dim] = 0;
3484             comm->cell_x1[dim] = ddbox->box_size[dim];
3485             if (dim >= ddbox->nboundeddim)
3486             {
3487                 comm->cell_x0[dim] += ddbox->box0[dim];
3488                 comm->cell_x1[dim] += ddbox->box0[dim];
3489             }
3490         }
3491     }
3492 }
3493
3494 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3495 {
3496     int d,np,i;
3497     gmx_domdec_comm_dim_t *cd;
3498
3499     for(d=0; d<dd->ndim; d++)
3500     {
3501         cd = &dd->comm->cd[d];
3502         np = npulse[dd->dim[d]];
3503         if (np > cd->np_nalloc)
3504         {
3505             if (debug)
3506             {
3507                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3508                         dim2char(dd->dim[d]),np);
3509             }
3510             if (DDMASTER(dd) && cd->np_nalloc > 0)
3511             {
3512                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3513             }
3514             srenew(cd->ind,np);
3515             for(i=cd->np_nalloc; i<np; i++)
3516             {
3517                 cd->ind[i].index  = NULL;
3518                 cd->ind[i].nalloc = 0;
3519             }
3520             cd->np_nalloc = np;
3521         }
3522         cd->np = np;
3523     }
3524 }
3525
3526
3527 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3528                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3529                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3530                               gmx_wallcycle_t wcycle)
3531 {
3532     gmx_domdec_comm_t *comm;
3533     int  d;
3534     ivec npulse;
3535
3536     comm = dd->comm;
3537
3538     /* Copy the old cell boundaries for the cg displacement check */
3539     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3540     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3541
3542     if (comm->bDynLoadBal)
3543     {
3544         if (DDMASTER(dd))
3545         {
3546             check_box_size(dd,ddbox);
3547         }
3548         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3549     }
3550     else
3551     {
3552         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3553         realloc_comm_ind(dd,npulse);
3554     }
3555
3556     if (debug)
3557     {
3558         for(d=0; d<DIM; d++)
3559         {
3560             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3561                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3562         }
3563     }
3564 }
3565
3566 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3567                                   gmx_ddbox_t *ddbox,
3568                                   rvec cell_ns_x0,rvec cell_ns_x1,
3569                                   gmx_large_int_t step)
3570 {
3571     gmx_domdec_comm_t *comm;
3572     int dim_ind,dim;
3573
3574     comm = dd->comm;
3575
3576     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3577     {
3578         dim = dd->dim[dim_ind];
3579
3580         /* Without PBC we don't have restrictions on the outer cells */
3581         if (!(dim >= ddbox->npbcdim &&
3582               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3583             comm->bDynLoadBal &&
3584             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3585             comm->cellsize_min[dim])
3586         {
3587             char buf[22];
3588             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3589                       gmx_step_str(step,buf),dim2char(dim),
3590                       comm->cell_x1[dim] - comm->cell_x0[dim],
3591                       ddbox->skew_fac[dim],
3592                       dd->comm->cellsize_min[dim],
3593                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3594         }
3595     }
3596
3597     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3598     {
3599         /* Communicate the boundaries and update cell_ns_x0/1 */
3600         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3601         if (dd->bGridJump && dd->ndim > 1)
3602         {
3603             check_grid_jump(step,dd,ddbox);
3604         }
3605     }
3606 }
3607
3608 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3609 {
3610     if (YY < npbcdim)
3611     {
3612         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3613     }
3614     else
3615     {
3616         tcm[YY][XX] = 0;
3617     }
3618     if (ZZ < npbcdim)
3619     {
3620         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3621         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3622     }
3623     else
3624     {
3625         tcm[ZZ][XX] = 0;
3626         tcm[ZZ][YY] = 0;
3627     }
3628 }
3629
3630 static void check_screw_box(matrix box)
3631 {
3632     /* Mathematical limitation */
3633     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3634     {
3635         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3636     }
3637
3638     /* Limitation due to the asymmetry of the eighth shell method */
3639     if (box[ZZ][YY] != 0)
3640     {
3641         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3642     }
3643 }
3644
3645 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3646                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3647                           gmx_domdec_t *dd)
3648 {
3649     gmx_domdec_master_t *ma;
3650     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3651     int  i,icg,j,k,k0,k1,d,npbcdim;
3652     matrix tcm;
3653     rvec box_size,cg_cm;
3654     ivec ind;
3655     real nrcg,inv_ncg,pos_d;
3656     atom_id *cgindex;
3657     gmx_bool bUnbounded,bScrew;
3658
3659     ma = dd->ma;
3660
3661     if (tmp_ind == NULL)
3662     {
3663         snew(tmp_nalloc,dd->nnodes);
3664         snew(tmp_ind,dd->nnodes);
3665         for(i=0; i<dd->nnodes; i++)
3666         {
3667             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3668             snew(tmp_ind[i],tmp_nalloc[i]);
3669         }
3670     }
3671
3672     /* Clear the count */
3673     for(i=0; i<dd->nnodes; i++)
3674     {
3675         ma->ncg[i] = 0;
3676         ma->nat[i] = 0;
3677     }
3678
3679     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3680
3681     cgindex = cgs->index;
3682
3683     /* Compute the center of geometry for all charge groups */
3684     for(icg=0; icg<cgs->nr; icg++)
3685     {
3686         k0      = cgindex[icg];
3687         k1      = cgindex[icg+1];
3688         nrcg    = k1 - k0;
3689         if (nrcg == 1)
3690         {
3691             copy_rvec(pos[k0],cg_cm);
3692         }
3693         else
3694         {
3695             inv_ncg = 1.0/nrcg;
3696
3697             clear_rvec(cg_cm);
3698             for(k=k0; (k<k1); k++)
3699             {
3700                 rvec_inc(cg_cm,pos[k]);
3701             }
3702             for(d=0; (d<DIM); d++)
3703             {
3704                 cg_cm[d] *= inv_ncg;
3705             }
3706         }
3707         /* Put the charge group in the box and determine the cell index */
3708         for(d=DIM-1; d>=0; d--) {
3709             pos_d = cg_cm[d];
3710             if (d < dd->npbcdim)
3711             {
3712                 bScrew = (dd->bScrewPBC && d == XX);
3713                 if (tric_dir[d] && dd->nc[d] > 1)
3714                 {
3715                     /* Use triclinic coordintates for this dimension */
3716                     for(j=d+1; j<DIM; j++)
3717                     {
3718                         pos_d += cg_cm[j]*tcm[j][d];
3719                     }
3720                 }
3721                 while(pos_d >= box[d][d])
3722                 {
3723                     pos_d -= box[d][d];
3724                     rvec_dec(cg_cm,box[d]);
3725                     if (bScrew)
3726                     {
3727                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3728                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3729                     }
3730                     for(k=k0; (k<k1); k++)
3731                     {
3732                         rvec_dec(pos[k],box[d]);
3733                         if (bScrew)
3734                         {
3735                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3736                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3737                         }
3738                     }
3739                 }
3740                 while(pos_d < 0)
3741                 {
3742                     pos_d += box[d][d];
3743                     rvec_inc(cg_cm,box[d]);
3744                     if (bScrew)
3745                     {
3746                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3747                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3748                     }
3749                     for(k=k0; (k<k1); k++)
3750                     {
3751                         rvec_inc(pos[k],box[d]);
3752                         if (bScrew) {
3753                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3754                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3755                         }
3756                     }
3757                 }
3758             }
3759             /* This could be done more efficiently */
3760             ind[d] = 0;
3761             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3762             {
3763                 ind[d]++;
3764             }
3765         }
3766         i = dd_index(dd->nc,ind);
3767         if (ma->ncg[i] == tmp_nalloc[i])
3768         {
3769             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3770             srenew(tmp_ind[i],tmp_nalloc[i]);
3771         }
3772         tmp_ind[i][ma->ncg[i]] = icg;
3773         ma->ncg[i]++;
3774         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3775     }
3776
3777     k1 = 0;
3778     for(i=0; i<dd->nnodes; i++)
3779     {
3780         ma->index[i] = k1;
3781         for(k=0; k<ma->ncg[i]; k++)
3782         {
3783             ma->cg[k1++] = tmp_ind[i][k];
3784         }
3785     }
3786     ma->index[dd->nnodes] = k1;
3787
3788     for(i=0; i<dd->nnodes; i++)
3789     {
3790         sfree(tmp_ind[i]);
3791     }
3792     sfree(tmp_ind);
3793     sfree(tmp_nalloc);
3794
3795     if (fplog)
3796     {
3797         char buf[22];
3798         fprintf(fplog,"Charge group distribution at step %s:",
3799                 gmx_step_str(step,buf));
3800         for(i=0; i<dd->nnodes; i++)
3801         {
3802             fprintf(fplog," %d",ma->ncg[i]);
3803         }
3804         fprintf(fplog,"\n");
3805     }
3806 }
3807
3808 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3809                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3810                                 rvec pos[])
3811 {
3812     gmx_domdec_master_t *ma=NULL;
3813     ivec npulse;
3814     int  i,cg_gl;
3815     int  *ibuf,buf2[2] = { 0, 0 };
3816
3817     if (DDMASTER(dd))
3818     {
3819         ma = dd->ma;
3820
3821         if (dd->bScrewPBC)
3822         {
3823             check_screw_box(box);
3824         }
3825
3826         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3827
3828         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3829         for(i=0; i<dd->nnodes; i++)
3830         {
3831             ma->ibuf[2*i]   = ma->ncg[i];
3832             ma->ibuf[2*i+1] = ma->nat[i];
3833         }
3834         ibuf = ma->ibuf;
3835     }
3836     else
3837     {
3838         ibuf = NULL;
3839     }
3840     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3841
3842     dd->ncg_home = buf2[0];
3843     dd->nat_home = buf2[1];
3844     dd->ncg_tot  = dd->ncg_home;
3845     dd->nat_tot  = dd->nat_home;
3846     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3847     {
3848         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3849         srenew(dd->index_gl,dd->cg_nalloc);
3850         srenew(dd->cgindex,dd->cg_nalloc+1);
3851     }
3852     if (DDMASTER(dd))
3853     {
3854         for(i=0; i<dd->nnodes; i++)
3855         {
3856             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3857             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3858         }
3859     }
3860
3861     dd_scatterv(dd,
3862                 DDMASTER(dd) ? ma->ibuf : NULL,
3863                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3864                 DDMASTER(dd) ? ma->cg : NULL,
3865                 dd->ncg_home*sizeof(int),dd->index_gl);
3866
3867     /* Determine the home charge group sizes */
3868     dd->cgindex[0] = 0;
3869     for(i=0; i<dd->ncg_home; i++)
3870     {
3871         cg_gl = dd->index_gl[i];
3872         dd->cgindex[i+1] =
3873             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3874     }
3875
3876     if (debug)
3877     {
3878         fprintf(debug,"Home charge groups:\n");
3879         for(i=0; i<dd->ncg_home; i++)
3880         {
3881             fprintf(debug," %d",dd->index_gl[i]);
3882             if (i % 10 == 9)
3883                 fprintf(debug,"\n");
3884         }
3885         fprintf(debug,"\n");
3886     }
3887 }
3888
3889 static int compact_and_copy_vec_at(int ncg,int *move,
3890                                    int *cgindex,
3891                                    int nvec,int vec,
3892                                    rvec *src,gmx_domdec_comm_t *comm,
3893                                    gmx_bool bCompact)
3894 {
3895     int m,icg,i,i0,i1,nrcg;
3896     int home_pos;
3897     int pos_vec[DIM*2];
3898
3899     home_pos = 0;
3900
3901     for(m=0; m<DIM*2; m++)
3902     {
3903         pos_vec[m] = 0;
3904     }
3905
3906     i0 = 0;
3907     for(icg=0; icg<ncg; icg++)
3908     {
3909         i1 = cgindex[icg+1];
3910         m = move[icg];
3911         if (m == -1)
3912         {
3913             if (bCompact)
3914             {
3915                 /* Compact the home array in place */
3916                 for(i=i0; i<i1; i++)
3917                 {
3918                     copy_rvec(src[i],src[home_pos++]);
3919                 }
3920             }
3921         }
3922         else
3923         {
3924             /* Copy to the communication buffer */
3925             nrcg = i1 - i0;
3926             pos_vec[m] += 1 + vec*nrcg;
3927             for(i=i0; i<i1; i++)
3928             {
3929                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3930             }
3931             pos_vec[m] += (nvec - vec - 1)*nrcg;
3932         }
3933         if (!bCompact)
3934         {
3935             home_pos += i1 - i0;
3936         }
3937         i0 = i1;
3938     }
3939
3940     return home_pos;
3941 }
3942
3943 static int compact_and_copy_vec_cg(int ncg,int *move,
3944                                    int *cgindex,
3945                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3946                                    gmx_bool bCompact)
3947 {
3948     int m,icg,i0,i1,nrcg;
3949     int home_pos;
3950     int pos_vec[DIM*2];
3951
3952     home_pos = 0;
3953
3954     for(m=0; m<DIM*2; m++)
3955     {
3956         pos_vec[m] = 0;
3957     }
3958
3959     i0 = 0;
3960     for(icg=0; icg<ncg; icg++)
3961     {
3962         i1 = cgindex[icg+1];
3963         m = move[icg];
3964         if (m == -1)
3965         {
3966             if (bCompact)
3967             {
3968                 /* Compact the home array in place */
3969                 copy_rvec(src[icg],src[home_pos++]);
3970             }
3971         }
3972         else
3973         {
3974             nrcg = i1 - i0;
3975             /* Copy to the communication buffer */
3976             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3977             pos_vec[m] += 1 + nrcg*nvec;
3978         }
3979         i0 = i1;
3980     }
3981     if (!bCompact)
3982     {
3983         home_pos = ncg;
3984     }
3985
3986     return home_pos;
3987 }
3988
3989 static int compact_ind(int ncg,int *move,
3990                        int *index_gl,int *cgindex,
3991                        int *gatindex,
3992                        gmx_ga2la_t ga2la,char *bLocalCG,
3993                        int *cginfo)
3994 {
3995     int cg,nat,a0,a1,a,a_gl;
3996     int home_pos;
3997
3998     home_pos = 0;
3999     nat = 0;
4000     for(cg=0; cg<ncg; cg++)
4001     {
4002         a0 = cgindex[cg];
4003         a1 = cgindex[cg+1];
4004         if (move[cg] == -1)
4005         {
4006             /* Compact the home arrays in place.
4007              * Anything that can be done here avoids access to global arrays.
4008              */
4009             cgindex[home_pos] = nat;
4010             for(a=a0; a<a1; a++)
4011             {
4012                 a_gl = gatindex[a];
4013                 gatindex[nat] = a_gl;
4014                 /* The cell number stays 0, so we don't need to set it */
4015                 ga2la_change_la(ga2la,a_gl,nat);
4016                 nat++;
4017             }
4018             index_gl[home_pos] = index_gl[cg];
4019             cginfo[home_pos]   = cginfo[cg];
4020             /* The charge group remains local, so bLocalCG does not change */
4021             home_pos++;
4022         }
4023         else
4024         {
4025             /* Clear the global indices */
4026             for(a=a0; a<a1; a++)
4027             {
4028                 ga2la_del(ga2la,gatindex[a]);
4029             }
4030             if (bLocalCG)
4031             {
4032                 bLocalCG[index_gl[cg]] = FALSE;
4033             }
4034         }
4035     }
4036     cgindex[home_pos] = nat;
4037
4038     return home_pos;
4039 }
4040
4041 static void clear_and_mark_ind(int ncg,int *move,
4042                                int *index_gl,int *cgindex,int *gatindex,
4043                                gmx_ga2la_t ga2la,char *bLocalCG,
4044                                int *cell_index)
4045 {
4046     int cg,a0,a1,a;
4047
4048     for(cg=0; cg<ncg; cg++)
4049     {
4050         if (move[cg] >= 0)
4051         {
4052             a0 = cgindex[cg];
4053             a1 = cgindex[cg+1];
4054             /* Clear the global indices */
4055             for(a=a0; a<a1; a++)
4056             {
4057                 ga2la_del(ga2la,gatindex[a]);
4058             }
4059             if (bLocalCG)
4060             {
4061                 bLocalCG[index_gl[cg]] = FALSE;
4062             }
4063             /* Signal that this cg has moved using the ns cell index.
4064              * Here we set it to -1.
4065              * fill_grid will change it from -1 to 4*grid->ncells.
4066              */
4067             cell_index[cg] = -1;
4068         }
4069     }
4070 }
4071
4072 static void print_cg_move(FILE *fplog,
4073                           gmx_domdec_t *dd,
4074                           gmx_large_int_t step,int cg,int dim,int dir,
4075                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4076                           rvec cm_old,rvec cm_new,real pos_d)
4077 {
4078     gmx_domdec_comm_t *comm;
4079     char buf[22];
4080
4081     comm = dd->comm;
4082
4083     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4084     if (bHaveLimitdAndCMOld)
4085     {
4086         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4087                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4088     }
4089     else
4090     {
4091         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4092                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4093     }
4094     fprintf(fplog,"distance out of cell %f\n",
4095             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4096     if (bHaveLimitdAndCMOld)
4097     {
4098         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4099                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4100     }
4101     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4102             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4103     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4104             dim2char(dim),
4105             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4106     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4107             dim2char(dim),
4108             comm->cell_x0[dim],comm->cell_x1[dim]);
4109 }
4110
4111 static void cg_move_error(FILE *fplog,
4112                           gmx_domdec_t *dd,
4113                           gmx_large_int_t step,int cg,int dim,int dir,
4114                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4115                           rvec cm_old,rvec cm_new,real pos_d)
4116 {
4117     if (fplog)
4118     {
4119         print_cg_move(fplog, dd,step,cg,dim,dir,
4120                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4121     }
4122     print_cg_move(stderr,dd,step,cg,dim,dir,
4123                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4124     gmx_fatal(FARGS,
4125               "A charge group moved too far between two domain decomposition steps\n"
4126               "This usually means that your system is not well equilibrated");
4127 }
4128
4129 static void rotate_state_atom(t_state *state,int a)
4130 {
4131     int est;
4132
4133     for(est=0; est<estNR; est++)
4134     {
4135         if (EST_DISTR(est) && state->flags & (1<<est)) {
4136             switch (est) {
4137             case estX:
4138                 /* Rotate the complete state; for a rectangular box only */
4139                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4140                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4141                 break;
4142             case estV:
4143                 state->v[a][YY] = -state->v[a][YY];
4144                 state->v[a][ZZ] = -state->v[a][ZZ];
4145                 break;
4146             case estSDX:
4147                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4148                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4149                 break;
4150             case estCGP:
4151                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4152                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4153                 break;
4154             case estDISRE_INITF:
4155             case estDISRE_RM3TAV:
4156             case estORIRE_INITF:
4157             case estORIRE_DTAV:
4158                 /* These are distances, so not affected by rotation */
4159                 break;
4160             default:
4161                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4162             }
4163         }
4164     }
4165 }
4166
4167 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4168                               gmx_domdec_t *dd,ivec tric_dir,
4169                               t_state *state,rvec **f,
4170                               t_forcerec *fr,t_mdatoms *md,
4171                               gmx_bool bCompact,
4172                               t_nrnb *nrnb)
4173 {
4174     int  *move;
4175     int  npbcdim;
4176     int  ncg[DIM*2],nat[DIM*2];
4177     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4178     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4179     int  sbuf[2],rbuf[2];
4180     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4181     int  flag;
4182     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4183     gmx_bool bScrew;
4184     ivec dev;
4185     real inv_ncg,pos_d;
4186     matrix tcm;
4187     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4188     atom_id *cgindex;
4189     cginfo_mb_t *cginfo_mb;
4190     gmx_domdec_comm_t *comm;
4191
4192     if (dd->bScrewPBC)
4193     {
4194         check_screw_box(state->box);
4195     }
4196
4197     comm  = dd->comm;
4198     cg_cm = fr->cg_cm;
4199
4200     for(i=0; i<estNR; i++)
4201     {
4202         if (EST_DISTR(i))
4203         {
4204             switch (i)
4205             {
4206             case estX:   /* Always present */            break;
4207             case estV:   bV   = (state->flags & (1<<i)); break;
4208             case estSDX: bSDX = (state->flags & (1<<i)); break;
4209             case estCGP: bCGP = (state->flags & (1<<i)); break;
4210             case estLD_RNG:
4211             case estLD_RNGI:
4212             case estDISRE_INITF:
4213             case estDISRE_RM3TAV:
4214             case estORIRE_INITF:
4215             case estORIRE_DTAV:
4216                 /* No processing required */
4217                 break;
4218             default:
4219             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4220             }
4221         }
4222     }
4223
4224     if (dd->ncg_tot > comm->nalloc_int)
4225     {
4226         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4227         srenew(comm->buf_int,comm->nalloc_int);
4228     }
4229     move = comm->buf_int;
4230
4231     /* Clear the count */
4232     for(c=0; c<dd->ndim*2; c++)
4233     {
4234         ncg[c] = 0;
4235         nat[c] = 0;
4236     }
4237
4238     npbcdim = dd->npbcdim;
4239
4240     for(d=0; (d<DIM); d++)
4241     {
4242         limitd[d] = dd->comm->cellsize_min[d];
4243         if (d >= npbcdim && dd->ci[d] == 0)
4244         {
4245             cell_x0[d] = -GMX_FLOAT_MAX;
4246         }
4247         else
4248         {
4249             cell_x0[d] = comm->cell_x0[d];
4250         }
4251         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4252         {
4253             cell_x1[d] = GMX_FLOAT_MAX;
4254         }
4255         else
4256         {
4257             cell_x1[d] = comm->cell_x1[d];
4258         }
4259         if (d < npbcdim)
4260         {
4261             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4262             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4263         }
4264         else
4265         {
4266             /* We check after communication if a charge group moved
4267              * more than one cell. Set the pre-comm check limit to float_max.
4268              */
4269             limit0[d] = -GMX_FLOAT_MAX;
4270             limit1[d] =  GMX_FLOAT_MAX;
4271         }
4272     }
4273
4274     make_tric_corr_matrix(npbcdim,state->box,tcm);
4275
4276     cgindex = dd->cgindex;
4277
4278     /* Compute the center of geometry for all home charge groups
4279      * and put them in the box and determine where they should go.
4280      */
4281     for(cg=0; cg<dd->ncg_home; cg++)
4282     {
4283         k0   = cgindex[cg];
4284         k1   = cgindex[cg+1];
4285         nrcg = k1 - k0;
4286         if (nrcg == 1)
4287         {
4288             copy_rvec(state->x[k0],cm_new);
4289         }
4290         else
4291         {
4292             inv_ncg = 1.0/nrcg;
4293
4294             clear_rvec(cm_new);
4295             for(k=k0; (k<k1); k++)
4296             {
4297                 rvec_inc(cm_new,state->x[k]);
4298             }
4299             for(d=0; (d<DIM); d++)
4300             {
4301                 cm_new[d] = inv_ncg*cm_new[d];
4302             }
4303         }
4304
4305         clear_ivec(dev);
4306         /* Do pbc and check DD cell boundary crossings */
4307         for(d=DIM-1; d>=0; d--)
4308         {
4309             if (dd->nc[d] > 1)
4310             {
4311                 bScrew = (dd->bScrewPBC && d == XX);
4312                 /* Determine the location of this cg in lattice coordinates */
4313                 pos_d = cm_new[d];
4314                 if (tric_dir[d])
4315                 {
4316                     for(d2=d+1; d2<DIM; d2++)
4317                     {
4318                         pos_d += cm_new[d2]*tcm[d2][d];
4319                     }
4320                 }
4321                 /* Put the charge group in the triclinic unit-cell */
4322                 if (pos_d >= cell_x1[d])
4323                 {
4324                     if (pos_d >= limit1[d])
4325                     {
4326                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4327                                       cg_cm[cg],cm_new,pos_d);
4328                     }
4329                     dev[d] = 1;
4330                     if (dd->ci[d] == dd->nc[d] - 1)
4331                     {
4332                         rvec_dec(cm_new,state->box[d]);
4333                         if (bScrew)
4334                         {
4335                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4336                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4337                         }
4338                         for(k=k0; (k<k1); k++)
4339                         {
4340                             rvec_dec(state->x[k],state->box[d]);
4341                             if (bScrew)
4342                             {
4343                                 rotate_state_atom(state,k);
4344                             }
4345                         }
4346                     }
4347                 }
4348                 else if (pos_d < cell_x0[d])
4349                 {
4350                     if (pos_d < limit0[d])
4351                     {
4352                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4353                                       cg_cm[cg],cm_new,pos_d);
4354                     }
4355                     dev[d] = -1;
4356                     if (dd->ci[d] == 0)
4357                     {
4358                         rvec_inc(cm_new,state->box[d]);
4359                         if (bScrew)
4360                         {
4361                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4362                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4363                         }
4364                         for(k=k0; (k<k1); k++)
4365                         {
4366                             rvec_inc(state->x[k],state->box[d]);
4367                             if (bScrew)
4368                             {
4369                                 rotate_state_atom(state,k);
4370                             }
4371                         }
4372                     }
4373                 }
4374             }
4375             else if (d < npbcdim)
4376             {
4377                 /* Put the charge group in the rectangular unit-cell */
4378                 while (cm_new[d] >= state->box[d][d])
4379                 {
4380                     rvec_dec(cm_new,state->box[d]);
4381                     for(k=k0; (k<k1); k++)
4382                     {
4383                         rvec_dec(state->x[k],state->box[d]);
4384                     }
4385                 }
4386                 while (cm_new[d] < 0)
4387                 {
4388                     rvec_inc(cm_new,state->box[d]);
4389                     for(k=k0; (k<k1); k++)
4390                     {
4391                         rvec_inc(state->x[k],state->box[d]);
4392                     }
4393                 }
4394             }
4395         }
4396
4397         copy_rvec(cm_new,cg_cm[cg]);
4398
4399         /* Determine where this cg should go */
4400         flag = 0;
4401         mc = -1;
4402         for(d=0; d<dd->ndim; d++)
4403         {
4404             dim = dd->dim[d];
4405             if (dev[dim] == 1)
4406             {
4407                 flag |= DD_FLAG_FW(d);
4408                 if (mc == -1)
4409                 {
4410                     mc = d*2;
4411                 }
4412             }
4413             else if (dev[dim] == -1)
4414             {
4415                 flag |= DD_FLAG_BW(d);
4416                 if (mc == -1) {
4417                     if (dd->nc[dim] > 2)
4418                     {
4419                         mc = d*2 + 1;
4420                     }
4421                     else
4422                     {
4423                         mc = d*2;
4424                     }
4425                 }
4426             }
4427         }
4428         move[cg] = mc;
4429         if (mc >= 0)
4430         {
4431             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4432             {
4433                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4434                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4435             }
4436             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4437             /* We store the cg size in the lower 16 bits
4438              * and the place where the charge group should go
4439              * in the next 6 bits. This saves some communication volume.
4440              */
4441             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4442             ncg[mc] += 1;
4443             nat[mc] += nrcg;
4444         }
4445     }
4446
4447     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4448     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4449
4450     nvec = 1;
4451     if (bV)
4452     {
4453         nvec++;
4454     }
4455     if (bSDX)
4456     {
4457         nvec++;
4458     }
4459     if (bCGP)
4460     {
4461         nvec++;
4462     }
4463
4464     /* Make sure the communication buffers are large enough */
4465     for(mc=0; mc<dd->ndim*2; mc++)
4466     {
4467         nvr = ncg[mc] + nat[mc]*nvec;
4468         if (nvr > comm->cgcm_state_nalloc[mc])
4469         {
4470             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4471             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4472         }
4473     }
4474
4475     /* Recalculating cg_cm might be cheaper than communicating,
4476      * but that could give rise to rounding issues.
4477      */
4478     home_pos_cg =
4479         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4480                                 nvec,cg_cm,comm,bCompact);
4481
4482     vec = 0;
4483     home_pos_at =
4484         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4485                                 nvec,vec++,state->x,comm,bCompact);
4486     if (bV)
4487     {
4488         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4489                                 nvec,vec++,state->v,comm,bCompact);
4490     }
4491     if (bSDX)
4492     {
4493         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4494                                 nvec,vec++,state->sd_X,comm,bCompact);
4495     }
4496     if (bCGP)
4497     {
4498         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4499                                 nvec,vec++,state->cg_p,comm,bCompact);
4500     }
4501
4502     if (bCompact)
4503     {
4504         compact_ind(dd->ncg_home,move,
4505                     dd->index_gl,dd->cgindex,dd->gatindex,
4506                     dd->ga2la,comm->bLocalCG,
4507                     fr->cginfo);
4508     }
4509     else
4510     {
4511         clear_and_mark_ind(dd->ncg_home,move,
4512                            dd->index_gl,dd->cgindex,dd->gatindex,
4513                            dd->ga2la,comm->bLocalCG,
4514                            fr->ns.grid->cell_index);
4515     }
4516
4517     cginfo_mb = fr->cginfo_mb;
4518
4519     ncg_stay_home = home_pos_cg;
4520     for(d=0; d<dd->ndim; d++)
4521     {
4522         dim = dd->dim[d];
4523         ncg_recv = 0;
4524         nat_recv = 0;
4525         nvr      = 0;
4526         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4527         {
4528             cdd = d*2 + dir;
4529             /* Communicate the cg and atom counts */
4530             sbuf[0] = ncg[cdd];
4531             sbuf[1] = nat[cdd];
4532             if (debug)
4533             {
4534                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4535                         d,dir,sbuf[0],sbuf[1]);
4536             }
4537             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4538
4539             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4540             {
4541                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4542                 srenew(comm->buf_int,comm->nalloc_int);
4543             }
4544
4545             /* Communicate the charge group indices, sizes and flags */
4546             dd_sendrecv_int(dd, d, dir,
4547                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4548                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4549
4550             nvs = ncg[cdd] + nat[cdd]*nvec;
4551             i   = rbuf[0]  + rbuf[1] *nvec;
4552             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4553
4554             /* Communicate cgcm and state */
4555             dd_sendrecv_rvec(dd, d, dir,
4556                              comm->cgcm_state[cdd], nvs,
4557                              comm->vbuf.v+nvr, i);
4558             ncg_recv += rbuf[0];
4559             nat_recv += rbuf[1];
4560             nvr      += i;
4561         }
4562
4563         /* Process the received charge groups */
4564         buf_pos = 0;
4565         for(cg=0; cg<ncg_recv; cg++)
4566         {
4567             flag = comm->buf_int[cg*DD_CGIBS+1];
4568
4569             if (dim >= npbcdim && dd->nc[dim] > 2)
4570             {
4571                 /* No pbc in this dim and more than one domain boundary.
4572                  * We to a separate check if a charge did not move too far.
4573                  */
4574                 if (((flag & DD_FLAG_FW(d)) &&
4575                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4576                     ((flag & DD_FLAG_BW(d)) &&
4577                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4578                 {
4579                     cg_move_error(fplog,dd,step,cg,d,
4580                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4581                                    FALSE,0,
4582                                    comm->vbuf.v[buf_pos],
4583                                    comm->vbuf.v[buf_pos],
4584                                    comm->vbuf.v[buf_pos][d]);
4585                 }
4586             }
4587
4588             mc = -1;
4589             if (d < dd->ndim-1)
4590             {
4591                 /* Check which direction this cg should go */
4592                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4593                 {
4594                     if (dd->bGridJump)
4595                     {
4596                         /* The cell boundaries for dimension d2 are not equal
4597                          * for each cell row of the lower dimension(s),
4598                          * therefore we might need to redetermine where
4599                          * this cg should go.
4600                          */
4601                         dim2 = dd->dim[d2];
4602                         /* If this cg crosses the box boundary in dimension d2
4603                          * we can use the communicated flag, so we do not
4604                          * have to worry about pbc.
4605                          */
4606                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4607                                (flag & DD_FLAG_FW(d2))) ||
4608                               (dd->ci[dim2] == 0 &&
4609                                (flag & DD_FLAG_BW(d2)))))
4610                         {
4611                             /* Clear the two flags for this dimension */
4612                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4613                             /* Determine the location of this cg
4614                              * in lattice coordinates
4615                              */
4616                             pos_d = comm->vbuf.v[buf_pos][dim2];
4617                             if (tric_dir[dim2])
4618                             {
4619                                 for(d3=dim2+1; d3<DIM; d3++)
4620                                 {
4621                                     pos_d +=
4622                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4623                                 }
4624                             }
4625                             /* Check of we are not at the box edge.
4626                              * pbc is only handled in the first step above,
4627                              * but this check could move over pbc while
4628                              * the first step did not due to different rounding.
4629                              */
4630                             if (pos_d >= cell_x1[dim2] &&
4631                                 dd->ci[dim2] != dd->nc[dim2]-1)
4632                             {
4633                                 flag |= DD_FLAG_FW(d2);
4634                             }
4635                             else if (pos_d < cell_x0[dim2] &&
4636                                      dd->ci[dim2] != 0)
4637                             {
4638                                 flag |= DD_FLAG_BW(d2);
4639                             }
4640                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4641                         }
4642                     }
4643                     /* Set to which neighboring cell this cg should go */
4644                     if (flag & DD_FLAG_FW(d2))
4645                     {
4646                         mc = d2*2;
4647                     }
4648                     else if (flag & DD_FLAG_BW(d2))
4649                     {
4650                         if (dd->nc[dd->dim[d2]] > 2)
4651                         {
4652                             mc = d2*2+1;
4653                         }
4654                         else
4655                         {
4656                             mc = d2*2;
4657                         }
4658                     }
4659                 }
4660             }
4661
4662             nrcg = flag & DD_FLAG_NRCG;
4663             if (mc == -1)
4664             {
4665                 if (home_pos_cg+1 > dd->cg_nalloc)
4666                 {
4667                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4668                     srenew(dd->index_gl,dd->cg_nalloc);
4669                     srenew(dd->cgindex,dd->cg_nalloc+1);
4670                 }
4671                 /* Set the global charge group index and size */
4672                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4673                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4674                 /* Copy the state from the buffer */
4675                 if (home_pos_cg >= fr->cg_nalloc)
4676                 {
4677                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4678                     cg_cm = fr->cg_cm;
4679                 }
4680                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4681                 /* Set the cginfo */
4682                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4683                                                    dd->index_gl[home_pos_cg]);
4684                 if (comm->bLocalCG)
4685                 {
4686                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4687                 }
4688
4689                 if (home_pos_at+nrcg > state->nalloc)
4690                 {
4691                     dd_realloc_state(state,f,home_pos_at+nrcg);
4692                 }
4693                 for(i=0; i<nrcg; i++)
4694                 {
4695                     copy_rvec(comm->vbuf.v[buf_pos++],
4696                               state->x[home_pos_at+i]);
4697                 }
4698                 if (bV)
4699                 {
4700                     for(i=0; i<nrcg; i++)
4701                     {
4702                         copy_rvec(comm->vbuf.v[buf_pos++],
4703                                   state->v[home_pos_at+i]);
4704                     }
4705                 }
4706                 if (bSDX)
4707                 {
4708                     for(i=0; i<nrcg; i++)
4709                     {
4710                         copy_rvec(comm->vbuf.v[buf_pos++],
4711                                   state->sd_X[home_pos_at+i]);
4712                     }
4713                 }
4714                 if (bCGP)
4715                 {
4716                     for(i=0; i<nrcg; i++)
4717                     {
4718                         copy_rvec(comm->vbuf.v[buf_pos++],
4719                                   state->cg_p[home_pos_at+i]);
4720                     }
4721                 }
4722                 home_pos_cg += 1;
4723                 home_pos_at += nrcg;
4724             }
4725             else
4726             {
4727                 /* Reallocate the buffers if necessary  */
4728                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4729                 {
4730                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4731                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4732                 }
4733                 nvr = ncg[mc] + nat[mc]*nvec;
4734                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4735                 {
4736                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4737                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4738                 }
4739                 /* Copy from the receive to the send buffers */
4740                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4741                        comm->buf_int + cg*DD_CGIBS,
4742                        DD_CGIBS*sizeof(int));
4743                 memcpy(comm->cgcm_state[mc][nvr],
4744                        comm->vbuf.v[buf_pos],
4745                        (1+nrcg*nvec)*sizeof(rvec));
4746                 buf_pos += 1 + nrcg*nvec;
4747                 ncg[mc] += 1;
4748                 nat[mc] += nrcg;
4749             }
4750         }
4751     }
4752
4753     /* With sorting (!bCompact) the indices are now only partially up to date
4754      * and ncg_home and nat_home are not the real count, since there are
4755      * "holes" in the arrays for the charge groups that moved to neighbors.
4756      */
4757     dd->ncg_home = home_pos_cg;
4758     dd->nat_home = home_pos_at;
4759
4760     if (debug)
4761     {
4762         fprintf(debug,"Finished repartitioning\n");
4763     }
4764
4765     return ncg_stay_home;
4766 }
4767
4768 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4769 {
4770     dd->comm->cycl[ddCycl] += cycles;
4771     dd->comm->cycl_n[ddCycl]++;
4772     if (cycles > dd->comm->cycl_max[ddCycl])
4773     {
4774         dd->comm->cycl_max[ddCycl] = cycles;
4775     }
4776 }
4777
4778 static double force_flop_count(t_nrnb *nrnb)
4779 {
4780     int i;
4781     double sum;
4782     const char *name;
4783
4784     sum = 0;
4785     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4786     {
4787         /* To get closer to the real timings, we half the count
4788          * for the normal loops and again half it for water loops.
4789          */
4790         name = nrnb_str(i);
4791         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4792         {
4793             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4794         }
4795         else
4796         {
4797             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4798         }
4799     }
4800     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4801     {
4802         name = nrnb_str(i);
4803         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4804         sum += nrnb->n[i]*cost_nrnb(i);
4805     }
4806     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4807     {
4808         sum += nrnb->n[i]*cost_nrnb(i);
4809     }
4810
4811     return sum;
4812 }
4813
4814 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4815 {
4816     if (dd->comm->eFlop)
4817     {
4818         dd->comm->flop -= force_flop_count(nrnb);
4819     }
4820 }
4821 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4822 {
4823     if (dd->comm->eFlop)
4824     {
4825         dd->comm->flop += force_flop_count(nrnb);
4826         dd->comm->flop_n++;
4827     }
4828 }
4829
4830 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4831 {
4832     int i;
4833
4834     for(i=0; i<ddCyclNr; i++)
4835     {
4836         dd->comm->cycl[i] = 0;
4837         dd->comm->cycl_n[i] = 0;
4838         dd->comm->cycl_max[i] = 0;
4839     }
4840     dd->comm->flop = 0;
4841     dd->comm->flop_n = 0;
4842 }
4843
4844 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4845 {
4846     gmx_domdec_comm_t *comm;
4847     gmx_domdec_load_t *load;
4848     gmx_domdec_root_t *root=NULL;
4849     int  d,dim,cid,i,pos;
4850     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4851     gmx_bool bSepPME;
4852
4853     if (debug)
4854     {
4855         fprintf(debug,"get_load_distribution start\n");
4856     }
4857
4858     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4859
4860     comm = dd->comm;
4861
4862     bSepPME = (dd->pme_nodeid >= 0);
4863
4864     for(d=dd->ndim-1; d>=0; d--)
4865     {
4866         dim = dd->dim[d];
4867         /* Check if we participate in the communication in this dimension */
4868         if (d == dd->ndim-1 ||
4869             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4870         {
4871             load = &comm->load[d];
4872             if (dd->bGridJump)
4873             {
4874                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4875             }
4876             pos = 0;
4877             if (d == dd->ndim-1)
4878             {
4879                 sbuf[pos++] = dd_force_load(comm);
4880                 sbuf[pos++] = sbuf[0];
4881                 if (dd->bGridJump)
4882                 {
4883                     sbuf[pos++] = sbuf[0];
4884                     sbuf[pos++] = cell_frac;
4885                     if (d > 0)
4886                     {
4887                         sbuf[pos++] = comm->cell_f_max0[d];
4888                         sbuf[pos++] = comm->cell_f_min1[d];
4889                     }
4890                 }
4891                 if (bSepPME)
4892                 {
4893                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4894                     sbuf[pos++] = comm->cycl[ddCyclPME];
4895                 }
4896             }
4897             else
4898             {
4899                 sbuf[pos++] = comm->load[d+1].sum;
4900                 sbuf[pos++] = comm->load[d+1].max;
4901                 if (dd->bGridJump)
4902                 {
4903                     sbuf[pos++] = comm->load[d+1].sum_m;
4904                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4905                     sbuf[pos++] = comm->load[d+1].flags;
4906                     if (d > 0)
4907                     {
4908                         sbuf[pos++] = comm->cell_f_max0[d];
4909                         sbuf[pos++] = comm->cell_f_min1[d];
4910                     }
4911                 }
4912                 if (bSepPME)
4913                 {
4914                     sbuf[pos++] = comm->load[d+1].mdf;
4915                     sbuf[pos++] = comm->load[d+1].pme;
4916                 }
4917             }
4918             load->nload = pos;
4919             /* Communicate a row in DD direction d.
4920              * The communicators are setup such that the root always has rank 0.
4921              */
4922 #ifdef GMX_MPI
4923             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4924                        load->load,load->nload*sizeof(float),MPI_BYTE,
4925                        0,comm->mpi_comm_load[d]);
4926 #endif
4927             if (dd->ci[dim] == dd->master_ci[dim])
4928             {
4929                 /* We are the root, process this row */
4930                 if (comm->bDynLoadBal)
4931                 {
4932                     root = comm->root[d];
4933                 }
4934                 load->sum = 0;
4935                 load->max = 0;
4936                 load->sum_m = 0;
4937                 load->cvol_min = 1;
4938                 load->flags = 0;
4939                 load->mdf = 0;
4940                 load->pme = 0;
4941                 pos = 0;
4942                 for(i=0; i<dd->nc[dim]; i++)
4943                 {
4944                     load->sum += load->load[pos++];
4945                     load->max = max(load->max,load->load[pos]);
4946                     pos++;
4947                     if (dd->bGridJump)
4948                     {
4949                         if (root->bLimited)
4950                         {
4951                             /* This direction could not be load balanced properly,
4952                              * therefore we need to use the maximum iso the average load.
4953                              */
4954                             load->sum_m = max(load->sum_m,load->load[pos]);
4955                         }
4956                         else
4957                         {
4958                             load->sum_m += load->load[pos];
4959                         }
4960                         pos++;
4961                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4962                         pos++;
4963                         if (d < dd->ndim-1)
4964                         {
4965                             load->flags = (int)(load->load[pos++] + 0.5);
4966                         }
4967                         if (d > 0)
4968                         {
4969                             root->cell_f_max0[i] = load->load[pos++];
4970                             root->cell_f_min1[i] = load->load[pos++];
4971                         }
4972                     }
4973                     if (bSepPME)
4974                     {
4975                         load->mdf = max(load->mdf,load->load[pos]);
4976                         pos++;
4977                         load->pme = max(load->pme,load->load[pos]);
4978                         pos++;
4979                     }
4980                 }
4981                 if (comm->bDynLoadBal && root->bLimited)
4982                 {
4983                     load->sum_m *= dd->nc[dim];
4984                     load->flags |= (1<<d);
4985                 }
4986             }
4987         }
4988     }
4989
4990     if (DDMASTER(dd))
4991     {
4992         comm->nload      += dd_load_count(comm);
4993         comm->load_step  += comm->cycl[ddCyclStep];
4994         comm->load_sum   += comm->load[0].sum;
4995         comm->load_max   += comm->load[0].max;
4996         if (comm->bDynLoadBal)
4997         {
4998             for(d=0; d<dd->ndim; d++)
4999             {
5000                 if (comm->load[0].flags & (1<<d))
5001                 {
5002                     comm->load_lim[d]++;
5003                 }
5004             }
5005         }
5006         if (bSepPME)
5007         {
5008             comm->load_mdf += comm->load[0].mdf;
5009             comm->load_pme += comm->load[0].pme;
5010         }
5011     }
5012
5013     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5014
5015     if (debug)
5016     {
5017         fprintf(debug,"get_load_distribution finished\n");
5018     }
5019 }
5020
5021 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5022 {
5023     /* Return the relative performance loss on the total run time
5024      * due to the force calculation load imbalance.
5025      */
5026     if (dd->comm->nload > 0)
5027     {
5028         return
5029             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5030             (dd->comm->load_step*dd->nnodes);
5031     }
5032     else
5033     {
5034         return 0;
5035     }
5036 }
5037
5038 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5039 {
5040     char  buf[STRLEN];
5041     int   npp,npme,nnodes,d,limp;
5042     float imbal,pme_f_ratio,lossf,lossp=0;
5043     gmx_bool  bLim;
5044     gmx_domdec_comm_t *comm;
5045
5046     comm = dd->comm;
5047     if (DDMASTER(dd) && comm->nload > 0)
5048     {
5049         npp    = dd->nnodes;
5050         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5051         nnodes = npp + npme;
5052         imbal = comm->load_max*npp/comm->load_sum - 1;
5053         lossf = dd_force_imb_perf_loss(dd);
5054         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5055         fprintf(fplog,"%s",buf);
5056         fprintf(stderr,"\n");
5057         fprintf(stderr,"%s",buf);
5058         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5059         fprintf(fplog,"%s",buf);
5060         fprintf(stderr,"%s",buf);
5061         bLim = FALSE;
5062         if (comm->bDynLoadBal)
5063         {
5064             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5065             for(d=0; d<dd->ndim; d++)
5066             {
5067                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5068                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5069                 if (limp >= 50)
5070                 {
5071                     bLim = TRUE;
5072                 }
5073             }
5074             sprintf(buf+strlen(buf),"\n");
5075             fprintf(fplog,"%s",buf);
5076             fprintf(stderr,"%s",buf);
5077         }
5078         if (npme > 0)
5079         {
5080             pme_f_ratio = comm->load_pme/comm->load_mdf;
5081             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5082             if (lossp <= 0)
5083             {
5084                 lossp *= (float)npme/(float)nnodes;
5085             }
5086             else
5087             {
5088                 lossp *= (float)npp/(float)nnodes;
5089             }
5090             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5091             fprintf(fplog,"%s",buf);
5092             fprintf(stderr,"%s",buf);
5093             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5094             fprintf(fplog,"%s",buf);
5095             fprintf(stderr,"%s",buf);
5096         }
5097         fprintf(fplog,"\n");
5098         fprintf(stderr,"\n");
5099
5100         if (lossf >= DD_PERF_LOSS)
5101         {
5102             sprintf(buf,
5103                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5104                     "      in the domain decomposition.\n",lossf*100);
5105             if (!comm->bDynLoadBal)
5106             {
5107                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5108             }
5109             else if (bLim)
5110             {
5111                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5112             }
5113             fprintf(fplog,"%s\n",buf);
5114             fprintf(stderr,"%s\n",buf);
5115         }
5116         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5117         {
5118             sprintf(buf,
5119                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5120                     "      had %s work to do than the PP nodes.\n"
5121                     "      You might want to %s the number of PME nodes\n"
5122                     "      or %s the cut-off and the grid spacing.\n",
5123                     fabs(lossp*100),
5124                     (lossp < 0) ? "less"     : "more",
5125                     (lossp < 0) ? "decrease" : "increase",
5126                     (lossp < 0) ? "decrease" : "increase");
5127             fprintf(fplog,"%s\n",buf);
5128             fprintf(stderr,"%s\n",buf);
5129         }
5130     }
5131 }
5132
5133 static float dd_vol_min(gmx_domdec_t *dd)
5134 {
5135     return dd->comm->load[0].cvol_min*dd->nnodes;
5136 }
5137
5138 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5139 {
5140     return dd->comm->load[0].flags;
5141 }
5142
5143 static float dd_f_imbal(gmx_domdec_t *dd)
5144 {
5145     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5146 }
5147
5148 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5149 {
5150     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5151 }
5152
5153 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5154 {
5155     int flags,d;
5156     char buf[22];
5157
5158     flags = dd_load_flags(dd);
5159     if (flags)
5160     {
5161         fprintf(fplog,
5162                 "DD  load balancing is limited by minimum cell size in dimension");
5163         for(d=0; d<dd->ndim; d++)
5164         {
5165             if (flags & (1<<d))
5166             {
5167                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5168             }
5169         }
5170         fprintf(fplog,"\n");
5171     }
5172     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5173     if (dd->comm->bDynLoadBal)
5174     {
5175         fprintf(fplog,"  vol min/aver %5.3f%c",
5176                 dd_vol_min(dd),flags ? '!' : ' ');
5177     }
5178     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5179     if (dd->comm->cycl_n[ddCyclPME])
5180     {
5181         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5182     }
5183     fprintf(fplog,"\n\n");
5184 }
5185
5186 static void dd_print_load_verbose(gmx_domdec_t *dd)
5187 {
5188     if (dd->comm->bDynLoadBal)
5189     {
5190         fprintf(stderr,"vol %4.2f%c ",
5191                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5192     }
5193     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5194     if (dd->comm->cycl_n[ddCyclPME])
5195     {
5196         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5197     }
5198 }
5199
5200 #ifdef GMX_MPI
5201 static void make_load_communicator(gmx_domdec_t *dd,MPI_Group g_all,
5202                                    int dim_ind,ivec loc)
5203 {
5204     MPI_Group g_row;
5205     MPI_Comm  c_row;
5206     int  dim,i,*rank;
5207     ivec loc_c;
5208     gmx_domdec_root_t *root;
5209
5210     dim = dd->dim[dim_ind];
5211     copy_ivec(loc,loc_c);
5212     snew(rank,dd->nc[dim]);
5213     for(i=0; i<dd->nc[dim]; i++)
5214     {
5215         loc_c[dim] = i;
5216         rank[i] = dd_index(dd->nc,loc_c);
5217     }
5218     /* Here we create a new group, that does not necessarily
5219      * include our process. But MPI_Comm_create needs to be
5220      * called by all the processes in the original communicator.
5221      * Calling MPI_Group_free afterwards gives errors, so I assume
5222      * also the group is needed by all processes. (B. Hess)
5223      */
5224     MPI_Group_incl(g_all,dd->nc[dim],rank,&g_row);
5225     MPI_Comm_create(dd->mpi_comm_all,g_row,&c_row);
5226     if (c_row != MPI_COMM_NULL)
5227     {
5228         /* This process is part of the group */
5229         dd->comm->mpi_comm_load[dim_ind] = c_row;
5230         if (dd->comm->eDLB != edlbNO)
5231         {
5232             if (dd->ci[dim] == dd->master_ci[dim])
5233             {
5234                 /* This is the root process of this row */
5235                 snew(dd->comm->root[dim_ind],1);
5236                 root = dd->comm->root[dim_ind];
5237                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5238                 snew(root->old_cell_f,dd->nc[dim]+1);
5239                 snew(root->bCellMin,dd->nc[dim]);
5240                 if (dim_ind > 0)
5241                 {
5242                     snew(root->cell_f_max0,dd->nc[dim]);
5243                     snew(root->cell_f_min1,dd->nc[dim]);
5244                     snew(root->bound_min,dd->nc[dim]);
5245                     snew(root->bound_max,dd->nc[dim]);
5246                 }
5247                 snew(root->buf_ncd,dd->nc[dim]);
5248             }
5249             else
5250             {
5251                 /* This is not a root process, we only need to receive cell_f */
5252                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5253             }
5254         }
5255         if (dd->ci[dim] == dd->master_ci[dim])
5256         {
5257             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5258         }
5259     }
5260     sfree(rank);
5261 }
5262 #endif
5263
5264 static void make_load_communicators(gmx_domdec_t *dd)
5265 {
5266 #ifdef GMX_MPI
5267   MPI_Group g_all;
5268   int  dim0,dim1,i,j;
5269   ivec loc;
5270
5271   if (debug)
5272     fprintf(debug,"Making load communicators\n");
5273
5274   MPI_Comm_group(dd->mpi_comm_all,&g_all);
5275
5276   snew(dd->comm->load,dd->ndim);
5277   snew(dd->comm->mpi_comm_load,dd->ndim);
5278
5279   clear_ivec(loc);
5280   make_load_communicator(dd,g_all,0,loc);
5281   if (dd->ndim > 1) {
5282     dim0 = dd->dim[0];
5283     for(i=0; i<dd->nc[dim0]; i++) {
5284       loc[dim0] = i;
5285       make_load_communicator(dd,g_all,1,loc);
5286     }
5287   }
5288   if (dd->ndim > 2) {
5289     dim0 = dd->dim[0];
5290     for(i=0; i<dd->nc[dim0]; i++) {
5291       loc[dim0] = i;
5292       dim1 = dd->dim[1];
5293       for(j=0; j<dd->nc[dim1]; j++) {
5294           loc[dim1] = j;
5295           make_load_communicator(dd,g_all,2,loc);
5296       }
5297     }
5298   }
5299
5300   MPI_Group_free(&g_all);
5301
5302   if (debug)
5303     fprintf(debug,"Finished making load communicators\n");
5304 #endif
5305 }
5306
5307 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5308 {
5309     gmx_bool bZYX;
5310     int  d,dim,i,j,m;
5311     ivec tmp,s;
5312     int  nzone,nzonep;
5313     ivec dd_zp[DD_MAXIZONE];
5314     gmx_domdec_zones_t *zones;
5315     gmx_domdec_ns_ranges_t *izone;
5316
5317     for(d=0; d<dd->ndim; d++)
5318     {
5319         dim = dd->dim[d];
5320         copy_ivec(dd->ci,tmp);
5321         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5322         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5323         copy_ivec(dd->ci,tmp);
5324         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5325         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5326         if (debug)
5327         {
5328             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5329                     dd->rank,dim,
5330                     dd->neighbor[d][0],
5331                     dd->neighbor[d][1]);
5332         }
5333     }
5334
5335     if (DDMASTER(dd))
5336     {
5337         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5338             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5339     }
5340     if (fplog)
5341     {
5342         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5343                 dd->ndim,
5344                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5345                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5346     }
5347     switch (dd->ndim)
5348     {
5349     case 3:
5350         nzone  = dd_z3n;
5351         nzonep = dd_zp3n;
5352         for(i=0; i<nzonep; i++)
5353         {
5354             copy_ivec(dd_zp3[i],dd_zp[i]);
5355         }
5356         break;
5357     case 2:
5358         nzone  = dd_z2n;
5359         nzonep = dd_zp2n;
5360         for(i=0; i<nzonep; i++)
5361         {
5362             copy_ivec(dd_zp2[i],dd_zp[i]);
5363         }
5364         break;
5365     case 1:
5366         nzone  = dd_z1n;
5367         nzonep = dd_zp1n;
5368         for(i=0; i<nzonep; i++)
5369         {
5370             copy_ivec(dd_zp1[i],dd_zp[i]);
5371         }
5372         break;
5373     default:
5374         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5375         nzone = 0;
5376         nzonep = 0;
5377     }
5378
5379     zones = &dd->comm->zones;
5380
5381     for(i=0; i<nzone; i++)
5382     {
5383         m = 0;
5384         clear_ivec(zones->shift[i]);
5385         for(d=0; d<dd->ndim; d++)
5386         {
5387             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5388         }
5389     }
5390
5391     zones->n = nzone;
5392     for(i=0; i<nzone; i++)
5393     {
5394         for(d=0; d<DIM; d++)
5395         {
5396             s[d] = dd->ci[d] - zones->shift[i][d];
5397             if (s[d] < 0)
5398             {
5399                 s[d] += dd->nc[d];
5400             }
5401             else if (s[d] >= dd->nc[d])
5402             {
5403                 s[d] -= dd->nc[d];
5404             }
5405         }
5406     }
5407     zones->nizone = nzonep;
5408     for(i=0; i<zones->nizone; i++)
5409     {
5410         if (dd_zp[i][0] != i)
5411         {
5412             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5413         }
5414         izone = &zones->izone[i];
5415         izone->j0 = dd_zp[i][1];
5416         izone->j1 = dd_zp[i][2];
5417         for(dim=0; dim<DIM; dim++)
5418         {
5419             if (dd->nc[dim] == 1)
5420             {
5421                 /* All shifts should be allowed */
5422                 izone->shift0[dim] = -1;
5423                 izone->shift1[dim] = 1;
5424             }
5425             else
5426             {
5427                 /*
5428                   izone->shift0[d] = 0;
5429                   izone->shift1[d] = 0;
5430                   for(j=izone->j0; j<izone->j1; j++) {
5431                   if (dd->shift[j][d] > dd->shift[i][d])
5432                   izone->shift0[d] = -1;
5433                   if (dd->shift[j][d] < dd->shift[i][d])
5434                   izone->shift1[d] = 1;
5435                   }
5436                 */
5437
5438                 int shift_diff;
5439
5440                 /* Assume the shift are not more than 1 cell */
5441                 izone->shift0[dim] = 1;
5442                 izone->shift1[dim] = -1;
5443                 for(j=izone->j0; j<izone->j1; j++)
5444                 {
5445                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5446                     if (shift_diff < izone->shift0[dim])
5447                     {
5448                         izone->shift0[dim] = shift_diff;
5449                     }
5450                     if (shift_diff > izone->shift1[dim])
5451                     {
5452                         izone->shift1[dim] = shift_diff;
5453                     }
5454                 }
5455             }
5456         }
5457     }
5458
5459     if (dd->comm->eDLB != edlbNO)
5460     {
5461         snew(dd->comm->root,dd->ndim);
5462     }
5463
5464     if (dd->comm->bRecordLoad)
5465     {
5466         make_load_communicators(dd);
5467     }
5468 }
5469
5470 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5471 {
5472     gmx_domdec_t *dd;
5473     gmx_domdec_comm_t *comm;
5474     int  i,rank,*buf;
5475     ivec periods;
5476 #ifdef GMX_MPI
5477     MPI_Comm comm_cart;
5478 #endif
5479
5480     dd = cr->dd;
5481     comm = dd->comm;
5482
5483 #ifdef GMX_MPI
5484     if (comm->bCartesianPP)
5485     {
5486         /* Set up cartesian communication for the particle-particle part */
5487         if (fplog)
5488         {
5489             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5490                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5491         }
5492
5493         for(i=0; i<DIM; i++)
5494         {
5495             periods[i] = TRUE;
5496         }
5497         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5498                         &comm_cart);
5499         /* We overwrite the old communicator with the new cartesian one */
5500         cr->mpi_comm_mygroup = comm_cart;
5501     }
5502
5503     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5504     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5505
5506     if (comm->bCartesianPP_PME)
5507     {
5508         /* Since we want to use the original cartesian setup for sim,
5509          * and not the one after split, we need to make an index.
5510          */
5511         snew(comm->ddindex2ddnodeid,dd->nnodes);
5512         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5513         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5514         /* Get the rank of the DD master,
5515          * above we made sure that the master node is a PP node.
5516          */
5517         if (MASTER(cr))
5518         {
5519             rank = dd->rank;
5520         }
5521         else
5522         {
5523             rank = 0;
5524         }
5525         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5526     }
5527     else if (comm->bCartesianPP)
5528     {
5529         if (cr->npmenodes == 0)
5530         {
5531             /* The PP communicator is also
5532              * the communicator for this simulation
5533              */
5534             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5535         }
5536         cr->nodeid = dd->rank;
5537
5538         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5539
5540         /* We need to make an index to go from the coordinates
5541          * to the nodeid of this simulation.
5542          */
5543         snew(comm->ddindex2simnodeid,dd->nnodes);
5544         snew(buf,dd->nnodes);
5545         if (cr->duty & DUTY_PP)
5546         {
5547             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5548         }
5549         /* Communicate the ddindex to simulation nodeid index */
5550         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5551                       cr->mpi_comm_mysim);
5552         sfree(buf);
5553
5554         /* Determine the master coordinates and rank.
5555          * The DD master should be the same node as the master of this sim.
5556          */
5557         for(i=0; i<dd->nnodes; i++)
5558         {
5559             if (comm->ddindex2simnodeid[i] == 0)
5560             {
5561                 ddindex2xyz(dd->nc,i,dd->master_ci);
5562                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5563             }
5564         }
5565         if (debug)
5566         {
5567             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5568         }
5569     }
5570     else
5571     {
5572         /* No Cartesian communicators */
5573         /* We use the rank in dd->comm->all as DD index */
5574         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5575         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5576         dd->masterrank = 0;
5577         clear_ivec(dd->master_ci);
5578     }
5579 #endif
5580
5581     if (fplog)
5582     {
5583         fprintf(fplog,
5584                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5585                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5586     }
5587     if (debug)
5588     {
5589         fprintf(debug,
5590                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5591                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5592     }
5593 }
5594
5595 static void receive_ddindex2simnodeid(t_commrec *cr)
5596 {
5597     gmx_domdec_t *dd;
5598
5599     gmx_domdec_comm_t *comm;
5600     int  *buf;
5601
5602     dd = cr->dd;
5603     comm = dd->comm;
5604
5605 #ifdef GMX_MPI
5606     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5607     {
5608         snew(comm->ddindex2simnodeid,dd->nnodes);
5609         snew(buf,dd->nnodes);
5610         if (cr->duty & DUTY_PP)
5611         {
5612             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5613         }
5614 #ifdef GMX_MPI
5615         /* Communicate the ddindex to simulation nodeid index */
5616         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5617                       cr->mpi_comm_mysim);
5618 #endif
5619         sfree(buf);
5620     }
5621 #endif
5622 }
5623
5624 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5625                                                      int ncg,int natoms)
5626 {
5627     gmx_domdec_master_t *ma;
5628     int i;
5629
5630     snew(ma,1);
5631
5632     snew(ma->ncg,dd->nnodes);
5633     snew(ma->index,dd->nnodes+1);
5634     snew(ma->cg,ncg);
5635     snew(ma->nat,dd->nnodes);
5636     snew(ma->ibuf,dd->nnodes*2);
5637     snew(ma->cell_x,DIM);
5638     for(i=0; i<DIM; i++)
5639     {
5640         snew(ma->cell_x[i],dd->nc[i]+1);
5641     }
5642
5643     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5644     {
5645         ma->vbuf = NULL;
5646     }
5647     else
5648     {
5649         snew(ma->vbuf,natoms);
5650     }
5651
5652     return ma;
5653 }
5654
5655 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5656                                int reorder)
5657 {
5658     gmx_domdec_t *dd;
5659     gmx_domdec_comm_t *comm;
5660     int  i,rank;
5661     gmx_bool bDiv[DIM];
5662     ivec periods;
5663 #ifdef GMX_MPI
5664     MPI_Comm comm_cart;
5665 #endif
5666
5667     dd = cr->dd;
5668     comm = dd->comm;
5669
5670     if (comm->bCartesianPP)
5671     {
5672         for(i=1; i<DIM; i++)
5673         {
5674             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5675         }
5676         if (bDiv[YY] || bDiv[ZZ])
5677         {
5678             comm->bCartesianPP_PME = TRUE;
5679             /* If we have 2D PME decomposition, which is always in x+y,
5680              * we stack the PME only nodes in z.
5681              * Otherwise we choose the direction that provides the thinnest slab
5682              * of PME only nodes as this will have the least effect
5683              * on the PP communication.
5684              * But for the PME communication the opposite might be better.
5685              */
5686             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5687                              !bDiv[YY] ||
5688                              dd->nc[YY] > dd->nc[ZZ]))
5689             {
5690                 comm->cartpmedim = ZZ;
5691             }
5692             else
5693             {
5694                 comm->cartpmedim = YY;
5695             }
5696             comm->ntot[comm->cartpmedim]
5697                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5698         }
5699         else if (fplog)
5700         {
5701             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5702             fprintf(fplog,
5703                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5704         }
5705     }
5706
5707 #ifdef GMX_MPI
5708     if (comm->bCartesianPP_PME)
5709     {
5710         if (fplog)
5711         {
5712             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5713         }
5714
5715         for(i=0; i<DIM; i++)
5716         {
5717             periods[i] = TRUE;
5718         }
5719         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5720                         &comm_cart);
5721
5722         MPI_Comm_rank(comm_cart,&rank);
5723         if (MASTERNODE(cr) && rank != 0)
5724         {
5725             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5726         }
5727
5728         /* With this assigment we loose the link to the original communicator
5729          * which will usually be MPI_COMM_WORLD, unless have multisim.
5730          */
5731         cr->mpi_comm_mysim = comm_cart;
5732         cr->sim_nodeid = rank;
5733
5734         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5735
5736         if (fplog)
5737         {
5738             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5739                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5740         }
5741
5742         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5743         {
5744             cr->duty = DUTY_PP;
5745         }
5746         if (cr->npmenodes == 0 ||
5747             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5748         {
5749             cr->duty = DUTY_PME;
5750         }
5751
5752         /* Split the sim communicator into PP and PME only nodes */
5753         MPI_Comm_split(cr->mpi_comm_mysim,
5754                        cr->duty,
5755                        dd_index(comm->ntot,dd->ci),
5756                        &cr->mpi_comm_mygroup);
5757     }
5758     else
5759     {
5760         switch (dd_node_order)
5761         {
5762         case ddnoPP_PME:
5763             if (fplog)
5764             {
5765                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5766             }
5767             break;
5768         case ddnoINTERLEAVE:
5769             /* Interleave the PP-only and PME-only nodes,
5770              * as on clusters with dual-core machines this will double
5771              * the communication bandwidth of the PME processes
5772              * and thus speed up the PP <-> PME and inter PME communication.
5773              */
5774             if (fplog)
5775             {
5776                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5777             }
5778             comm->pmenodes = dd_pmenodes(cr);
5779             break;
5780         case ddnoCARTESIAN:
5781             break;
5782         default:
5783             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5784         }
5785
5786         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5787         {
5788             cr->duty = DUTY_PME;
5789         }
5790         else
5791         {
5792             cr->duty = DUTY_PP;
5793         }
5794
5795         /* Split the sim communicator into PP and PME only nodes */
5796         MPI_Comm_split(cr->mpi_comm_mysim,
5797                        cr->duty,
5798                        cr->nodeid,
5799                        &cr->mpi_comm_mygroup);
5800         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5801     }
5802 #endif
5803
5804     if (fplog)
5805     {
5806         fprintf(fplog,"This is a %s only node\n\n",
5807                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5808     }
5809 }
5810
5811 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5812 {
5813     gmx_domdec_t *dd;
5814     gmx_domdec_comm_t *comm;
5815     int CartReorder;
5816
5817     dd = cr->dd;
5818     comm = dd->comm;
5819
5820     copy_ivec(dd->nc,comm->ntot);
5821
5822     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5823     comm->bCartesianPP_PME = FALSE;
5824
5825     /* Reorder the nodes by default. This might change the MPI ranks.
5826      * Real reordering is only supported on very few architectures,
5827      * Blue Gene is one of them.
5828      */
5829     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5830
5831     if (cr->npmenodes > 0)
5832     {
5833         /* Split the communicator into a PP and PME part */
5834         split_communicator(fplog,cr,dd_node_order,CartReorder);
5835         if (comm->bCartesianPP_PME)
5836         {
5837             /* We (possibly) reordered the nodes in split_communicator,
5838              * so it is no longer required in make_pp_communicator.
5839              */
5840             CartReorder = FALSE;
5841         }
5842     }
5843     else
5844     {
5845         /* All nodes do PP and PME */
5846 #ifdef GMX_MPI
5847         /* We do not require separate communicators */
5848         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5849 #endif
5850     }
5851
5852     if (cr->duty & DUTY_PP)
5853     {
5854         /* Copy or make a new PP communicator */
5855         make_pp_communicator(fplog,cr,CartReorder);
5856     }
5857     else
5858     {
5859         receive_ddindex2simnodeid(cr);
5860     }
5861
5862     if (!(cr->duty & DUTY_PME))
5863     {
5864         /* Set up the commnuication to our PME node */
5865         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5866         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5867         if (debug)
5868         {
5869             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5870                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5871         }
5872     }
5873     else
5874     {
5875         dd->pme_nodeid = -1;
5876     }
5877
5878     if (DDMASTER(dd))
5879     {
5880         dd->ma = init_gmx_domdec_master_t(dd,
5881                                           comm->cgs_gl.nr,
5882                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5883     }
5884 }
5885
5886 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5887 {
5888     real *slb_frac,tot;
5889     int  i,n;
5890     double dbl;
5891
5892     slb_frac = NULL;
5893     if (nc > 1 && size_string != NULL)
5894     {
5895         if (fplog)
5896         {
5897             fprintf(fplog,"Using static load balancing for the %s direction\n",
5898                     dir);
5899         }
5900         snew(slb_frac,nc);
5901         tot = 0;
5902         for (i=0; i<nc; i++)
5903         {
5904             dbl = 0;
5905             sscanf(size_string,"%lf%n",&dbl,&n);
5906             if (dbl == 0)
5907             {
5908                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5909             }
5910             slb_frac[i] = dbl;
5911             size_string += n;
5912             tot += slb_frac[i];
5913         }
5914         /* Normalize */
5915         if (fplog)
5916         {
5917             fprintf(fplog,"Relative cell sizes:");
5918         }
5919         for (i=0; i<nc; i++)
5920         {
5921             slb_frac[i] /= tot;
5922             if (fplog)
5923             {
5924                 fprintf(fplog," %5.3f",slb_frac[i]);
5925             }
5926         }
5927         if (fplog)
5928         {
5929             fprintf(fplog,"\n");
5930         }
5931     }
5932
5933     return slb_frac;
5934 }
5935
5936 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5937 {
5938     int n,nmol,ftype;
5939     gmx_mtop_ilistloop_t iloop;
5940     t_ilist *il;
5941
5942     n = 0;
5943     iloop = gmx_mtop_ilistloop_init(mtop);
5944     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5945     {
5946         for(ftype=0; ftype<F_NRE; ftype++)
5947         {
5948             if ((interaction_function[ftype].flags & IF_BOND) &&
5949                 NRAL(ftype) >  2)
5950             {
5951                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5952             }
5953         }
5954   }
5955
5956   return n;
5957 }
5958
5959 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5960 {
5961     char *val;
5962     int  nst;
5963
5964     nst = def;
5965     val = getenv(env_var);
5966     if (val)
5967     {
5968         if (sscanf(val,"%d",&nst) <= 0)
5969         {
5970             nst = 1;
5971         }
5972         if (fplog)
5973         {
5974             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5975                     env_var,val,nst);
5976         }
5977     }
5978
5979     return nst;
5980 }
5981
5982 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5983 {
5984     if (MASTER(cr))
5985     {
5986         fprintf(stderr,"\n%s\n",warn_string);
5987     }
5988     if (fplog)
5989     {
5990         fprintf(fplog,"\n%s\n",warn_string);
5991     }
5992 }
5993
5994 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5995                                   t_inputrec *ir,FILE *fplog)
5996 {
5997     if (ir->ePBC == epbcSCREW &&
5998         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
5999     {
6000         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6001     }
6002
6003     if (ir->ns_type == ensSIMPLE)
6004     {
6005         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6006     }
6007
6008     if (ir->nstlist == 0)
6009     {
6010         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6011     }
6012
6013     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6014     {
6015         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6016     }
6017 }
6018
6019 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6020 {
6021     int  di,d;
6022     real r;
6023
6024     r = ddbox->box_size[XX];
6025     for(di=0; di<dd->ndim; di++)
6026     {
6027         d = dd->dim[di];
6028         /* Check using the initial average cell size */
6029         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6030     }
6031
6032     return r;
6033 }
6034
6035 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6036                              const char *dlb_opt,gmx_bool bRecordLoad,
6037                              unsigned long Flags,t_inputrec *ir)
6038 {
6039     gmx_domdec_t *dd;
6040     int  eDLB=-1;
6041     char buf[STRLEN];
6042
6043     switch (dlb_opt[0])
6044     {
6045     case 'a': eDLB = edlbAUTO; break;
6046     case 'n': eDLB = edlbNO;   break;
6047     case 'y': eDLB = edlbYES;  break;
6048     default: gmx_incons("Unknown dlb_opt");
6049     }
6050
6051     if (Flags & MD_RERUN)
6052     {
6053         return edlbNO;
6054     }
6055
6056     if (!EI_DYNAMICS(ir->eI))
6057     {
6058         if (eDLB == edlbYES)
6059         {
6060             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6061             dd_warning(cr,fplog,buf);
6062         }
6063
6064         return edlbNO;
6065     }
6066
6067     if (!bRecordLoad)
6068     {
6069         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6070
6071         return edlbNO;
6072     }
6073
6074     if (Flags & MD_REPRODUCIBLE)
6075     {
6076         switch (eDLB)
6077         {
6078                         case edlbNO:
6079                                 break;
6080                         case edlbAUTO:
6081                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6082                                 eDLB = edlbNO;
6083                                 break;
6084                         case edlbYES:
6085                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6086                                 break;
6087                         default:
6088                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6089                                 break;
6090         }
6091     }
6092
6093     return eDLB;
6094 }
6095
6096 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6097 {
6098     int dim;
6099
6100     dd->ndim = 0;
6101     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6102     {
6103         /* Decomposition order z,y,x */
6104         if (fplog)
6105         {
6106             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6107         }
6108         for(dim=DIM-1; dim>=0; dim--)
6109         {
6110             if (dd->nc[dim] > 1)
6111             {
6112                 dd->dim[dd->ndim++] = dim;
6113             }
6114         }
6115     }
6116     else
6117     {
6118         /* Decomposition order x,y,z */
6119         for(dim=0; dim<DIM; dim++)
6120         {
6121             if (dd->nc[dim] > 1)
6122             {
6123                 dd->dim[dd->ndim++] = dim;
6124             }
6125         }
6126     }
6127 }
6128
6129 static gmx_domdec_comm_t *init_dd_comm()
6130 {
6131     gmx_domdec_comm_t *comm;
6132     int  i;
6133
6134     snew(comm,1);
6135     snew(comm->cggl_flag,DIM*2);
6136     snew(comm->cgcm_state,DIM*2);
6137     for(i=0; i<DIM*2; i++)
6138     {
6139         comm->cggl_flag_nalloc[i]  = 0;
6140         comm->cgcm_state_nalloc[i] = 0;
6141     }
6142
6143     comm->nalloc_int = 0;
6144     comm->buf_int    = NULL;
6145
6146     vec_rvec_init(&comm->vbuf);
6147
6148     comm->n_load_have    = 0;
6149     comm->n_load_collect = 0;
6150
6151     for(i=0; i<ddnatNR-ddnatZONE; i++)
6152     {
6153         comm->sum_nat[i] = 0;
6154     }
6155     comm->ndecomp = 0;
6156     comm->nload   = 0;
6157     comm->load_step = 0;
6158     comm->load_sum  = 0;
6159     comm->load_max  = 0;
6160     clear_ivec(comm->load_lim);
6161     comm->load_mdf  = 0;
6162     comm->load_pme  = 0;
6163
6164     return comm;
6165 }
6166
6167 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6168                                         unsigned long Flags,
6169                                         ivec nc,
6170                                         real comm_distance_min,real rconstr,
6171                                         const char *dlb_opt,real dlb_scale,
6172                                         const char *sizex,const char *sizey,const char *sizez,
6173                                         gmx_mtop_t *mtop,t_inputrec *ir,
6174                                         matrix box,rvec *x,
6175                                         gmx_ddbox_t *ddbox,
6176                                         int *npme_x,int *npme_y)
6177 {
6178     gmx_domdec_t *dd;
6179     gmx_domdec_comm_t *comm;
6180     int  recload;
6181     int  d,i,j;
6182     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6183     gmx_bool bC;
6184     char buf[STRLEN];
6185
6186     if (fplog)
6187     {
6188         fprintf(fplog,
6189                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6190     }
6191
6192     snew(dd,1);
6193
6194     dd->comm = init_dd_comm();
6195     comm = dd->comm;
6196     snew(comm->cggl_flag,DIM*2);
6197     snew(comm->cgcm_state,DIM*2);
6198
6199     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6200     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6201
6202     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6203     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6204     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6205     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6206     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6207     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6208     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6209     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6210
6211     dd->pme_recv_f_alloc = 0;
6212     dd->pme_recv_f_buf = NULL;
6213
6214     if (dd->bSendRecv2 && fplog)
6215     {
6216         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6217     }
6218     if (comm->eFlop)
6219     {
6220         if (fplog)
6221         {
6222             fprintf(fplog,"Will load balance based on FLOP count\n");
6223         }
6224         if (comm->eFlop > 1)
6225         {
6226             srand(1+cr->nodeid);
6227         }
6228         comm->bRecordLoad = TRUE;
6229     }
6230     else
6231     {
6232         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6233
6234     }
6235
6236     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6237
6238     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6239     if (fplog)
6240     {
6241         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6242     }
6243     dd->bGridJump = comm->bDynLoadBal;
6244
6245     if (comm->nstSortCG)
6246     {
6247         if (fplog)
6248         {
6249             if (comm->nstSortCG == 1)
6250             {
6251                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6252             }
6253             else
6254             {
6255                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6256                         comm->nstSortCG);
6257             }
6258         }
6259         snew(comm->sort,1);
6260     }
6261     else
6262     {
6263         if (fplog)
6264         {
6265             fprintf(fplog,"Will not sort the charge groups\n");
6266         }
6267     }
6268
6269     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6270     if (comm->bInterCGBondeds)
6271     {
6272         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6273     }
6274     else
6275     {
6276         comm->bInterCGMultiBody = FALSE;
6277     }
6278
6279     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6280
6281     if (ir->rlistlong == 0)
6282     {
6283         /* Set the cut-off to some very large value,
6284          * so we don't need if statements everywhere in the code.
6285          * We use sqrt, since the cut-off is squared in some places.
6286          */
6287         comm->cutoff   = GMX_CUTOFF_INF;
6288     }
6289     else
6290     {
6291         comm->cutoff   = ir->rlistlong;
6292     }
6293     comm->cutoff_mbody = 0;
6294
6295     comm->cellsize_limit = 0;
6296     comm->bBondComm = FALSE;
6297
6298     if (comm->bInterCGBondeds)
6299     {
6300         if (comm_distance_min > 0)
6301         {
6302             comm->cutoff_mbody = comm_distance_min;
6303             if (Flags & MD_DDBONDCOMM)
6304             {
6305                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6306             }
6307             else
6308             {
6309                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6310             }
6311             r_bonded_limit = comm->cutoff_mbody;
6312         }
6313         else if (ir->bPeriodicMols)
6314         {
6315             /* Can not easily determine the required cut-off */
6316             dd_warning(cr,fplog,"NOTE: Periodic molecules: can not easily determine the required minimum bonded cut-off, using half the non-bonded cut-off\n");
6317             comm->cutoff_mbody = comm->cutoff/2;
6318             r_bonded_limit = comm->cutoff_mbody;
6319         }
6320         else
6321         {
6322             if (MASTER(cr))
6323             {
6324                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6325                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6326             }
6327             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6328             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6329
6330             /* We use an initial margin of 10% for the minimum cell size,
6331              * except when we are just below the non-bonded cut-off.
6332              */
6333             if (Flags & MD_DDBONDCOMM)
6334             {
6335                 if (max(r_2b,r_mb) > comm->cutoff)
6336                 {
6337                     r_bonded       = max(r_2b,r_mb);
6338                     r_bonded_limit = 1.1*r_bonded;
6339                     comm->bBondComm = TRUE;
6340                 }
6341                 else
6342                 {
6343                     r_bonded       = r_mb;
6344                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6345                 }
6346                 /* We determine cutoff_mbody later */
6347             }
6348             else
6349             {
6350                 /* No special bonded communication,
6351                  * simply increase the DD cut-off.
6352                  */
6353                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6354                 comm->cutoff_mbody = r_bonded_limit;
6355                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6356             }
6357         }
6358         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6359         if (fplog)
6360         {
6361             fprintf(fplog,
6362                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6363                     comm->cellsize_limit);
6364         }
6365     }
6366
6367     if (dd->bInterCGcons && rconstr <= 0)
6368     {
6369         /* There is a cell size limit due to the constraints (P-LINCS) */
6370         rconstr = constr_r_max(fplog,mtop,ir);
6371         if (fplog)
6372         {
6373             fprintf(fplog,
6374                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6375                     rconstr);
6376             if (rconstr > comm->cellsize_limit)
6377             {
6378                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6379             }
6380         }
6381     }
6382     else if (rconstr > 0 && fplog)
6383     {
6384         /* Here we do not check for dd->bInterCGcons,
6385          * because one can also set a cell size limit for virtual sites only
6386          * and at this point we don't know yet if there are intercg v-sites.
6387          */
6388         fprintf(fplog,
6389                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6390                 rconstr);
6391     }
6392     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6393
6394     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6395
6396     if (nc[XX] > 0)
6397     {
6398         copy_ivec(nc,dd->nc);
6399         set_dd_dim(fplog,dd);
6400         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6401
6402         if (cr->npmenodes == -1)
6403         {
6404             cr->npmenodes = 0;
6405         }
6406         acs = average_cellsize_min(dd,ddbox);
6407         if (acs < comm->cellsize_limit)
6408         {
6409             if (fplog)
6410             {
6411                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6412             }
6413             gmx_fatal_collective(FARGS,cr,NULL,
6414                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6415                                  acs,comm->cellsize_limit);
6416         }
6417     }
6418     else
6419     {
6420         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6421
6422         /* We need to choose the optimal DD grid and possibly PME nodes */
6423         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6424                                comm->eDLB!=edlbNO,dlb_scale,
6425                                comm->cellsize_limit,comm->cutoff,
6426                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6427
6428         if (dd->nc[XX] == 0)
6429         {
6430             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6431             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6432                     !bC ? "-rdd" : "-rcon",
6433                     comm->eDLB!=edlbNO ? " or -dds" : "",
6434                     bC ? " or your LINCS settings" : "");
6435
6436             gmx_fatal_collective(FARGS,cr,NULL,
6437                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6438                                  "%s\n"
6439                                  "Look in the log file for details on the domain decomposition",
6440                                  cr->nnodes-cr->npmenodes,limit,buf);
6441         }
6442         set_dd_dim(fplog,dd);
6443     }
6444
6445     if (fplog)
6446     {
6447         fprintf(fplog,
6448                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6449                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6450     }
6451
6452     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6453     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6454     {
6455         gmx_fatal_collective(FARGS,cr,NULL,
6456                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6457                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6458     }
6459     if (cr->npmenodes > dd->nnodes)
6460     {
6461         gmx_fatal_collective(FARGS,cr,NULL,
6462                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6463     }
6464     if (cr->npmenodes > 0)
6465     {
6466         comm->npmenodes = cr->npmenodes;
6467     }
6468     else
6469     {
6470         comm->npmenodes = dd->nnodes;
6471     }
6472
6473     if (EEL_PME(ir->coulombtype))
6474     {
6475         /* The following choices should match those
6476          * in comm_cost_est in domdec_setup.c.
6477          * Note that here the checks have to take into account
6478          * that the decomposition might occur in a different order than xyz
6479          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6480          * in which case they will not match those in comm_cost_est,
6481          * but since that is mainly for testing purposes that's fine.
6482          */
6483         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6484             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6485             getenv("GMX_PMEONEDD") == NULL)
6486         {
6487             comm->npmedecompdim = 2;
6488             comm->npmenodes_x   = dd->nc[XX];
6489             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6490         }
6491         else
6492         {
6493             /* In case nc is 1 in both x and y we could still choose to
6494              * decompose pme in y instead of x, but we use x for simplicity.
6495              */
6496             comm->npmedecompdim = 1;
6497             if (dd->dim[0] == YY)
6498             {
6499                 comm->npmenodes_x = 1;
6500                 comm->npmenodes_y = comm->npmenodes;
6501             }
6502             else
6503             {
6504                 comm->npmenodes_x = comm->npmenodes;
6505                 comm->npmenodes_y = 1;
6506             }
6507         }
6508         if (fplog)
6509         {
6510             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6511                     comm->npmenodes_x,comm->npmenodes_y,1);
6512         }
6513     }
6514     else
6515     {
6516         comm->npmedecompdim = 0;
6517         comm->npmenodes_x   = 0;
6518         comm->npmenodes_y   = 0;
6519     }
6520
6521     /* Technically we don't need both of these,
6522      * but it simplifies code not having to recalculate it.
6523      */
6524     *npme_x = comm->npmenodes_x;
6525     *npme_y = comm->npmenodes_y;
6526
6527     snew(comm->slb_frac,DIM);
6528     if (comm->eDLB == edlbNO)
6529     {
6530         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6531         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6532         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6533     }
6534
6535     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6536     {
6537         if (comm->bBondComm || comm->eDLB != edlbNO)
6538         {
6539             /* Set the bonded communication distance to halfway
6540              * the minimum and the maximum,
6541              * since the extra communication cost is nearly zero.
6542              */
6543             acs = average_cellsize_min(dd,ddbox);
6544             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6545             if (comm->eDLB != edlbNO)
6546             {
6547                 /* Check if this does not limit the scaling */
6548                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6549             }
6550             if (!comm->bBondComm)
6551             {
6552                 /* Without bBondComm do not go beyond the n.b. cut-off */
6553                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6554                 if (comm->cellsize_limit >= comm->cutoff)
6555                 {
6556                     /* We don't loose a lot of efficieny
6557                      * when increasing it to the n.b. cut-off.
6558                      * It can even be slightly faster, because we need
6559                      * less checks for the communication setup.
6560                      */
6561                     comm->cutoff_mbody = comm->cutoff;
6562                 }
6563             }
6564             /* Check if we did not end up below our original limit */
6565             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6566
6567             if (comm->cutoff_mbody > comm->cellsize_limit)
6568             {
6569                 comm->cellsize_limit = comm->cutoff_mbody;
6570             }
6571         }
6572         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6573     }
6574
6575     if (debug)
6576     {
6577         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6578                 "cellsize limit %f\n",
6579                 comm->bBondComm,comm->cellsize_limit);
6580     }
6581
6582     if (MASTER(cr))
6583     {
6584         check_dd_restrictions(cr,dd,ir,fplog);
6585     }
6586
6587     comm->globalcomm_step = INT_MIN;
6588     dd->ddp_count = 0;
6589
6590     clear_dd_cycle_counts(dd);
6591
6592     return dd;
6593 }
6594
6595 static void set_dlb_limits(gmx_domdec_t *dd)
6596
6597 {
6598     int d;
6599
6600     for(d=0; d<dd->ndim; d++)
6601     {
6602         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6603         dd->comm->cellsize_min[dd->dim[d]] =
6604             dd->comm->cellsize_min_dlb[dd->dim[d]];
6605     }
6606 }
6607
6608
6609 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6610 {
6611     gmx_domdec_t *dd;
6612     gmx_domdec_comm_t *comm;
6613     real cellsize_min;
6614     int  d,nc,i;
6615     char buf[STRLEN];
6616
6617     dd = cr->dd;
6618     comm = dd->comm;
6619
6620     if (fplog)
6621     {
6622         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6623     }
6624
6625     cellsize_min = comm->cellsize_min[dd->dim[0]];
6626     for(d=1; d<dd->ndim; d++)
6627     {
6628         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6629     }
6630
6631     if (cellsize_min < comm->cellsize_limit*1.05)
6632     {
6633         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6634
6635         /* Change DLB from "auto" to "no". */
6636         comm->eDLB = edlbNO;
6637
6638         return;
6639     }
6640
6641     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6642     comm->bDynLoadBal = TRUE;
6643     dd->bGridJump = TRUE;
6644
6645     set_dlb_limits(dd);
6646
6647     /* We can set the required cell size info here,
6648      * so we do not need to communicate this.
6649      * The grid is completely uniform.
6650      */
6651     for(d=0; d<dd->ndim; d++)
6652     {
6653         if (comm->root[d])
6654         {
6655             comm->load[d].sum_m = comm->load[d].sum;
6656
6657             nc = dd->nc[dd->dim[d]];
6658             for(i=0; i<nc; i++)
6659             {
6660                 comm->root[d]->cell_f[i]    = i/(real)nc;
6661                 if (d > 0)
6662                 {
6663                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6664                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6665                 }
6666             }
6667             comm->root[d]->cell_f[nc] = 1.0;
6668         }
6669     }
6670 }
6671
6672 static char *init_bLocalCG(gmx_mtop_t *mtop)
6673 {
6674     int  ncg,cg;
6675     char *bLocalCG;
6676
6677     ncg = ncg_mtop(mtop);
6678     snew(bLocalCG,ncg);
6679     for(cg=0; cg<ncg; cg++)
6680     {
6681         bLocalCG[cg] = FALSE;
6682     }
6683
6684     return bLocalCG;
6685 }
6686
6687 void dd_init_bondeds(FILE *fplog,
6688                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6689                      gmx_vsite_t *vsite,gmx_constr_t constr,
6690                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6691 {
6692     gmx_domdec_comm_t *comm;
6693     gmx_bool bBondComm;
6694     int  d;
6695
6696     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6697
6698     comm = dd->comm;
6699
6700     if (comm->bBondComm)
6701     {
6702         /* Communicate atoms beyond the cut-off for bonded interactions */
6703         comm = dd->comm;
6704
6705         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6706
6707         comm->bLocalCG = init_bLocalCG(mtop);
6708     }
6709     else
6710     {
6711         /* Only communicate atoms based on cut-off */
6712         comm->cglink   = NULL;
6713         comm->bLocalCG = NULL;
6714     }
6715 }
6716
6717 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6718                               t_inputrec *ir,
6719                               gmx_bool bDynLoadBal,real dlb_scale,
6720                               gmx_ddbox_t *ddbox)
6721 {
6722     gmx_domdec_comm_t *comm;
6723     int  d;
6724     ivec np;
6725     real limit,shrink;
6726     char buf[64];
6727
6728     if (fplog == NULL)
6729     {
6730         return;
6731     }
6732
6733     comm = dd->comm;
6734
6735     if (bDynLoadBal)
6736     {
6737         fprintf(fplog,"The maximum number of communication pulses is:");
6738         for(d=0; d<dd->ndim; d++)
6739         {
6740             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6741         }
6742         fprintf(fplog,"\n");
6743         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6744         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6745         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6746         for(d=0; d<DIM; d++)
6747         {
6748             if (dd->nc[d] > 1)
6749             {
6750                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6751                 {
6752                     shrink = 0;
6753                 }
6754                 else
6755                 {
6756                     shrink =
6757                         comm->cellsize_min_dlb[d]/
6758                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6759                 }
6760                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6761             }
6762         }
6763         fprintf(fplog,"\n");
6764     }
6765     else
6766     {
6767         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6768         fprintf(fplog,"The initial number of communication pulses is:");
6769         for(d=0; d<dd->ndim; d++)
6770         {
6771             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6772         }
6773         fprintf(fplog,"\n");
6774         fprintf(fplog,"The initial domain decomposition cell size is:");
6775         for(d=0; d<DIM; d++) {
6776             if (dd->nc[d] > 1)
6777             {
6778                 fprintf(fplog," %c %.2f nm",
6779                         dim2char(d),dd->comm->cellsize_min[d]);
6780             }
6781         }
6782         fprintf(fplog,"\n\n");
6783     }
6784
6785     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6786     {
6787         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6788         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6789                 "non-bonded interactions","",comm->cutoff);
6790
6791         if (bDynLoadBal)
6792         {
6793             limit = dd->comm->cellsize_limit;
6794         }
6795         else
6796         {
6797             if (dynamic_dd_box(ddbox,ir))
6798             {
6799                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6800             }
6801             limit = dd->comm->cellsize_min[XX];
6802             for(d=1; d<DIM; d++)
6803             {
6804                 limit = min(limit,dd->comm->cellsize_min[d]);
6805             }
6806         }
6807
6808         if (comm->bInterCGBondeds)
6809         {
6810             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6811                     "two-body bonded interactions","(-rdd)",
6812                     max(comm->cutoff,comm->cutoff_mbody));
6813             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6814                     "multi-body bonded interactions","(-rdd)",
6815                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6816         }
6817         if (dd->vsite_comm)
6818         {
6819             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6820                     "virtual site constructions","(-rcon)",limit);
6821         }
6822         if (dd->constraint_comm)
6823         {
6824             sprintf(buf,"atoms separated by up to %d constraints",
6825                     1+ir->nProjOrder);
6826             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6827                     buf,"(-rcon)",limit);
6828         }
6829         fprintf(fplog,"\n");
6830     }
6831
6832     fflush(fplog);
6833 }
6834
6835 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6836                        t_inputrec *ir,t_forcerec *fr,
6837                        gmx_ddbox_t *ddbox)
6838 {
6839     gmx_domdec_comm_t *comm;
6840     int  d,dim,npulse,npulse_d_max,npulse_d;
6841     gmx_bool bNoCutOff;
6842     int  natoms_tot;
6843     real vol_frac;
6844
6845     comm = dd->comm;
6846
6847     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6848
6849     if (EEL_PME(ir->coulombtype))
6850     {
6851         init_ddpme(dd,&comm->ddpme[0],0);
6852         if (comm->npmedecompdim >= 2)
6853         {
6854             init_ddpme(dd,&comm->ddpme[1],1);
6855         }
6856     }
6857     else
6858     {
6859         comm->npmenodes = 0;
6860         if (dd->pme_nodeid >= 0)
6861         {
6862             gmx_fatal_collective(FARGS,NULL,dd,
6863                                  "Can not have separate PME nodes without PME electrostatics");
6864         }
6865     }
6866
6867     /* If each molecule is a single charge group
6868      * or we use domain decomposition for each periodic dimension,
6869      * we do not need to take pbc into account for the bonded interactions.
6870      */
6871     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6872         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6873     {
6874         fr->bMolPBC = FALSE;
6875     }
6876     else
6877     {
6878         fr->bMolPBC = TRUE;
6879     }
6880
6881     if (debug)
6882     {
6883         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6884     }
6885     if (comm->eDLB != edlbNO)
6886     {
6887         /* Determine the maximum number of comm. pulses in one dimension */
6888
6889         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6890
6891         /* Determine the maximum required number of grid pulses */
6892         if (comm->cellsize_limit >= comm->cutoff)
6893         {
6894             /* Only a single pulse is required */
6895             npulse = 1;
6896         }
6897         else if (!bNoCutOff && comm->cellsize_limit > 0)
6898         {
6899             /* We round down slightly here to avoid overhead due to the latency
6900              * of extra communication calls when the cut-off
6901              * would be only slightly longer than the cell size.
6902              * Later cellsize_limit is redetermined,
6903              * so we can not miss interactions due to this rounding.
6904              */
6905             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6906         }
6907         else
6908         {
6909             /* There is no cell size limit */
6910             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6911         }
6912
6913         if (!bNoCutOff && npulse > 1)
6914         {
6915             /* See if we can do with less pulses, based on dlb_scale */
6916             npulse_d_max = 0;
6917             for(d=0; d<dd->ndim; d++)
6918             {
6919                 dim = dd->dim[d];
6920                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6921                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6922                 npulse_d_max = max(npulse_d_max,npulse_d);
6923             }
6924             npulse = min(npulse,npulse_d_max);
6925         }
6926
6927         /* This env var can override npulse */
6928         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6929         if (d > 0)
6930         {
6931             npulse = d;
6932         }
6933
6934         comm->maxpulse = 1;
6935         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6936         for(d=0; d<dd->ndim; d++)
6937         {
6938             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6939             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6940             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6941             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6942             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6943             {
6944                 comm->bVacDLBNoLimit = FALSE;
6945             }
6946         }
6947
6948         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6949         if (!comm->bVacDLBNoLimit)
6950         {
6951             comm->cellsize_limit = max(comm->cellsize_limit,
6952                                        comm->cutoff/comm->maxpulse);
6953         }
6954         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6955         /* Set the minimum cell size for each DD dimension */
6956         for(d=0; d<dd->ndim; d++)
6957         {
6958             if (comm->bVacDLBNoLimit ||
6959                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6960             {
6961                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6962             }
6963             else
6964             {
6965                 comm->cellsize_min_dlb[dd->dim[d]] =
6966                     comm->cutoff/comm->cd[d].np_dlb;
6967             }
6968         }
6969         if (comm->cutoff_mbody <= 0)
6970         {
6971             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6972         }
6973         if (comm->bDynLoadBal)
6974         {
6975             set_dlb_limits(dd);
6976         }
6977     }
6978
6979     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6980     if (comm->eDLB == edlbAUTO)
6981     {
6982         if (fplog)
6983         {
6984             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6985         }
6986         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6987     }
6988
6989     if (ir->ePBC == epbcNONE)
6990     {
6991         vol_frac = 1 - 1/(double)dd->nnodes;
6992     }
6993     else
6994     {
6995         vol_frac =
6996             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
6997     }
6998     if (debug)
6999     {
7000         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7001     }
7002     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7003
7004     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7005 }
7006
7007 static void merge_cg_buffers(int ncell,
7008                              gmx_domdec_comm_dim_t *cd, int pulse,
7009                              int  *ncg_cell,
7010                              int  *index_gl, int  *recv_i,
7011                              rvec *cg_cm,    rvec *recv_vr,
7012                              int *cgindex,
7013                              cginfo_mb_t *cginfo_mb,int *cginfo)
7014 {
7015     gmx_domdec_ind_t *ind,*ind_p;
7016     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7017     int shift,shift_at;
7018
7019     ind = &cd->ind[pulse];
7020
7021     /* First correct the already stored data */
7022     shift = ind->nrecv[ncell];
7023     for(cell=ncell-1; cell>=0; cell--)
7024     {
7025         shift -= ind->nrecv[cell];
7026         if (shift > 0)
7027         {
7028             /* Move the cg's present from previous grid pulses */
7029             cg0 = ncg_cell[ncell+cell];
7030             cg1 = ncg_cell[ncell+cell+1];
7031             cgindex[cg1+shift] = cgindex[cg1];
7032             for(cg=cg1-1; cg>=cg0; cg--)
7033             {
7034                 index_gl[cg+shift] = index_gl[cg];
7035                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7036                 cgindex[cg+shift] = cgindex[cg];
7037                 cginfo[cg+shift] = cginfo[cg];
7038             }
7039             /* Correct the already stored send indices for the shift */
7040             for(p=1; p<=pulse; p++)
7041             {
7042                 ind_p = &cd->ind[p];
7043                 cg0 = 0;
7044                 for(c=0; c<cell; c++)
7045                 {
7046                     cg0 += ind_p->nsend[c];
7047                 }
7048                 cg1 = cg0 + ind_p->nsend[cell];
7049                 for(cg=cg0; cg<cg1; cg++)
7050                 {
7051                     ind_p->index[cg] += shift;
7052                 }
7053             }
7054         }
7055     }
7056
7057     /* Merge in the communicated buffers */
7058     shift = 0;
7059     shift_at = 0;
7060     cg0 = 0;
7061     for(cell=0; cell<ncell; cell++)
7062     {
7063         cg1 = ncg_cell[ncell+cell+1] + shift;
7064         if (shift_at > 0)
7065         {
7066             /* Correct the old cg indices */
7067             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7068             {
7069                 cgindex[cg+1] += shift_at;
7070             }
7071         }
7072         for(cg=0; cg<ind->nrecv[cell]; cg++)
7073         {
7074             /* Copy this charge group from the buffer */
7075             index_gl[cg1] = recv_i[cg0];
7076             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7077             /* Add it to the cgindex */
7078             cg_gl = index_gl[cg1];
7079             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7080             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7081             cgindex[cg1+1] = cgindex[cg1] + nat;
7082             cg0++;
7083             cg1++;
7084             shift_at += nat;
7085         }
7086         shift += ind->nrecv[cell];
7087         ncg_cell[ncell+cell+1] = cg1;
7088     }
7089 }
7090
7091 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7092                                int nzone,int cg0,const int *cgindex)
7093 {
7094     int cg,zone,p;
7095
7096     /* Store the atom block boundaries for easy copying of communication buffers
7097      */
7098     cg = cg0;
7099     for(zone=0; zone<nzone; zone++)
7100     {
7101         for(p=0; p<cd->np; p++) {
7102             cd->ind[p].cell2at0[zone] = cgindex[cg];
7103             cg += cd->ind[p].nrecv[zone];
7104             cd->ind[p].cell2at1[zone] = cgindex[cg];
7105         }
7106     }
7107 }
7108
7109 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7110 {
7111     int  i;
7112     gmx_bool bMiss;
7113
7114     bMiss = FALSE;
7115     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7116     {
7117         if (!bLocalCG[link->a[i]])
7118         {
7119             bMiss = TRUE;
7120         }
7121     }
7122
7123     return bMiss;
7124 }
7125
7126 static void setup_dd_communication(gmx_domdec_t *dd,
7127                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7128 {
7129     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7130     int nzone,nzone_send,zone,zonei,cg0,cg1;
7131     int c,i,j,cg,cg_gl,nrcg;
7132     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7133     gmx_domdec_comm_t *comm;
7134     gmx_domdec_zones_t *zones;
7135     gmx_domdec_comm_dim_t *cd;
7136     gmx_domdec_ind_t *ind;
7137     cginfo_mb_t *cginfo_mb;
7138     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7139     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7140     rvec rb,rn;
7141     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7142     real bcorner[DIM],bcorner_round_1=0;
7143     ivec tric_dist;
7144     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7145     real skew_fac2_d,skew_fac_01;
7146     rvec sf2_round;
7147     int  nsend,nat;
7148
7149     if (debug)
7150     {
7151         fprintf(debug,"Setting up DD communication\n");
7152     }
7153
7154     comm  = dd->comm;
7155     cg_cm = fr->cg_cm;
7156
7157     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7158     {
7159         dim = dd->dim[dim_ind];
7160
7161         /* Check if we need to use triclinic distances */
7162         tric_dist[dim_ind] = 0;
7163         for(i=0; i<=dim_ind; i++)
7164         {
7165             if (ddbox->tric_dir[dd->dim[i]])
7166             {
7167                 tric_dist[dim_ind] = 1;
7168             }
7169         }
7170     }
7171
7172     bBondComm = comm->bBondComm;
7173
7174     /* Do we need to determine extra distances for multi-body bondeds? */
7175     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7176
7177     /* Do we need to determine extra distances for only two-body bondeds? */
7178     bDist2B = (bBondComm && !bDistMB);
7179
7180     r_comm2  = sqr(comm->cutoff);
7181     r_bcomm2 = sqr(comm->cutoff_mbody);
7182
7183     if (debug)
7184     {
7185         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7186     }
7187
7188     zones = &comm->zones;
7189
7190     dim0 = dd->dim[0];
7191     /* The first dimension is equal for all cells */
7192     corner[0][0] = comm->cell_x0[dim0];
7193     if (bDistMB)
7194     {
7195         bcorner[0] = corner[0][0];
7196     }
7197     if (dd->ndim >= 2)
7198     {
7199         dim1 = dd->dim[1];
7200         /* This cell row is only seen from the first row */
7201         corner[1][0] = comm->cell_x0[dim1];
7202         /* All rows can see this row */
7203         corner[1][1] = comm->cell_x0[dim1];
7204         if (dd->bGridJump)
7205         {
7206             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7207             if (bDistMB)
7208             {
7209                 /* For the multi-body distance we need the maximum */
7210                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7211             }
7212         }
7213         /* Set the upper-right corner for rounding */
7214         corner_round_0 = comm->cell_x1[dim0];
7215
7216         if (dd->ndim >= 3)
7217         {
7218             dim2 = dd->dim[2];
7219             for(j=0; j<4; j++)
7220             {
7221                 corner[2][j] = comm->cell_x0[dim2];
7222             }
7223             if (dd->bGridJump)
7224             {
7225                 /* Use the maximum of the i-cells that see a j-cell */
7226                 for(i=0; i<zones->nizone; i++)
7227                 {
7228                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7229                     {
7230                         if (j >= 4)
7231                         {
7232                             corner[2][j-4] =
7233                                 max(corner[2][j-4],
7234                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7235                         }
7236                     }
7237                 }
7238                 if (bDistMB)
7239                 {
7240                     /* For the multi-body distance we need the maximum */
7241                     bcorner[2] = comm->cell_x0[dim2];
7242                     for(i=0; i<2; i++)
7243                     {
7244                         for(j=0; j<2; j++)
7245                         {
7246                             bcorner[2] = max(bcorner[2],
7247                                              comm->zone_d2[i][j].p1_0);
7248                         }
7249                     }
7250                 }
7251             }
7252
7253             /* Set the upper-right corner for rounding */
7254             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7255              * Only cell (0,0,0) can see cell 7 (1,1,1)
7256              */
7257             corner_round_1[0] = comm->cell_x1[dim1];
7258             corner_round_1[3] = comm->cell_x1[dim1];
7259             if (dd->bGridJump)
7260             {
7261                 corner_round_1[0] = max(comm->cell_x1[dim1],
7262                                         comm->zone_d1[1].mch1);
7263                 if (bDistMB)
7264                 {
7265                     /* For the multi-body distance we need the maximum */
7266                     bcorner_round_1 = max(comm->cell_x1[dim1],
7267                                           comm->zone_d1[1].p1_1);
7268                 }
7269             }
7270         }
7271     }
7272
7273     /* Triclinic stuff */
7274     normal = ddbox->normal;
7275     skew_fac_01 = 0;
7276     if (dd->ndim >= 2)
7277     {
7278         v_0 = ddbox->v[dim0];
7279         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7280         {
7281             /* Determine the coupling coefficient for the distances
7282              * to the cell planes along dim0 and dim1 through dim2.
7283              * This is required for correct rounding.
7284              */
7285             skew_fac_01 =
7286                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7287             if (debug)
7288             {
7289                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7290             }
7291         }
7292     }
7293     if (dd->ndim >= 3)
7294     {
7295         v_1 = ddbox->v[dim1];
7296     }
7297
7298     zone_cg_range = zones->cg_range;
7299     index_gl = dd->index_gl;
7300     cgindex  = dd->cgindex;
7301     cginfo_mb = fr->cginfo_mb;
7302
7303     zone_cg_range[0]   = 0;
7304     zone_cg_range[1]   = dd->ncg_home;
7305     comm->zone_ncg1[0] = dd->ncg_home;
7306     pos_cg             = dd->ncg_home;
7307
7308     nat_tot = dd->nat_home;
7309     nzone = 1;
7310     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7311     {
7312         dim = dd->dim[dim_ind];
7313         cd = &comm->cd[dim_ind];
7314
7315         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7316         {
7317             /* No pbc in this dimension, the first node should not comm. */
7318             nzone_send = 0;
7319         }
7320         else
7321         {
7322             nzone_send = nzone;
7323         }
7324
7325         bScrew = (dd->bScrewPBC && dim == XX);
7326
7327         v_d = ddbox->v[dim];
7328         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7329
7330         cd->bInPlace = TRUE;
7331         for(p=0; p<cd->np; p++)
7332         {
7333             /* Only atoms communicated in the first pulse are used
7334              * for multi-body bonded interactions or for bBondComm.
7335              */
7336             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7337             bDistMB_pulse = (bDistMB && bDistBonded);
7338
7339             ind = &cd->ind[p];
7340             nsend = 0;
7341             nat = 0;
7342             for(zone=0; zone<nzone_send; zone++)
7343             {
7344                 if (tric_dist[dim_ind] && dim_ind > 0)
7345                 {
7346                     /* Determine slightly more optimized skew_fac's
7347                      * for rounding.
7348                      * This reduces the number of communicated atoms
7349                      * by about 10% for 3D DD of rhombic dodecahedra.
7350                      */
7351                     for(dimd=0; dimd<dim; dimd++)
7352                     {
7353                         sf2_round[dimd] = 1;
7354                         if (ddbox->tric_dir[dimd])
7355                         {
7356                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7357                             {
7358                                 /* If we are shifted in dimension i
7359                                  * and the cell plane is tilted forward
7360                                  * in dimension i, skip this coupling.
7361                                  */
7362                                 if (!(zones->shift[nzone+zone][i] &&
7363                                       ddbox->v[dimd][i][dimd] >= 0))
7364                                 {
7365                                     sf2_round[dimd] +=
7366                                         sqr(ddbox->v[dimd][i][dimd]);
7367                                 }
7368                             }
7369                             sf2_round[dimd] = 1/sf2_round[dimd];
7370                         }
7371                     }
7372                 }
7373
7374                 zonei = zone_perm[dim_ind][zone];
7375                 if (p == 0)
7376                 {
7377                     /* Here we permutate the zones to obtain a convenient order
7378                      * for neighbor searching
7379                      */
7380                     cg0 = zone_cg_range[zonei];
7381                     cg1 = zone_cg_range[zonei+1];
7382                 }
7383                 else
7384                 {
7385                     /* Look only at the cg's received in the previous grid pulse
7386                      */
7387                     cg1 = zone_cg_range[nzone+zone+1];
7388                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7389                 }
7390                 ind->nsend[zone] = 0;
7391                 for(cg=cg0; cg<cg1; cg++)
7392                 {
7393                     r2  = 0;
7394                     rb2 = 0;
7395                     if (tric_dist[dim_ind] == 0)
7396                     {
7397                         /* Rectangular direction, easy */
7398                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7399                         if (r > 0)
7400                         {
7401                             r2 += r*r;
7402                         }
7403                         if (bDistMB_pulse)
7404                         {
7405                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7406                             if (r > 0)
7407                             {
7408                                 rb2 += r*r;
7409                             }
7410                         }
7411                         /* Rounding gives at most a 16% reduction
7412                          * in communicated atoms
7413                          */
7414                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7415                         {
7416                             r = cg_cm[cg][dim0] - corner_round_0;
7417                             /* This is the first dimension, so always r >= 0 */
7418                             r2 += r*r;
7419                             if (bDistMB_pulse)
7420                             {
7421                                 rb2 += r*r;
7422                             }
7423                         }
7424                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7425                         {
7426                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7427                             if (r > 0)
7428                             {
7429                                 r2 += r*r;
7430                             }
7431                             if (bDistMB_pulse)
7432                             {
7433                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7434                                 if (r > 0)
7435                                 {
7436                                     rb2 += r*r;
7437                                 }
7438                             }
7439                         }
7440                     }
7441                     else
7442                     {
7443                         /* Triclinic direction, more complicated */
7444                         clear_rvec(rn);
7445                         clear_rvec(rb);
7446                         /* Rounding, conservative as the skew_fac multiplication
7447                          * will slightly underestimate the distance.
7448                          */
7449                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7450                         {
7451                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7452                             for(i=dim0+1; i<DIM; i++)
7453                             {
7454                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7455                             }
7456                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7457                             if (bDistMB_pulse)
7458                             {
7459                                 rb[dim0] = rn[dim0];
7460                                 rb2 = r2;
7461                             }
7462                             /* Take care that the cell planes along dim0 might not
7463                              * be orthogonal to those along dim1 and dim2.
7464                              */
7465                             for(i=1; i<=dim_ind; i++)
7466                             {
7467                                 dimd = dd->dim[i];
7468                                 if (normal[dim0][dimd] > 0)
7469                                 {
7470                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7471                                     if (bDistMB_pulse)
7472                                     {
7473                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7474                                     }
7475                                 }
7476                             }
7477                         }
7478                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7479                         {
7480                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7481                             tric_sh = 0;
7482                             for(i=dim1+1; i<DIM; i++)
7483                             {
7484                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7485                             }
7486                             rn[dim1] += tric_sh;
7487                             if (rn[dim1] > 0)
7488                             {
7489                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7490                                 /* Take care of coupling of the distances
7491                                  * to the planes along dim0 and dim1 through dim2.
7492                                  */
7493                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7494                                 /* Take care that the cell planes along dim1
7495                                  * might not be orthogonal to that along dim2.
7496                                  */
7497                                 if (normal[dim1][dim2] > 0)
7498                                 {
7499                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7500                                 }
7501                             }
7502                             if (bDistMB_pulse)
7503                             {
7504                                 rb[dim1] +=
7505                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7506                                 if (rb[dim1] > 0)
7507                                 {
7508                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7509                                     /* Take care of coupling of the distances
7510                                      * to the planes along dim0 and dim1 through dim2.
7511                                      */
7512                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7513                                     /* Take care that the cell planes along dim1
7514                                      * might not be orthogonal to that along dim2.
7515                                      */
7516                                     if (normal[dim1][dim2] > 0)
7517                                     {
7518                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7519                                     }
7520                                 }
7521                             }
7522                         }
7523                         /* The distance along the communication direction */
7524                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7525                         tric_sh = 0;
7526                         for(i=dim+1; i<DIM; i++)
7527                         {
7528                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7529                         }
7530                         rn[dim] += tric_sh;
7531                         if (rn[dim] > 0)
7532                         {
7533                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7534                             /* Take care of coupling of the distances
7535                              * to the planes along dim0 and dim1 through dim2.
7536                              */
7537                             if (dim_ind == 1 && zonei == 1)
7538                             {
7539                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7540                             }
7541                         }
7542                         if (bDistMB_pulse)
7543                         {
7544                             clear_rvec(rb);
7545                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7546                             if (rb[dim] > 0)
7547                             {
7548                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7549                                 /* Take care of coupling of the distances
7550                                  * to the planes along dim0 and dim1 through dim2.
7551                                  */
7552                                 if (dim_ind == 1 && zonei == 1)
7553                                 {
7554                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7555                                 }
7556                             }
7557                         }
7558                     }
7559
7560                     if (r2 < r_comm2 ||
7561                         (bDistBonded &&
7562                          ((bDistMB && rb2 < r_bcomm2) ||
7563                           (bDist2B && r2  < r_bcomm2)) &&
7564                          (!bBondComm ||
7565                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7566                            missing_link(comm->cglink,index_gl[cg],
7567                                         comm->bLocalCG)))))
7568                     {
7569                         /* Make an index to the local charge groups */
7570                         if (nsend+1 > ind->nalloc)
7571                         {
7572                             ind->nalloc = over_alloc_large(nsend+1);
7573                             srenew(ind->index,ind->nalloc);
7574                         }
7575                         if (nsend+1 > comm->nalloc_int)
7576                         {
7577                             comm->nalloc_int = over_alloc_large(nsend+1);
7578                             srenew(comm->buf_int,comm->nalloc_int);
7579                         }
7580                         ind->index[nsend] = cg;
7581                         comm->buf_int[nsend] = index_gl[cg];
7582                         ind->nsend[zone]++;
7583                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7584
7585                         if (dd->ci[dim] == 0)
7586                         {
7587                             /* Correct cg_cm for pbc */
7588                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7589                             if (bScrew)
7590                             {
7591                                 comm->vbuf.v[nsend][YY] =
7592                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7593                                 comm->vbuf.v[nsend][ZZ] =
7594                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7595                             }
7596                         }
7597                         else
7598                         {
7599                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7600                         }
7601                         nsend++;
7602                         nat += cgindex[cg+1] - cgindex[cg];
7603                     }
7604                 }
7605             }
7606             /* Clear the counts in case we do not have pbc */
7607             for(zone=nzone_send; zone<nzone; zone++)
7608             {
7609                 ind->nsend[zone] = 0;
7610             }
7611             ind->nsend[nzone]   = nsend;
7612             ind->nsend[nzone+1] = nat;
7613             /* Communicate the number of cg's and atoms to receive */
7614             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7615                             ind->nsend, nzone+2,
7616                             ind->nrecv, nzone+2);
7617
7618             /* The rvec buffer is also required for atom buffers of size nsend
7619              * in dd_move_x and dd_move_f.
7620              */
7621             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7622
7623             if (p > 0)
7624             {
7625                 /* We can receive in place if only the last zone is not empty */
7626                 for(zone=0; zone<nzone-1; zone++)
7627                 {
7628                     if (ind->nrecv[zone] > 0)
7629                     {
7630                         cd->bInPlace = FALSE;
7631                     }
7632                 }
7633                 if (!cd->bInPlace)
7634                 {
7635                     /* The int buffer is only required here for the cg indices */
7636                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7637                     {
7638                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7639                         srenew(comm->buf_int2,comm->nalloc_int2);
7640                     }
7641                     /* The rvec buffer is also required for atom buffers
7642                      * of size nrecv in dd_move_x and dd_move_f.
7643                      */
7644                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7645                     vec_rvec_check_alloc(&comm->vbuf2,i);
7646                 }
7647             }
7648
7649             /* Make space for the global cg indices */
7650             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7651                 || dd->cg_nalloc == 0)
7652             {
7653                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7654                 srenew(index_gl,dd->cg_nalloc);
7655                 srenew(cgindex,dd->cg_nalloc+1);
7656             }
7657             /* Communicate the global cg indices */
7658             if (cd->bInPlace)
7659             {
7660                 recv_i = index_gl + pos_cg;
7661             }
7662             else
7663             {
7664                 recv_i = comm->buf_int2;
7665             }
7666             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7667                             comm->buf_int, nsend,
7668                             recv_i,        ind->nrecv[nzone]);
7669
7670             /* Make space for cg_cm */
7671             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7672             {
7673                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7674                 cg_cm = fr->cg_cm;
7675             }
7676             /* Communicate cg_cm */
7677             if (cd->bInPlace)
7678             {
7679                 recv_vr = cg_cm + pos_cg;
7680             }
7681             else
7682             {
7683                 recv_vr = comm->vbuf2.v;
7684             }
7685             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7686                              comm->vbuf.v, nsend,
7687                              recv_vr,      ind->nrecv[nzone]);
7688
7689             /* Make the charge group index */
7690             if (cd->bInPlace)
7691             {
7692                 zone = (p == 0 ? 0 : nzone - 1);
7693                 while (zone < nzone)
7694                 {
7695                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7696                     {
7697                         cg_gl = index_gl[pos_cg];
7698                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7699                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7700                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7701                         if (bBondComm)
7702                         {
7703                             /* Update the charge group presence,
7704                              * so we can use it in the next pass of the loop.
7705                              */
7706                             comm->bLocalCG[cg_gl] = TRUE;
7707                         }
7708                         pos_cg++;
7709                     }
7710                     if (p == 0)
7711                     {
7712                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7713                     }
7714                     zone++;
7715                     zone_cg_range[nzone+zone] = pos_cg;
7716                 }
7717             }
7718             else
7719             {
7720                 /* This part of the code is never executed with bBondComm. */
7721                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7722                                  index_gl,recv_i,cg_cm,recv_vr,
7723                                  cgindex,fr->cginfo_mb,fr->cginfo);
7724                 pos_cg += ind->nrecv[nzone];
7725             }
7726             nat_tot += ind->nrecv[nzone+1];
7727         }
7728         if (!cd->bInPlace)
7729         {
7730             /* Store the atom block for easy copying of communication buffers */
7731             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7732         }
7733         nzone += nzone;
7734     }
7735     dd->index_gl = index_gl;
7736     dd->cgindex  = cgindex;
7737
7738     dd->ncg_tot = zone_cg_range[zones->n];
7739     dd->nat_tot = nat_tot;
7740     comm->nat[ddnatHOME] = dd->nat_home;
7741     for(i=ddnatZONE; i<ddnatNR; i++)
7742     {
7743         comm->nat[i] = dd->nat_tot;
7744     }
7745
7746     if (!bBondComm)
7747     {
7748         /* We don't need to update cginfo, since that was alrady done above.
7749          * So we pass NULL for the forcerec.
7750          */
7751         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7752                       NULL,comm->bLocalCG);
7753     }
7754
7755     if (debug)
7756     {
7757         fprintf(debug,"Finished setting up DD communication, zones:");
7758         for(c=0; c<zones->n; c++)
7759         {
7760             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7761         }
7762         fprintf(debug,"\n");
7763     }
7764 }
7765
7766 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7767 {
7768     int c;
7769
7770     for(c=0; c<zones->nizone; c++)
7771     {
7772         zones->izone[c].cg1  = zones->cg_range[c+1];
7773         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7774         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7775     }
7776 }
7777
7778 static int comp_cgsort(const void *a,const void *b)
7779 {
7780     int comp;
7781
7782     gmx_cgsort_t *cga,*cgb;
7783     cga = (gmx_cgsort_t *)a;
7784     cgb = (gmx_cgsort_t *)b;
7785
7786     comp = cga->nsc - cgb->nsc;
7787     if (comp == 0)
7788     {
7789         comp = cga->ind_gl - cgb->ind_gl;
7790     }
7791
7792     return comp;
7793 }
7794
7795 static void order_int_cg(int n,gmx_cgsort_t *sort,
7796                          int *a,int *buf)
7797 {
7798     int i;
7799
7800     /* Order the data */
7801     for(i=0; i<n; i++)
7802     {
7803         buf[i] = a[sort[i].ind];
7804     }
7805
7806     /* Copy back to the original array */
7807     for(i=0; i<n; i++)
7808     {
7809         a[i] = buf[i];
7810     }
7811 }
7812
7813 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7814                          rvec *v,rvec *buf)
7815 {
7816     int i;
7817
7818     /* Order the data */
7819     for(i=0; i<n; i++)
7820     {
7821         copy_rvec(v[sort[i].ind],buf[i]);
7822     }
7823
7824     /* Copy back to the original array */
7825     for(i=0; i<n; i++)
7826     {
7827         copy_rvec(buf[i],v[i]);
7828     }
7829 }
7830
7831 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7832                            rvec *v,rvec *buf)
7833 {
7834     int a,atot,cg,cg0,cg1,i;
7835
7836     /* Order the data */
7837     a = 0;
7838     for(cg=0; cg<ncg; cg++)
7839     {
7840         cg0 = cgindex[sort[cg].ind];
7841         cg1 = cgindex[sort[cg].ind+1];
7842         for(i=cg0; i<cg1; i++)
7843         {
7844             copy_rvec(v[i],buf[a]);
7845             a++;
7846         }
7847     }
7848     atot = a;
7849
7850     /* Copy back to the original array */
7851     for(a=0; a<atot; a++)
7852     {
7853         copy_rvec(buf[a],v[a]);
7854     }
7855 }
7856
7857 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7858                          int nsort_new,gmx_cgsort_t *sort_new,
7859                          gmx_cgsort_t *sort1)
7860 {
7861     int i1,i2,i_new;
7862
7863     /* The new indices are not very ordered, so we qsort them */
7864     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7865
7866     /* sort2 is already ordered, so now we can merge the two arrays */
7867     i1 = 0;
7868     i2 = 0;
7869     i_new = 0;
7870     while(i2 < nsort2 || i_new < nsort_new)
7871     {
7872         if (i2 == nsort2)
7873         {
7874             sort1[i1++] = sort_new[i_new++];
7875         }
7876         else if (i_new == nsort_new)
7877         {
7878             sort1[i1++] = sort2[i2++];
7879         }
7880         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7881                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7882                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7883         {
7884             sort1[i1++] = sort2[i2++];
7885         }
7886         else
7887         {
7888             sort1[i1++] = sort_new[i_new++];
7889         }
7890     }
7891 }
7892
7893 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7894                           rvec *cgcm,t_forcerec *fr,t_state *state,
7895                           int ncg_home_old)
7896 {
7897     gmx_domdec_sort_t *sort;
7898     gmx_cgsort_t *cgsort,*sort_i;
7899     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7900     rvec *vbuf;
7901
7902     sort = dd->comm->sort;
7903
7904     if (dd->ncg_home > sort->sort_nalloc)
7905     {
7906         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7907         srenew(sort->sort1,sort->sort_nalloc);
7908         srenew(sort->sort2,sort->sort_nalloc);
7909     }
7910
7911     if (ncg_home_old >= 0)
7912     {
7913         /* The charge groups that remained in the same ns grid cell
7914          * are completely ordered. So we can sort efficiently by sorting
7915          * the charge groups that did move into the stationary list.
7916          */
7917         ncg_new = 0;
7918         nsort2 = 0;
7919         nsort_new = 0;
7920         for(i=0; i<dd->ncg_home; i++)
7921         {
7922             /* Check if this cg did not move to another node */
7923             cell_index = fr->ns.grid->cell_index[i];
7924             if (cell_index !=  4*fr->ns.grid->ncells)
7925             {
7926                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7927                 {
7928                     /* This cg is new on this node or moved ns grid cell */
7929                     if (nsort_new >= sort->sort_new_nalloc)
7930                     {
7931                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7932                         srenew(sort->sort_new,sort->sort_new_nalloc);
7933                     }
7934                     sort_i = &(sort->sort_new[nsort_new++]);
7935                 }
7936                 else
7937                 {
7938                     /* This cg did not move */
7939                     sort_i = &(sort->sort2[nsort2++]);
7940                 }
7941                 /* Sort on the ns grid cell indices
7942                  * and the global topology index
7943                  */
7944                 sort_i->nsc    = cell_index;
7945                 sort_i->ind_gl = dd->index_gl[i];
7946                 sort_i->ind    = i;
7947                 ncg_new++;
7948             }
7949         }
7950         if (debug)
7951         {
7952             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7953                     nsort2,nsort_new);
7954         }
7955         /* Sort efficiently */
7956         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7957     }
7958     else
7959     {
7960         cgsort = sort->sort1;
7961         ncg_new = 0;
7962         for(i=0; i<dd->ncg_home; i++)
7963         {
7964             /* Sort on the ns grid cell indices
7965              * and the global topology index
7966              */
7967             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7968             cgsort[i].ind_gl = dd->index_gl[i];
7969             cgsort[i].ind    = i;
7970             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7971             {
7972                 ncg_new++;
7973             }
7974         }
7975         if (debug)
7976         {
7977             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7978         }
7979         /* Determine the order of the charge groups using qsort */
7980         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7981     }
7982     cgsort = sort->sort1;
7983
7984     /* We alloc with the old size, since cgindex is still old */
7985     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7986     vbuf = dd->comm->vbuf.v;
7987
7988     /* Remove the charge groups which are no longer at home here */
7989     dd->ncg_home = ncg_new;
7990
7991     /* Reorder the state */
7992     for(i=0; i<estNR; i++)
7993     {
7994         if (EST_DISTR(i) && state->flags & (1<<i))
7995         {
7996             switch (i)
7997             {
7998             case estX:
7999                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8000                 break;
8001             case estV:
8002                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8003                 break;
8004             case estSDX:
8005                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8006                 break;
8007             case estCGP:
8008                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8009                 break;
8010             case estLD_RNG:
8011             case estLD_RNGI:
8012             case estDISRE_INITF:
8013             case estDISRE_RM3TAV:
8014             case estORIRE_INITF:
8015             case estORIRE_DTAV:
8016                 /* No ordering required */
8017                 break;
8018             default:
8019                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8020                 break;
8021             }
8022         }
8023     }
8024     /* Reorder cgcm */
8025     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8026
8027     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8028     {
8029         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8030         srenew(sort->ibuf,sort->ibuf_nalloc);
8031     }
8032     ibuf = sort->ibuf;
8033     /* Reorder the global cg index */
8034     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8035     /* Reorder the cginfo */
8036     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8037     /* Rebuild the local cg index */
8038     ibuf[0] = 0;
8039     for(i=0; i<dd->ncg_home; i++)
8040     {
8041         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8042         ibuf[i+1] = ibuf[i] + cgsize;
8043     }
8044     for(i=0; i<dd->ncg_home+1; i++)
8045     {
8046         dd->cgindex[i] = ibuf[i];
8047     }
8048     /* Set the home atom number */
8049     dd->nat_home = dd->cgindex[dd->ncg_home];
8050
8051     /* Copy the sorted ns cell indices back to the ns grid struct */
8052     for(i=0; i<dd->ncg_home; i++)
8053     {
8054         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8055     }
8056     fr->ns.grid->nr = dd->ncg_home;
8057 }
8058
8059 static void add_dd_statistics(gmx_domdec_t *dd)
8060 {
8061     gmx_domdec_comm_t *comm;
8062     int ddnat;
8063
8064     comm = dd->comm;
8065
8066     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8067     {
8068         comm->sum_nat[ddnat-ddnatZONE] +=
8069             comm->nat[ddnat] - comm->nat[ddnat-1];
8070     }
8071     comm->ndecomp++;
8072 }
8073
8074 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8075 {
8076     gmx_domdec_comm_t *comm;
8077     int ddnat;
8078
8079     comm = dd->comm;
8080
8081     /* Reset all the statistics and counters for total run counting */
8082     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8083     {
8084         comm->sum_nat[ddnat-ddnatZONE] = 0;
8085     }
8086     comm->ndecomp = 0;
8087     comm->nload = 0;
8088     comm->load_step = 0;
8089     comm->load_sum = 0;
8090     comm->load_max = 0;
8091     clear_ivec(comm->load_lim);
8092     comm->load_mdf = 0;
8093     comm->load_pme = 0;
8094 }
8095
8096 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8097 {
8098     gmx_domdec_comm_t *comm;
8099     int ddnat;
8100     double av;
8101
8102     comm = cr->dd->comm;
8103
8104     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8105
8106     if (fplog == NULL)
8107     {
8108         return;
8109     }
8110
8111     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8112
8113     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8114     {
8115         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8116         switch(ddnat)
8117         {
8118         case ddnatZONE:
8119             fprintf(fplog,
8120                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8121                     2,av);
8122             break;
8123         case ddnatVSITE:
8124             if (cr->dd->vsite_comm)
8125             {
8126                 fprintf(fplog,
8127                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8128                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8129                         av);
8130             }
8131             break;
8132         case ddnatCON:
8133             if (cr->dd->constraint_comm)
8134             {
8135                 fprintf(fplog,
8136                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8137                         1 + ir->nLincsIter,av);
8138             }
8139             break;
8140         default:
8141             gmx_incons(" Unknown type for DD statistics");
8142         }
8143     }
8144     fprintf(fplog,"\n");
8145
8146     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8147     {
8148         print_dd_load_av(fplog,cr->dd);
8149     }
8150 }
8151
8152 void dd_partition_system(FILE            *fplog,
8153                          gmx_large_int_t      step,
8154                          t_commrec       *cr,
8155                          gmx_bool            bMasterState,
8156                          int             nstglobalcomm,
8157                          t_state         *state_global,
8158                          gmx_mtop_t      *top_global,
8159                          t_inputrec      *ir,
8160                          t_state         *state_local,
8161                          rvec            **f,
8162                          t_mdatoms       *mdatoms,
8163                          gmx_localtop_t  *top_local,
8164                          t_forcerec      *fr,
8165                          gmx_vsite_t     *vsite,
8166                          gmx_shellfc_t   shellfc,
8167                          gmx_constr_t    constr,
8168                          t_nrnb          *nrnb,
8169                          gmx_wallcycle_t wcycle,
8170                          gmx_bool            bVerbose)
8171 {
8172     gmx_domdec_t *dd;
8173     gmx_domdec_comm_t *comm;
8174     gmx_ddbox_t ddbox={0};
8175     t_block *cgs_gl;
8176     gmx_large_int_t step_pcoupl;
8177     rvec cell_ns_x0,cell_ns_x1;
8178     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8179     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8180     gmx_bool bRedist,bSortCG,bResortAll;
8181     ivec ncells_old,np;
8182     real grid_density;
8183     char sbuf[22];
8184
8185     dd = cr->dd;
8186     comm = dd->comm;
8187
8188     bBoxChanged = (bMasterState || DEFORM(*ir));
8189     if (ir->epc != epcNO)
8190     {
8191         /* With nstpcouple > 1 pressure coupling happens.
8192          * one step after calculating the pressure.
8193          * Box scaling happens at the end of the MD step,
8194          * after the DD partitioning.
8195          * We therefore have to do DLB in the first partitioning
8196          * after an MD step where P-coupling occured.
8197          * We need to determine the last step in which p-coupling occurred.
8198          * MRS -- need to validate this for vv?
8199          */
8200         n = ir->nstpcouple;
8201         if (n == 1)
8202         {
8203             step_pcoupl = step - 1;
8204         }
8205         else
8206         {
8207             step_pcoupl = ((step - 1)/n)*n + 1;
8208         }
8209         if (step_pcoupl >= comm->globalcomm_step)
8210         {
8211             bBoxChanged = TRUE;
8212         }
8213     }
8214
8215     bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8216
8217     if (!comm->bDynLoadBal)
8218     {
8219         bDoDLB = FALSE;
8220     }
8221     else
8222     {
8223         /* Should we do dynamic load balacing this step?
8224          * Since it requires (possibly expensive) global communication,
8225          * we might want to do DLB less frequently.
8226          */
8227         if (bBoxChanged || ir->epc != epcNO)
8228         {
8229             bDoDLB = bBoxChanged;
8230         }
8231         else
8232         {
8233             bDoDLB = bNStGlobalComm;
8234         }
8235     }
8236
8237     /* Check if we have recorded loads on the nodes */
8238     if (comm->bRecordLoad && dd_load_count(comm))
8239     {
8240         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8241         {
8242             /* Check if we should use DLB at the second partitioning
8243              * and every 100 partitionings,
8244              * so the extra communication cost is negligible.
8245              */
8246             n = max(100,nstglobalcomm);
8247             bCheckDLB = (comm->n_load_collect == 0 ||
8248                          comm->n_load_have % n == n-1);
8249         }
8250         else
8251         {
8252             bCheckDLB = FALSE;
8253         }
8254
8255         /* Print load every nstlog, first and last step to the log file */
8256         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8257                     comm->n_load_collect == 0 ||
8258                     (step + ir->nstlist > ir->init_step + ir->nsteps));
8259
8260         /* Avoid extra communication due to verbose screen output
8261          * when nstglobalcomm is set.
8262          */
8263         if (bDoDLB || bLogLoad || bCheckDLB ||
8264             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8265         {
8266             get_load_distribution(dd,wcycle);
8267             if (DDMASTER(dd))
8268             {
8269                 if (bLogLoad)
8270                 {
8271                     dd_print_load(fplog,dd,step-1);
8272                 }
8273                 if (bVerbose)
8274                 {
8275                     dd_print_load_verbose(dd);
8276                 }
8277             }
8278             comm->n_load_collect++;
8279
8280             if (bCheckDLB) {
8281                 /* Since the timings are node dependent, the master decides */
8282                 if (DDMASTER(dd))
8283                 {
8284                     bTurnOnDLB =
8285                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8286                     if (debug)
8287                     {
8288                         fprintf(debug,"step %s, imb loss %f\n",
8289                                 gmx_step_str(step,sbuf),
8290                                 dd_force_imb_perf_loss(dd));
8291                     }
8292                 }
8293                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8294                 if (bTurnOnDLB)
8295                 {
8296                     turn_on_dlb(fplog,cr,step);
8297                     bDoDLB = TRUE;
8298                 }
8299             }
8300         }
8301         comm->n_load_have++;
8302     }
8303
8304     cgs_gl = &comm->cgs_gl;
8305
8306     bRedist = FALSE;
8307     if (bMasterState)
8308     {
8309         /* Clear the old state */
8310         clear_dd_indices(dd,0,0);
8311
8312         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8313                   TRUE,cgs_gl,state_global->x,&ddbox);
8314
8315         get_cg_distribution(fplog,step,dd,cgs_gl,
8316                             state_global->box,&ddbox,state_global->x);
8317
8318         dd_distribute_state(dd,cgs_gl,
8319                             state_global,state_local,f);
8320
8321         dd_make_local_cgs(dd,&top_local->cgs);
8322
8323         if (dd->ncg_home > fr->cg_nalloc)
8324         {
8325             dd_realloc_fr_cg(fr,dd->ncg_home);
8326         }
8327         calc_cgcm(fplog,0,dd->ncg_home,
8328                   &top_local->cgs,state_local->x,fr->cg_cm);
8329
8330         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8331
8332         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8333
8334         cg0 = 0;
8335     }
8336     else if (state_local->ddp_count != dd->ddp_count)
8337     {
8338         if (state_local->ddp_count > dd->ddp_count)
8339         {
8340             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8341         }
8342
8343         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8344         {
8345             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8346         }
8347
8348         /* Clear the old state */
8349         clear_dd_indices(dd,0,0);
8350
8351         /* Build the new indices */
8352         rebuild_cgindex(dd,cgs_gl->index,state_local);
8353         make_dd_indices(dd,cgs_gl->index,0);
8354
8355         /* Redetermine the cg COMs */
8356         calc_cgcm(fplog,0,dd->ncg_home,
8357                   &top_local->cgs,state_local->x,fr->cg_cm);
8358
8359         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8360
8361         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8362
8363         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8364                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8365
8366         bRedist = comm->bDynLoadBal;
8367     }
8368     else
8369     {
8370         /* We have the full state, only redistribute the cgs */
8371
8372         /* Clear the non-home indices */
8373         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8374
8375         /* Avoid global communication for dim's without pbc and -gcom */
8376         if (!bNStGlobalComm)
8377         {
8378             copy_rvec(comm->box0    ,ddbox.box0    );
8379             copy_rvec(comm->box_size,ddbox.box_size);
8380         }
8381         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8382                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8383
8384         bBoxChanged = TRUE;
8385         bRedist = TRUE;
8386     }
8387     /* For dim's without pbc and -gcom */
8388     copy_rvec(ddbox.box0    ,comm->box0    );
8389     copy_rvec(ddbox.box_size,comm->box_size);
8390
8391     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8392                       step,wcycle);
8393
8394     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8395     {
8396         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8397     }
8398
8399     /* Check if we should sort the charge groups */
8400     if (comm->nstSortCG > 0)
8401     {
8402         bSortCG = (bMasterState ||
8403                    (bRedist && (step % comm->nstSortCG == 0)));
8404     }
8405     else
8406     {
8407         bSortCG = FALSE;
8408     }
8409
8410     ncg_home_old = dd->ncg_home;
8411
8412     if (bRedist)
8413     {
8414         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8415                                  state_local,f,fr,mdatoms,
8416                                  !bSortCG,nrnb);
8417     }
8418
8419     get_nsgrid_boundaries(fr->ns.grid,dd,
8420                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8421                           dd->ncg_home,fr->cg_cm,
8422                           cell_ns_x0,cell_ns_x1,&grid_density);
8423
8424     if (bBoxChanged)
8425     {
8426         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8427     }
8428
8429     copy_ivec(fr->ns.grid->n,ncells_old);
8430     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8431                state_local->box,cell_ns_x0,cell_ns_x1,
8432                fr->rlistlong,grid_density);
8433     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8434     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8435
8436     if (bSortCG)
8437     {
8438         /* Sort the state on charge group position.
8439          * This enables exact restarts from this step.
8440          * It also improves performance by about 15% with larger numbers
8441          * of atoms per node.
8442          */
8443
8444         /* Fill the ns grid with the home cell,
8445          * so we can sort with the indices.
8446          */
8447         set_zones_ncg_home(dd);
8448         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8449                   0,dd->ncg_home,fr->cg_cm);
8450
8451         /* Check if we can user the old order and ns grid cell indices
8452          * of the charge groups to sort the charge groups efficiently.
8453          */
8454         bResortAll = (bMasterState ||
8455                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8456                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8457                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8458
8459         if (debug)
8460         {
8461             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8462                     gmx_step_str(step,sbuf),dd->ncg_home);
8463         }
8464         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8465                       bResortAll ? -1 : ncg_home_old);
8466         /* Rebuild all the indices */
8467         cg0 = 0;
8468         ga2la_clear(dd->ga2la);
8469     }
8470
8471     /* Setup up the communication and communicate the coordinates */
8472     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8473
8474     /* Set the indices */
8475     make_dd_indices(dd,cgs_gl->index,cg0);
8476
8477     /* Set the charge group boundaries for neighbor searching */
8478     set_cg_boundaries(&comm->zones);
8479
8480     /*
8481     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8482                  -1,state_local->x,state_local->box);
8483     */
8484
8485     /* Extract a local topology from the global topology */
8486     for(i=0; i<dd->ndim; i++)
8487     {
8488         np[dd->dim[i]] = comm->cd[i].np;
8489     }
8490     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8491                       comm->cellsize_min,np,
8492                       fr,vsite,top_global,top_local);
8493
8494     /* Set up the special atom communication */
8495     n = comm->nat[ddnatZONE];
8496     for(i=ddnatZONE+1; i<ddnatNR; i++)
8497     {
8498         switch(i)
8499         {
8500         case ddnatVSITE:
8501             if (vsite && vsite->n_intercg_vsite)
8502             {
8503                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8504             }
8505             break;
8506         case ddnatCON:
8507             if (dd->bInterCGcons)
8508             {
8509                 /* Only for inter-cg constraints we need special code */
8510                 n = dd_make_local_constraints(dd,n,top_global,
8511                                               constr,ir->nProjOrder,
8512                                               &top_local->idef.il[F_CONSTR]);
8513             }
8514             break;
8515         default:
8516             gmx_incons("Unknown special atom type setup");
8517         }
8518         comm->nat[i] = n;
8519     }
8520
8521     /* Make space for the extra coordinates for virtual site
8522      * or constraint communication.
8523      */
8524     state_local->natoms = comm->nat[ddnatNR-1];
8525     if (state_local->natoms > state_local->nalloc)
8526     {
8527         dd_realloc_state(state_local,f,state_local->natoms);
8528     }
8529
8530     if (fr->bF_NoVirSum)
8531     {
8532         if (vsite && vsite->n_intercg_vsite)
8533         {
8534             nat_f_novirsum = comm->nat[ddnatVSITE];
8535         }
8536         else
8537         {
8538             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8539             {
8540                 nat_f_novirsum = dd->nat_tot;
8541             }
8542             else
8543             {
8544                 nat_f_novirsum = dd->nat_home;
8545             }
8546         }
8547     }
8548     else
8549     {
8550         nat_f_novirsum = 0;
8551     }
8552
8553     /* Set the number of atoms required for the force calculation.
8554      * Forces need to be constrained when using a twin-range setup
8555      * or with energy minimization. For simple simulations we could
8556      * avoid some allocation, zeroing and copying, but this is
8557      * probably not worth the complications ande checking.
8558      */
8559     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8560                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8561
8562     /* We make the all mdatoms up to nat_tot_con.
8563      * We could save some work by only setting invmass
8564      * between nat_tot and nat_tot_con.
8565      */
8566     /* This call also sets the new number of home particles to dd->nat_home */
8567     atoms2md(top_global,ir,
8568              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8569
8570     /* Now we have the charges we can sort the FE interactions */
8571     dd_sort_local_top(dd,mdatoms,top_local);
8572
8573     if (shellfc)
8574     {
8575         /* Make the local shell stuff, currently no communication is done */
8576         make_local_shells(cr,mdatoms,shellfc);
8577     }
8578
8579         if (ir->implicit_solvent)
8580     {
8581         make_local_gb(cr,fr->born,ir->gb_algorithm);
8582     }
8583
8584     if (!(cr->duty & DUTY_PME))
8585     {
8586         /* Send the charges to our PME only node */
8587         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8588                        mdatoms->chargeA,mdatoms->chargeB,
8589                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8590     }
8591
8592     if (constr)
8593     {
8594         set_constraints(constr,top_local,ir,mdatoms,cr);
8595     }
8596
8597     if (ir->ePull != epullNO)
8598     {
8599         /* Update the local pull groups */
8600         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8601     }
8602
8603     add_dd_statistics(dd);
8604
8605     /* Make sure we only count the cycles for this DD partitioning */
8606     clear_dd_cycle_counts(dd);
8607
8608     /* Because the order of the atoms might have changed since
8609      * the last vsite construction, we need to communicate the constructing
8610      * atom coordinates again (for spreading the forces this MD step).
8611      */
8612     dd_move_x_vsites(dd,state_local->box,state_local->x);
8613
8614     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8615     {
8616         dd_move_x(dd,state_local->box,state_local->x);
8617         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8618                      -1,state_local->x,state_local->box);
8619     }
8620
8621     if (bNStGlobalComm)
8622     {
8623         /* Store the global communication step */
8624         comm->globalcomm_step = step;
8625     }
8626
8627     /* Increase the DD partitioning counter */
8628     dd->ddp_count++;
8629     /* The state currently matches this DD partitioning count, store it */
8630     state_local->ddp_count = dd->ddp_count;
8631     if (bMasterState)
8632     {
8633         /* The DD master node knows the complete cg distribution,
8634          * store the count so we can possibly skip the cg info communication.
8635          */
8636         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8637     }
8638
8639     if (comm->DD_debug > 0)
8640     {
8641         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8642         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8643                                 "after partitioning");
8644     }
8645 }