src/gromacs/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "vec.h"
  31 #include "domdec.h"
  32 #include "domdec_network.h"
  33 #include "nrnb.h"
  34 #include "pbc.h"
  35 #include "chargegroup.h"
  36 #include "constr.h"
  37 #include "mdatoms.h"
  38 #include "names.h"
  39 #include "pdbio.h"
  40 #include "futil.h"
  41 #include "force.h"
  42 #include "pme.h"
  43 #include "pull.h"
  44 #include "pull_rotation.h"
  45 #include "gmx_wallcycle.h"
  46 #include "mdrun.h"
  47 #include "nsgrid.h"
  48 #include "shellfc.h"
  49 #include "mtop_util.h"
  50 #include "gmxfio.h"
  51 #include "gmx_ga2la.h"
  52 #include "gmx_sort.h"
  53 #include "macros.h"
  54
  55 #ifdef GMX_LIB_MPI
  56 #include <mpi.h>
  57 #endif
  58 #ifdef GMX_THREAD_MPI
  59 #include "tmpi.h"
  60 #endif
  61
  62 #define DDRANK(dd,rank)    (rank)
  63 #define DDMASTERRANK(dd)   (dd->masterrank)
  64
  65 typedef struct gmx_domdec_master
  66 {
  67     /* The cell boundaries */
  68     real **cell_x;
  69     /* The global charge group division */
  70     int  *ncg;     /* Number of home charge groups for each node */
  71     int  *index;   /* Index of nnodes+1 into cg */
  72     int  *cg;      /* Global charge group index */
  73     int  *nat;     /* Number of home atoms for each node. */
  74     int  *ibuf;    /* Buffer for communication */
  75     rvec *vbuf;    /* Buffer for state scattering and gathering */
  76 } gmx_domdec_master_t;
  77
  78 typedef struct
  79 {
  80     /* The numbers of charge groups to send and receive for each cell
  81      * that requires communication, the last entry contains the total
  82      * number of atoms that needs to be communicated.
  83      */
  84     int nsend[DD_MAXIZONE+2];
  85     int nrecv[DD_MAXIZONE+2];
  86     /* The charge groups to send */
  87     int *index;
  88     int nalloc;
  89     /* The atom range for non-in-place communication */
  90     int cell2at0[DD_MAXIZONE];
  91     int cell2at1[DD_MAXIZONE];
  92 } gmx_domdec_ind_t;
  93
  94 typedef struct
  95 {
  96     int  np;                   /* Number of grid pulses in this dimension */
  97     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
  98     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
  99     int  np_nalloc;
 100     gmx_bool bInPlace;             /* Can we communicate in place?            */
 101 } gmx_domdec_comm_dim_t;
 102
 103 typedef struct
 104 {
 105     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 106     real *cell_f;      /* State var.: cell boundaries, box relative      */
 107     real *old_cell_f;  /* Temp. var.: old cell size                      */
 108     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 109     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 110     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 111     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 112     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 113     real *buf_ncd;     /* Temp. var.                                     */
 114 } gmx_domdec_root_t;
 115
 116 #define DD_NLOAD_MAX 9
 117
 118 /* Here floats are accurate enough, since these variables
 119  * only influence the load balancing, not the actual MD results.
 120  */
 121 typedef struct
 122 {
 123     int  nload;
 124     float *load;
 125     float sum;
 126     float max;
 127     float sum_m;
 128     float cvol_min;
 129     float mdf;
 130     float pme;
 131     int   flags;
 132 } gmx_domdec_load_t;
 133
 134 typedef struct
 135 {
 136     int  nsc;
 137     int  ind_gl;
 138     int  ind;
 139 } gmx_cgsort_t;
 140
 141 typedef struct
 142 {
 143     gmx_cgsort_t *sort1,*sort2;
 144     int  sort_nalloc;
 145     gmx_cgsort_t *sort_new;
 146     int  sort_new_nalloc;
 147     int  *ibuf;
 148     int  ibuf_nalloc;
 149 } gmx_domdec_sort_t;
 150
 151 typedef struct
 152 {
 153     rvec *v;
 154     int  nalloc;
 155 } vec_rvec_t;
 156
 157 /* This enum determines the order of the coordinates.
 158  * ddnatHOME and ddnatZONE should be first and second,
 159  * the others can be ordered as wanted.
 160  */
 161 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 162
 163 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 164 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 165
 166 typedef struct
 167 {
 168     int  dim;      /* The dimension                                          */
 169     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 170     int  nslab;    /* The number of PME slabs in this dimension              */
 171     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 172     int  *pp_min;  /* The minimum pp node location, size nslab               */
 173     int  *pp_max;  /* The maximum pp node location,size nslab                */
 174     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 175 } gmx_ddpme_t;
 176
 177 typedef struct
 178 {
 179     real min0;    /* The minimum bottom of this zone                        */
 180     real max1;    /* The maximum top of this zone                           */
 181     real mch0;    /* The maximum bottom communicaton height for this zone   */
 182     real mch1;    /* The maximum top communicaton height for this zone      */
 183     real p1_0;    /* The bottom value of the first cell in this zone        */
 184     real p1_1;    /* The top value of the first cell in this zone           */
 185 } gmx_ddzone_t;
 186
 187 typedef struct gmx_domdec_comm
 188 {
 189     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 190      * unless stated otherwise.
 191      */
 192
 193     /* The number of decomposition dimensions for PME, 0: no PME */
 194     int  npmedecompdim;
 195     /* The number of nodes doing PME (PP/PME or only PME) */
 196     int  npmenodes;
 197     int  npmenodes_x;
 198     int  npmenodes_y;
 199     /* The communication setup including the PME only nodes */
 200     gmx_bool bCartesianPP_PME;
 201     ivec ntot;
 202     int  cartpmedim;
 203     int  *pmenodes;          /* size npmenodes                         */
 204     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 205                               * but with bCartesianPP_PME              */
 206     gmx_ddpme_t ddpme[2];
 207
 208     /* The DD particle-particle nodes only */
 209     gmx_bool bCartesianPP;
 210     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 211
 212     /* The global charge groups */
 213     t_block cgs_gl;
 214
 215     /* Should we sort the cgs */
 216     int  nstSortCG;
 217     gmx_domdec_sort_t *sort;
 218
 219     /* Are there bonded and multi-body interactions between charge groups? */
 220     gmx_bool bInterCGBondeds;
 221     gmx_bool bInterCGMultiBody;
 222
 223     /* Data for the optional bonded interaction atom communication range */
 224     gmx_bool bBondComm;
 225     t_blocka *cglink;
 226     char *bLocalCG;
 227
 228     /* The DLB option */
 229     int  eDLB;
 230     /* Are we actually using DLB? */
 231     gmx_bool bDynLoadBal;
 232
 233     /* Cell sizes for static load balancing, first index cartesian */
 234     real **slb_frac;
 235
 236     /* The width of the communicated boundaries */
 237     real cutoff_mbody;
 238     real cutoff;
 239     /* The minimum cell size (including triclinic correction) */
 240     rvec cellsize_min;
 241     /* For dlb, for use with edlbAUTO */
 242     rvec cellsize_min_dlb;
 243     /* The lower limit for the DD cell size with DLB */
 244     real cellsize_limit;
 245     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 246     gmx_bool bVacDLBNoLimit;
 247
 248     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 249     ivec tric_dir;
 250     /* box0 and box_size are required with dim's without pbc and -gcom */
 251     rvec box0;
 252     rvec box_size;
 253
 254     /* The cell boundaries */
 255     rvec cell_x0;
 256     rvec cell_x1;
 257
 258     /* The old location of the cell boundaries, to check cg displacements */
 259     rvec old_cell_x0;
 260     rvec old_cell_x1;
 261
 262     /* The communication setup and charge group boundaries for the zones */
 263     gmx_domdec_zones_t zones;
 264
 265     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 266      * cell boundaries of neighboring cells for dynamic load balancing.
 267      */
 268     gmx_ddzone_t zone_d1[2];
 269     gmx_ddzone_t zone_d2[2][2];
 270
 271     /* The coordinate/force communication setup and indices */
 272     gmx_domdec_comm_dim_t cd[DIM];
 273     /* The maximum number of cells to communicate with in one dimension */
 274     int  maxpulse;
 275
 276     /* Which cg distribution is stored on the master node */
 277     int master_cg_ddp_count;
 278
 279     /* The number of cg's received from the direct neighbors */
 280     int  zone_ncg1[DD_MAXZONE];
 281
 282     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 283     int  nat[ddnatNR];
 284
 285     /* Communication buffer for general use */
 286     int  *buf_int;
 287     int  nalloc_int;
 288
 289      /* Communication buffer for general use */
 290     vec_rvec_t vbuf;
 291
 292     /* Communication buffers only used with multiple grid pulses */
 293     int  *buf_int2;
 294     int  nalloc_int2;
 295     vec_rvec_t vbuf2;
 296
 297     /* Communication buffers for local redistribution */
 298     int  **cggl_flag;
 299     int  cggl_flag_nalloc[DIM*2];
 300     rvec **cgcm_state;
 301     int  cgcm_state_nalloc[DIM*2];
 302
 303     /* Cell sizes for dynamic load balancing */
 304     gmx_domdec_root_t **root;
 305     real *cell_f_row;
 306     real cell_f0[DIM];
 307     real cell_f1[DIM];
 308     real cell_f_max0[DIM];
 309     real cell_f_min1[DIM];
 310
 311     /* Stuff for load communication */
 312     gmx_bool bRecordLoad;
 313     gmx_domdec_load_t *load;
 314 #ifdef GMX_MPI
 315     MPI_Comm *mpi_comm_load;
 316 #endif
 317
 318     /* Maximum DLB scaling per load balancing step in percent */
 319     int dlb_scale_lim;
 320
 321     /* Cycle counters */
 322     float cycl[ddCyclNr];
 323     int   cycl_n[ddCyclNr];
 324     float cycl_max[ddCyclNr];
 325     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 326     int eFlop;
 327     double flop;
 328     int    flop_n;
 329     /* Have often have did we have load measurements */
 330     int    n_load_have;
 331     /* Have often have we collected the load measurements */
 332     int    n_load_collect;
 333
 334     /* Statistics */
 335     double sum_nat[ddnatNR-ddnatZONE];
 336     int    ndecomp;
 337     int    nload;
 338     double load_step;
 339     double load_sum;
 340     double load_max;
 341     ivec   load_lim;
 342     double load_mdf;
 343     double load_pme;
 344
 345     /* The last partition step */
 346     gmx_large_int_t globalcomm_step;
 347
 348     /* Debugging */
 349     int  nstDDDump;
 350     int  nstDDDumpGrid;
 351     int  DD_debug;
 352 } gmx_domdec_comm_t;
 353
 354 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 355 #define DD_CGIBS 2
 356
 357 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 358 #define DD_FLAG_NRCG  65535
 359 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 360 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 361
 362 /* Zone permutation required to obtain consecutive charge groups
 363  * for neighbor searching.
 364  */
 365 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 366
 367 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 368  * components see only j zones with that component 0.
 369  */
 370
 371 /* The DD zone order */
 372 static const ivec dd_zo[DD_MAXZONE] =
 373   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 374
 375 /* The 3D setup */
 376 #define dd_z3n  8
 377 #define dd_zp3n 4
 378 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 379
 380 /* The 2D setup */
 381 #define dd_z2n  4
 382 #define dd_zp2n 2
 383 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 384
 385 /* The 1D setup */
 386 #define dd_z1n  2
 387 #define dd_zp1n 1
 388 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 389
 390 /* Factors used to avoid problems due to rounding issues */
 391 #define DD_CELL_MARGIN       1.0001
 392 #define DD_CELL_MARGIN2      1.00005
 393 /* Factor to account for pressure scaling during nstlist steps */
 394 #define DD_PRES_SCALE_MARGIN 1.02
 395
 396 /* Allowed performance loss before we DLB or warn */
 397 #define DD_PERF_LOSS 0.05
 398
 399 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 400
 401 /* Use separate MPI send and receive commands
 402  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 403  * This saves memory (and some copying for small nnodes).
 404  * For high parallelization scatter and gather calls are used.
 405  */
 406 #define GMX_DD_NNODES_SENDRECV 4
 407
 408
 409 /*
 410 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 411
 412 static void index2xyz(ivec nc,int ind,ivec xyz)
 413 {
 414   xyz[XX] = ind % nc[XX];
 415   xyz[YY] = (ind / nc[XX]) % nc[YY];
 416   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 417 }
 418 */
 419
 420 /* This order is required to minimize the coordinate communication in PME
 421  * which uses decomposition in the x direction.
 422  */
 423 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 424
 425 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 426 {
 427     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 428     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 429     xyz[ZZ] = ind % nc[ZZ];
 430 }
 431
 432 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 433 {
 434     int ddindex;
 435     int ddnodeid=-1;
 436
 437     ddindex = dd_index(dd->nc,c);
 438     if (dd->comm->bCartesianPP_PME)
 439     {
 440         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 441     }
 442     else if (dd->comm->bCartesianPP)
 443     {
 444 #ifdef GMX_MPI
 445         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 446 #endif
 447     }
 448     else
 449     {
 450         ddnodeid = ddindex;
 451     }
 452
 453     return ddnodeid;
 454 }
 455
 456 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 457 {
 458     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 459 }
 460
 461 int ddglatnr(gmx_domdec_t *dd,int i)
 462 {
 463     int atnr;
 464
 465     if (dd == NULL)
 466     {
 467         atnr = i + 1;
 468     }
 469     else
 470     {
 471         if (i >= dd->comm->nat[ddnatNR-1])
 472         {
 473             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 474         }
 475         atnr = dd->gatindex[i] + 1;
 476     }
 477
 478     return atnr;
 479 }
 480
 481 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 482 {
 483     return &dd->comm->cgs_gl;
 484 }
 485
 486 static void vec_rvec_init(vec_rvec_t *v)
 487 {
 488     v->nalloc = 0;
 489     v->v      = NULL;
 490 }
 491
 492 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 493 {
 494     if (n > v->nalloc)
 495     {
 496         v->nalloc = over_alloc_dd(n);
 497         srenew(v->v,v->nalloc);
 498     }
 499 }
 500
 501 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 502 {
 503     int i;
 504
 505     if (state->ddp_count != dd->ddp_count)
 506     {
 507         gmx_incons("The state does not the domain decomposition state");
 508     }
 509
 510     state->ncg_gl = dd->ncg_home;
 511     if (state->ncg_gl > state->cg_gl_nalloc)
 512     {
 513         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 514         srenew(state->cg_gl,state->cg_gl_nalloc);
 515     }
 516     for(i=0; i<state->ncg_gl; i++)
 517     {
 518         state->cg_gl[i] = dd->index_gl[i];
 519     }
 520
 521     state->ddp_count_cg_gl = dd->ddp_count;
 522 }
 523
 524 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 525 {
 526     return &dd->comm->zones;
 527 }
 528
 529 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 530                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 531 {
 532     gmx_domdec_zones_t *zones;
 533     int izone,d,dim;
 534
 535     zones = &dd->comm->zones;
 536
 537     izone = 0;
 538     while (icg >= zones->izone[izone].cg1)
 539     {
 540         izone++;
 541     }
 542
 543     if (izone == 0)
 544     {
 545         *jcg0 = icg;
 546     }
 547     else if (izone < zones->nizone)
 548     {
 549         *jcg0 = zones->izone[izone].jcg0;
 550     }
 551     else
 552     {
 553         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 554                   icg,izone,zones->nizone);
 555     }
 556
 557     *jcg1 = zones->izone[izone].jcg1;
 558
 559     for(d=0; d<dd->ndim; d++)
 560     {
 561         dim = dd->dim[d];
 562         shift0[dim] = zones->izone[izone].shift0[dim];
 563         shift1[dim] = zones->izone[izone].shift1[dim];
 564         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 565         {
 566             /* A conservative approach, this can be optimized */
 567             shift0[dim] -= 1;
 568             shift1[dim] += 1;
 569         }
 570     }
 571 }
 572
 573 int dd_natoms_vsite(gmx_domdec_t *dd)
 574 {
 575     return dd->comm->nat[ddnatVSITE];
 576 }
 577
 578 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 579 {
 580     *at_start = dd->comm->nat[ddnatCON-1];
 581     *at_end   = dd->comm->nat[ddnatCON];
 582 }
 583
 584 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 585 {
 586     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 587     int  *index,*cgindex;
 588     gmx_domdec_comm_t *comm;
 589     gmx_domdec_comm_dim_t *cd;
 590     gmx_domdec_ind_t *ind;
 591     rvec shift={0,0,0},*buf,*rbuf;
 592     gmx_bool bPBC,bScrew;
 593
 594     comm = dd->comm;
 595
 596     cgindex = dd->cgindex;
 597
 598     buf = comm->vbuf.v;
 599
 600     nzone = 1;
 601     nat_tot = dd->nat_home;
 602     for(d=0; d<dd->ndim; d++)
 603     {
 604         bPBC   = (dd->ci[dd->dim[d]] == 0);
 605         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 606         if (bPBC)
 607         {
 608             copy_rvec(box[dd->dim[d]],shift);
 609         }
 610         cd = &comm->cd[d];
 611         for(p=0; p<cd->np; p++)
 612         {
 613             ind = &cd->ind[p];
 614             index = ind->index;
 615             n = 0;
 616             if (!bPBC)
 617             {
 618                 for(i=0; i<ind->nsend[nzone]; i++)
 619                 {
 620                     at0 = cgindex[index[i]];
 621                     at1 = cgindex[index[i]+1];
 622                     for(j=at0; j<at1; j++)
 623                     {
 624                         copy_rvec(x[j],buf[n]);
 625                         n++;
 626                     }
 627                 }
 628             }
 629             else if (!bScrew)
 630             {
 631                 for(i=0; i<ind->nsend[nzone]; i++)
 632                 {
 633                     at0 = cgindex[index[i]];
 634                     at1 = cgindex[index[i]+1];
 635                     for(j=at0; j<at1; j++)
 636                     {
 637                         /* We need to shift the coordinates */
 638                         rvec_add(x[j],shift,buf[n]);
 639                         n++;
 640                     }
 641                 }
 642             }
 643             else
 644             {
 645                 for(i=0; i<ind->nsend[nzone]; i++)
 646                 {
 647                     at0 = cgindex[index[i]];
 648                     at1 = cgindex[index[i]+1];
 649                     for(j=at0; j<at1; j++)
 650                     {
 651                         /* Shift x */
 652                         buf[n][XX] = x[j][XX] + shift[XX];
 653                         /* Rotate y and z.
 654                          * This operation requires a special shift force
 655                          * treatment, which is performed in calc_vir.
 656                          */
 657                         buf[n][YY] = box[YY][YY] - x[j][YY];
 658                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 659                         n++;
 660                     }
 661                 }
 662             }
 663
 664             if (cd->bInPlace)
 665             {
 666                 rbuf = x + nat_tot;
 667             }
 668             else
 669             {
 670                 rbuf = comm->vbuf2.v;
 671             }
 672             /* Send and receive the coordinates */
 673             dd_sendrecv_rvec(dd, d, dddirBackward,
 674                              buf,  ind->nsend[nzone+1],
 675                              rbuf, ind->nrecv[nzone+1]);
 676             if (!cd->bInPlace)
 677             {
 678                 j = 0;
 679                 for(zone=0; zone<nzone; zone++)
 680                 {
 681                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 682                     {
 683                         copy_rvec(rbuf[j],x[i]);
 684                         j++;
 685                     }
 686                 }
 687             }
 688             nat_tot += ind->nrecv[nzone+1];
 689         }
 690         nzone += nzone;
 691     }
 692 }
 693
 694 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 695 {
 696     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 697     int  *index,*cgindex;
 698     gmx_domdec_comm_t *comm;
 699     gmx_domdec_comm_dim_t *cd;
 700     gmx_domdec_ind_t *ind;
 701     rvec *buf,*sbuf;
 702     ivec vis;
 703     int  is;
 704     gmx_bool bPBC,bScrew;
 705
 706     comm = dd->comm;
 707
 708     cgindex = dd->cgindex;
 709
 710     buf = comm->vbuf.v;
 711
 712     n = 0;
 713     nzone = comm->zones.n/2;
 714     nat_tot = dd->nat_tot;
 715     for(d=dd->ndim-1; d>=0; d--)
 716     {
 717         bPBC   = (dd->ci[dd->dim[d]] == 0);
 718         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 719         if (fshift == NULL && !bScrew)
 720         {
 721             bPBC = FALSE;
 722         }
 723         /* Determine which shift vector we need */
 724         clear_ivec(vis);
 725         vis[dd->dim[d]] = 1;
 726         is = IVEC2IS(vis);
 727
 728         cd = &comm->cd[d];
 729         for(p=cd->np-1; p>=0; p--) {
 730             ind = &cd->ind[p];
 731             nat_tot -= ind->nrecv[nzone+1];
 732             if (cd->bInPlace)
 733             {
 734                 sbuf = f + nat_tot;
 735             }
 736             else
 737             {
 738                 sbuf = comm->vbuf2.v;
 739                 j = 0;
 740                 for(zone=0; zone<nzone; zone++)
 741                 {
 742                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 743                     {
 744                         copy_rvec(f[i],sbuf[j]);
 745                         j++;
 746                     }
 747                 }
 748             }
 749             /* Communicate the forces */
 750             dd_sendrecv_rvec(dd, d, dddirForward,
 751                              sbuf, ind->nrecv[nzone+1],
 752                              buf,  ind->nsend[nzone+1]);
 753             index = ind->index;
 754             /* Add the received forces */
 755             n = 0;
 756             if (!bPBC)
 757             {
 758                 for(i=0; i<ind->nsend[nzone]; i++)
 759                 {
 760                     at0 = cgindex[index[i]];
 761                     at1 = cgindex[index[i]+1];
 762                     for(j=at0; j<at1; j++)
 763                     {
 764                         rvec_inc(f[j],buf[n]);
 765                         n++;
 766                     }
 767                 }
 768             }
 769             else if (!bScrew)
 770             {
 771                 for(i=0; i<ind->nsend[nzone]; i++)
 772                 {
 773                     at0 = cgindex[index[i]];
 774                     at1 = cgindex[index[i]+1];
 775                     for(j=at0; j<at1; j++)
 776                     {
 777                         rvec_inc(f[j],buf[n]);
 778                         /* Add this force to the shift force */
 779                         rvec_inc(fshift[is],buf[n]);
 780                         n++;
 781                     }
 782                 }
 783             }
 784             else
 785             {
 786                 for(i=0; i<ind->nsend[nzone]; i++)
 787                 {
 788                     at0 = cgindex[index[i]];
 789                     at1 = cgindex[index[i]+1];
 790                     for(j=at0; j<at1; j++)
 791                     {
 792                         /* Rotate the force */
 793                         f[j][XX] += buf[n][XX];
 794                         f[j][YY] -= buf[n][YY];
 795                         f[j][ZZ] -= buf[n][ZZ];
 796                         if (fshift)
 797                         {
 798                             /* Add this force to the shift force */
 799                             rvec_inc(fshift[is],buf[n]);
 800                         }
 801                         n++;
 802                     }
 803                 }
 804             }
 805         }
 806         nzone /= 2;
 807     }
 808 }
 809
 810 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 811 {
 812     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 813     int  *index,*cgindex;
 814     gmx_domdec_comm_t *comm;
 815     gmx_domdec_comm_dim_t *cd;
 816     gmx_domdec_ind_t *ind;
 817     real *buf,*rbuf;
 818
 819     comm = dd->comm;
 820
 821     cgindex = dd->cgindex;
 822
 823     buf = &comm->vbuf.v[0][0];
 824
 825     nzone = 1;
 826     nat_tot = dd->nat_home;
 827     for(d=0; d<dd->ndim; d++)
 828     {
 829         cd = &comm->cd[d];
 830         for(p=0; p<cd->np; p++)
 831         {
 832             ind = &cd->ind[p];
 833             index = ind->index;
 834             n = 0;
 835             for(i=0; i<ind->nsend[nzone]; i++)
 836             {
 837                 at0 = cgindex[index[i]];
 838                 at1 = cgindex[index[i]+1];
 839                 for(j=at0; j<at1; j++)
 840                 {
 841                     buf[n] = v[j];
 842                     n++;
 843                 }
 844             }
 845
 846             if (cd->bInPlace)
 847             {
 848                 rbuf = v + nat_tot;
 849             }
 850             else
 851             {
 852                 rbuf = &comm->vbuf2.v[0][0];
 853             }
 854             /* Send and receive the coordinates */
 855             dd_sendrecv_real(dd, d, dddirBackward,
 856                              buf,  ind->nsend[nzone+1],
 857                              rbuf, ind->nrecv[nzone+1]);
 858             if (!cd->bInPlace)
 859             {
 860                 j = 0;
 861                 for(zone=0; zone<nzone; zone++)
 862                 {
 863                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 864                     {
 865                         v[i] = rbuf[j];
 866                         j++;
 867                     }
 868                 }
 869             }
 870             nat_tot += ind->nrecv[nzone+1];
 871         }
 872         nzone += nzone;
 873     }
 874 }
 875
 876 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 877 {
 878     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 879     int  *index,*cgindex;
 880     gmx_domdec_comm_t *comm;
 881     gmx_domdec_comm_dim_t *cd;
 882     gmx_domdec_ind_t *ind;
 883     real *buf,*sbuf;
 884
 885     comm = dd->comm;
 886
 887     cgindex = dd->cgindex;
 888
 889     buf = &comm->vbuf.v[0][0];
 890
 891     n = 0;
 892     nzone = comm->zones.n/2;
 893     nat_tot = dd->nat_tot;
 894     for(d=dd->ndim-1; d>=0; d--)
 895     {
 896         cd = &comm->cd[d];
 897         for(p=cd->np-1; p>=0; p--) {
 898             ind = &cd->ind[p];
 899             nat_tot -= ind->nrecv[nzone+1];
 900             if (cd->bInPlace)
 901             {
 902                 sbuf = v + nat_tot;
 903             }
 904             else
 905             {
 906                 sbuf = &comm->vbuf2.v[0][0];
 907                 j = 0;
 908                 for(zone=0; zone<nzone; zone++)
 909                 {
 910                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 911                     {
 912                         sbuf[j] = v[i];
 913                         j++;
 914                     }
 915                 }
 916             }
 917             /* Communicate the forces */
 918             dd_sendrecv_real(dd, d, dddirForward,
 919                              sbuf, ind->nrecv[nzone+1],
 920                              buf,  ind->nsend[nzone+1]);
 921             index = ind->index;
 922             /* Add the received forces */
 923             n = 0;
 924             for(i=0; i<ind->nsend[nzone]; i++)
 925             {
 926                 at0 = cgindex[index[i]];
 927                 at1 = cgindex[index[i]+1];
 928                 for(j=at0; j<at1; j++)
 929                 {
 930                     v[j] += buf[n];
 931                     n++;
 932                 }
 933             }
 934         }
 935         nzone /= 2;
 936     }
 937 }
 938
 939 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 940 {
 941     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 942             d,i,j,
 943             zone->min0,zone->max1,
 944             zone->mch0,zone->mch0,
 945             zone->p1_0,zone->p1_1);
 946 }
 947
 948 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 949                                int ddimind,int direction,
 950                                gmx_ddzone_t *buf_s,int n_s,
 951                                gmx_ddzone_t *buf_r,int n_r)
 952 {
 953     rvec vbuf_s[5*2],vbuf_r[5*2];
 954     int i;
 955
 956     for(i=0; i<n_s; i++)
 957     {
 958         vbuf_s[i*2  ][0] = buf_s[i].min0;
 959         vbuf_s[i*2  ][1] = buf_s[i].max1;
 960         vbuf_s[i*2  ][2] = buf_s[i].mch0;
 961         vbuf_s[i*2+1][0] = buf_s[i].mch1;
 962         vbuf_s[i*2+1][1] = buf_s[i].p1_0;
 963         vbuf_s[i*2+1][2] = buf_s[i].p1_1;
 964     }
 965
 966     dd_sendrecv_rvec(dd, ddimind, direction,
 967                      vbuf_s, n_s*2,
 968                      vbuf_r, n_r*2);
 969
 970     for(i=0; i<n_r; i++)
 971     {
 972         buf_r[i].min0 = vbuf_r[i*2  ][0];
 973         buf_r[i].max1 = vbuf_r[i*2  ][1];
 974         buf_r[i].mch0 = vbuf_r[i*2  ][2];
 975         buf_r[i].mch1 = vbuf_r[i*2+1][0];
 976         buf_r[i].p1_0 = vbuf_r[i*2+1][1];
 977         buf_r[i].p1_1 = vbuf_r[i*2+1][2];
 978     }
 979 }
 980
 981 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
 982                           rvec cell_ns_x0,rvec cell_ns_x1)
 983 {
 984     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
 985     gmx_ddzone_t *zp,buf_s[5],buf_r[5],buf_e[5];
 986     rvec extr_s[2],extr_r[2];
 987     rvec dh;
 988     real dist_d,c=0,det;
 989     gmx_domdec_comm_t *comm;
 990     gmx_bool bPBC,bUse;
 991
 992     comm = dd->comm;
 993
 994     for(d=1; d<dd->ndim; d++)
 995     {
 996         dim = dd->dim[d];
 997         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
 998         zp->min0 = cell_ns_x0[dim];
 999         zp->max1 = cell_ns_x1[dim];
1000         zp->mch0 = cell_ns_x0[dim];
1001         zp->mch1 = cell_ns_x1[dim];
1002         zp->p1_0 = cell_ns_x0[dim];
1003         zp->p1_1 = cell_ns_x1[dim];
1004     }
1005
1006     for(d=dd->ndim-2; d>=0; d--)
1007     {
1008         dim  = dd->dim[d];
1009         bPBC = (dim < ddbox->npbcdim);
1010
1011         /* Use an rvec to store two reals */
1012         extr_s[d][0] = comm->cell_f0[d+1];
1013         extr_s[d][1] = comm->cell_f1[d+1];
1014         extr_s[d][2] = 0;
1015
1016         pos = 0;
1017         /* Store the extremes in the backward sending buffer,
1018          * so the get updated separately from the forward communication.
1019          */
1020         for(d1=d; d1<dd->ndim-1; d1++)
1021         {
1022             /* We invert the order to be able to use the same loop for buf_e */
1023             buf_s[pos].min0 = extr_s[d1][1];
1024             buf_s[pos].max1 = extr_s[d1][0];
1025             buf_s[pos].mch0 = 0;
1026             buf_s[pos].mch1 = 0;
1027             /* Store the cell corner of the dimension we communicate along */
1028             buf_s[pos].p1_0 = comm->cell_x0[dim];
1029             buf_s[pos].p1_1 = 0;
1030             pos++;
1031         }
1032
1033         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1034         pos++;
1035
1036         if (dd->ndim == 3 && d == 0)
1037         {
1038             buf_s[pos] = comm->zone_d2[0][1];
1039             pos++;
1040             buf_s[pos] = comm->zone_d1[0];
1041             pos++;
1042         }
1043
1044         /* We only need to communicate the extremes
1045          * in the forward direction
1046          */
1047         npulse = comm->cd[d].np;
1048         if (bPBC)
1049         {
1050             /* Take the minimum to avoid double communication */
1051             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1052         }
1053         else
1054         {
1055             /* Without PBC we should really not communicate over
1056              * the boundaries, but implementing that complicates
1057              * the communication setup and therefore we simply
1058              * do all communication, but ignore some data.
1059              */
1060             npulse_min = npulse;
1061         }
1062         for(p=0; p<npulse_min; p++)
1063         {
1064             /* Communicate the extremes forward */
1065             bUse = (bPBC || dd->ci[dim] > 0);
1066
1067             dd_sendrecv_rvec(dd, d, dddirForward,
1068                              extr_s+d, dd->ndim-d-1,
1069                              extr_r+d, dd->ndim-d-1);
1070
1071             if (bUse)
1072             {
1073                 for(d1=d; d1<dd->ndim-1; d1++)
1074                 {
1075                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1076                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1077                 }
1078             }
1079         }
1080
1081         buf_size = pos;
1082         for(p=0; p<npulse; p++)
1083         {
1084             /* Communicate all the zone information backward */
1085             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1086
1087             dd_sendrecv_ddzone(dd, d, dddirBackward,
1088                                buf_s, buf_size,
1089                                buf_r, buf_size);
1090
1091             clear_rvec(dh);
1092             if (p > 0)
1093             {
1094                 for(d1=d+1; d1<dd->ndim; d1++)
1095                 {
1096                     /* Determine the decrease of maximum required
1097                      * communication height along d1 due to the distance along d,
1098                      * this avoids a lot of useless atom communication.
1099                      */
1100                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1101
1102                     if (ddbox->tric_dir[dim])
1103                     {
1104                         /* c is the off-diagonal coupling between the cell planes
1105                          * along directions d and d1.
1106                          */
1107                         c = ddbox->v[dim][dd->dim[d1]][dim];
1108                     }
1109                     else
1110                     {
1111                         c = 0;
1112                     }
1113                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1114                     if (det > 0)
1115                     {
1116                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1117                     }
1118                     else
1119                     {
1120                         /* A negative value signals out of range */
1121                         dh[d1] = -1;
1122                     }
1123                 }
1124             }
1125
1126             /* Accumulate the extremes over all pulses */
1127             for(i=0; i<buf_size; i++)
1128             {
1129                 if (p == 0)
1130                 {
1131                     buf_e[i] = buf_r[i];
1132                 }
1133                 else
1134                 {
1135                     if (bUse)
1136                     {
1137                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1138                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1139                     }
1140
1141                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1142                     {
1143                         d1 = 1;
1144                     }
1145                     else
1146                     {
1147                         d1 = d + 1;
1148                     }
1149                     if (bUse && dh[d1] >= 0)
1150                     {
1151                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1152                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1153                     }
1154                 }
1155                 /* Copy the received buffer to the send buffer,
1156                  * to pass the data through with the next pulse.
1157                  */
1158                 buf_s[i] = buf_r[i];
1159             }
1160             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1161                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1162             {
1163                 /* Store the extremes */
1164                 pos = 0;
1165
1166                 for(d1=d; d1<dd->ndim-1; d1++)
1167                 {
1168                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1169                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1170                     pos++;
1171                 }
1172
1173                 if (d == 1 || (d == 0 && dd->ndim == 3))
1174                 {
1175                     for(i=d; i<2; i++)
1176                     {
1177                         comm->zone_d2[1-d][i] = buf_e[pos];
1178                         pos++;
1179                     }
1180                 }
1181                 if (d == 0)
1182                 {
1183                     comm->zone_d1[1] = buf_e[pos];
1184                     pos++;
1185                 }
1186             }
1187         }
1188     }
1189
1190     if (dd->ndim >= 2)
1191     {
1192         dim = dd->dim[1];
1193         for(i=0; i<2; i++)
1194         {
1195             if (debug)
1196             {
1197                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1198             }
1199             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1200             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1201         }
1202     }
1203     if (dd->ndim >= 3)
1204     {
1205         dim = dd->dim[2];
1206         for(i=0; i<2; i++)
1207         {
1208             for(j=0; j<2; j++)
1209             {
1210                 if (debug)
1211                 {
1212                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1213                 }
1214                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1215                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1216             }
1217         }
1218     }
1219     for(d=1; d<dd->ndim; d++)
1220     {
1221         comm->cell_f_max0[d] = extr_s[d-1][0];
1222         comm->cell_f_min1[d] = extr_s[d-1][1];
1223         if (debug)
1224         {
1225             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1226                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1227         }
1228     }
1229 }
1230
1231 static void dd_collect_cg(gmx_domdec_t *dd,
1232                           t_state *state_local)
1233 {
1234     gmx_domdec_master_t *ma=NULL;
1235     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1236     t_block *cgs_gl;
1237
1238     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1239     {
1240         /* The master has the correct distribution */
1241         return;
1242     }
1243
1244     if (state_local->ddp_count == dd->ddp_count)
1245     {
1246         ncg_home = dd->ncg_home;
1247         cg       = dd->index_gl;
1248         nat_home = dd->nat_home;
1249     }
1250     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1251     {
1252         cgs_gl = &dd->comm->cgs_gl;
1253
1254         ncg_home = state_local->ncg_gl;
1255         cg       = state_local->cg_gl;
1256         nat_home = 0;
1257         for(i=0; i<ncg_home; i++)
1258         {
1259             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1260         }
1261     }
1262     else
1263     {
1264         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1265     }
1266
1267     buf2[0] = dd->ncg_home;
1268     buf2[1] = dd->nat_home;
1269     if (DDMASTER(dd))
1270     {
1271         ma = dd->ma;
1272         ibuf = ma->ibuf;
1273     }
1274     else
1275     {
1276         ibuf = NULL;
1277     }
1278     /* Collect the charge group and atom counts on the master */
1279     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1280
1281     if (DDMASTER(dd))
1282     {
1283         ma->index[0] = 0;
1284         for(i=0; i<dd->nnodes; i++)
1285         {
1286             ma->ncg[i] = ma->ibuf[2*i];
1287             ma->nat[i] = ma->ibuf[2*i+1];
1288             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1289
1290         }
1291         /* Make byte counts and indices */
1292         for(i=0; i<dd->nnodes; i++)
1293         {
1294             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1295             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1296         }
1297         if (debug)
1298         {
1299             fprintf(debug,"Initial charge group distribution: ");
1300             for(i=0; i<dd->nnodes; i++)
1301                 fprintf(debug," %d",ma->ncg[i]);
1302             fprintf(debug,"\n");
1303         }
1304     }
1305
1306     /* Collect the charge group indices on the master */
1307     dd_gatherv(dd,
1308                dd->ncg_home*sizeof(int),dd->index_gl,
1309                DDMASTER(dd) ? ma->ibuf : NULL,
1310                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1311                DDMASTER(dd) ? ma->cg : NULL);
1312
1313     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1314 }
1315
1316 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1317                                     rvec *lv,rvec *v)
1318 {
1319     gmx_domdec_master_t *ma;
1320     int  n,i,c,a,nalloc=0;
1321     rvec *buf=NULL;
1322     t_block *cgs_gl;
1323
1324     ma = dd->ma;
1325
1326     if (!DDMASTER(dd))
1327     {
1328 #ifdef GMX_MPI
1329         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1330                  dd->rank,dd->mpi_comm_all);
1331 #endif
1332     } else {
1333         /* Copy the master coordinates to the global array */
1334         cgs_gl = &dd->comm->cgs_gl;
1335
1336         n = DDMASTERRANK(dd);
1337         a = 0;
1338         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1339         {
1340             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1341             {
1342                 copy_rvec(lv[a++],v[c]);
1343             }
1344         }
1345
1346         for(n=0; n<dd->nnodes; n++)
1347         {
1348             if (n != dd->rank)
1349             {
1350                 if (ma->nat[n] > nalloc)
1351                 {
1352                     nalloc = over_alloc_dd(ma->nat[n]);
1353                     srenew(buf,nalloc);
1354                 }
1355 #ifdef GMX_MPI
1356                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1357                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1358 #endif
1359                 a = 0;
1360                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1361                 {
1362                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1363                     {
1364                         copy_rvec(buf[a++],v[c]);
1365                     }
1366                 }
1367             }
1368         }
1369         sfree(buf);
1370     }
1371 }
1372
1373 static void get_commbuffer_counts(gmx_domdec_t *dd,
1374                                   int **counts,int **disps)
1375 {
1376     gmx_domdec_master_t *ma;
1377     int n;
1378
1379     ma = dd->ma;
1380
1381     /* Make the rvec count and displacment arrays */
1382     *counts  = ma->ibuf;
1383     *disps   = ma->ibuf + dd->nnodes;
1384     for(n=0; n<dd->nnodes; n++)
1385     {
1386         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1387         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1388     }
1389 }
1390
1391 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1392                                    rvec *lv,rvec *v)
1393 {
1394     gmx_domdec_master_t *ma;
1395     int  *rcounts=NULL,*disps=NULL;
1396     int  n,i,c,a;
1397     rvec *buf=NULL;
1398     t_block *cgs_gl;
1399
1400     ma = dd->ma;
1401
1402     if (DDMASTER(dd))
1403     {
1404         get_commbuffer_counts(dd,&rcounts,&disps);
1405
1406         buf = ma->vbuf;
1407     }
1408
1409     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1410
1411     if (DDMASTER(dd))
1412     {
1413         cgs_gl = &dd->comm->cgs_gl;
1414
1415         a = 0;
1416         for(n=0; n<dd->nnodes; n++)
1417         {
1418             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1419             {
1420                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1421                 {
1422                     copy_rvec(buf[a++],v[c]);
1423                 }
1424             }
1425         }
1426     }
1427 }
1428
1429 void dd_collect_vec(gmx_domdec_t *dd,
1430                     t_state *state_local,rvec *lv,rvec *v)
1431 {
1432     gmx_domdec_master_t *ma;
1433     int  n,i,c,a,nalloc=0;
1434     rvec *buf=NULL;
1435
1436     dd_collect_cg(dd,state_local);
1437
1438     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1439     {
1440         dd_collect_vec_sendrecv(dd,lv,v);
1441     }
1442     else
1443     {
1444         dd_collect_vec_gatherv(dd,lv,v);
1445     }
1446 }
1447
1448
1449 void dd_collect_state(gmx_domdec_t *dd,
1450                       t_state *state_local,t_state *state)
1451 {
1452     int est,i,j,nh;
1453
1454     nh = state->nhchainlength;
1455
1456     if (DDMASTER(dd))
1457     {
1458         for (i=0;i<efptNR;i++) {
1459             state->lambda[i] = state_local->lambda[i];
1460         }
1461         state->fep_state = state_local->fep_state;
1462         state->veta = state_local->veta;
1463         state->vol0 = state_local->vol0;
1464         copy_mat(state_local->box,state->box);
1465         copy_mat(state_local->boxv,state->boxv);
1466         copy_mat(state_local->svir_prev,state->svir_prev);
1467         copy_mat(state_local->fvir_prev,state->fvir_prev);
1468         copy_mat(state_local->pres_prev,state->pres_prev);
1469
1470
1471         for(i=0; i<state_local->ngtc; i++)
1472         {
1473             for(j=0; j<nh; j++) {
1474                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1475                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1476             }
1477             state->therm_integral[i] = state_local->therm_integral[i];
1478         }
1479         for(i=0; i<state_local->nnhpres; i++)
1480         {
1481             for(j=0; j<nh; j++) {
1482                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1483                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1484             }
1485         }
1486     }
1487     for(est=0; est<estNR; est++)
1488     {
1489         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1490         {
1491             switch (est) {
1492             case estX:
1493                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1494                 break;
1495             case estV:
1496                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1497                 break;
1498             case estSDX:
1499                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1500                 break;
1501             case estCGP:
1502                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1503                 break;
1504             case estLD_RNG:
1505                 if (state->nrngi == 1)
1506                 {
1507                     if (DDMASTER(dd))
1508                     {
1509                         for(i=0; i<state_local->nrng; i++)
1510                         {
1511                             state->ld_rng[i] = state_local->ld_rng[i];
1512                         }
1513                     }
1514                 }
1515                 else
1516                 {
1517                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1518                               state_local->ld_rng,state->ld_rng);
1519                 }
1520                 break;
1521             case estLD_RNGI:
1522                 if (state->nrngi == 1)
1523                 {
1524                    if (DDMASTER(dd))
1525                     {
1526                         state->ld_rngi[0] = state_local->ld_rngi[0];
1527                     }
1528                 }
1529                 else
1530                 {
1531                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1532                               state_local->ld_rngi,state->ld_rngi);
1533                 }
1534                 break;
1535             case estDISRE_INITF:
1536             case estDISRE_RM3TAV:
1537             case estORIRE_INITF:
1538             case estORIRE_DTAV:
1539                 break;
1540             default:
1541                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1542             }
1543         }
1544     }
1545 }
1546
1547 static void dd_realloc_fr_cg(t_forcerec *fr,int nalloc)
1548 {
1549     if (debug)
1550     {
1551         fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1552     }
1553     fr->cg_nalloc = over_alloc_dd(nalloc);
1554     srenew(fr->cg_cm,fr->cg_nalloc);
1555     srenew(fr->cginfo,fr->cg_nalloc);
1556 }
1557
1558 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1559 {
1560     int est;
1561
1562     if (debug)
1563     {
1564         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1565     }
1566
1567     state->nalloc = over_alloc_dd(nalloc);
1568
1569     for(est=0; est<estNR; est++)
1570     {
1571         if (EST_DISTR(est) && (state->flags & (1<<est)))
1572         {
1573             switch(est) {
1574             case estX:
1575                 srenew(state->x,state->nalloc);
1576                 break;
1577             case estV:
1578                 srenew(state->v,state->nalloc);
1579                 break;
1580             case estSDX:
1581                 srenew(state->sd_X,state->nalloc);
1582                 break;
1583             case estCGP:
1584                 srenew(state->cg_p,state->nalloc);
1585                 break;
1586             case estLD_RNG:
1587             case estLD_RNGI:
1588             case estDISRE_INITF:
1589             case estDISRE_RM3TAV:
1590             case estORIRE_INITF:
1591             case estORIRE_DTAV:
1592                 /* No reallocation required */
1593                 break;
1594             default:
1595                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1596             }
1597         }
1598     }
1599
1600     if (f != NULL)
1601     {
1602         srenew(*f,state->nalloc);
1603     }
1604 }
1605
1606 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1607                                        rvec *v,rvec *lv)
1608 {
1609     gmx_domdec_master_t *ma;
1610     int  n,i,c,a,nalloc=0;
1611     rvec *buf=NULL;
1612
1613     if (DDMASTER(dd))
1614     {
1615         ma  = dd->ma;
1616
1617         for(n=0; n<dd->nnodes; n++)
1618         {
1619             if (n != dd->rank)
1620             {
1621                 if (ma->nat[n] > nalloc)
1622                 {
1623                     nalloc = over_alloc_dd(ma->nat[n]);
1624                     srenew(buf,nalloc);
1625                 }
1626                 /* Use lv as a temporary buffer */
1627                 a = 0;
1628                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1629                 {
1630                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1631                     {
1632                         copy_rvec(v[c],buf[a++]);
1633                     }
1634                 }
1635                 if (a != ma->nat[n])
1636                 {
1637                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1638                               a,ma->nat[n]);
1639                 }
1640
1641 #ifdef GMX_MPI
1642                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1643                          DDRANK(dd,n),n,dd->mpi_comm_all);
1644 #endif
1645             }
1646         }
1647         sfree(buf);
1648         n = DDMASTERRANK(dd);
1649         a = 0;
1650         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1651         {
1652             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1653             {
1654                 copy_rvec(v[c],lv[a++]);
1655             }
1656         }
1657     }
1658     else
1659     {
1660 #ifdef GMX_MPI
1661         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1662                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1663 #endif
1664     }
1665 }
1666
1667 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1668                                        rvec *v,rvec *lv)
1669 {
1670     gmx_domdec_master_t *ma;
1671     int  *scounts=NULL,*disps=NULL;
1672     int  n,i,c,a,nalloc=0;
1673     rvec *buf=NULL;
1674
1675     if (DDMASTER(dd))
1676     {
1677         ma  = dd->ma;
1678
1679         get_commbuffer_counts(dd,&scounts,&disps);
1680
1681         buf = ma->vbuf;
1682         a = 0;
1683         for(n=0; n<dd->nnodes; n++)
1684         {
1685             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1686             {
1687                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1688                 {
1689                     copy_rvec(v[c],buf[a++]);
1690                 }
1691             }
1692         }
1693     }
1694
1695     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1696 }
1697
1698 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1699 {
1700     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1701     {
1702         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1703     }
1704     else
1705     {
1706         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1707     }
1708 }
1709
1710 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1711                                 t_state *state,t_state *state_local,
1712                                 rvec **f)
1713 {
1714     int  i,j,nh;
1715
1716     nh = state->nhchainlength;
1717
1718     if (DDMASTER(dd))
1719     {
1720         for(i=0;i<efptNR;i++)
1721         {
1722             state_local->lambda[i] = state->lambda[i];
1723         }
1724         state_local->fep_state = state->fep_state;
1725         state_local->veta   = state->veta;
1726         state_local->vol0   = state->vol0;
1727         copy_mat(state->box,state_local->box);
1728         copy_mat(state->box_rel,state_local->box_rel);
1729         copy_mat(state->boxv,state_local->boxv);
1730         copy_mat(state->svir_prev,state_local->svir_prev);
1731         copy_mat(state->fvir_prev,state_local->fvir_prev);
1732         for(i=0; i<state_local->ngtc; i++)
1733         {
1734             for(j=0; j<nh; j++) {
1735                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1736                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1737             }
1738             state_local->therm_integral[i] = state->therm_integral[i];
1739         }
1740         for(i=0; i<state_local->nnhpres; i++)
1741         {
1742             for(j=0; j<nh; j++) {
1743                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1744                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1745             }
1746         }
1747     }
1748     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1749     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1750     dd_bcast(dd,sizeof(real),&state_local->veta);
1751     dd_bcast(dd,sizeof(real),&state_local->vol0);
1752     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1753     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1754     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1755     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1756     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1757     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1758     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1759     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1760     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1761     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1762
1763     if (dd->nat_home > state_local->nalloc)
1764     {
1765         dd_realloc_state(state_local,f,dd->nat_home);
1766     }
1767     for(i=0; i<estNR; i++)
1768     {
1769         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1770         {
1771             switch (i) {
1772             case estX:
1773                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1774                 break;
1775             case estV:
1776                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1777                 break;
1778             case estSDX:
1779                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1780                 break;
1781             case estCGP:
1782                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1783                 break;
1784             case estLD_RNG:
1785                 if (state->nrngi == 1)
1786                 {
1787                     dd_bcastc(dd,
1788                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1789                               state->ld_rng,state_local->ld_rng);
1790                 }
1791                 else
1792                 {
1793                     dd_scatter(dd,
1794                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1795                                state->ld_rng,state_local->ld_rng);
1796                 }
1797                 break;
1798             case estLD_RNGI:
1799                 if (state->nrngi == 1)
1800                 {
1801                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1802                               state->ld_rngi,state_local->ld_rngi);
1803                 }
1804                 else
1805                 {
1806                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1807                                state->ld_rngi,state_local->ld_rngi);
1808                 }
1809                 break;
1810             case estDISRE_INITF:
1811             case estDISRE_RM3TAV:
1812             case estORIRE_INITF:
1813             case estORIRE_DTAV:
1814                 /* Not implemented yet */
1815                 break;
1816             default:
1817                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1818             }
1819         }
1820     }
1821 }
1822
1823 static char dim2char(int dim)
1824 {
1825     char c='?';
1826
1827     switch (dim)
1828     {
1829     case XX: c = 'X'; break;
1830     case YY: c = 'Y'; break;
1831     case ZZ: c = 'Z'; break;
1832     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1833     }
1834
1835     return c;
1836 }
1837
1838 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1839                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1840 {
1841     rvec grid_s[2],*grid_r=NULL,cx,r;
1842     char fname[STRLEN],format[STRLEN],buf[22];
1843     FILE *out;
1844     int  a,i,d,z,y,x;
1845     matrix tric;
1846     real vol;
1847
1848     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1849     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1850
1851     if (DDMASTER(dd))
1852     {
1853         snew(grid_r,2*dd->nnodes);
1854     }
1855
1856     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1857
1858     if (DDMASTER(dd))
1859     {
1860         for(d=0; d<DIM; d++)
1861         {
1862             for(i=0; i<DIM; i++)
1863             {
1864                 if (d == i)
1865                 {
1866                     tric[d][i] = 1;
1867                 }
1868                 else
1869                 {
1870                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1871                     {
1872                         tric[d][i] = box[i][d]/box[i][i];
1873                     }
1874                     else
1875                     {
1876                         tric[d][i] = 0;
1877                     }
1878                 }
1879             }
1880         }
1881         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1882         sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1883         out = gmx_fio_fopen(fname,"w");
1884         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1885         a = 1;
1886         for(i=0; i<dd->nnodes; i++)
1887         {
1888             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1889             for(d=0; d<DIM; d++)
1890             {
1891                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1892             }
1893             for(z=0; z<2; z++)
1894             {
1895                 for(y=0; y<2; y++)
1896                 {
1897                     for(x=0; x<2; x++)
1898                     {
1899                         cx[XX] = grid_r[i*2+x][XX];
1900                         cx[YY] = grid_r[i*2+y][YY];
1901                         cx[ZZ] = grid_r[i*2+z][ZZ];
1902                         mvmul(tric,cx,r);
1903                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1904                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1905                     }
1906                 }
1907             }
1908             for(d=0; d<DIM; d++)
1909             {
1910                 for(x=0; x<4; x++)
1911                 {
1912                     switch(d)
1913                     {
1914                     case 0: y = 1 + i*8 + 2*x; break;
1915                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1916                     case 2: y = 1 + i*8 + x; break;
1917                     }
1918                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
1919                 }
1920             }
1921         }
1922         gmx_fio_fclose(out);
1923         sfree(grid_r);
1924     }
1925 }
1926
1927 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
1928                   gmx_mtop_t *mtop,t_commrec *cr,
1929                   int natoms,rvec x[],matrix box)
1930 {
1931     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
1932     FILE *out;
1933     int  i,ii,resnr,c;
1934     char *atomname,*resname;
1935     real b;
1936     gmx_domdec_t *dd;
1937
1938     dd = cr->dd;
1939     if (natoms == -1)
1940     {
1941         natoms = dd->comm->nat[ddnatVSITE];
1942     }
1943
1944     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
1945
1946     sprintf(format,"%s%s\n",get_pdbformat(),"%6.2f%6.2f");
1947     sprintf(format4,"%s%s\n",get_pdbformat4(),"%6.2f%6.2f");
1948
1949     out = gmx_fio_fopen(fname,"w");
1950
1951     fprintf(out,"TITLE     %s\n",title);
1952     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1953     for(i=0; i<natoms; i++)
1954     {
1955         ii = dd->gatindex[i];
1956         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
1957         if (i < dd->comm->nat[ddnatZONE])
1958         {
1959             c = 0;
1960             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
1961             {
1962                 c++;
1963             }
1964             b = c;
1965         }
1966         else if (i < dd->comm->nat[ddnatVSITE])
1967         {
1968             b = dd->comm->zones.n;
1969         }
1970         else
1971         {
1972             b = dd->comm->zones.n + 1;
1973         }
1974         fprintf(out,strlen(atomname)<4 ? format : format4,
1975                 "ATOM",(ii+1)%100000,
1976                 atomname,resname,' ',resnr%10000,' ',
1977                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
1978     }
1979     fprintf(out,"TER\n");
1980
1981     gmx_fio_fclose(out);
1982 }
1983
1984 real dd_cutoff_mbody(gmx_domdec_t *dd)
1985 {
1986     gmx_domdec_comm_t *comm;
1987     int  di;
1988     real r;
1989
1990     comm = dd->comm;
1991
1992     r = -1;
1993     if (comm->bInterCGBondeds)
1994     {
1995         if (comm->cutoff_mbody > 0)
1996         {
1997             r = comm->cutoff_mbody;
1998         }
1999         else
2000         {
2001             /* cutoff_mbody=0 means we do not have DLB */
2002             r = comm->cellsize_min[dd->dim[0]];
2003             for(di=1; di<dd->ndim; di++)
2004             {
2005                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2006             }
2007             if (comm->bBondComm)
2008             {
2009                 r = max(r,comm->cutoff_mbody);
2010             }
2011             else
2012             {
2013                 r = min(r,comm->cutoff);
2014             }
2015         }
2016     }
2017
2018     return r;
2019 }
2020
2021 real dd_cutoff_twobody(gmx_domdec_t *dd)
2022 {
2023     real r_mb;
2024
2025     r_mb = dd_cutoff_mbody(dd);
2026
2027     return max(dd->comm->cutoff,r_mb);
2028 }
2029
2030
2031 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2032 {
2033     int nc,ntot;
2034
2035     nc   = dd->nc[dd->comm->cartpmedim];
2036     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2037     copy_ivec(coord,coord_pme);
2038     coord_pme[dd->comm->cartpmedim] =
2039         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2040 }
2041
2042 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2043 {
2044     /* Here we assign a PME node to communicate with this DD node
2045      * by assuming that the major index of both is x.
2046      * We add cr->npmenodes/2 to obtain an even distribution.
2047      */
2048     return (ddindex*npme + npme/2)/ndd;
2049 }
2050
2051 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2052 {
2053     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2054 }
2055
2056 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2057 {
2058     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2059 }
2060
2061 static int *dd_pmenodes(t_commrec *cr)
2062 {
2063     int *pmenodes;
2064     int n,i,p0,p1;
2065
2066     snew(pmenodes,cr->npmenodes);
2067     n = 0;
2068     for(i=0; i<cr->dd->nnodes; i++) {
2069         p0 = cr_ddindex2pmeindex(cr,i);
2070         p1 = cr_ddindex2pmeindex(cr,i+1);
2071         if (i+1 == cr->dd->nnodes || p1 > p0) {
2072             if (debug)
2073                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2074             pmenodes[n] = i + 1 + n;
2075             n++;
2076         }
2077     }
2078
2079     return pmenodes;
2080 }
2081
2082 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2083 {
2084     gmx_domdec_t *dd;
2085     ivec coords,coords_pme,nc;
2086     int  slab;
2087
2088     dd = cr->dd;
2089     /*
2090       if (dd->comm->bCartesian) {
2091       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2092       dd_coords2pmecoords(dd,coords,coords_pme);
2093       copy_ivec(dd->ntot,nc);
2094       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2095       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2096
2097       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2098       } else {
2099       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2100       }
2101     */
2102     coords[XX] = x;
2103     coords[YY] = y;
2104     coords[ZZ] = z;
2105     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2106
2107     return slab;
2108 }
2109
2110 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2111 {
2112     gmx_domdec_comm_t *comm;
2113     ivec coords;
2114     int  ddindex,nodeid=-1;
2115
2116     comm = cr->dd->comm;
2117
2118     coords[XX] = x;
2119     coords[YY] = y;
2120     coords[ZZ] = z;
2121     if (comm->bCartesianPP_PME)
2122     {
2123 #ifdef GMX_MPI
2124         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2125 #endif
2126     }
2127     else
2128     {
2129         ddindex = dd_index(cr->dd->nc,coords);
2130         if (comm->bCartesianPP)
2131         {
2132             nodeid = comm->ddindex2simnodeid[ddindex];
2133         }
2134         else
2135         {
2136             if (comm->pmenodes)
2137             {
2138                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2139             }
2140             else
2141             {
2142                 nodeid = ddindex;
2143             }
2144         }
2145     }
2146
2147     return nodeid;
2148 }
2149
2150 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2151 {
2152     gmx_domdec_t *dd;
2153     gmx_domdec_comm_t *comm;
2154     ivec coord,coord_pme;
2155     int  i;
2156     int  pmenode=-1;
2157
2158     dd = cr->dd;
2159     comm = dd->comm;
2160
2161     /* This assumes a uniform x domain decomposition grid cell size */
2162     if (comm->bCartesianPP_PME)
2163     {
2164 #ifdef GMX_MPI
2165         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2166         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2167         {
2168             /* This is a PP node */
2169             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2170             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2171         }
2172 #endif
2173     }
2174     else if (comm->bCartesianPP)
2175     {
2176         if (sim_nodeid < dd->nnodes)
2177         {
2178             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2179         }
2180     }
2181     else
2182     {
2183         /* This assumes DD cells with identical x coordinates
2184          * are numbered sequentially.
2185          */
2186         if (dd->comm->pmenodes == NULL)
2187         {
2188             if (sim_nodeid < dd->nnodes)
2189             {
2190                 /* The DD index equals the nodeid */
2191                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2192             }
2193         }
2194         else
2195         {
2196             i = 0;
2197             while (sim_nodeid > dd->comm->pmenodes[i])
2198             {
2199                 i++;
2200             }
2201             if (sim_nodeid < dd->comm->pmenodes[i])
2202             {
2203                 pmenode = dd->comm->pmenodes[i];
2204             }
2205         }
2206     }
2207
2208     return pmenode;
2209 }
2210
2211 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2212 {
2213     gmx_bool bPMEOnlyNode;
2214
2215     if (DOMAINDECOMP(cr))
2216     {
2217         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2218     }
2219     else
2220     {
2221         bPMEOnlyNode = FALSE;
2222     }
2223
2224     return bPMEOnlyNode;
2225 }
2226
2227 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2228                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2229 {
2230     gmx_domdec_t *dd;
2231     int x,y,z;
2232     ivec coord,coord_pme;
2233
2234     dd = cr->dd;
2235
2236     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2237
2238     *nmy_ddnodes = 0;
2239     for(x=0; x<dd->nc[XX]; x++)
2240     {
2241         for(y=0; y<dd->nc[YY]; y++)
2242         {
2243             for(z=0; z<dd->nc[ZZ]; z++)
2244             {
2245                 if (dd->comm->bCartesianPP_PME)
2246                 {
2247                     coord[XX] = x;
2248                     coord[YY] = y;
2249                     coord[ZZ] = z;
2250                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2251                     if (dd->ci[XX] == coord_pme[XX] &&
2252                         dd->ci[YY] == coord_pme[YY] &&
2253                         dd->ci[ZZ] == coord_pme[ZZ])
2254                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2255                 }
2256                 else
2257                 {
2258                     /* The slab corresponds to the nodeid in the PME group */
2259                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2260                     {
2261                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2262                     }
2263                 }
2264             }
2265         }
2266     }
2267
2268     /* The last PP-only node is the peer node */
2269     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2270
2271     if (debug)
2272     {
2273         fprintf(debug,"Receive coordinates from PP nodes:");
2274         for(x=0; x<*nmy_ddnodes; x++)
2275         {
2276             fprintf(debug," %d",(*my_ddnodes)[x]);
2277         }
2278         fprintf(debug,"\n");
2279     }
2280 }
2281
2282 static gmx_bool receive_vir_ener(t_commrec *cr)
2283 {
2284     gmx_domdec_comm_t *comm;
2285     int  pmenode,coords[DIM],rank;
2286     gmx_bool bReceive;
2287
2288     bReceive = TRUE;
2289     if (cr->npmenodes < cr->dd->nnodes)
2290     {
2291         comm = cr->dd->comm;
2292         if (comm->bCartesianPP_PME)
2293         {
2294             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2295 #ifdef GMX_MPI
2296             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2297             coords[comm->cartpmedim]++;
2298             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2299             {
2300                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2301                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2302                 {
2303                     /* This is not the last PP node for pmenode */
2304                     bReceive = FALSE;
2305                 }
2306             }
2307 #endif
2308         }
2309         else
2310         {
2311             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2312             if (cr->sim_nodeid+1 < cr->nnodes &&
2313                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2314             {
2315                 /* This is not the last PP node for pmenode */
2316                 bReceive = FALSE;
2317             }
2318         }
2319     }
2320
2321     return bReceive;
2322 }
2323
2324 static void set_zones_ncg_home(gmx_domdec_t *dd)
2325 {
2326     gmx_domdec_zones_t *zones;
2327     int i;
2328
2329     zones = &dd->comm->zones;
2330
2331     zones->cg_range[0] = 0;
2332     for(i=1; i<zones->n+1; i++)
2333     {
2334         zones->cg_range[i] = dd->ncg_home;
2335     }
2336 }
2337
2338 static void rebuild_cgindex(gmx_domdec_t *dd,int *gcgs_index,t_state *state)
2339 {
2340     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2341
2342     ind = state->cg_gl;
2343     dd_cg_gl = dd->index_gl;
2344     cgindex  = dd->cgindex;
2345     nat = 0;
2346     cgindex[0] = nat;
2347     for(i=0; i<state->ncg_gl; i++)
2348     {
2349         cgindex[i] = nat;
2350         cg_gl = ind[i];
2351         dd_cg_gl[i] = cg_gl;
2352         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2353     }
2354     cgindex[i] = nat;
2355
2356     dd->ncg_home = state->ncg_gl;
2357     dd->nat_home = nat;
2358
2359     set_zones_ncg_home(dd);
2360 }
2361
2362 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2363 {
2364     while (cg >= cginfo_mb->cg_end)
2365     {
2366         cginfo_mb++;
2367     }
2368
2369     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2370 }
2371
2372 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2373                           t_forcerec *fr,char *bLocalCG)
2374 {
2375     cginfo_mb_t *cginfo_mb;
2376     int *cginfo;
2377     int cg;
2378
2379     if (fr != NULL)
2380     {
2381         cginfo_mb = fr->cginfo_mb;
2382         cginfo    = fr->cginfo;
2383
2384         for(cg=cg0; cg<cg1; cg++)
2385         {
2386             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2387         }
2388     }
2389
2390     if (bLocalCG != NULL)
2391     {
2392         for(cg=cg0; cg<cg1; cg++)
2393         {
2394             bLocalCG[index_gl[cg]] = TRUE;
2395         }
2396     }
2397 }
2398
2399 static void make_dd_indices(gmx_domdec_t *dd,int *gcgs_index,int cg_start)
2400 {
2401     int nzone,zone,zone1,cg0,cg,cg_gl,a,a_gl;
2402     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2403     gmx_ga2la_t *ga2la;
2404     char *bLocalCG;
2405
2406     bLocalCG = dd->comm->bLocalCG;
2407
2408     if (dd->nat_tot > dd->gatindex_nalloc)
2409     {
2410         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2411         srenew(dd->gatindex,dd->gatindex_nalloc);
2412     }
2413
2414     nzone      = dd->comm->zones.n;
2415     zone2cg    = dd->comm->zones.cg_range;
2416     zone_ncg1  = dd->comm->zone_ncg1;
2417     index_gl   = dd->index_gl;
2418     gatindex   = dd->gatindex;
2419
2420     if (zone2cg[1] != dd->ncg_home)
2421     {
2422         gmx_incons("dd->ncg_zone is not up to date");
2423     }
2424
2425     /* Make the local to global and global to local atom index */
2426     a = dd->cgindex[cg_start];
2427     for(zone=0; zone<nzone; zone++)
2428     {
2429         if (zone == 0)
2430         {
2431             cg0 = cg_start;
2432         }
2433         else
2434         {
2435             cg0 = zone2cg[zone];
2436         }
2437         for(cg=cg0; cg<zone2cg[zone+1]; cg++)
2438         {
2439             zone1 = zone;
2440             if (cg - cg0 >= zone_ncg1[zone])
2441             {
2442                 /* Signal that this cg is from more than one zone away */
2443                 zone1 += nzone;
2444             }
2445             cg_gl = index_gl[cg];
2446             for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2447             {
2448                 gatindex[a] = a_gl;
2449                 ga2la_set(dd->ga2la,a_gl,a,zone1);
2450                 a++;
2451             }
2452         }
2453     }
2454 }
2455
2456 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2457                           const char *where)
2458 {
2459     int ncg,i,ngl,nerr;
2460
2461     nerr = 0;
2462     if (bLocalCG == NULL)
2463     {
2464         return nerr;
2465     }
2466     for(i=0; i<dd->ncg_tot; i++)
2467     {
2468         if (!bLocalCG[dd->index_gl[i]])
2469         {
2470             fprintf(stderr,
2471                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2472             nerr++;
2473         }
2474     }
2475     ngl = 0;
2476     for(i=0; i<ncg_sys; i++)
2477     {
2478         if (bLocalCG[i])
2479         {
2480             ngl++;
2481         }
2482     }
2483     if (ngl != dd->ncg_tot)
2484     {
2485         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2486         nerr++;
2487     }
2488
2489     return nerr;
2490 }
2491
2492 static void check_index_consistency(gmx_domdec_t *dd,
2493                                     int natoms_sys,int ncg_sys,
2494                                     const char *where)
2495 {
2496     int  nerr,ngl,i,a,cell;
2497     int  *have;
2498
2499     nerr = 0;
2500
2501     if (dd->comm->DD_debug > 1)
2502     {
2503         snew(have,natoms_sys);
2504         for(a=0; a<dd->nat_tot; a++)
2505         {
2506             if (have[dd->gatindex[a]] > 0)
2507             {
2508                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2509             }
2510             else
2511             {
2512                 have[dd->gatindex[a]] = a + 1;
2513             }
2514         }
2515         sfree(have);
2516     }
2517
2518     snew(have,dd->nat_tot);
2519
2520     ngl  = 0;
2521     for(i=0; i<natoms_sys; i++)
2522     {
2523         if (ga2la_get(dd->ga2la,i,&a,&cell))
2524         {
2525             if (a >= dd->nat_tot)
2526             {
2527                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2528                 nerr++;
2529             }
2530             else
2531             {
2532                 have[a] = 1;
2533                 if (dd->gatindex[a] != i)
2534                 {
2535                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2536                     nerr++;
2537                 }
2538             }
2539             ngl++;
2540         }
2541     }
2542     if (ngl != dd->nat_tot)
2543     {
2544         fprintf(stderr,
2545                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2546                 dd->rank,where,ngl,dd->nat_tot);
2547     }
2548     for(a=0; a<dd->nat_tot; a++)
2549     {
2550         if (have[a] == 0)
2551         {
2552             fprintf(stderr,
2553                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2554                     dd->rank,where,a+1,dd->gatindex[a]+1);
2555         }
2556     }
2557     sfree(have);
2558
2559     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2560
2561     if (nerr > 0) {
2562         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2563                   dd->rank,where,nerr);
2564     }
2565 }
2566
2567 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2568 {
2569     int  i;
2570     char *bLocalCG;
2571
2572     if (a_start == 0)
2573     {
2574         /* Clear the whole list without searching */
2575         ga2la_clear(dd->ga2la);
2576     }
2577     else
2578     {
2579         for(i=a_start; i<dd->nat_tot; i++)
2580         {
2581             ga2la_del(dd->ga2la,dd->gatindex[i]);
2582         }
2583     }
2584
2585     bLocalCG = dd->comm->bLocalCG;
2586     if (bLocalCG)
2587     {
2588         for(i=cg_start; i<dd->ncg_tot; i++)
2589         {
2590             bLocalCG[dd->index_gl[i]] = FALSE;
2591         }
2592     }
2593
2594     dd_clear_local_vsite_indices(dd);
2595
2596     if (dd->constraints)
2597     {
2598         dd_clear_local_constraint_indices(dd);
2599     }
2600 }
2601
2602 static real grid_jump_limit(gmx_domdec_comm_t *comm,int dim_ind)
2603 {
2604     real grid_jump_limit;
2605
2606     /* The distance between the boundaries of cells at distance
2607      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2608      * and by the fact that cells should not be shifted by more than
2609      * half their size, such that cg's only shift by one cell
2610      * at redecomposition.
2611      */
2612     grid_jump_limit = comm->cellsize_limit;
2613     if (!comm->bVacDLBNoLimit)
2614     {
2615         grid_jump_limit = max(grid_jump_limit,
2616                               comm->cutoff/comm->cd[dim_ind].np);
2617     }
2618
2619     return grid_jump_limit;
2620 }
2621
2622 static void check_grid_jump(gmx_large_int_t step,gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2623 {
2624     gmx_domdec_comm_t *comm;
2625     int  d,dim;
2626     real limit,bfac;
2627
2628     comm = dd->comm;
2629
2630     for(d=1; d<dd->ndim; d++)
2631     {
2632         dim = dd->dim[d];
2633         limit = grid_jump_limit(comm,d);
2634         bfac = ddbox->box_size[dim];
2635         if (ddbox->tric_dir[dim])
2636         {
2637             bfac *= ddbox->skew_fac[dim];
2638         }
2639         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2640             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2641         {
2642             char buf[22];
2643             gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d\n",
2644                       gmx_step_str(step,buf),
2645                       dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2646         }
2647     }
2648 }
2649
2650 static int dd_load_count(gmx_domdec_comm_t *comm)
2651 {
2652     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2653 }
2654
2655 static float dd_force_load(gmx_domdec_comm_t *comm)
2656 {
2657     float load;
2658
2659     if (comm->eFlop)
2660     {
2661         load = comm->flop;
2662         if (comm->eFlop > 1)
2663         {
2664             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2665         }
2666     }
2667     else
2668     {
2669         load = comm->cycl[ddCyclF];
2670         if (comm->cycl_n[ddCyclF] > 1)
2671         {
2672             /* Subtract the maximum of the last n cycle counts
2673              * to get rid of possible high counts due to other soures,
2674              * for instance system activity, that would otherwise
2675              * affect the dynamic load balancing.
2676              */
2677             load -= comm->cycl_max[ddCyclF];
2678         }
2679     }
2680
2681     return load;
2682 }
2683
2684 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2685 {
2686     gmx_domdec_comm_t *comm;
2687     int i;
2688
2689     comm = dd->comm;
2690
2691     snew(*dim_f,dd->nc[dim]+1);
2692     (*dim_f)[0] = 0;
2693     for(i=1; i<dd->nc[dim]; i++)
2694     {
2695         if (comm->slb_frac[dim])
2696         {
2697             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2698         }
2699         else
2700         {
2701             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2702         }
2703     }
2704     (*dim_f)[dd->nc[dim]] = 1;
2705 }
2706
2707 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2708 {
2709     int  pmeindex,slab,nso,i;
2710     ivec xyz;
2711
2712     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2713     {
2714         ddpme->dim = YY;
2715     }
2716     else
2717     {
2718         ddpme->dim = dimind;
2719     }
2720     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2721
2722     ddpme->nslab = (ddpme->dim == 0 ?
2723                     dd->comm->npmenodes_x :
2724                     dd->comm->npmenodes_y);
2725
2726     if (ddpme->nslab <= 1)
2727     {
2728         return;
2729     }
2730
2731     nso = dd->comm->npmenodes/ddpme->nslab;
2732     /* Determine for each PME slab the PP location range for dimension dim */
2733     snew(ddpme->pp_min,ddpme->nslab);
2734     snew(ddpme->pp_max,ddpme->nslab);
2735     for(slab=0; slab<ddpme->nslab; slab++) {
2736         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2737         ddpme->pp_max[slab] = 0;
2738     }
2739     for(i=0; i<dd->nnodes; i++) {
2740         ddindex2xyz(dd->nc,i,xyz);
2741         /* For y only use our y/z slab.
2742          * This assumes that the PME x grid size matches the DD grid size.
2743          */
2744         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2745             pmeindex = ddindex2pmeindex(dd,i);
2746             if (dimind == 0) {
2747                 slab = pmeindex/nso;
2748             } else {
2749                 slab = pmeindex % ddpme->nslab;
2750             }
2751             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2752             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2753         }
2754     }
2755
2756     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2757 }
2758
2759 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2760 {
2761     if (dd->comm->ddpme[0].dim == XX)
2762     {
2763         return dd->comm->ddpme[0].maxshift;
2764     }
2765     else
2766     {
2767         return 0;
2768     }
2769 }
2770
2771 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2772 {
2773     if (dd->comm->ddpme[0].dim == YY)
2774     {
2775         return dd->comm->ddpme[0].maxshift;
2776     }
2777     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2778     {
2779         return dd->comm->ddpme[1].maxshift;
2780     }
2781     else
2782     {
2783         return 0;
2784     }
2785 }
2786
2787 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2788                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2789 {
2790     gmx_domdec_comm_t *comm;
2791     int  nc,ns,s;
2792     int  *xmin,*xmax;
2793     real range,pme_boundary;
2794     int  sh;
2795
2796     comm = dd->comm;
2797     nc  = dd->nc[ddpme->dim];
2798     ns  = ddpme->nslab;
2799
2800     if (!ddpme->dim_match)
2801     {
2802         /* PP decomposition is not along dim: the worst situation */
2803         sh = ns/2;
2804     }
2805     else if (ns <= 3 || (bUniform && ns == nc))
2806     {
2807         /* The optimal situation */
2808         sh = 1;
2809     }
2810     else
2811     {
2812         /* We need to check for all pme nodes which nodes they
2813          * could possibly need to communicate with.
2814          */
2815         xmin = ddpme->pp_min;
2816         xmax = ddpme->pp_max;
2817         /* Allow for atoms to be maximally 2/3 times the cut-off
2818          * out of their DD cell. This is a reasonable balance between
2819          * between performance and support for most charge-group/cut-off
2820          * combinations.
2821          */
2822         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2823         /* Avoid extra communication when we are exactly at a boundary */
2824         range *= 0.999;
2825
2826         sh = 1;
2827         for(s=0; s<ns; s++)
2828         {
2829             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2830             pme_boundary = (real)s/ns;
2831             while (sh+1 < ns &&
2832                    ((s-(sh+1) >= 0 &&
2833                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2834                     (s-(sh+1) <  0 &&
2835                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2836             {
2837                 sh++;
2838             }
2839             pme_boundary = (real)(s+1)/ns;
2840             while (sh+1 < ns &&
2841                    ((s+(sh+1) <  ns &&
2842                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2843                     (s+(sh+1) >= ns &&
2844                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2845             {
2846                 sh++;
2847             }
2848         }
2849     }
2850
2851     ddpme->maxshift = sh;
2852
2853     if (debug)
2854     {
2855         fprintf(debug,"PME slab communication range for dim %d is %d\n",
2856                 ddpme->dim,ddpme->maxshift);
2857     }
2858 }
2859
2860 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
2861 {
2862     int d,dim;
2863
2864     for(d=0; d<dd->ndim; d++)
2865     {
2866         dim = dd->dim[d];
2867         if (dim < ddbox->nboundeddim &&
2868             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
2869             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
2870         {
2871             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
2872                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
2873                       dd->nc[dim],dd->comm->cellsize_limit);
2874         }
2875     }
2876 }
2877
2878 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
2879                                   gmx_bool bMaster,ivec npulse)
2880 {
2881     gmx_domdec_comm_t *comm;
2882     int  d,j;
2883     rvec cellsize_min;
2884     real *cell_x,cell_dx,cellsize;
2885
2886     comm = dd->comm;
2887
2888     for(d=0; d<DIM; d++)
2889     {
2890         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
2891         npulse[d] = 1;
2892         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
2893         {
2894             /* Uniform grid */
2895             cell_dx = ddbox->box_size[d]/dd->nc[d];
2896             if (bMaster)
2897             {
2898                 for(j=0; j<dd->nc[d]+1; j++)
2899                 {
2900                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
2901                 }
2902             }
2903             else
2904             {
2905                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
2906                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
2907             }
2908             cellsize = cell_dx*ddbox->skew_fac[d];
2909             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
2910             {
2911                 npulse[d]++;
2912             }
2913             cellsize_min[d] = cellsize;
2914         }
2915         else
2916         {
2917             /* Statically load balanced grid */
2918             /* Also when we are not doing a master distribution we determine
2919              * all cell borders in a loop to obtain identical values
2920              * to the master distribution case and to determine npulse.
2921              */
2922             if (bMaster)
2923             {
2924                 cell_x = dd->ma->cell_x[d];
2925             }
2926             else
2927             {
2928                 snew(cell_x,dd->nc[d]+1);
2929             }
2930             cell_x[0] = ddbox->box0[d];
2931             for(j=0; j<dd->nc[d]; j++)
2932             {
2933                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
2934                 cell_x[j+1] = cell_x[j] + cell_dx;
2935                 cellsize = cell_dx*ddbox->skew_fac[d];
2936                 while (cellsize*npulse[d] < comm->cutoff &&
2937                        npulse[d] < dd->nc[d]-1)
2938                 {
2939                     npulse[d]++;
2940                 }
2941                 cellsize_min[d] = min(cellsize_min[d],cellsize);
2942             }
2943             if (!bMaster)
2944             {
2945                 comm->cell_x0[d] = cell_x[dd->ci[d]];
2946                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
2947                 sfree(cell_x);
2948             }
2949         }
2950         /* The following limitation is to avoid that a cell would receive
2951          * some of its own home charge groups back over the periodic boundary.
2952          * Double charge groups cause trouble with the global indices.
2953          */
2954         if (d < ddbox->npbcdim &&
2955             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
2956         {
2957             gmx_fatal_collective(FARGS,NULL,dd,
2958                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
2959                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
2960                                  comm->cutoff,
2961                                  dd->nc[d],dd->nc[d],
2962                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
2963         }
2964     }
2965
2966     if (!comm->bDynLoadBal)
2967     {
2968         copy_rvec(cellsize_min,comm->cellsize_min);
2969     }
2970
2971     for(d=0; d<comm->npmedecompdim; d++)
2972     {
2973         set_pme_maxshift(dd,&comm->ddpme[d],
2974                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
2975                          comm->ddpme[d].slb_dim_f);
2976     }
2977 }
2978
2979
2980 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
2981                                        int d,int dim,gmx_domdec_root_t *root,
2982                                        gmx_ddbox_t *ddbox,
2983                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
2984 {
2985     gmx_domdec_comm_t *comm;
2986     int  ncd,i,j,nmin,nmin_old;
2987     gmx_bool bLimLo,bLimHi;
2988     real *cell_size;
2989     real fac,halfway,cellsize_limit_f_i,region_size;
2990     gmx_bool bPBC,bLastHi=FALSE;
2991     int nrange[]={range[0],range[1]};
2992
2993     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
2994
2995     comm = dd->comm;
2996
2997     ncd = dd->nc[dim];
2998
2999     bPBC = (dim < ddbox->npbcdim);
3000
3001     cell_size = root->buf_ncd;
3002
3003     if (debug)
3004     {
3005         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3006     }
3007
3008     /* First we need to check if the scaling does not make cells
3009      * smaller than the smallest allowed size.
3010      * We need to do this iteratively, since if a cell is too small,
3011      * it needs to be enlarged, which makes all the other cells smaller,
3012      * which could in turn make another cell smaller than allowed.
3013      */
3014     for(i=range[0]; i<range[1]; i++)
3015     {
3016         root->bCellMin[i] = FALSE;
3017     }
3018     nmin = 0;
3019     do
3020     {
3021         nmin_old = nmin;
3022         /* We need the total for normalization */
3023         fac = 0;
3024         for(i=range[0]; i<range[1]; i++)
3025         {
3026             if (root->bCellMin[i] == FALSE)
3027             {
3028                 fac += cell_size[i];
3029             }
3030         }
3031         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3032         /* Determine the cell boundaries */
3033         for(i=range[0]; i<range[1]; i++)
3034         {
3035             if (root->bCellMin[i] == FALSE)
3036             {
3037                 cell_size[i] *= fac;
3038                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3039                 {
3040                     cellsize_limit_f_i = 0;
3041                 }
3042                 else
3043                 {
3044                     cellsize_limit_f_i = cellsize_limit_f;
3045                 }
3046                 if (cell_size[i] < cellsize_limit_f_i)
3047                 {
3048                     root->bCellMin[i] = TRUE;
3049                     cell_size[i] = cellsize_limit_f_i;
3050                     nmin++;
3051                 }
3052             }
3053             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3054         }
3055     }
3056     while (nmin > nmin_old);
3057
3058     i=range[1]-1;
3059     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3060     /* For this check we should not use DD_CELL_MARGIN,
3061      * but a slightly smaller factor,
3062      * since rounding could get use below the limit.
3063      */
3064     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3065     {
3066         char buf[22];
3067         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3068                   gmx_step_str(step,buf),
3069                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3070                   ncd,comm->cellsize_min[dim]);
3071     }
3072
3073     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3074
3075     if (!bUniform)
3076     {
3077         /* Check if the boundary did not displace more than halfway
3078          * each of the cells it bounds, as this could cause problems,
3079          * especially when the differences between cell sizes are large.
3080          * If changes are applied, they will not make cells smaller
3081          * than the cut-off, as we check all the boundaries which
3082          * might be affected by a change and if the old state was ok,
3083          * the cells will at most be shrunk back to their old size.
3084          */
3085         for(i=range[0]+1; i<range[1]; i++)
3086         {
3087             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3088             if (root->cell_f[i] < halfway)
3089             {
3090                 root->cell_f[i] = halfway;
3091                 /* Check if the change also causes shifts of the next boundaries */
3092                 for(j=i+1; j<range[1]; j++)
3093                 {
3094                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3095                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3096                 }
3097             }
3098             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3099             if (root->cell_f[i] > halfway)
3100             {
3101                 root->cell_f[i] = halfway;
3102                 /* Check if the change also causes shifts of the next boundaries */
3103                 for(j=i-1; j>=range[0]+1; j--)
3104                 {
3105                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3106                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3107                 }
3108             }
3109         }
3110     }
3111
3112     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3113     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3114      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3115      * for a and b nrange is used */
3116     if (d > 0)
3117     {
3118         /* Take care of the staggering of the cell boundaries */
3119         if (bUniform)
3120         {
3121             for(i=range[0]; i<range[1]; i++)
3122             {
3123                 root->cell_f_max0[i] = root->cell_f[i];
3124                 root->cell_f_min1[i] = root->cell_f[i+1];
3125             }
3126         }
3127         else
3128         {
3129             for(i=range[0]+1; i<range[1]; i++)
3130             {
3131                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3132                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3133                 if (bLimLo && bLimHi)
3134                 {
3135                     /* Both limits violated, try the best we can */
3136                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3137                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3138                     nrange[0]=range[0];
3139                     nrange[1]=i;
3140                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3141
3142                     nrange[0]=i;
3143                     nrange[1]=range[1];
3144                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3145
3146                     return;
3147                 }
3148                 else if (bLimLo)
3149                 {
3150                     /* root->cell_f[i] = root->bound_min[i]; */
3151                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3152                     bLastHi=FALSE;
3153                 }
3154                 else if (bLimHi && !bLastHi)
3155                 {
3156                     bLastHi=TRUE;
3157                     if (nrange[1] < range[1])   /* found a LimLo before */
3158                     {
3159                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3160                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3161                         nrange[0]=nrange[1];
3162                     }
3163                     root->cell_f[i] = root->bound_max[i];
3164                     nrange[1]=i;
3165                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3166                     nrange[0]=i;
3167                     nrange[1]=range[1];
3168                 }
3169             }
3170             if (nrange[1] < range[1])   /* found last a LimLo */
3171             {
3172                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3173                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3174                 nrange[0]=nrange[1];
3175                 nrange[1]=range[1];
3176                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3177             }
3178             else if (nrange[0] > range[0]) /* found at least one LimHi */
3179             {
3180                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3181             }
3182         }
3183     }
3184 }
3185
3186
3187 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3188                                        int d,int dim,gmx_domdec_root_t *root,
3189                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3190                                        gmx_bool bUniform,gmx_large_int_t step)
3191 {
3192     gmx_domdec_comm_t *comm;
3193     int  ncd,d1,i,j,pos;
3194     real *cell_size;
3195     real load_aver,load_i,imbalance,change,change_max,sc;
3196     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3197     real change_limit;
3198     real relax = 0.5;
3199     gmx_bool bPBC;
3200     int range[] = { 0, 0 };
3201
3202     comm = dd->comm;
3203
3204     /* Convert the maximum change from the input percentage to a fraction */
3205     change_limit = comm->dlb_scale_lim*0.01;
3206
3207     ncd = dd->nc[dim];
3208
3209     bPBC = (dim < ddbox->npbcdim);
3210
3211     cell_size = root->buf_ncd;
3212
3213     /* Store the original boundaries */
3214     for(i=0; i<ncd+1; i++)
3215     {
3216         root->old_cell_f[i] = root->cell_f[i];
3217     }
3218     if (bUniform) {
3219         for(i=0; i<ncd; i++)
3220         {
3221             cell_size[i] = 1.0/ncd;
3222         }
3223     }
3224     else if (dd_load_count(comm))
3225     {
3226         load_aver = comm->load[d].sum_m/ncd;
3227         change_max = 0;
3228         for(i=0; i<ncd; i++)
3229         {
3230             /* Determine the relative imbalance of cell i */
3231             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3232             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3233             /* Determine the change of the cell size using underrelaxation */
3234             change = -relax*imbalance;
3235             change_max = max(change_max,max(change,-change));
3236         }
3237         /* Limit the amount of scaling.
3238          * We need to use the same rescaling for all cells in one row,
3239          * otherwise the load balancing might not converge.
3240          */
3241         sc = relax;
3242         if (change_max > change_limit)
3243         {
3244             sc *= change_limit/change_max;
3245         }
3246         for(i=0; i<ncd; i++)
3247         {
3248             /* Determine the relative imbalance of cell i */
3249             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3250             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3251             /* Determine the change of the cell size using underrelaxation */
3252             change = -sc*imbalance;
3253             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3254         }
3255     }
3256
3257     cellsize_limit_f  = comm->cellsize_min[dim]/ddbox->box_size[dim];
3258     cellsize_limit_f *= DD_CELL_MARGIN;
3259     dist_min_f_hard        = grid_jump_limit(comm,d)/ddbox->box_size[dim];
3260     dist_min_f       = dist_min_f_hard * DD_CELL_MARGIN;
3261     if (ddbox->tric_dir[dim])
3262     {
3263         cellsize_limit_f /= ddbox->skew_fac[dim];
3264         dist_min_f       /= ddbox->skew_fac[dim];
3265     }
3266     if (bDynamicBox && d > 0)
3267     {
3268         dist_min_f *= DD_PRES_SCALE_MARGIN;
3269     }
3270     if (d > 0 && !bUniform)
3271     {
3272         /* Make sure that the grid is not shifted too much */
3273         for(i=1; i<ncd; i++) {
3274             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3275             {
3276                 gmx_incons("Inconsistent DD boundary staggering limits!");
3277             }
3278             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3279             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3280             if (space > 0) {
3281                 root->bound_min[i] += 0.5*space;
3282             }
3283             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3284             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3285             if (space < 0) {
3286                 root->bound_max[i] += 0.5*space;
3287             }
3288             if (debug)
3289             {
3290                 fprintf(debug,
3291                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3292                         d,i,
3293                         root->cell_f_max0[i-1] + dist_min_f,
3294                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3295                         root->cell_f_min1[i] - dist_min_f);
3296             }
3297         }
3298     }
3299     range[1]=ncd;
3300     root->cell_f[0] = 0;
3301     root->cell_f[ncd] = 1;
3302     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3303
3304
3305     /* After the checks above, the cells should obey the cut-off
3306      * restrictions, but it does not hurt to check.
3307      */
3308     for(i=0; i<ncd; i++)
3309     {
3310         if (debug)
3311         {
3312             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3313                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3314         }
3315
3316         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3317             root->cell_f[i+1] - root->cell_f[i] <
3318             cellsize_limit_f/DD_CELL_MARGIN)
3319         {
3320             char buf[22];
3321             fprintf(stderr,
3322                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3323                     gmx_step_str(step,buf),dim2char(dim),i,
3324                     (root->cell_f[i+1] - root->cell_f[i])
3325                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3326         }
3327     }
3328
3329     pos = ncd + 1;
3330     /* Store the cell boundaries of the lower dimensions at the end */
3331     for(d1=0; d1<d; d1++)
3332     {
3333         root->cell_f[pos++] = comm->cell_f0[d1];
3334         root->cell_f[pos++] = comm->cell_f1[d1];
3335     }
3336
3337     if (d < comm->npmedecompdim)
3338     {
3339         /* The master determines the maximum shift for
3340          * the coordinate communication between separate PME nodes.
3341          */
3342         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3343     }
3344     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3345     if (d >= 1)
3346     {
3347         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3348     }
3349 }
3350
3351 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3352                                              gmx_ddbox_t *ddbox,int dimind)
3353 {
3354     gmx_domdec_comm_t *comm;
3355     int dim;
3356
3357     comm = dd->comm;
3358
3359     /* Set the cell dimensions */
3360     dim = dd->dim[dimind];
3361     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3362     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3363     if (dim >= ddbox->nboundeddim)
3364     {
3365         comm->cell_x0[dim] += ddbox->box0[dim];
3366         comm->cell_x1[dim] += ddbox->box0[dim];
3367     }
3368 }
3369
3370 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3371                                          int d,int dim,real *cell_f_row,
3372                                          gmx_ddbox_t *ddbox)
3373 {
3374     gmx_domdec_comm_t *comm;
3375     int d1,dim1,pos;
3376
3377     comm = dd->comm;
3378
3379 #ifdef GMX_MPI
3380     /* Each node would only need to know two fractions,
3381      * but it is probably cheaper to broadcast the whole array.
3382      */
3383     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3384               0,comm->mpi_comm_load[d]);
3385 #endif
3386     /* Copy the fractions for this dimension from the buffer */
3387     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3388     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3389     /* The whole array was communicated, so set the buffer position */
3390     pos = dd->nc[dim] + 1;
3391     for(d1=0; d1<=d; d1++)
3392     {
3393         if (d1 < d)
3394         {
3395             /* Copy the cell fractions of the lower dimensions */
3396             comm->cell_f0[d1] = cell_f_row[pos++];
3397             comm->cell_f1[d1] = cell_f_row[pos++];
3398         }
3399         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3400     }
3401     /* Convert the communicated shift from float to int */
3402     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3403     if (d >= 1)
3404     {
3405         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3406     }
3407 }
3408
3409 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3410                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3411                                          gmx_bool bUniform,gmx_large_int_t step)
3412 {
3413     gmx_domdec_comm_t *comm;
3414     int d,dim,d1;
3415     gmx_bool bRowMember,bRowRoot;
3416     real *cell_f_row;
3417
3418     comm = dd->comm;
3419
3420     for(d=0; d<dd->ndim; d++)
3421     {
3422         dim = dd->dim[d];
3423         bRowMember = TRUE;
3424         bRowRoot = TRUE;
3425         for(d1=d; d1<dd->ndim; d1++)
3426         {
3427             if (dd->ci[dd->dim[d1]] > 0)
3428             {
3429                 if (d1 > d)
3430                 {
3431                     bRowMember = FALSE;
3432                 }
3433                 bRowRoot = FALSE;
3434             }
3435         }
3436         if (bRowMember)
3437         {
3438             if (bRowRoot)
3439             {
3440                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3441                                            ddbox,bDynamicBox,bUniform,step);
3442                 cell_f_row = comm->root[d]->cell_f;
3443             }
3444             else
3445             {
3446                 cell_f_row = comm->cell_f_row;
3447             }
3448             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3449         }
3450     }
3451 }
3452
3453 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3454 {
3455     int d;
3456
3457     /* This function assumes the box is static and should therefore
3458      * not be called when the box has changed since the last
3459      * call to dd_partition_system.
3460      */
3461     for(d=0; d<dd->ndim; d++)
3462     {
3463         relative_to_absolute_cell_bounds(dd,ddbox,d);
3464     }
3465 }
3466
3467
3468
3469 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3470                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3471                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3472                                   gmx_wallcycle_t wcycle)
3473 {
3474     gmx_domdec_comm_t *comm;
3475     int dim;
3476
3477     comm = dd->comm;
3478
3479     if (bDoDLB)
3480     {
3481         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3482         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3483         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3484     }
3485     else if (bDynamicBox)
3486     {
3487         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3488     }
3489
3490     /* Set the dimensions for which no DD is used */
3491     for(dim=0; dim<DIM; dim++) {
3492         if (dd->nc[dim] == 1) {
3493             comm->cell_x0[dim] = 0;
3494             comm->cell_x1[dim] = ddbox->box_size[dim];
3495             if (dim >= ddbox->nboundeddim)
3496             {
3497                 comm->cell_x0[dim] += ddbox->box0[dim];
3498                 comm->cell_x1[dim] += ddbox->box0[dim];
3499             }
3500         }
3501     }
3502 }
3503
3504 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3505 {
3506     int d,np,i;
3507     gmx_domdec_comm_dim_t *cd;
3508
3509     for(d=0; d<dd->ndim; d++)
3510     {
3511         cd = &dd->comm->cd[d];
3512         np = npulse[dd->dim[d]];
3513         if (np > cd->np_nalloc)
3514         {
3515             if (debug)
3516             {
3517                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3518                         dim2char(dd->dim[d]),np);
3519             }
3520             if (DDMASTER(dd) && cd->np_nalloc > 0)
3521             {
3522                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3523             }
3524             srenew(cd->ind,np);
3525             for(i=cd->np_nalloc; i<np; i++)
3526             {
3527                 cd->ind[i].index  = NULL;
3528                 cd->ind[i].nalloc = 0;
3529             }
3530             cd->np_nalloc = np;
3531         }
3532         cd->np = np;
3533     }
3534 }
3535
3536
3537 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3538                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3539                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3540                               gmx_wallcycle_t wcycle)
3541 {
3542     gmx_domdec_comm_t *comm;
3543     int  d;
3544     ivec npulse;
3545
3546     comm = dd->comm;
3547
3548     /* Copy the old cell boundaries for the cg displacement check */
3549     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3550     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3551
3552     if (comm->bDynLoadBal)
3553     {
3554         if (DDMASTER(dd))
3555         {
3556             check_box_size(dd,ddbox);
3557         }
3558         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3559     }
3560     else
3561     {
3562         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3563         realloc_comm_ind(dd,npulse);
3564     }
3565
3566     if (debug)
3567     {
3568         for(d=0; d<DIM; d++)
3569         {
3570             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3571                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3572         }
3573     }
3574 }
3575
3576 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3577                                   gmx_ddbox_t *ddbox,
3578                                   rvec cell_ns_x0,rvec cell_ns_x1,
3579                                   gmx_large_int_t step)
3580 {
3581     gmx_domdec_comm_t *comm;
3582     int dim_ind,dim;
3583
3584     comm = dd->comm;
3585
3586     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3587     {
3588         dim = dd->dim[dim_ind];
3589
3590         /* Without PBC we don't have restrictions on the outer cells */
3591         if (!(dim >= ddbox->npbcdim &&
3592               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3593             comm->bDynLoadBal &&
3594             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3595             comm->cellsize_min[dim])
3596         {
3597             char buf[22];
3598             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3599                       gmx_step_str(step,buf),dim2char(dim),
3600                       comm->cell_x1[dim] - comm->cell_x0[dim],
3601                       ddbox->skew_fac[dim],
3602                       dd->comm->cellsize_min[dim],
3603                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3604         }
3605     }
3606
3607     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3608     {
3609         /* Communicate the boundaries and update cell_ns_x0/1 */
3610         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3611         if (dd->bGridJump && dd->ndim > 1)
3612         {
3613             check_grid_jump(step,dd,ddbox);
3614         }
3615     }
3616 }
3617
3618 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3619 {
3620     if (YY < npbcdim)
3621     {
3622         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3623     }
3624     else
3625     {
3626         tcm[YY][XX] = 0;
3627     }
3628     if (ZZ < npbcdim)
3629     {
3630         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3631         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3632     }
3633     else
3634     {
3635         tcm[ZZ][XX] = 0;
3636         tcm[ZZ][YY] = 0;
3637     }
3638 }
3639
3640 static void check_screw_box(matrix box)
3641 {
3642     /* Mathematical limitation */
3643     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3644     {
3645         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3646     }
3647
3648     /* Limitation due to the asymmetry of the eighth shell method */
3649     if (box[ZZ][YY] != 0)
3650     {
3651         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3652     }
3653 }
3654
3655 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3656                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3657                           gmx_domdec_t *dd)
3658 {
3659     gmx_domdec_master_t *ma;
3660     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3661     int  i,icg,j,k,k0,k1,d,npbcdim;
3662     matrix tcm;
3663     rvec box_size,cg_cm;
3664     ivec ind;
3665     real nrcg,inv_ncg,pos_d;
3666     atom_id *cgindex;
3667     gmx_bool bUnbounded,bScrew;
3668
3669     ma = dd->ma;
3670
3671     if (tmp_ind == NULL)
3672     {
3673         snew(tmp_nalloc,dd->nnodes);
3674         snew(tmp_ind,dd->nnodes);
3675         for(i=0; i<dd->nnodes; i++)
3676         {
3677             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3678             snew(tmp_ind[i],tmp_nalloc[i]);
3679         }
3680     }
3681
3682     /* Clear the count */
3683     for(i=0; i<dd->nnodes; i++)
3684     {
3685         ma->ncg[i] = 0;
3686         ma->nat[i] = 0;
3687     }
3688
3689     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3690
3691     cgindex = cgs->index;
3692
3693     /* Compute the center of geometry for all charge groups */
3694     for(icg=0; icg<cgs->nr; icg++)
3695     {
3696         k0      = cgindex[icg];
3697         k1      = cgindex[icg+1];
3698         nrcg    = k1 - k0;
3699         if (nrcg == 1)
3700         {
3701             copy_rvec(pos[k0],cg_cm);
3702         }
3703         else
3704         {
3705             inv_ncg = 1.0/nrcg;
3706
3707             clear_rvec(cg_cm);
3708             for(k=k0; (k<k1); k++)
3709             {
3710                 rvec_inc(cg_cm,pos[k]);
3711             }
3712             for(d=0; (d<DIM); d++)
3713             {
3714                 cg_cm[d] *= inv_ncg;
3715             }
3716         }
3717         /* Put the charge group in the box and determine the cell index */
3718         for(d=DIM-1; d>=0; d--) {
3719             pos_d = cg_cm[d];
3720             if (d < dd->npbcdim)
3721             {
3722                 bScrew = (dd->bScrewPBC && d == XX);
3723                 if (tric_dir[d] && dd->nc[d] > 1)
3724                 {
3725                     /* Use triclinic coordintates for this dimension */
3726                     for(j=d+1; j<DIM; j++)
3727                     {
3728                         pos_d += cg_cm[j]*tcm[j][d];
3729                     }
3730                 }
3731                 while(pos_d >= box[d][d])
3732                 {
3733                     pos_d -= box[d][d];
3734                     rvec_dec(cg_cm,box[d]);
3735                     if (bScrew)
3736                     {
3737                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3738                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3739                     }
3740                     for(k=k0; (k<k1); k++)
3741                     {
3742                         rvec_dec(pos[k],box[d]);
3743                         if (bScrew)
3744                         {
3745                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3746                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3747                         }
3748                     }
3749                 }
3750                 while(pos_d < 0)
3751                 {
3752                     pos_d += box[d][d];
3753                     rvec_inc(cg_cm,box[d]);
3754                     if (bScrew)
3755                     {
3756                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3757                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3758                     }
3759                     for(k=k0; (k<k1); k++)
3760                     {
3761                         rvec_inc(pos[k],box[d]);
3762                         if (bScrew) {
3763                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3764                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3765                         }
3766                     }
3767                 }
3768             }
3769             /* This could be done more efficiently */
3770             ind[d] = 0;
3771             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3772             {
3773                 ind[d]++;
3774             }
3775         }
3776         i = dd_index(dd->nc,ind);
3777         if (ma->ncg[i] == tmp_nalloc[i])
3778         {
3779             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3780             srenew(tmp_ind[i],tmp_nalloc[i]);
3781         }
3782         tmp_ind[i][ma->ncg[i]] = icg;
3783         ma->ncg[i]++;
3784         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3785     }
3786
3787     k1 = 0;
3788     for(i=0; i<dd->nnodes; i++)
3789     {
3790         ma->index[i] = k1;
3791         for(k=0; k<ma->ncg[i]; k++)
3792         {
3793             ma->cg[k1++] = tmp_ind[i][k];
3794         }
3795     }
3796     ma->index[dd->nnodes] = k1;
3797
3798     for(i=0; i<dd->nnodes; i++)
3799     {
3800         sfree(tmp_ind[i]);
3801     }
3802     sfree(tmp_ind);
3803     sfree(tmp_nalloc);
3804
3805     if (fplog)
3806     {
3807         char buf[22];
3808         fprintf(fplog,"Charge group distribution at step %s:",
3809                 gmx_step_str(step,buf));
3810         for(i=0; i<dd->nnodes; i++)
3811         {
3812             fprintf(fplog," %d",ma->ncg[i]);
3813         }
3814         fprintf(fplog,"\n");
3815     }
3816 }
3817
3818 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3819                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3820                                 rvec pos[])
3821 {
3822     gmx_domdec_master_t *ma=NULL;
3823     ivec npulse;
3824     int  i,cg_gl;
3825     int  *ibuf,buf2[2] = { 0, 0 };
3826     gmx_bool bMaster = DDMASTER(dd);
3827     if (bMaster)
3828     {
3829         ma = dd->ma;
3830
3831         if (dd->bScrewPBC)
3832         {
3833             check_screw_box(box);
3834         }
3835
3836         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3837
3838         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3839         for(i=0; i<dd->nnodes; i++)
3840         {
3841             ma->ibuf[2*i]   = ma->ncg[i];
3842             ma->ibuf[2*i+1] = ma->nat[i];
3843         }
3844         ibuf = ma->ibuf;
3845     }
3846     else
3847     {
3848         ibuf = NULL;
3849     }
3850     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3851
3852     dd->ncg_home = buf2[0];
3853     dd->nat_home = buf2[1];
3854     dd->ncg_tot  = dd->ncg_home;
3855     dd->nat_tot  = dd->nat_home;
3856     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
3857     {
3858         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
3859         srenew(dd->index_gl,dd->cg_nalloc);
3860         srenew(dd->cgindex,dd->cg_nalloc+1);
3861     }
3862     if (bMaster)
3863     {
3864         for(i=0; i<dd->nnodes; i++)
3865         {
3866             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
3867             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
3868         }
3869     }
3870
3871     dd_scatterv(dd,
3872                 DDMASTER(dd) ? ma->ibuf : NULL,
3873                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
3874                 DDMASTER(dd) ? ma->cg : NULL,
3875                 dd->ncg_home*sizeof(int),dd->index_gl);
3876
3877     /* Determine the home charge group sizes */
3878     dd->cgindex[0] = 0;
3879     for(i=0; i<dd->ncg_home; i++)
3880     {
3881         cg_gl = dd->index_gl[i];
3882         dd->cgindex[i+1] =
3883             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
3884     }
3885
3886     if (debug)
3887     {
3888         fprintf(debug,"Home charge groups:\n");
3889         for(i=0; i<dd->ncg_home; i++)
3890         {
3891             fprintf(debug," %d",dd->index_gl[i]);
3892             if (i % 10 == 9)
3893                 fprintf(debug,"\n");
3894         }
3895         fprintf(debug,"\n");
3896     }
3897 }
3898
3899 static int compact_and_copy_vec_at(int ncg,int *move,
3900                                    int *cgindex,
3901                                    int nvec,int vec,
3902                                    rvec *src,gmx_domdec_comm_t *comm,
3903                                    gmx_bool bCompact)
3904 {
3905     int m,icg,i,i0,i1,nrcg;
3906     int home_pos;
3907     int pos_vec[DIM*2];
3908
3909     home_pos = 0;
3910
3911     for(m=0; m<DIM*2; m++)
3912     {
3913         pos_vec[m] = 0;
3914     }
3915
3916     i0 = 0;
3917     for(icg=0; icg<ncg; icg++)
3918     {
3919         i1 = cgindex[icg+1];
3920         m = move[icg];
3921         if (m == -1)
3922         {
3923             if (bCompact)
3924             {
3925                 /* Compact the home array in place */
3926                 for(i=i0; i<i1; i++)
3927                 {
3928                     copy_rvec(src[i],src[home_pos++]);
3929                 }
3930             }
3931         }
3932         else
3933         {
3934             /* Copy to the communication buffer */
3935             nrcg = i1 - i0;
3936             pos_vec[m] += 1 + vec*nrcg;
3937             for(i=i0; i<i1; i++)
3938             {
3939                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
3940             }
3941             pos_vec[m] += (nvec - vec - 1)*nrcg;
3942         }
3943         if (!bCompact)
3944         {
3945             home_pos += i1 - i0;
3946         }
3947         i0 = i1;
3948     }
3949
3950     return home_pos;
3951 }
3952
3953 static int compact_and_copy_vec_cg(int ncg,int *move,
3954                                    int *cgindex,
3955                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
3956                                    gmx_bool bCompact)
3957 {
3958     int m,icg,i0,i1,nrcg;
3959     int home_pos;
3960     int pos_vec[DIM*2];
3961
3962     home_pos = 0;
3963
3964     for(m=0; m<DIM*2; m++)
3965     {
3966         pos_vec[m] = 0;
3967     }
3968
3969     i0 = 0;
3970     for(icg=0; icg<ncg; icg++)
3971     {
3972         i1 = cgindex[icg+1];
3973         m = move[icg];
3974         if (m == -1)
3975         {
3976             if (bCompact)
3977             {
3978                 /* Compact the home array in place */
3979                 copy_rvec(src[icg],src[home_pos++]);
3980             }
3981         }
3982         else
3983         {
3984             nrcg = i1 - i0;
3985             /* Copy to the communication buffer */
3986             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
3987             pos_vec[m] += 1 + nrcg*nvec;
3988         }
3989         i0 = i1;
3990     }
3991     if (!bCompact)
3992     {
3993         home_pos = ncg;
3994     }
3995
3996     return home_pos;
3997 }
3998
3999 static int compact_ind(int ncg,int *move,
4000                        int *index_gl,int *cgindex,
4001                        int *gatindex,
4002                        gmx_ga2la_t ga2la,char *bLocalCG,
4003                        int *cginfo)
4004 {
4005     int cg,nat,a0,a1,a,a_gl;
4006     int home_pos;
4007
4008     home_pos = 0;
4009     nat = 0;
4010     for(cg=0; cg<ncg; cg++)
4011     {
4012         a0 = cgindex[cg];
4013         a1 = cgindex[cg+1];
4014         if (move[cg] == -1)
4015         {
4016             /* Compact the home arrays in place.
4017              * Anything that can be done here avoids access to global arrays.
4018              */
4019             cgindex[home_pos] = nat;
4020             for(a=a0; a<a1; a++)
4021             {
4022                 a_gl = gatindex[a];
4023                 gatindex[nat] = a_gl;
4024                 /* The cell number stays 0, so we don't need to set it */
4025                 ga2la_change_la(ga2la,a_gl,nat);
4026                 nat++;
4027             }
4028             index_gl[home_pos] = index_gl[cg];
4029             cginfo[home_pos]   = cginfo[cg];
4030             /* The charge group remains local, so bLocalCG does not change */
4031             home_pos++;
4032         }
4033         else
4034         {
4035             /* Clear the global indices */
4036             for(a=a0; a<a1; a++)
4037             {
4038                 ga2la_del(ga2la,gatindex[a]);
4039             }
4040             if (bLocalCG)
4041             {
4042                 bLocalCG[index_gl[cg]] = FALSE;
4043             }
4044         }
4045     }
4046     cgindex[home_pos] = nat;
4047
4048     return home_pos;
4049 }
4050
4051 static void clear_and_mark_ind(int ncg,int *move,
4052                                int *index_gl,int *cgindex,int *gatindex,
4053                                gmx_ga2la_t ga2la,char *bLocalCG,
4054                                int *cell_index)
4055 {
4056     int cg,a0,a1,a;
4057
4058     for(cg=0; cg<ncg; cg++)
4059     {
4060         if (move[cg] >= 0)
4061         {
4062             a0 = cgindex[cg];
4063             a1 = cgindex[cg+1];
4064             /* Clear the global indices */
4065             for(a=a0; a<a1; a++)
4066             {
4067                 ga2la_del(ga2la,gatindex[a]);
4068             }
4069             if (bLocalCG)
4070             {
4071                 bLocalCG[index_gl[cg]] = FALSE;
4072             }
4073             /* Signal that this cg has moved using the ns cell index.
4074              * Here we set it to -1.
4075              * fill_grid will change it from -1 to 4*grid->ncells.
4076              */
4077             cell_index[cg] = -1;
4078         }
4079     }
4080 }
4081
4082 static void print_cg_move(FILE *fplog,
4083                           gmx_domdec_t *dd,
4084                           gmx_large_int_t step,int cg,int dim,int dir,
4085                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4086                           rvec cm_old,rvec cm_new,real pos_d)
4087 {
4088     gmx_domdec_comm_t *comm;
4089     char buf[22];
4090
4091     comm = dd->comm;
4092
4093     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4094     if (bHaveLimitdAndCMOld)
4095     {
4096         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition (%f) in direction %c\n",
4097                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4098     }
4099     else
4100     {
4101         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4102                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4103     }
4104     fprintf(fplog,"distance out of cell %f\n",
4105             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4106     if (bHaveLimitdAndCMOld)
4107     {
4108         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4109                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4110     }
4111     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4112             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4113     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4114             dim2char(dim),
4115             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4116     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4117             dim2char(dim),
4118             comm->cell_x0[dim],comm->cell_x1[dim]);
4119 }
4120
4121 static void cg_move_error(FILE *fplog,
4122                           gmx_domdec_t *dd,
4123                           gmx_large_int_t step,int cg,int dim,int dir,
4124                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4125                           rvec cm_old,rvec cm_new,real pos_d)
4126 {
4127     if (fplog)
4128     {
4129         print_cg_move(fplog, dd,step,cg,dim,dir,
4130                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4131     }
4132     print_cg_move(stderr,dd,step,cg,dim,dir,
4133                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4134     gmx_fatal(FARGS,
4135               "A charge group moved too far between two domain decomposition steps\n"
4136               "This usually means that your system is not well equilibrated");
4137 }
4138
4139 static void rotate_state_atom(t_state *state,int a)
4140 {
4141     int est;
4142
4143     for(est=0; est<estNR; est++)
4144     {
4145         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4146             switch (est) {
4147             case estX:
4148                 /* Rotate the complete state; for a rectangular box only */
4149                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4150                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4151                 break;
4152             case estV:
4153                 state->v[a][YY] = -state->v[a][YY];
4154                 state->v[a][ZZ] = -state->v[a][ZZ];
4155                 break;
4156             case estSDX:
4157                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4158                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4159                 break;
4160             case estCGP:
4161                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4162                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4163                 break;
4164             case estDISRE_INITF:
4165             case estDISRE_RM3TAV:
4166             case estORIRE_INITF:
4167             case estORIRE_DTAV:
4168                 /* These are distances, so not affected by rotation */
4169                 break;
4170             default:
4171                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4172             }
4173         }
4174     }
4175 }
4176
4177 static int dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4178                               gmx_domdec_t *dd,ivec tric_dir,
4179                               t_state *state,rvec **f,
4180                               t_forcerec *fr,t_mdatoms *md,
4181                               gmx_bool bCompact,
4182                               t_nrnb *nrnb)
4183 {
4184     int  *move;
4185     int  npbcdim;
4186     int  ncg[DIM*2],nat[DIM*2];
4187     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4188     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4189     int  sbuf[2],rbuf[2];
4190     int  home_pos_cg,home_pos_at,ncg_stay_home,buf_pos;
4191     int  flag;
4192     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4193     gmx_bool bScrew;
4194     ivec dev;
4195     real inv_ncg,pos_d;
4196     matrix tcm;
4197     rvec *cg_cm,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4198     atom_id *cgindex;
4199     cginfo_mb_t *cginfo_mb;
4200     gmx_domdec_comm_t *comm;
4201
4202     if (dd->bScrewPBC)
4203     {
4204         check_screw_box(state->box);
4205     }
4206
4207     comm  = dd->comm;
4208     cg_cm = fr->cg_cm;
4209
4210     for(i=0; i<estNR; i++)
4211     {
4212         if (EST_DISTR(i))
4213         {
4214             switch (i)
4215             {
4216             case estX:   /* Always present */            break;
4217             case estV:   bV   = (state->flags & (1<<i)); break;
4218             case estSDX: bSDX = (state->flags & (1<<i)); break;
4219             case estCGP: bCGP = (state->flags & (1<<i)); break;
4220             case estLD_RNG:
4221             case estLD_RNGI:
4222             case estDISRE_INITF:
4223             case estDISRE_RM3TAV:
4224             case estORIRE_INITF:
4225             case estORIRE_DTAV:
4226                 /* No processing required */
4227                 break;
4228             default:
4229             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4230             }
4231         }
4232     }
4233
4234     if (dd->ncg_tot > comm->nalloc_int)
4235     {
4236         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4237         srenew(comm->buf_int,comm->nalloc_int);
4238     }
4239     move = comm->buf_int;
4240
4241     /* Clear the count */
4242     for(c=0; c<dd->ndim*2; c++)
4243     {
4244         ncg[c] = 0;
4245         nat[c] = 0;
4246     }
4247
4248     npbcdim = dd->npbcdim;
4249
4250     for(d=0; (d<DIM); d++)
4251     {
4252         limitd[d] = dd->comm->cellsize_min[d];
4253         if (d >= npbcdim && dd->ci[d] == 0)
4254         {
4255             cell_x0[d] = -GMX_FLOAT_MAX;
4256         }
4257         else
4258         {
4259             cell_x0[d] = comm->cell_x0[d];
4260         }
4261         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4262         {
4263             cell_x1[d] = GMX_FLOAT_MAX;
4264         }
4265         else
4266         {
4267             cell_x1[d] = comm->cell_x1[d];
4268         }
4269         if (d < npbcdim)
4270         {
4271             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4272             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4273         }
4274         else
4275         {
4276             /* We check after communication if a charge group moved
4277              * more than one cell. Set the pre-comm check limit to float_max.
4278              */
4279             limit0[d] = -GMX_FLOAT_MAX;
4280             limit1[d] =  GMX_FLOAT_MAX;
4281         }
4282     }
4283
4284     make_tric_corr_matrix(npbcdim,state->box,tcm);
4285
4286     cgindex = dd->cgindex;
4287
4288     /* Compute the center of geometry for all home charge groups
4289      * and put them in the box and determine where they should go.
4290      */
4291     for(cg=0; cg<dd->ncg_home; cg++)
4292     {
4293         k0   = cgindex[cg];
4294         k1   = cgindex[cg+1];
4295         nrcg = k1 - k0;
4296         if (nrcg == 1)
4297         {
4298             copy_rvec(state->x[k0],cm_new);
4299         }
4300         else
4301         {
4302             inv_ncg = 1.0/nrcg;
4303
4304             clear_rvec(cm_new);
4305             for(k=k0; (k<k1); k++)
4306             {
4307                 rvec_inc(cm_new,state->x[k]);
4308             }
4309             for(d=0; (d<DIM); d++)
4310             {
4311                 cm_new[d] = inv_ncg*cm_new[d];
4312             }
4313         }
4314
4315         clear_ivec(dev);
4316         /* Do pbc and check DD cell boundary crossings */
4317         for(d=DIM-1; d>=0; d--)
4318         {
4319             if (dd->nc[d] > 1)
4320             {
4321                 bScrew = (dd->bScrewPBC && d == XX);
4322                 /* Determine the location of this cg in lattice coordinates */
4323                 pos_d = cm_new[d];
4324                 if (tric_dir[d])
4325                 {
4326                     for(d2=d+1; d2<DIM; d2++)
4327                     {
4328                         pos_d += cm_new[d2]*tcm[d2][d];
4329                     }
4330                 }
4331                 /* Put the charge group in the triclinic unit-cell */
4332                 if (pos_d >= cell_x1[d])
4333                 {
4334                     if (pos_d >= limit1[d])
4335                     {
4336                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4337                                       cg_cm[cg],cm_new,pos_d);
4338                     }
4339                     dev[d] = 1;
4340                     if (dd->ci[d] == dd->nc[d] - 1)
4341                     {
4342                         rvec_dec(cm_new,state->box[d]);
4343                         if (bScrew)
4344                         {
4345                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4346                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4347                         }
4348                         for(k=k0; (k<k1); k++)
4349                         {
4350                             rvec_dec(state->x[k],state->box[d]);
4351                             if (bScrew)
4352                             {
4353                                 rotate_state_atom(state,k);
4354                             }
4355                         }
4356                     }
4357                 }
4358                 else if (pos_d < cell_x0[d])
4359                 {
4360                     if (pos_d < limit0[d])
4361                     {
4362                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4363                                       cg_cm[cg],cm_new,pos_d);
4364                     }
4365                     dev[d] = -1;
4366                     if (dd->ci[d] == 0)
4367                     {
4368                         rvec_inc(cm_new,state->box[d]);
4369                         if (bScrew)
4370                         {
4371                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4372                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4373                         }
4374                         for(k=k0; (k<k1); k++)
4375                         {
4376                             rvec_inc(state->x[k],state->box[d]);
4377                             if (bScrew)
4378                             {
4379                                 rotate_state_atom(state,k);
4380                             }
4381                         }
4382                     }
4383                 }
4384             }
4385             else if (d < npbcdim)
4386             {
4387                 /* Put the charge group in the rectangular unit-cell */
4388                 while (cm_new[d] >= state->box[d][d])
4389                 {
4390                     rvec_dec(cm_new,state->box[d]);
4391                     for(k=k0; (k<k1); k++)
4392                     {
4393                         rvec_dec(state->x[k],state->box[d]);
4394                     }
4395                 }
4396                 while (cm_new[d] < 0)
4397                 {
4398                     rvec_inc(cm_new,state->box[d]);
4399                     for(k=k0; (k<k1); k++)
4400                     {
4401                         rvec_inc(state->x[k],state->box[d]);
4402                     }
4403                 }
4404             }
4405         }
4406
4407         copy_rvec(cm_new,cg_cm[cg]);
4408
4409         /* Determine where this cg should go */
4410         flag = 0;
4411         mc = -1;
4412         for(d=0; d<dd->ndim; d++)
4413         {
4414             dim = dd->dim[d];
4415             if (dev[dim] == 1)
4416             {
4417                 flag |= DD_FLAG_FW(d);
4418                 if (mc == -1)
4419                 {
4420                     mc = d*2;
4421                 }
4422             }
4423             else if (dev[dim] == -1)
4424             {
4425                 flag |= DD_FLAG_BW(d);
4426                 if (mc == -1) {
4427                     if (dd->nc[dim] > 2)
4428                     {
4429                         mc = d*2 + 1;
4430                     }
4431                     else
4432                     {
4433                         mc = d*2;
4434                     }
4435                 }
4436             }
4437         }
4438         move[cg] = mc;
4439         if (mc >= 0)
4440         {
4441             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4442             {
4443                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4444                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4445             }
4446             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4447             /* We store the cg size in the lower 16 bits
4448              * and the place where the charge group should go
4449              * in the next 6 bits. This saves some communication volume.
4450              */
4451             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4452             ncg[mc] += 1;
4453             nat[mc] += nrcg;
4454         }
4455     }
4456
4457     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4458     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4459
4460     nvec = 1;
4461     if (bV)
4462     {
4463         nvec++;
4464     }
4465     if (bSDX)
4466     {
4467         nvec++;
4468     }
4469     if (bCGP)
4470     {
4471         nvec++;
4472     }
4473
4474     /* Make sure the communication buffers are large enough */
4475     for(mc=0; mc<dd->ndim*2; mc++)
4476     {
4477         nvr = ncg[mc] + nat[mc]*nvec;
4478         if (nvr > comm->cgcm_state_nalloc[mc])
4479         {
4480             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4481             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4482         }
4483     }
4484
4485     /* Recalculating cg_cm might be cheaper than communicating,
4486      * but that could give rise to rounding issues.
4487      */
4488     home_pos_cg =
4489         compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4490                                 nvec,cg_cm,comm,bCompact);
4491
4492     vec = 0;
4493     home_pos_at =
4494         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4495                                 nvec,vec++,state->x,comm,bCompact);
4496     if (bV)
4497     {
4498         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4499                                 nvec,vec++,state->v,comm,bCompact);
4500     }
4501     if (bSDX)
4502     {
4503         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4504                                 nvec,vec++,state->sd_X,comm,bCompact);
4505     }
4506     if (bCGP)
4507     {
4508         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4509                                 nvec,vec++,state->cg_p,comm,bCompact);
4510     }
4511
4512     if (bCompact)
4513     {
4514         compact_ind(dd->ncg_home,move,
4515                     dd->index_gl,dd->cgindex,dd->gatindex,
4516                     dd->ga2la,comm->bLocalCG,
4517                     fr->cginfo);
4518     }
4519     else
4520     {
4521         clear_and_mark_ind(dd->ncg_home,move,
4522                            dd->index_gl,dd->cgindex,dd->gatindex,
4523                            dd->ga2la,comm->bLocalCG,
4524                            fr->ns.grid->cell_index);
4525     }
4526
4527     cginfo_mb = fr->cginfo_mb;
4528
4529     ncg_stay_home = home_pos_cg;
4530     for(d=0; d<dd->ndim; d++)
4531     {
4532         dim = dd->dim[d];
4533         ncg_recv = 0;
4534         nat_recv = 0;
4535         nvr      = 0;
4536         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4537         {
4538             cdd = d*2 + dir;
4539             /* Communicate the cg and atom counts */
4540             sbuf[0] = ncg[cdd];
4541             sbuf[1] = nat[cdd];
4542             if (debug)
4543             {
4544                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4545                         d,dir,sbuf[0],sbuf[1]);
4546             }
4547             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4548
4549             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4550             {
4551                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4552                 srenew(comm->buf_int,comm->nalloc_int);
4553             }
4554
4555             /* Communicate the charge group indices, sizes and flags */
4556             dd_sendrecv_int(dd, d, dir,
4557                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4558                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4559
4560             nvs = ncg[cdd] + nat[cdd]*nvec;
4561             i   = rbuf[0]  + rbuf[1] *nvec;
4562             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4563
4564             /* Communicate cgcm and state */
4565             dd_sendrecv_rvec(dd, d, dir,
4566                              comm->cgcm_state[cdd], nvs,
4567                              comm->vbuf.v+nvr, i);
4568             ncg_recv += rbuf[0];
4569             nat_recv += rbuf[1];
4570             nvr      += i;
4571         }
4572
4573         /* Process the received charge groups */
4574         buf_pos = 0;
4575         for(cg=0; cg<ncg_recv; cg++)
4576         {
4577             flag = comm->buf_int[cg*DD_CGIBS+1];
4578
4579             if (dim >= npbcdim && dd->nc[dim] > 2)
4580             {
4581                 /* No pbc in this dim and more than one domain boundary.
4582                  * We to a separate check if a charge did not move too far.
4583                  */
4584                 if (((flag & DD_FLAG_FW(d)) &&
4585                      comm->vbuf.v[buf_pos][d] > cell_x1[dim]) ||
4586                     ((flag & DD_FLAG_BW(d)) &&
4587                      comm->vbuf.v[buf_pos][d] < cell_x0[dim]))
4588                 {
4589                     cg_move_error(fplog,dd,step,cg,d,
4590                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4591                                    FALSE,0,
4592                                    comm->vbuf.v[buf_pos],
4593                                    comm->vbuf.v[buf_pos],
4594                                    comm->vbuf.v[buf_pos][d]);
4595                 }
4596             }
4597
4598             mc = -1;
4599             if (d < dd->ndim-1)
4600             {
4601                 /* Check which direction this cg should go */
4602                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4603                 {
4604                     if (dd->bGridJump)
4605                     {
4606                         /* The cell boundaries for dimension d2 are not equal
4607                          * for each cell row of the lower dimension(s),
4608                          * therefore we might need to redetermine where
4609                          * this cg should go.
4610                          */
4611                         dim2 = dd->dim[d2];
4612                         /* If this cg crosses the box boundary in dimension d2
4613                          * we can use the communicated flag, so we do not
4614                          * have to worry about pbc.
4615                          */
4616                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4617                                (flag & DD_FLAG_FW(d2))) ||
4618                               (dd->ci[dim2] == 0 &&
4619                                (flag & DD_FLAG_BW(d2)))))
4620                         {
4621                             /* Clear the two flags for this dimension */
4622                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4623                             /* Determine the location of this cg
4624                              * in lattice coordinates
4625                              */
4626                             pos_d = comm->vbuf.v[buf_pos][dim2];
4627                             if (tric_dir[dim2])
4628                             {
4629                                 for(d3=dim2+1; d3<DIM; d3++)
4630                                 {
4631                                     pos_d +=
4632                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4633                                 }
4634                             }
4635                             /* Check of we are not at the box edge.
4636                              * pbc is only handled in the first step above,
4637                              * but this check could move over pbc while
4638                              * the first step did not due to different rounding.
4639                              */
4640                             if (pos_d >= cell_x1[dim2] &&
4641                                 dd->ci[dim2] != dd->nc[dim2]-1)
4642                             {
4643                                 flag |= DD_FLAG_FW(d2);
4644                             }
4645                             else if (pos_d < cell_x0[dim2] &&
4646                                      dd->ci[dim2] != 0)
4647                             {
4648                                 flag |= DD_FLAG_BW(d2);
4649                             }
4650                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4651                         }
4652                     }
4653                     /* Set to which neighboring cell this cg should go */
4654                     if (flag & DD_FLAG_FW(d2))
4655                     {
4656                         mc = d2*2;
4657                     }
4658                     else if (flag & DD_FLAG_BW(d2))
4659                     {
4660                         if (dd->nc[dd->dim[d2]] > 2)
4661                         {
4662                             mc = d2*2+1;
4663                         }
4664                         else
4665                         {
4666                             mc = d2*2;
4667                         }
4668                     }
4669                 }
4670             }
4671
4672             nrcg = flag & DD_FLAG_NRCG;
4673             if (mc == -1)
4674             {
4675                 if (home_pos_cg+1 > dd->cg_nalloc)
4676                 {
4677                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4678                     srenew(dd->index_gl,dd->cg_nalloc);
4679                     srenew(dd->cgindex,dd->cg_nalloc+1);
4680                 }
4681                 /* Set the global charge group index and size */
4682                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4683                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4684                 /* Copy the state from the buffer */
4685                 if (home_pos_cg >= fr->cg_nalloc)
4686                 {
4687                     dd_realloc_fr_cg(fr,home_pos_cg+1);
4688                     cg_cm = fr->cg_cm;
4689                 }
4690                 copy_rvec(comm->vbuf.v[buf_pos++],cg_cm[home_pos_cg]);
4691                 /* Set the cginfo */
4692                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4693                                                    dd->index_gl[home_pos_cg]);
4694                 if (comm->bLocalCG)
4695                 {
4696                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4697                 }
4698
4699                 if (home_pos_at+nrcg > state->nalloc)
4700                 {
4701                     dd_realloc_state(state,f,home_pos_at+nrcg);
4702                 }
4703                 for(i=0; i<nrcg; i++)
4704                 {
4705                     copy_rvec(comm->vbuf.v[buf_pos++],
4706                               state->x[home_pos_at+i]);
4707                 }
4708                 if (bV)
4709                 {
4710                     for(i=0; i<nrcg; i++)
4711                     {
4712                         copy_rvec(comm->vbuf.v[buf_pos++],
4713                                   state->v[home_pos_at+i]);
4714                     }
4715                 }
4716                 if (bSDX)
4717                 {
4718                     for(i=0; i<nrcg; i++)
4719                     {
4720                         copy_rvec(comm->vbuf.v[buf_pos++],
4721                                   state->sd_X[home_pos_at+i]);
4722                     }
4723                 }
4724                 if (bCGP)
4725                 {
4726                     for(i=0; i<nrcg; i++)
4727                     {
4728                         copy_rvec(comm->vbuf.v[buf_pos++],
4729                                   state->cg_p[home_pos_at+i]);
4730                     }
4731                 }
4732                 home_pos_cg += 1;
4733                 home_pos_at += nrcg;
4734             }
4735             else
4736             {
4737                 /* Reallocate the buffers if necessary  */
4738                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4739                 {
4740                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4741                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4742                 }
4743                 nvr = ncg[mc] + nat[mc]*nvec;
4744                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
4745                 {
4746                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
4747                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4748                 }
4749                 /* Copy from the receive to the send buffers */
4750                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
4751                        comm->buf_int + cg*DD_CGIBS,
4752                        DD_CGIBS*sizeof(int));
4753                 memcpy(comm->cgcm_state[mc][nvr],
4754                        comm->vbuf.v[buf_pos],
4755                        (1+nrcg*nvec)*sizeof(rvec));
4756                 buf_pos += 1 + nrcg*nvec;
4757                 ncg[mc] += 1;
4758                 nat[mc] += nrcg;
4759             }
4760         }
4761     }
4762
4763     /* With sorting (!bCompact) the indices are now only partially up to date
4764      * and ncg_home and nat_home are not the real count, since there are
4765      * "holes" in the arrays for the charge groups that moved to neighbors.
4766      */
4767     dd->ncg_home = home_pos_cg;
4768     dd->nat_home = home_pos_at;
4769
4770     if (debug)
4771     {
4772         fprintf(debug,"Finished repartitioning\n");
4773     }
4774
4775     return ncg_stay_home;
4776 }
4777
4778 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
4779 {
4780     dd->comm->cycl[ddCycl] += cycles;
4781     dd->comm->cycl_n[ddCycl]++;
4782     if (cycles > dd->comm->cycl_max[ddCycl])
4783     {
4784         dd->comm->cycl_max[ddCycl] = cycles;
4785     }
4786 }
4787
4788 static double force_flop_count(t_nrnb *nrnb)
4789 {
4790     int i;
4791     double sum;
4792     const char *name;
4793
4794     sum = 0;
4795     for(i=eNR_NBKERNEL010; i<eNR_NBKERNEL_FREE_ENERGY; i++)
4796     {
4797         /* To get closer to the real timings, we half the count
4798          * for the normal loops and again half it for water loops.
4799          */
4800         name = nrnb_str(i);
4801         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4802         {
4803             sum += nrnb->n[i]*0.25*cost_nrnb(i);
4804         }
4805         else
4806         {
4807             sum += nrnb->n[i]*0.50*cost_nrnb(i);
4808         }
4809     }
4810     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
4811     {
4812         name = nrnb_str(i);
4813         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
4814         sum += nrnb->n[i]*cost_nrnb(i);
4815     }
4816     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
4817     {
4818         sum += nrnb->n[i]*cost_nrnb(i);
4819     }
4820
4821     return sum;
4822 }
4823
4824 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
4825 {
4826     if (dd->comm->eFlop)
4827     {
4828         dd->comm->flop -= force_flop_count(nrnb);
4829     }
4830 }
4831 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
4832 {
4833     if (dd->comm->eFlop)
4834     {
4835         dd->comm->flop += force_flop_count(nrnb);
4836         dd->comm->flop_n++;
4837     }
4838 }
4839
4840 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
4841 {
4842     int i;
4843
4844     for(i=0; i<ddCyclNr; i++)
4845     {
4846         dd->comm->cycl[i] = 0;
4847         dd->comm->cycl_n[i] = 0;
4848         dd->comm->cycl_max[i] = 0;
4849     }
4850     dd->comm->flop = 0;
4851     dd->comm->flop_n = 0;
4852 }
4853
4854 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
4855 {
4856     gmx_domdec_comm_t *comm;
4857     gmx_domdec_load_t *load;
4858     gmx_domdec_root_t *root=NULL;
4859     int  d,dim,cid,i,pos;
4860     float cell_frac=0,sbuf[DD_NLOAD_MAX];
4861     gmx_bool bSepPME;
4862
4863     if (debug)
4864     {
4865         fprintf(debug,"get_load_distribution start\n");
4866     }
4867
4868     wallcycle_start(wcycle,ewcDDCOMMLOAD);
4869
4870     comm = dd->comm;
4871
4872     bSepPME = (dd->pme_nodeid >= 0);
4873
4874     for(d=dd->ndim-1; d>=0; d--)
4875     {
4876         dim = dd->dim[d];
4877         /* Check if we participate in the communication in this dimension */
4878         if (d == dd->ndim-1 ||
4879             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
4880         {
4881             load = &comm->load[d];
4882             if (dd->bGridJump)
4883             {
4884                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
4885             }
4886             pos = 0;
4887             if (d == dd->ndim-1)
4888             {
4889                 sbuf[pos++] = dd_force_load(comm);
4890                 sbuf[pos++] = sbuf[0];
4891                 if (dd->bGridJump)
4892                 {
4893                     sbuf[pos++] = sbuf[0];
4894                     sbuf[pos++] = cell_frac;
4895                     if (d > 0)
4896                     {
4897                         sbuf[pos++] = comm->cell_f_max0[d];
4898                         sbuf[pos++] = comm->cell_f_min1[d];
4899                     }
4900                 }
4901                 if (bSepPME)
4902                 {
4903                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
4904                     sbuf[pos++] = comm->cycl[ddCyclPME];
4905                 }
4906             }
4907             else
4908             {
4909                 sbuf[pos++] = comm->load[d+1].sum;
4910                 sbuf[pos++] = comm->load[d+1].max;
4911                 if (dd->bGridJump)
4912                 {
4913                     sbuf[pos++] = comm->load[d+1].sum_m;
4914                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
4915                     sbuf[pos++] = comm->load[d+1].flags;
4916                     if (d > 0)
4917                     {
4918                         sbuf[pos++] = comm->cell_f_max0[d];
4919                         sbuf[pos++] = comm->cell_f_min1[d];
4920                     }
4921                 }
4922                 if (bSepPME)
4923                 {
4924                     sbuf[pos++] = comm->load[d+1].mdf;
4925                     sbuf[pos++] = comm->load[d+1].pme;
4926                 }
4927             }
4928             load->nload = pos;
4929             /* Communicate a row in DD direction d.
4930              * The communicators are setup such that the root always has rank 0.
4931              */
4932 #ifdef GMX_MPI
4933             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
4934                        load->load,load->nload*sizeof(float),MPI_BYTE,
4935                        0,comm->mpi_comm_load[d]);
4936 #endif
4937             if (dd->ci[dim] == dd->master_ci[dim])
4938             {
4939                 /* We are the root, process this row */
4940                 if (comm->bDynLoadBal)
4941                 {
4942                     root = comm->root[d];
4943                 }
4944                 load->sum = 0;
4945                 load->max = 0;
4946                 load->sum_m = 0;
4947                 load->cvol_min = 1;
4948                 load->flags = 0;
4949                 load->mdf = 0;
4950                 load->pme = 0;
4951                 pos = 0;
4952                 for(i=0; i<dd->nc[dim]; i++)
4953                 {
4954                     load->sum += load->load[pos++];
4955                     load->max = max(load->max,load->load[pos]);
4956                     pos++;
4957                     if (dd->bGridJump)
4958                     {
4959                         if (root->bLimited)
4960                         {
4961                             /* This direction could not be load balanced properly,
4962                              * therefore we need to use the maximum iso the average load.
4963                              */
4964                             load->sum_m = max(load->sum_m,load->load[pos]);
4965                         }
4966                         else
4967                         {
4968                             load->sum_m += load->load[pos];
4969                         }
4970                         pos++;
4971                         load->cvol_min = min(load->cvol_min,load->load[pos]);
4972                         pos++;
4973                         if (d < dd->ndim-1)
4974                         {
4975                             load->flags = (int)(load->load[pos++] + 0.5);
4976                         }
4977                         if (d > 0)
4978                         {
4979                             root->cell_f_max0[i] = load->load[pos++];
4980                             root->cell_f_min1[i] = load->load[pos++];
4981                         }
4982                     }
4983                     if (bSepPME)
4984                     {
4985                         load->mdf = max(load->mdf,load->load[pos]);
4986                         pos++;
4987                         load->pme = max(load->pme,load->load[pos]);
4988                         pos++;
4989                     }
4990                 }
4991                 if (comm->bDynLoadBal && root->bLimited)
4992                 {
4993                     load->sum_m *= dd->nc[dim];
4994                     load->flags |= (1<<d);
4995                 }
4996             }
4997         }
4998     }
4999
5000     if (DDMASTER(dd))
5001     {
5002         comm->nload      += dd_load_count(comm);
5003         comm->load_step  += comm->cycl[ddCyclStep];
5004         comm->load_sum   += comm->load[0].sum;
5005         comm->load_max   += comm->load[0].max;
5006         if (comm->bDynLoadBal)
5007         {
5008             for(d=0; d<dd->ndim; d++)
5009             {
5010                 if (comm->load[0].flags & (1<<d))
5011                 {
5012                     comm->load_lim[d]++;
5013                 }
5014             }
5015         }
5016         if (bSepPME)
5017         {
5018             comm->load_mdf += comm->load[0].mdf;
5019             comm->load_pme += comm->load[0].pme;
5020         }
5021     }
5022
5023     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5024
5025     if (debug)
5026     {
5027         fprintf(debug,"get_load_distribution finished\n");
5028     }
5029 }
5030
5031 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5032 {
5033     /* Return the relative performance loss on the total run time
5034      * due to the force calculation load imbalance.
5035      */
5036     if (dd->comm->nload > 0)
5037     {
5038         return
5039             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5040             (dd->comm->load_step*dd->nnodes);
5041     }
5042     else
5043     {
5044         return 0;
5045     }
5046 }
5047
5048 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5049 {
5050     char  buf[STRLEN];
5051     int   npp,npme,nnodes,d,limp;
5052     float imbal,pme_f_ratio,lossf,lossp=0;
5053     gmx_bool  bLim;
5054     gmx_domdec_comm_t *comm;
5055
5056     comm = dd->comm;
5057     if (DDMASTER(dd) && comm->nload > 0)
5058     {
5059         npp    = dd->nnodes;
5060         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5061         nnodes = npp + npme;
5062         imbal = comm->load_max*npp/comm->load_sum - 1;
5063         lossf = dd_force_imb_perf_loss(dd);
5064         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5065         fprintf(fplog,"%s",buf);
5066         fprintf(stderr,"\n");
5067         fprintf(stderr,"%s",buf);
5068         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5069         fprintf(fplog,"%s",buf);
5070         fprintf(stderr,"%s",buf);
5071         bLim = FALSE;
5072         if (comm->bDynLoadBal)
5073         {
5074             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5075             for(d=0; d<dd->ndim; d++)
5076             {
5077                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5078                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5079                 if (limp >= 50)
5080                 {
5081                     bLim = TRUE;
5082                 }
5083             }
5084             sprintf(buf+strlen(buf),"\n");
5085             fprintf(fplog,"%s",buf);
5086             fprintf(stderr,"%s",buf);
5087         }
5088         if (npme > 0)
5089         {
5090             pme_f_ratio = comm->load_pme/comm->load_mdf;
5091             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5092             if (lossp <= 0)
5093             {
5094                 lossp *= (float)npme/(float)nnodes;
5095             }
5096             else
5097             {
5098                 lossp *= (float)npp/(float)nnodes;
5099             }
5100             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5101             fprintf(fplog,"%s",buf);
5102             fprintf(stderr,"%s",buf);
5103             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5104             fprintf(fplog,"%s",buf);
5105             fprintf(stderr,"%s",buf);
5106         }
5107         fprintf(fplog,"\n");
5108         fprintf(stderr,"\n");
5109
5110         if (lossf >= DD_PERF_LOSS)
5111         {
5112             sprintf(buf,
5113                     "NOTE: %.1f %% performance was lost due to load imbalance\n"
5114                     "      in the domain decomposition.\n",lossf*100);
5115             if (!comm->bDynLoadBal)
5116             {
5117                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5118             }
5119             else if (bLim)
5120             {
5121                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5122             }
5123             fprintf(fplog,"%s\n",buf);
5124             fprintf(stderr,"%s\n",buf);
5125         }
5126         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5127         {
5128             sprintf(buf,
5129                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5130                     "      had %s work to do than the PP nodes.\n"
5131                     "      You might want to %s the number of PME nodes\n"
5132                     "      or %s the cut-off and the grid spacing.\n",
5133                     fabs(lossp*100),
5134                     (lossp < 0) ? "less"     : "more",
5135                     (lossp < 0) ? "decrease" : "increase",
5136                     (lossp < 0) ? "decrease" : "increase");
5137             fprintf(fplog,"%s\n",buf);
5138             fprintf(stderr,"%s\n",buf);
5139         }
5140     }
5141 }
5142
5143 static float dd_vol_min(gmx_domdec_t *dd)
5144 {
5145     return dd->comm->load[0].cvol_min*dd->nnodes;
5146 }
5147
5148 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5149 {
5150     return dd->comm->load[0].flags;
5151 }
5152
5153 static float dd_f_imbal(gmx_domdec_t *dd)
5154 {
5155     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5156 }
5157
5158 static float dd_pme_f_ratio(gmx_domdec_t *dd)
5159 {
5160     return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5161 }
5162
5163 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5164 {
5165     int flags,d;
5166     char buf[22];
5167
5168     flags = dd_load_flags(dd);
5169     if (flags)
5170     {
5171         fprintf(fplog,
5172                 "DD  load balancing is limited by minimum cell size in dimension");
5173         for(d=0; d<dd->ndim; d++)
5174         {
5175             if (flags & (1<<d))
5176             {
5177                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5178             }
5179         }
5180         fprintf(fplog,"\n");
5181     }
5182     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5183     if (dd->comm->bDynLoadBal)
5184     {
5185         fprintf(fplog,"  vol min/aver %5.3f%c",
5186                 dd_vol_min(dd),flags ? '!' : ' ');
5187     }
5188     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5189     if (dd->comm->cycl_n[ddCyclPME])
5190     {
5191         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5192     }
5193     fprintf(fplog,"\n\n");
5194 }
5195
5196 static void dd_print_load_verbose(gmx_domdec_t *dd)
5197 {
5198     if (dd->comm->bDynLoadBal)
5199     {
5200         fprintf(stderr,"vol %4.2f%c ",
5201                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5202     }
5203     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5204     if (dd->comm->cycl_n[ddCyclPME])
5205     {
5206         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5207     }
5208 }
5209
5210 #ifdef GMX_MPI
5211 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5212 {
5213     MPI_Comm  c_row;
5214     int  dim, i, rank;
5215     ivec loc_c;
5216     gmx_domdec_root_t *root;
5217     gmx_bool bPartOfGroup = FALSE;
5218
5219     dim = dd->dim[dim_ind];
5220     copy_ivec(loc,loc_c);
5221     for(i=0; i<dd->nc[dim]; i++)
5222     {
5223         loc_c[dim] = i;
5224         rank = dd_index(dd->nc,loc_c);
5225         if (rank == dd->rank)
5226         {
5227             /* This process is part of the group */
5228             bPartOfGroup = TRUE;
5229         }
5230     }
5231     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5232                    &c_row);
5233     if (bPartOfGroup)
5234     {
5235         dd->comm->mpi_comm_load[dim_ind] = c_row;
5236         if (dd->comm->eDLB != edlbNO)
5237         {
5238             if (dd->ci[dim] == dd->master_ci[dim])
5239             {
5240                 /* This is the root process of this row */
5241                 snew(dd->comm->root[dim_ind],1);
5242                 root = dd->comm->root[dim_ind];
5243                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5244                 snew(root->old_cell_f,dd->nc[dim]+1);
5245                 snew(root->bCellMin,dd->nc[dim]);
5246                 if (dim_ind > 0)
5247                 {
5248                     snew(root->cell_f_max0,dd->nc[dim]);
5249                     snew(root->cell_f_min1,dd->nc[dim]);
5250                     snew(root->bound_min,dd->nc[dim]);
5251                     snew(root->bound_max,dd->nc[dim]);
5252                 }
5253                 snew(root->buf_ncd,dd->nc[dim]);
5254             }
5255             else
5256             {
5257                 /* This is not a root process, we only need to receive cell_f */
5258                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5259             }
5260         }
5261         if (dd->ci[dim] == dd->master_ci[dim])
5262         {
5263             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5264         }
5265     }
5266 }
5267 #endif
5268
5269 static void make_load_communicators(gmx_domdec_t *dd)
5270 {
5271 #ifdef GMX_MPI
5272   int  dim0,dim1,i,j;
5273   ivec loc;
5274
5275   if (debug)
5276     fprintf(debug,"Making load communicators\n");
5277
5278   snew(dd->comm->load,dd->ndim);
5279   snew(dd->comm->mpi_comm_load,dd->ndim);
5280
5281   clear_ivec(loc);
5282   make_load_communicator(dd,0,loc);
5283   if (dd->ndim > 1) {
5284     dim0 = dd->dim[0];
5285     for(i=0; i<dd->nc[dim0]; i++) {
5286       loc[dim0] = i;
5287       make_load_communicator(dd,1,loc);
5288     }
5289   }
5290   if (dd->ndim > 2) {
5291     dim0 = dd->dim[0];
5292     for(i=0; i<dd->nc[dim0]; i++) {
5293       loc[dim0] = i;
5294       dim1 = dd->dim[1];
5295       for(j=0; j<dd->nc[dim1]; j++) {
5296           loc[dim1] = j;
5297           make_load_communicator(dd,2,loc);
5298       }
5299     }
5300   }
5301
5302   if (debug)
5303     fprintf(debug,"Finished making load communicators\n");
5304 #endif
5305 }
5306
5307 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5308 {
5309     gmx_bool bZYX;
5310     int  d,dim,i,j,m;
5311     ivec tmp,s;
5312     int  nzone,nzonep;
5313     ivec dd_zp[DD_MAXIZONE];
5314     gmx_domdec_zones_t *zones;
5315     gmx_domdec_ns_ranges_t *izone;
5316
5317     for(d=0; d<dd->ndim; d++)
5318     {
5319         dim = dd->dim[d];
5320         copy_ivec(dd->ci,tmp);
5321         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5322         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5323         copy_ivec(dd->ci,tmp);
5324         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5325         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5326         if (debug)
5327         {
5328             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5329                     dd->rank,dim,
5330                     dd->neighbor[d][0],
5331                     dd->neighbor[d][1]);
5332         }
5333     }
5334
5335     if (DDMASTER(dd))
5336     {
5337         fprintf(stderr,"Making %dD domain decomposition %d x %d x %d\n",
5338             dd->ndim,dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5339     }
5340     if (fplog)
5341     {
5342         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5343                 dd->ndim,
5344                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5345                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5346     }
5347     switch (dd->ndim)
5348     {
5349     case 3:
5350         nzone  = dd_z3n;
5351         nzonep = dd_zp3n;
5352         for(i=0; i<nzonep; i++)
5353         {
5354             copy_ivec(dd_zp3[i],dd_zp[i]);
5355         }
5356         break;
5357     case 2:
5358         nzone  = dd_z2n;
5359         nzonep = dd_zp2n;
5360         for(i=0; i<nzonep; i++)
5361         {
5362             copy_ivec(dd_zp2[i],dd_zp[i]);
5363         }
5364         break;
5365     case 1:
5366         nzone  = dd_z1n;
5367         nzonep = dd_zp1n;
5368         for(i=0; i<nzonep; i++)
5369         {
5370             copy_ivec(dd_zp1[i],dd_zp[i]);
5371         }
5372         break;
5373     default:
5374         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5375         nzone = 0;
5376         nzonep = 0;
5377     }
5378
5379     zones = &dd->comm->zones;
5380
5381     for(i=0; i<nzone; i++)
5382     {
5383         m = 0;
5384         clear_ivec(zones->shift[i]);
5385         for(d=0; d<dd->ndim; d++)
5386         {
5387             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5388         }
5389     }
5390
5391     zones->n = nzone;
5392     for(i=0; i<nzone; i++)
5393     {
5394         for(d=0; d<DIM; d++)
5395         {
5396             s[d] = dd->ci[d] - zones->shift[i][d];
5397             if (s[d] < 0)
5398             {
5399                 s[d] += dd->nc[d];
5400             }
5401             else if (s[d] >= dd->nc[d])
5402             {
5403                 s[d] -= dd->nc[d];
5404             }
5405         }
5406     }
5407     zones->nizone = nzonep;
5408     for(i=0; i<zones->nizone; i++)
5409     {
5410         if (dd_zp[i][0] != i)
5411         {
5412             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5413         }
5414         izone = &zones->izone[i];
5415         izone->j0 = dd_zp[i][1];
5416         izone->j1 = dd_zp[i][2];
5417         for(dim=0; dim<DIM; dim++)
5418         {
5419             if (dd->nc[dim] == 1)
5420             {
5421                 /* All shifts should be allowed */
5422                 izone->shift0[dim] = -1;
5423                 izone->shift1[dim] = 1;
5424             }
5425             else
5426             {
5427                 /*
5428                   izone->shift0[d] = 0;
5429                   izone->shift1[d] = 0;
5430                   for(j=izone->j0; j<izone->j1; j++) {
5431                   if (dd->shift[j][d] > dd->shift[i][d])
5432                   izone->shift0[d] = -1;
5433                   if (dd->shift[j][d] < dd->shift[i][d])
5434                   izone->shift1[d] = 1;
5435                   }
5436                 */
5437
5438                 int shift_diff;
5439
5440                 /* Assume the shift are not more than 1 cell */
5441                 izone->shift0[dim] = 1;
5442                 izone->shift1[dim] = -1;
5443                 for(j=izone->j0; j<izone->j1; j++)
5444                 {
5445                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5446                     if (shift_diff < izone->shift0[dim])
5447                     {
5448                         izone->shift0[dim] = shift_diff;
5449                     }
5450                     if (shift_diff > izone->shift1[dim])
5451                     {
5452                         izone->shift1[dim] = shift_diff;
5453                     }
5454                 }
5455             }
5456         }
5457     }
5458
5459     if (dd->comm->eDLB != edlbNO)
5460     {
5461         snew(dd->comm->root,dd->ndim);
5462     }
5463
5464     if (dd->comm->bRecordLoad)
5465     {
5466         make_load_communicators(dd);
5467     }
5468 }
5469
5470 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5471 {
5472     gmx_domdec_t *dd;
5473     gmx_domdec_comm_t *comm;
5474     int  i,rank,*buf;
5475     ivec periods;
5476 #ifdef GMX_MPI
5477     MPI_Comm comm_cart;
5478 #endif
5479
5480     dd = cr->dd;
5481     comm = dd->comm;
5482
5483 #ifdef GMX_MPI
5484     if (comm->bCartesianPP)
5485     {
5486         /* Set up cartesian communication for the particle-particle part */
5487         if (fplog)
5488         {
5489             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5490                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5491         }
5492
5493         for(i=0; i<DIM; i++)
5494         {
5495             periods[i] = TRUE;
5496         }
5497         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5498                         &comm_cart);
5499         /* We overwrite the old communicator with the new cartesian one */
5500         cr->mpi_comm_mygroup = comm_cart;
5501     }
5502
5503     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5504     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5505
5506     if (comm->bCartesianPP_PME)
5507     {
5508         /* Since we want to use the original cartesian setup for sim,
5509          * and not the one after split, we need to make an index.
5510          */
5511         snew(comm->ddindex2ddnodeid,dd->nnodes);
5512         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5513         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5514         /* Get the rank of the DD master,
5515          * above we made sure that the master node is a PP node.
5516          */
5517         if (MASTER(cr))
5518         {
5519             rank = dd->rank;
5520         }
5521         else
5522         {
5523             rank = 0;
5524         }
5525         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5526     }
5527     else if (comm->bCartesianPP)
5528     {
5529         if (cr->npmenodes == 0)
5530         {
5531             /* The PP communicator is also
5532              * the communicator for this simulation
5533              */
5534             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5535         }
5536         cr->nodeid = dd->rank;
5537
5538         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5539
5540         /* We need to make an index to go from the coordinates
5541          * to the nodeid of this simulation.
5542          */
5543         snew(comm->ddindex2simnodeid,dd->nnodes);
5544         snew(buf,dd->nnodes);
5545         if (cr->duty & DUTY_PP)
5546         {
5547             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5548         }
5549         /* Communicate the ddindex to simulation nodeid index */
5550         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5551                       cr->mpi_comm_mysim);
5552         sfree(buf);
5553
5554         /* Determine the master coordinates and rank.
5555          * The DD master should be the same node as the master of this sim.
5556          */
5557         for(i=0; i<dd->nnodes; i++)
5558         {
5559             if (comm->ddindex2simnodeid[i] == 0)
5560             {
5561                 ddindex2xyz(dd->nc,i,dd->master_ci);
5562                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5563             }
5564         }
5565         if (debug)
5566         {
5567             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5568         }
5569     }
5570     else
5571     {
5572         /* No Cartesian communicators */
5573         /* We use the rank in dd->comm->all as DD index */
5574         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5575         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5576         dd->masterrank = 0;
5577         clear_ivec(dd->master_ci);
5578     }
5579 #endif
5580
5581     if (fplog)
5582     {
5583         fprintf(fplog,
5584                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5585                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5586     }
5587     if (debug)
5588     {
5589         fprintf(debug,
5590                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5591                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5592     }
5593 }
5594
5595 static void receive_ddindex2simnodeid(t_commrec *cr)
5596 {
5597     gmx_domdec_t *dd;
5598
5599     gmx_domdec_comm_t *comm;
5600     int  *buf;
5601
5602     dd = cr->dd;
5603     comm = dd->comm;
5604
5605 #ifdef GMX_MPI
5606     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5607     {
5608         snew(comm->ddindex2simnodeid,dd->nnodes);
5609         snew(buf,dd->nnodes);
5610         if (cr->duty & DUTY_PP)
5611         {
5612             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5613         }
5614 #ifdef GMX_MPI
5615         /* Communicate the ddindex to simulation nodeid index */
5616         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5617                       cr->mpi_comm_mysim);
5618 #endif
5619         sfree(buf);
5620     }
5621 #endif
5622 }
5623
5624 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5625                                                      int ncg,int natoms)
5626 {
5627     gmx_domdec_master_t *ma;
5628     int i;
5629
5630     snew(ma,1);
5631
5632     snew(ma->ncg,dd->nnodes);
5633     snew(ma->index,dd->nnodes+1);
5634     snew(ma->cg,ncg);
5635     snew(ma->nat,dd->nnodes);
5636     snew(ma->ibuf,dd->nnodes*2);
5637     snew(ma->cell_x,DIM);
5638     for(i=0; i<DIM; i++)
5639     {
5640         snew(ma->cell_x[i],dd->nc[i]+1);
5641     }
5642
5643     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5644     {
5645         ma->vbuf = NULL;
5646     }
5647     else
5648     {
5649         snew(ma->vbuf,natoms);
5650     }
5651
5652     return ma;
5653 }
5654
5655 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5656                                int reorder)
5657 {
5658     gmx_domdec_t *dd;
5659     gmx_domdec_comm_t *comm;
5660     int  i,rank;
5661     gmx_bool bDiv[DIM];
5662     ivec periods;
5663 #ifdef GMX_MPI
5664     MPI_Comm comm_cart;
5665 #endif
5666
5667     dd = cr->dd;
5668     comm = dd->comm;
5669
5670     if (comm->bCartesianPP)
5671     {
5672         for(i=1; i<DIM; i++)
5673         {
5674             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5675         }
5676         if (bDiv[YY] || bDiv[ZZ])
5677         {
5678             comm->bCartesianPP_PME = TRUE;
5679             /* If we have 2D PME decomposition, which is always in x+y,
5680              * we stack the PME only nodes in z.
5681              * Otherwise we choose the direction that provides the thinnest slab
5682              * of PME only nodes as this will have the least effect
5683              * on the PP communication.
5684              * But for the PME communication the opposite might be better.
5685              */
5686             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5687                              !bDiv[YY] ||
5688                              dd->nc[YY] > dd->nc[ZZ]))
5689             {
5690                 comm->cartpmedim = ZZ;
5691             }
5692             else
5693             {
5694                 comm->cartpmedim = YY;
5695             }
5696             comm->ntot[comm->cartpmedim]
5697                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5698         }
5699         else if (fplog)
5700         {
5701             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5702             fprintf(fplog,
5703                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5704         }
5705     }
5706
5707 #ifdef GMX_MPI
5708     if (comm->bCartesianPP_PME)
5709     {
5710         if (fplog)
5711         {
5712             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5713         }
5714
5715         for(i=0; i<DIM; i++)
5716         {
5717             periods[i] = TRUE;
5718         }
5719         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5720                         &comm_cart);
5721
5722         MPI_Comm_rank(comm_cart,&rank);
5723         if (MASTERNODE(cr) && rank != 0)
5724         {
5725             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5726         }
5727
5728         /* With this assigment we loose the link to the original communicator
5729          * which will usually be MPI_COMM_WORLD, unless have multisim.
5730          */
5731         cr->mpi_comm_mysim = comm_cart;
5732         cr->sim_nodeid = rank;
5733
5734         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
5735
5736         if (fplog)
5737         {
5738             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
5739                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5740         }
5741
5742         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
5743         {
5744             cr->duty = DUTY_PP;
5745         }
5746         if (cr->npmenodes == 0 ||
5747             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
5748         {
5749             cr->duty = DUTY_PME;
5750         }
5751
5752         /* Split the sim communicator into PP and PME only nodes */
5753         MPI_Comm_split(cr->mpi_comm_mysim,
5754                        cr->duty,
5755                        dd_index(comm->ntot,dd->ci),
5756                        &cr->mpi_comm_mygroup);
5757     }
5758     else
5759     {
5760         switch (dd_node_order)
5761         {
5762         case ddnoPP_PME:
5763             if (fplog)
5764             {
5765                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
5766             }
5767             break;
5768         case ddnoINTERLEAVE:
5769             /* Interleave the PP-only and PME-only nodes,
5770              * as on clusters with dual-core machines this will double
5771              * the communication bandwidth of the PME processes
5772              * and thus speed up the PP <-> PME and inter PME communication.
5773              */
5774             if (fplog)
5775             {
5776                 fprintf(fplog,"Interleaving PP and PME nodes\n");
5777             }
5778             comm->pmenodes = dd_pmenodes(cr);
5779             break;
5780         case ddnoCARTESIAN:
5781             break;
5782         default:
5783             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
5784         }
5785
5786         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
5787         {
5788             cr->duty = DUTY_PME;
5789         }
5790         else
5791         {
5792             cr->duty = DUTY_PP;
5793         }
5794
5795         /* Split the sim communicator into PP and PME only nodes */
5796         MPI_Comm_split(cr->mpi_comm_mysim,
5797                        cr->duty,
5798                        cr->nodeid,
5799                        &cr->mpi_comm_mygroup);
5800         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
5801     }
5802 #endif
5803
5804     if (fplog)
5805     {
5806         fprintf(fplog,"This is a %s only node\n\n",
5807                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
5808     }
5809 }
5810
5811 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
5812 {
5813     gmx_domdec_t *dd;
5814     gmx_domdec_comm_t *comm;
5815     int CartReorder;
5816
5817     dd = cr->dd;
5818     comm = dd->comm;
5819
5820     copy_ivec(dd->nc,comm->ntot);
5821
5822     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
5823     comm->bCartesianPP_PME = FALSE;
5824
5825     /* Reorder the nodes by default. This might change the MPI ranks.
5826      * Real reordering is only supported on very few architectures,
5827      * Blue Gene is one of them.
5828      */
5829     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
5830
5831     if (cr->npmenodes > 0)
5832     {
5833         /* Split the communicator into a PP and PME part */
5834         split_communicator(fplog,cr,dd_node_order,CartReorder);
5835         if (comm->bCartesianPP_PME)
5836         {
5837             /* We (possibly) reordered the nodes in split_communicator,
5838              * so it is no longer required in make_pp_communicator.
5839              */
5840             CartReorder = FALSE;
5841         }
5842     }
5843     else
5844     {
5845         /* All nodes do PP and PME */
5846 #ifdef GMX_MPI
5847         /* We do not require separate communicators */
5848         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
5849 #endif
5850     }
5851
5852     if (cr->duty & DUTY_PP)
5853     {
5854         /* Copy or make a new PP communicator */
5855         make_pp_communicator(fplog,cr,CartReorder);
5856     }
5857     else
5858     {
5859         receive_ddindex2simnodeid(cr);
5860     }
5861
5862     if (!(cr->duty & DUTY_PME))
5863     {
5864         /* Set up the commnuication to our PME node */
5865         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
5866         dd->pme_receive_vir_ener = receive_vir_ener(cr);
5867         if (debug)
5868         {
5869             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
5870                     dd->pme_nodeid,dd->pme_receive_vir_ener);
5871         }
5872     }
5873     else
5874     {
5875         dd->pme_nodeid = -1;
5876     }
5877
5878     if (DDMASTER(dd))
5879     {
5880         dd->ma = init_gmx_domdec_master_t(dd,
5881                                           comm->cgs_gl.nr,
5882                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
5883     }
5884 }
5885
5886 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
5887 {
5888     real *slb_frac,tot;
5889     int  i,n;
5890     double dbl;
5891
5892     slb_frac = NULL;
5893     if (nc > 1 && size_string != NULL)
5894     {
5895         if (fplog)
5896         {
5897             fprintf(fplog,"Using static load balancing for the %s direction\n",
5898                     dir);
5899         }
5900         snew(slb_frac,nc);
5901         tot = 0;
5902         for (i=0; i<nc; i++)
5903         {
5904             dbl = 0;
5905             sscanf(size_string,"%lf%n",&dbl,&n);
5906             if (dbl == 0)
5907             {
5908                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
5909             }
5910             slb_frac[i] = dbl;
5911             size_string += n;
5912             tot += slb_frac[i];
5913         }
5914         /* Normalize */
5915         if (fplog)
5916         {
5917             fprintf(fplog,"Relative cell sizes:");
5918         }
5919         for (i=0; i<nc; i++)
5920         {
5921             slb_frac[i] /= tot;
5922             if (fplog)
5923             {
5924                 fprintf(fplog," %5.3f",slb_frac[i]);
5925             }
5926         }
5927         if (fplog)
5928         {
5929             fprintf(fplog,"\n");
5930         }
5931     }
5932
5933     return slb_frac;
5934 }
5935
5936 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
5937 {
5938     int n,nmol,ftype;
5939     gmx_mtop_ilistloop_t iloop;
5940     t_ilist *il;
5941
5942     n = 0;
5943     iloop = gmx_mtop_ilistloop_init(mtop);
5944     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
5945     {
5946         for(ftype=0; ftype<F_NRE; ftype++)
5947         {
5948             if ((interaction_function[ftype].flags & IF_BOND) &&
5949                 NRAL(ftype) >  2)
5950             {
5951                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
5952             }
5953         }
5954   }
5955
5956   return n;
5957 }
5958
5959 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
5960 {
5961     char *val;
5962     int  nst;
5963
5964     nst = def;
5965     val = getenv(env_var);
5966     if (val)
5967     {
5968         if (sscanf(val,"%d",&nst) <= 0)
5969         {
5970             nst = 1;
5971         }
5972         if (fplog)
5973         {
5974             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
5975                     env_var,val,nst);
5976         }
5977     }
5978
5979     return nst;
5980 }
5981
5982 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
5983 {
5984     if (MASTER(cr))
5985     {
5986         fprintf(stderr,"\n%s\n",warn_string);
5987     }
5988     if (fplog)
5989     {
5990         fprintf(fplog,"\n%s\n",warn_string);
5991     }
5992 }
5993
5994 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
5995                                   t_inputrec *ir,FILE *fplog)
5996 {
5997     if (ir->ePBC == epbcSCREW &&
5998         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
5999     {
6000         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6001     }
6002
6003     if (ir->ns_type == ensSIMPLE)
6004     {
6005         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6006     }
6007
6008     if (ir->nstlist == 0)
6009     {
6010         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6011     }
6012
6013     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6014     {
6015         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6016     }
6017 }
6018
6019 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6020 {
6021     int  di,d;
6022     real r;
6023
6024     r = ddbox->box_size[XX];
6025     for(di=0; di<dd->ndim; di++)
6026     {
6027         d = dd->dim[di];
6028         /* Check using the initial average cell size */
6029         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6030     }
6031
6032     return r;
6033 }
6034
6035 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6036                              const char *dlb_opt,gmx_bool bRecordLoad,
6037                              unsigned long Flags,t_inputrec *ir)
6038 {
6039     gmx_domdec_t *dd;
6040     int  eDLB=-1;
6041     char buf[STRLEN];
6042
6043     switch (dlb_opt[0])
6044     {
6045     case 'a': eDLB = edlbAUTO; break;
6046     case 'n': eDLB = edlbNO;   break;
6047     case 'y': eDLB = edlbYES;  break;
6048     default: gmx_incons("Unknown dlb_opt");
6049     }
6050
6051     if (Flags & MD_RERUN)
6052     {
6053         return edlbNO;
6054     }
6055
6056     if (!EI_DYNAMICS(ir->eI))
6057     {
6058         if (eDLB == edlbYES)
6059         {
6060             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6061             dd_warning(cr,fplog,buf);
6062         }
6063
6064         return edlbNO;
6065     }
6066
6067     if (!bRecordLoad)
6068     {
6069         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6070
6071         return edlbNO;
6072     }
6073
6074     if (Flags & MD_REPRODUCIBLE)
6075     {
6076         switch (eDLB)
6077         {
6078                         case edlbNO:
6079                                 break;
6080                         case edlbAUTO:
6081                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6082                                 eDLB = edlbNO;
6083                                 break;
6084                         case edlbYES:
6085                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6086                                 break;
6087                         default:
6088                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6089                                 break;
6090         }
6091     }
6092
6093     return eDLB;
6094 }
6095
6096 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6097 {
6098     int dim;
6099
6100     dd->ndim = 0;
6101     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6102     {
6103         /* Decomposition order z,y,x */
6104         if (fplog)
6105         {
6106             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6107         }
6108         for(dim=DIM-1; dim>=0; dim--)
6109         {
6110             if (dd->nc[dim] > 1)
6111             {
6112                 dd->dim[dd->ndim++] = dim;
6113             }
6114         }
6115     }
6116     else
6117     {
6118         /* Decomposition order x,y,z */
6119         for(dim=0; dim<DIM; dim++)
6120         {
6121             if (dd->nc[dim] > 1)
6122             {
6123                 dd->dim[dd->ndim++] = dim;
6124             }
6125         }
6126     }
6127 }
6128
6129 static gmx_domdec_comm_t *init_dd_comm()
6130 {
6131     gmx_domdec_comm_t *comm;
6132     int  i;
6133
6134     snew(comm,1);
6135     snew(comm->cggl_flag,DIM*2);
6136     snew(comm->cgcm_state,DIM*2);
6137     for(i=0; i<DIM*2; i++)
6138     {
6139         comm->cggl_flag_nalloc[i]  = 0;
6140         comm->cgcm_state_nalloc[i] = 0;
6141     }
6142
6143     comm->nalloc_int = 0;
6144     comm->buf_int    = NULL;
6145
6146     vec_rvec_init(&comm->vbuf);
6147
6148     comm->n_load_have    = 0;
6149     comm->n_load_collect = 0;
6150
6151     for(i=0; i<ddnatNR-ddnatZONE; i++)
6152     {
6153         comm->sum_nat[i] = 0;
6154     }
6155     comm->ndecomp = 0;
6156     comm->nload   = 0;
6157     comm->load_step = 0;
6158     comm->load_sum  = 0;
6159     comm->load_max  = 0;
6160     clear_ivec(comm->load_lim);
6161     comm->load_mdf  = 0;
6162     comm->load_pme  = 0;
6163
6164     return comm;
6165 }
6166
6167 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6168                                         unsigned long Flags,
6169                                         ivec nc,
6170                                         real comm_distance_min,real rconstr,
6171                                         const char *dlb_opt,real dlb_scale,
6172                                         const char *sizex,const char *sizey,const char *sizez,
6173                                         gmx_mtop_t *mtop,t_inputrec *ir,
6174                                         matrix box,rvec *x,
6175                                         gmx_ddbox_t *ddbox,
6176                                         int *npme_x,int *npme_y)
6177 {
6178     gmx_domdec_t *dd;
6179     gmx_domdec_comm_t *comm;
6180     int  recload;
6181     int  d,i,j;
6182     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6183     gmx_bool bC;
6184     char buf[STRLEN];
6185
6186     if (fplog)
6187     {
6188         fprintf(fplog,
6189                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6190     }
6191
6192     snew(dd,1);
6193
6194     dd->comm = init_dd_comm();
6195     comm = dd->comm;
6196     snew(comm->cggl_flag,DIM*2);
6197     snew(comm->cgcm_state,DIM*2);
6198
6199     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6200     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6201
6202     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6203     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6204     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6205     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6206     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6207     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6208     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6209     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6210
6211     dd->pme_recv_f_alloc = 0;
6212     dd->pme_recv_f_buf = NULL;
6213
6214     if (dd->bSendRecv2 && fplog)
6215     {
6216         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6217     }
6218     if (comm->eFlop)
6219     {
6220         if (fplog)
6221         {
6222             fprintf(fplog,"Will load balance based on FLOP count\n");
6223         }
6224         if (comm->eFlop > 1)
6225         {
6226             srand(1+cr->nodeid);
6227         }
6228         comm->bRecordLoad = TRUE;
6229     }
6230     else
6231     {
6232         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6233
6234     }
6235
6236     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6237
6238     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6239     if (fplog)
6240     {
6241         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6242     }
6243     dd->bGridJump = comm->bDynLoadBal;
6244
6245     if (comm->nstSortCG)
6246     {
6247         if (fplog)
6248         {
6249             if (comm->nstSortCG == 1)
6250             {
6251                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6252             }
6253             else
6254             {
6255                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6256                         comm->nstSortCG);
6257             }
6258         }
6259         snew(comm->sort,1);
6260     }
6261     else
6262     {
6263         if (fplog)
6264         {
6265             fprintf(fplog,"Will not sort the charge groups\n");
6266         }
6267     }
6268
6269     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6270     if (comm->bInterCGBondeds)
6271     {
6272         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6273     }
6274     else
6275     {
6276         comm->bInterCGMultiBody = FALSE;
6277     }
6278
6279     dd->bInterCGcons = inter_charge_group_constraints(mtop);
6280
6281     if (ir->rlistlong == 0)
6282     {
6283         /* Set the cut-off to some very large value,
6284          * so we don't need if statements everywhere in the code.
6285          * We use sqrt, since the cut-off is squared in some places.
6286          */
6287         comm->cutoff   = GMX_CUTOFF_INF;
6288     }
6289     else
6290     {
6291         comm->cutoff   = ir->rlistlong;
6292     }
6293     comm->cutoff_mbody = 0;
6294
6295     comm->cellsize_limit = 0;
6296     comm->bBondComm = FALSE;
6297
6298     if (comm->bInterCGBondeds)
6299     {
6300         if (comm_distance_min > 0)
6301         {
6302             comm->cutoff_mbody = comm_distance_min;
6303             if (Flags & MD_DDBONDCOMM)
6304             {
6305                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6306             }
6307             else
6308             {
6309                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6310             }
6311             r_bonded_limit = comm->cutoff_mbody;
6312         }
6313         else if (ir->bPeriodicMols)
6314         {
6315             /* Can not easily determine the required cut-off */
6316             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6317             comm->cutoff_mbody = comm->cutoff/2;
6318             r_bonded_limit = comm->cutoff_mbody;
6319         }
6320         else
6321         {
6322             if (MASTER(cr))
6323             {
6324                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6325                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6326             }
6327             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6328             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6329
6330             /* We use an initial margin of 10% for the minimum cell size,
6331              * except when we are just below the non-bonded cut-off.
6332              */
6333             if (Flags & MD_DDBONDCOMM)
6334             {
6335                 if (max(r_2b,r_mb) > comm->cutoff)
6336                 {
6337                     r_bonded       = max(r_2b,r_mb);
6338                     r_bonded_limit = 1.1*r_bonded;
6339                     comm->bBondComm = TRUE;
6340                 }
6341                 else
6342                 {
6343                     r_bonded       = r_mb;
6344                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6345                 }
6346                 /* We determine cutoff_mbody later */
6347             }
6348             else
6349             {
6350                 /* No special bonded communication,
6351                  * simply increase the DD cut-off.
6352                  */
6353                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6354                 comm->cutoff_mbody = r_bonded_limit;
6355                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6356             }
6357         }
6358         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6359         if (fplog)
6360         {
6361             fprintf(fplog,
6362                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6363                     comm->cellsize_limit);
6364         }
6365     }
6366
6367     if (dd->bInterCGcons && rconstr <= 0)
6368     {
6369         /* There is a cell size limit due to the constraints (P-LINCS) */
6370         rconstr = constr_r_max(fplog,mtop,ir);
6371         if (fplog)
6372         {
6373             fprintf(fplog,
6374                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6375                     rconstr);
6376             if (rconstr > comm->cellsize_limit)
6377             {
6378                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6379             }
6380         }
6381     }
6382     else if (rconstr > 0 && fplog)
6383     {
6384         /* Here we do not check for dd->bInterCGcons,
6385          * because one can also set a cell size limit for virtual sites only
6386          * and at this point we don't know yet if there are intercg v-sites.
6387          */
6388         fprintf(fplog,
6389                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6390                 rconstr);
6391     }
6392     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6393
6394     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6395
6396     if (nc[XX] > 0)
6397     {
6398         copy_ivec(nc,dd->nc);
6399         set_dd_dim(fplog,dd);
6400         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6401
6402         if (cr->npmenodes == -1)
6403         {
6404             cr->npmenodes = 0;
6405         }
6406         acs = average_cellsize_min(dd,ddbox);
6407         if (acs < comm->cellsize_limit)
6408         {
6409             if (fplog)
6410             {
6411                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6412             }
6413             gmx_fatal_collective(FARGS,cr,NULL,
6414                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6415                                  acs,comm->cellsize_limit);
6416         }
6417     }
6418     else
6419     {
6420         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6421
6422         /* We need to choose the optimal DD grid and possibly PME nodes */
6423         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6424                                comm->eDLB!=edlbNO,dlb_scale,
6425                                comm->cellsize_limit,comm->cutoff,
6426                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6427
6428         if (dd->nc[XX] == 0)
6429         {
6430             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6431             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6432                     !bC ? "-rdd" : "-rcon",
6433                     comm->eDLB!=edlbNO ? " or -dds" : "",
6434                     bC ? " or your LINCS settings" : "");
6435
6436             gmx_fatal_collective(FARGS,cr,NULL,
6437                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6438                                  "%s\n"
6439                                  "Look in the log file for details on the domain decomposition",
6440                                  cr->nnodes-cr->npmenodes,limit,buf);
6441         }
6442         set_dd_dim(fplog,dd);
6443     }
6444
6445     if (fplog)
6446     {
6447         fprintf(fplog,
6448                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6449                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6450     }
6451
6452     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6453     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6454     {
6455         gmx_fatal_collective(FARGS,cr,NULL,
6456                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6457                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6458     }
6459     if (cr->npmenodes > dd->nnodes)
6460     {
6461         gmx_fatal_collective(FARGS,cr,NULL,
6462                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6463     }
6464     if (cr->npmenodes > 0)
6465     {
6466         comm->npmenodes = cr->npmenodes;
6467     }
6468     else
6469     {
6470         comm->npmenodes = dd->nnodes;
6471     }
6472
6473     if (EEL_PME(ir->coulombtype))
6474     {
6475         /* The following choices should match those
6476          * in comm_cost_est in domdec_setup.c.
6477          * Note that here the checks have to take into account
6478          * that the decomposition might occur in a different order than xyz
6479          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6480          * in which case they will not match those in comm_cost_est,
6481          * but since that is mainly for testing purposes that's fine.
6482          */
6483         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6484             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6485             getenv("GMX_PMEONEDD") == NULL)
6486         {
6487             comm->npmedecompdim = 2;
6488             comm->npmenodes_x   = dd->nc[XX];
6489             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6490         }
6491         else
6492         {
6493             /* In case nc is 1 in both x and y we could still choose to
6494              * decompose pme in y instead of x, but we use x for simplicity.
6495              */
6496             comm->npmedecompdim = 1;
6497             if (dd->dim[0] == YY)
6498             {
6499                 comm->npmenodes_x = 1;
6500                 comm->npmenodes_y = comm->npmenodes;
6501             }
6502             else
6503             {
6504                 comm->npmenodes_x = comm->npmenodes;
6505                 comm->npmenodes_y = 1;
6506             }
6507         }
6508         if (fplog)
6509         {
6510             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6511                     comm->npmenodes_x,comm->npmenodes_y,1);
6512         }
6513     }
6514     else
6515     {
6516         comm->npmedecompdim = 0;
6517         comm->npmenodes_x   = 0;
6518         comm->npmenodes_y   = 0;
6519     }
6520
6521     /* Technically we don't need both of these,
6522      * but it simplifies code not having to recalculate it.
6523      */
6524     *npme_x = comm->npmenodes_x;
6525     *npme_y = comm->npmenodes_y;
6526
6527     snew(comm->slb_frac,DIM);
6528     if (comm->eDLB == edlbNO)
6529     {
6530         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6531         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6532         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6533     }
6534
6535     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6536     {
6537         if (comm->bBondComm || comm->eDLB != edlbNO)
6538         {
6539             /* Set the bonded communication distance to halfway
6540              * the minimum and the maximum,
6541              * since the extra communication cost is nearly zero.
6542              */
6543             acs = average_cellsize_min(dd,ddbox);
6544             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6545             if (comm->eDLB != edlbNO)
6546             {
6547                 /* Check if this does not limit the scaling */
6548                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6549             }
6550             if (!comm->bBondComm)
6551             {
6552                 /* Without bBondComm do not go beyond the n.b. cut-off */
6553                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6554                 if (comm->cellsize_limit >= comm->cutoff)
6555                 {
6556                     /* We don't loose a lot of efficieny
6557                      * when increasing it to the n.b. cut-off.
6558                      * It can even be slightly faster, because we need
6559                      * less checks for the communication setup.
6560                      */
6561                     comm->cutoff_mbody = comm->cutoff;
6562                 }
6563             }
6564             /* Check if we did not end up below our original limit */
6565             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6566
6567             if (comm->cutoff_mbody > comm->cellsize_limit)
6568             {
6569                 comm->cellsize_limit = comm->cutoff_mbody;
6570             }
6571         }
6572         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6573     }
6574
6575     if (debug)
6576     {
6577         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6578                 "cellsize limit %f\n",
6579                 comm->bBondComm,comm->cellsize_limit);
6580     }
6581
6582     if (MASTER(cr))
6583     {
6584         check_dd_restrictions(cr,dd,ir,fplog);
6585     }
6586
6587     comm->globalcomm_step = INT_MIN;
6588     dd->ddp_count = 0;
6589
6590     clear_dd_cycle_counts(dd);
6591
6592     return dd;
6593 }
6594
6595 static void set_dlb_limits(gmx_domdec_t *dd)
6596
6597 {
6598     int d;
6599
6600     for(d=0; d<dd->ndim; d++)
6601     {
6602         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6603         dd->comm->cellsize_min[dd->dim[d]] =
6604             dd->comm->cellsize_min_dlb[dd->dim[d]];
6605     }
6606 }
6607
6608
6609 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6610 {
6611     gmx_domdec_t *dd;
6612     gmx_domdec_comm_t *comm;
6613     real cellsize_min;
6614     int  d,nc,i;
6615     char buf[STRLEN];
6616
6617     dd = cr->dd;
6618     comm = dd->comm;
6619
6620     if (fplog)
6621     {
6622         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6623     }
6624
6625     cellsize_min = comm->cellsize_min[dd->dim[0]];
6626     for(d=1; d<dd->ndim; d++)
6627     {
6628         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6629     }
6630
6631     if (cellsize_min < comm->cellsize_limit*1.05)
6632     {
6633         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6634
6635         /* Change DLB from "auto" to "no". */
6636         comm->eDLB = edlbNO;
6637
6638         return;
6639     }
6640
6641     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6642     comm->bDynLoadBal = TRUE;
6643     dd->bGridJump = TRUE;
6644
6645     set_dlb_limits(dd);
6646
6647     /* We can set the required cell size info here,
6648      * so we do not need to communicate this.
6649      * The grid is completely uniform.
6650      */
6651     for(d=0; d<dd->ndim; d++)
6652     {
6653         if (comm->root[d])
6654         {
6655             comm->load[d].sum_m = comm->load[d].sum;
6656
6657             nc = dd->nc[dd->dim[d]];
6658             for(i=0; i<nc; i++)
6659             {
6660                 comm->root[d]->cell_f[i]    = i/(real)nc;
6661                 if (d > 0)
6662                 {
6663                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6664                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6665                 }
6666             }
6667             comm->root[d]->cell_f[nc] = 1.0;
6668         }
6669     }
6670 }
6671
6672 static char *init_bLocalCG(gmx_mtop_t *mtop)
6673 {
6674     int  ncg,cg;
6675     char *bLocalCG;
6676
6677     ncg = ncg_mtop(mtop);
6678     snew(bLocalCG,ncg);
6679     for(cg=0; cg<ncg; cg++)
6680     {
6681         bLocalCG[cg] = FALSE;
6682     }
6683
6684     return bLocalCG;
6685 }
6686
6687 void dd_init_bondeds(FILE *fplog,
6688                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6689                      gmx_vsite_t *vsite,gmx_constr_t constr,
6690                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6691 {
6692     gmx_domdec_comm_t *comm;
6693     gmx_bool bBondComm;
6694     int  d;
6695
6696     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6697
6698     comm = dd->comm;
6699
6700     if (comm->bBondComm)
6701     {
6702         /* Communicate atoms beyond the cut-off for bonded interactions */
6703         comm = dd->comm;
6704
6705         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6706
6707         comm->bLocalCG = init_bLocalCG(mtop);
6708     }
6709     else
6710     {
6711         /* Only communicate atoms based on cut-off */
6712         comm->cglink   = NULL;
6713         comm->bLocalCG = NULL;
6714     }
6715 }
6716
6717 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6718                               t_inputrec *ir,
6719                               gmx_bool bDynLoadBal,real dlb_scale,
6720                               gmx_ddbox_t *ddbox)
6721 {
6722     gmx_domdec_comm_t *comm;
6723     int  d;
6724     ivec np;
6725     real limit,shrink;
6726     char buf[64];
6727
6728     if (fplog == NULL)
6729     {
6730         return;
6731     }
6732
6733     comm = dd->comm;
6734
6735     if (bDynLoadBal)
6736     {
6737         fprintf(fplog,"The maximum number of communication pulses is:");
6738         for(d=0; d<dd->ndim; d++)
6739         {
6740             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
6741         }
6742         fprintf(fplog,"\n");
6743         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
6744         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
6745         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
6746         for(d=0; d<DIM; d++)
6747         {
6748             if (dd->nc[d] > 1)
6749             {
6750                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
6751                 {
6752                     shrink = 0;
6753                 }
6754                 else
6755                 {
6756                     shrink =
6757                         comm->cellsize_min_dlb[d]/
6758                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6759                 }
6760                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
6761             }
6762         }
6763         fprintf(fplog,"\n");
6764     }
6765     else
6766     {
6767         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
6768         fprintf(fplog,"The initial number of communication pulses is:");
6769         for(d=0; d<dd->ndim; d++)
6770         {
6771             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
6772         }
6773         fprintf(fplog,"\n");
6774         fprintf(fplog,"The initial domain decomposition cell size is:");
6775         for(d=0; d<DIM; d++) {
6776             if (dd->nc[d] > 1)
6777             {
6778                 fprintf(fplog," %c %.2f nm",
6779                         dim2char(d),dd->comm->cellsize_min[d]);
6780             }
6781         }
6782         fprintf(fplog,"\n\n");
6783     }
6784
6785     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
6786     {
6787         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
6788         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6789                 "non-bonded interactions","",comm->cutoff);
6790
6791         if (bDynLoadBal)
6792         {
6793             limit = dd->comm->cellsize_limit;
6794         }
6795         else
6796         {
6797             if (dynamic_dd_box(ddbox,ir))
6798             {
6799                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
6800             }
6801             limit = dd->comm->cellsize_min[XX];
6802             for(d=1; d<DIM; d++)
6803             {
6804                 limit = min(limit,dd->comm->cellsize_min[d]);
6805             }
6806         }
6807
6808         if (comm->bInterCGBondeds)
6809         {
6810             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6811                     "two-body bonded interactions","(-rdd)",
6812                     max(comm->cutoff,comm->cutoff_mbody));
6813             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6814                     "multi-body bonded interactions","(-rdd)",
6815                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
6816         }
6817         if (dd->vsite_comm)
6818         {
6819             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6820                     "virtual site constructions","(-rcon)",limit);
6821         }
6822         if (dd->constraint_comm)
6823         {
6824             sprintf(buf,"atoms separated by up to %d constraints",
6825                     1+ir->nProjOrder);
6826             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
6827                     buf,"(-rcon)",limit);
6828         }
6829         fprintf(fplog,"\n");
6830     }
6831
6832     fflush(fplog);
6833 }
6834
6835 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
6836                        t_inputrec *ir,t_forcerec *fr,
6837                        gmx_ddbox_t *ddbox)
6838 {
6839     gmx_domdec_comm_t *comm;
6840     int  d,dim,npulse,npulse_d_max,npulse_d;
6841     gmx_bool bNoCutOff;
6842     int  natoms_tot;
6843     real vol_frac;
6844
6845     comm = dd->comm;
6846
6847     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
6848
6849     if (EEL_PME(ir->coulombtype))
6850     {
6851         init_ddpme(dd,&comm->ddpme[0],0);
6852         if (comm->npmedecompdim >= 2)
6853         {
6854             init_ddpme(dd,&comm->ddpme[1],1);
6855         }
6856     }
6857     else
6858     {
6859         comm->npmenodes = 0;
6860         if (dd->pme_nodeid >= 0)
6861         {
6862             gmx_fatal_collective(FARGS,NULL,dd,
6863                                  "Can not have separate PME nodes without PME electrostatics");
6864         }
6865     }
6866
6867     /* If each molecule is a single charge group
6868      * or we use domain decomposition for each periodic dimension,
6869      * we do not need to take pbc into account for the bonded interactions.
6870      */
6871     if (fr->ePBC == epbcNONE || !comm->bInterCGBondeds ||
6872         (dd->nc[XX]>1 && dd->nc[YY]>1 && (dd->nc[ZZ]>1 || fr->ePBC==epbcXY)))
6873     {
6874         fr->bMolPBC = FALSE;
6875     }
6876     else
6877     {
6878         fr->bMolPBC = TRUE;
6879     }
6880
6881     if (debug)
6882     {
6883         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
6884     }
6885     if (comm->eDLB != edlbNO)
6886     {
6887         /* Determine the maximum number of comm. pulses in one dimension */
6888
6889         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6890
6891         /* Determine the maximum required number of grid pulses */
6892         if (comm->cellsize_limit >= comm->cutoff)
6893         {
6894             /* Only a single pulse is required */
6895             npulse = 1;
6896         }
6897         else if (!bNoCutOff && comm->cellsize_limit > 0)
6898         {
6899             /* We round down slightly here to avoid overhead due to the latency
6900              * of extra communication calls when the cut-off
6901              * would be only slightly longer than the cell size.
6902              * Later cellsize_limit is redetermined,
6903              * so we can not miss interactions due to this rounding.
6904              */
6905             npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
6906         }
6907         else
6908         {
6909             /* There is no cell size limit */
6910             npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
6911         }
6912
6913         if (!bNoCutOff && npulse > 1)
6914         {
6915             /* See if we can do with less pulses, based on dlb_scale */
6916             npulse_d_max = 0;
6917             for(d=0; d<dd->ndim; d++)
6918             {
6919                 dim = dd->dim[d];
6920                 npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
6921                                  /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
6922                 npulse_d_max = max(npulse_d_max,npulse_d);
6923             }
6924             npulse = min(npulse,npulse_d_max);
6925         }
6926
6927         /* This env var can override npulse */
6928         d = dd_nst_env(fplog,"GMX_DD_NPULSE",0);
6929         if (d > 0)
6930         {
6931             npulse = d;
6932         }
6933
6934         comm->maxpulse = 1;
6935         comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
6936         for(d=0; d<dd->ndim; d++)
6937         {
6938             comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
6939             comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
6940             snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
6941             comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
6942             if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
6943             {
6944                 comm->bVacDLBNoLimit = FALSE;
6945             }
6946         }
6947
6948         /* cellsize_limit is set for LINCS in init_domain_decomposition */
6949         if (!comm->bVacDLBNoLimit)
6950         {
6951             comm->cellsize_limit = max(comm->cellsize_limit,
6952                                        comm->cutoff/comm->maxpulse);
6953         }
6954         comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
6955         /* Set the minimum cell size for each DD dimension */
6956         for(d=0; d<dd->ndim; d++)
6957         {
6958             if (comm->bVacDLBNoLimit ||
6959                 comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
6960             {
6961                 comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
6962             }
6963             else
6964             {
6965                 comm->cellsize_min_dlb[dd->dim[d]] =
6966                     comm->cutoff/comm->cd[d].np_dlb;
6967             }
6968         }
6969         if (comm->cutoff_mbody <= 0)
6970         {
6971             comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
6972         }
6973         if (comm->bDynLoadBal)
6974         {
6975             set_dlb_limits(dd);
6976         }
6977     }
6978
6979     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
6980     if (comm->eDLB == edlbAUTO)
6981     {
6982         if (fplog)
6983         {
6984             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
6985         }
6986         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
6987     }
6988
6989     if (ir->ePBC == epbcNONE)
6990     {
6991         vol_frac = 1 - 1/(double)dd->nnodes;
6992     }
6993     else
6994     {
6995         vol_frac =
6996             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
6997     }
6998     if (debug)
6999     {
7000         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7001     }
7002     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7003
7004     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7005 }
7006
7007 static void merge_cg_buffers(int ncell,
7008                              gmx_domdec_comm_dim_t *cd, int pulse,
7009                              int  *ncg_cell,
7010                              int  *index_gl, int  *recv_i,
7011                              rvec *cg_cm,    rvec *recv_vr,
7012                              int *cgindex,
7013                              cginfo_mb_t *cginfo_mb,int *cginfo)
7014 {
7015     gmx_domdec_ind_t *ind,*ind_p;
7016     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7017     int shift,shift_at;
7018
7019     ind = &cd->ind[pulse];
7020
7021     /* First correct the already stored data */
7022     shift = ind->nrecv[ncell];
7023     for(cell=ncell-1; cell>=0; cell--)
7024     {
7025         shift -= ind->nrecv[cell];
7026         if (shift > 0)
7027         {
7028             /* Move the cg's present from previous grid pulses */
7029             cg0 = ncg_cell[ncell+cell];
7030             cg1 = ncg_cell[ncell+cell+1];
7031             cgindex[cg1+shift] = cgindex[cg1];
7032             for(cg=cg1-1; cg>=cg0; cg--)
7033             {
7034                 index_gl[cg+shift] = index_gl[cg];
7035                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7036                 cgindex[cg+shift] = cgindex[cg];
7037                 cginfo[cg+shift] = cginfo[cg];
7038             }
7039             /* Correct the already stored send indices for the shift */
7040             for(p=1; p<=pulse; p++)
7041             {
7042                 ind_p = &cd->ind[p];
7043                 cg0 = 0;
7044                 for(c=0; c<cell; c++)
7045                 {
7046                     cg0 += ind_p->nsend[c];
7047                 }
7048                 cg1 = cg0 + ind_p->nsend[cell];
7049                 for(cg=cg0; cg<cg1; cg++)
7050                 {
7051                     ind_p->index[cg] += shift;
7052                 }
7053             }
7054         }
7055     }
7056
7057     /* Merge in the communicated buffers */
7058     shift = 0;
7059     shift_at = 0;
7060     cg0 = 0;
7061     for(cell=0; cell<ncell; cell++)
7062     {
7063         cg1 = ncg_cell[ncell+cell+1] + shift;
7064         if (shift_at > 0)
7065         {
7066             /* Correct the old cg indices */
7067             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7068             {
7069                 cgindex[cg+1] += shift_at;
7070             }
7071         }
7072         for(cg=0; cg<ind->nrecv[cell]; cg++)
7073         {
7074             /* Copy this charge group from the buffer */
7075             index_gl[cg1] = recv_i[cg0];
7076             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7077             /* Add it to the cgindex */
7078             cg_gl = index_gl[cg1];
7079             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7080             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7081             cgindex[cg1+1] = cgindex[cg1] + nat;
7082             cg0++;
7083             cg1++;
7084             shift_at += nat;
7085         }
7086         shift += ind->nrecv[cell];
7087         ncg_cell[ncell+cell+1] = cg1;
7088     }
7089 }
7090
7091 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7092                                int nzone,int cg0,const int *cgindex)
7093 {
7094     int cg,zone,p;
7095
7096     /* Store the atom block boundaries for easy copying of communication buffers
7097      */
7098     cg = cg0;
7099     for(zone=0; zone<nzone; zone++)
7100     {
7101         for(p=0; p<cd->np; p++) {
7102             cd->ind[p].cell2at0[zone] = cgindex[cg];
7103             cg += cd->ind[p].nrecv[zone];
7104             cd->ind[p].cell2at1[zone] = cgindex[cg];
7105         }
7106     }
7107 }
7108
7109 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7110 {
7111     int  i;
7112     gmx_bool bMiss;
7113
7114     bMiss = FALSE;
7115     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7116     {
7117         if (!bLocalCG[link->a[i]])
7118         {
7119             bMiss = TRUE;
7120         }
7121     }
7122
7123     return bMiss;
7124 }
7125
7126 static void setup_dd_communication(gmx_domdec_t *dd,
7127                                    matrix box,gmx_ddbox_t *ddbox,t_forcerec *fr)
7128 {
7129     int dim_ind,dim,dim0,dim1=-1,dim2=-1,dimd,p,nat_tot;
7130     int nzone,nzone_send,zone,zonei,cg0,cg1;
7131     int c,i,j,cg,cg_gl,nrcg;
7132     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7133     gmx_domdec_comm_t *comm;
7134     gmx_domdec_zones_t *zones;
7135     gmx_domdec_comm_dim_t *cd;
7136     gmx_domdec_ind_t *ind;
7137     cginfo_mb_t *cginfo_mb;
7138     gmx_bool bBondComm,bDist2B,bDistMB,bDistMB_pulse,bDistBonded,bScrew;
7139     real r_mb,r_comm2,r_scomm2,r_bcomm2,r,r_0,r_1,r2,rb2,r2inc,inv_ncg,tric_sh;
7140     rvec rb,rn;
7141     real corner[DIM][4],corner_round_0=0,corner_round_1[4];
7142     real bcorner[DIM],bcorner_round_1=0;
7143     ivec tric_dist;
7144     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7145     real skew_fac2_d,skew_fac_01;
7146     rvec sf2_round;
7147     int  nsend,nat;
7148
7149     if (debug)
7150     {
7151         fprintf(debug,"Setting up DD communication\n");
7152     }
7153
7154     comm  = dd->comm;
7155     cg_cm = fr->cg_cm;
7156
7157     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7158     {
7159         dim = dd->dim[dim_ind];
7160
7161         /* Check if we need to use triclinic distances */
7162         tric_dist[dim_ind] = 0;
7163         for(i=0; i<=dim_ind; i++)
7164         {
7165             if (ddbox->tric_dir[dd->dim[i]])
7166             {
7167                 tric_dist[dim_ind] = 1;
7168             }
7169         }
7170     }
7171
7172     bBondComm = comm->bBondComm;
7173
7174     /* Do we need to determine extra distances for multi-body bondeds? */
7175     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7176
7177     /* Do we need to determine extra distances for only two-body bondeds? */
7178     bDist2B = (bBondComm && !bDistMB);
7179
7180     r_comm2  = sqr(comm->cutoff);
7181     r_bcomm2 = sqr(comm->cutoff_mbody);
7182
7183     if (debug)
7184     {
7185         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7186     }
7187
7188     zones = &comm->zones;
7189
7190     dim0 = dd->dim[0];
7191     /* The first dimension is equal for all cells */
7192     corner[0][0] = comm->cell_x0[dim0];
7193     if (bDistMB)
7194     {
7195         bcorner[0] = corner[0][0];
7196     }
7197     if (dd->ndim >= 2)
7198     {
7199         dim1 = dd->dim[1];
7200         /* This cell row is only seen from the first row */
7201         corner[1][0] = comm->cell_x0[dim1];
7202         /* All rows can see this row */
7203         corner[1][1] = comm->cell_x0[dim1];
7204         if (dd->bGridJump)
7205         {
7206             corner[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7207             if (bDistMB)
7208             {
7209                 /* For the multi-body distance we need the maximum */
7210                 bcorner[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7211             }
7212         }
7213         /* Set the upper-right corner for rounding */
7214         corner_round_0 = comm->cell_x1[dim0];
7215
7216         if (dd->ndim >= 3)
7217         {
7218             dim2 = dd->dim[2];
7219             for(j=0; j<4; j++)
7220             {
7221                 corner[2][j] = comm->cell_x0[dim2];
7222             }
7223             if (dd->bGridJump)
7224             {
7225                 /* Use the maximum of the i-cells that see a j-cell */
7226                 for(i=0; i<zones->nizone; i++)
7227                 {
7228                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7229                     {
7230                         if (j >= 4)
7231                         {
7232                             corner[2][j-4] =
7233                                 max(corner[2][j-4],
7234                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7235                         }
7236                     }
7237                 }
7238                 if (bDistMB)
7239                 {
7240                     /* For the multi-body distance we need the maximum */
7241                     bcorner[2] = comm->cell_x0[dim2];
7242                     for(i=0; i<2; i++)
7243                     {
7244                         for(j=0; j<2; j++)
7245                         {
7246                             bcorner[2] = max(bcorner[2],
7247                                              comm->zone_d2[i][j].p1_0);
7248                         }
7249                     }
7250                 }
7251             }
7252
7253             /* Set the upper-right corner for rounding */
7254             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7255              * Only cell (0,0,0) can see cell 7 (1,1,1)
7256              */
7257             corner_round_1[0] = comm->cell_x1[dim1];
7258             corner_round_1[3] = comm->cell_x1[dim1];
7259             if (dd->bGridJump)
7260             {
7261                 corner_round_1[0] = max(comm->cell_x1[dim1],
7262                                         comm->zone_d1[1].mch1);
7263                 if (bDistMB)
7264                 {
7265                     /* For the multi-body distance we need the maximum */
7266                     bcorner_round_1 = max(comm->cell_x1[dim1],
7267                                           comm->zone_d1[1].p1_1);
7268                 }
7269             }
7270         }
7271     }
7272
7273     /* Triclinic stuff */
7274     normal = ddbox->normal;
7275     skew_fac_01 = 0;
7276     if (dd->ndim >= 2)
7277     {
7278         v_0 = ddbox->v[dim0];
7279         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7280         {
7281             /* Determine the coupling coefficient for the distances
7282              * to the cell planes along dim0 and dim1 through dim2.
7283              * This is required for correct rounding.
7284              */
7285             skew_fac_01 =
7286                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7287             if (debug)
7288             {
7289                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7290             }
7291         }
7292     }
7293     if (dd->ndim >= 3)
7294     {
7295         v_1 = ddbox->v[dim1];
7296     }
7297
7298     zone_cg_range = zones->cg_range;
7299     index_gl = dd->index_gl;
7300     cgindex  = dd->cgindex;
7301     cginfo_mb = fr->cginfo_mb;
7302
7303     zone_cg_range[0]   = 0;
7304     zone_cg_range[1]   = dd->ncg_home;
7305     comm->zone_ncg1[0] = dd->ncg_home;
7306     pos_cg             = dd->ncg_home;
7307
7308     nat_tot = dd->nat_home;
7309     nzone = 1;
7310     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7311     {
7312         dim = dd->dim[dim_ind];
7313         cd = &comm->cd[dim_ind];
7314
7315         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
7316         {
7317             /* No pbc in this dimension, the first node should not comm. */
7318             nzone_send = 0;
7319         }
7320         else
7321         {
7322             nzone_send = nzone;
7323         }
7324
7325         bScrew = (dd->bScrewPBC && dim == XX);
7326
7327         v_d = ddbox->v[dim];
7328         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
7329
7330         cd->bInPlace = TRUE;
7331         for(p=0; p<cd->np; p++)
7332         {
7333             /* Only atoms communicated in the first pulse are used
7334              * for multi-body bonded interactions or for bBondComm.
7335              */
7336             bDistBonded   = ((bDistMB || bDist2B) && p == 0);
7337             bDistMB_pulse = (bDistMB && bDistBonded);
7338
7339             ind = &cd->ind[p];
7340             nsend = 0;
7341             nat = 0;
7342             for(zone=0; zone<nzone_send; zone++)
7343             {
7344                 if (tric_dist[dim_ind] && dim_ind > 0)
7345                 {
7346                     /* Determine slightly more optimized skew_fac's
7347                      * for rounding.
7348                      * This reduces the number of communicated atoms
7349                      * by about 10% for 3D DD of rhombic dodecahedra.
7350                      */
7351                     for(dimd=0; dimd<dim; dimd++)
7352                     {
7353                         sf2_round[dimd] = 1;
7354                         if (ddbox->tric_dir[dimd])
7355                         {
7356                             for(i=dd->dim[dimd]+1; i<DIM; i++)
7357                             {
7358                                 /* If we are shifted in dimension i
7359                                  * and the cell plane is tilted forward
7360                                  * in dimension i, skip this coupling.
7361                                  */
7362                                 if (!(zones->shift[nzone+zone][i] &&
7363                                       ddbox->v[dimd][i][dimd] >= 0))
7364                                 {
7365                                     sf2_round[dimd] +=
7366                                         sqr(ddbox->v[dimd][i][dimd]);
7367                                 }
7368                             }
7369                             sf2_round[dimd] = 1/sf2_round[dimd];
7370                         }
7371                     }
7372                 }
7373
7374                 zonei = zone_perm[dim_ind][zone];
7375                 if (p == 0)
7376                 {
7377                     /* Here we permutate the zones to obtain a convenient order
7378                      * for neighbor searching
7379                      */
7380                     cg0 = zone_cg_range[zonei];
7381                     cg1 = zone_cg_range[zonei+1];
7382                 }
7383                 else
7384                 {
7385                     /* Look only at the cg's received in the previous grid pulse
7386                      */
7387                     cg1 = zone_cg_range[nzone+zone+1];
7388                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
7389                 }
7390                 ind->nsend[zone] = 0;
7391                 for(cg=cg0; cg<cg1; cg++)
7392                 {
7393                     r2  = 0;
7394                     rb2 = 0;
7395                     if (tric_dist[dim_ind] == 0)
7396                     {
7397                         /* Rectangular direction, easy */
7398                         r = cg_cm[cg][dim] - corner[dim_ind][zone];
7399                         if (r > 0)
7400                         {
7401                             r2 += r*r;
7402                         }
7403                         if (bDistMB_pulse)
7404                         {
7405                             r = cg_cm[cg][dim] - bcorner[dim_ind];
7406                             if (r > 0)
7407                             {
7408                                 rb2 += r*r;
7409                             }
7410                         }
7411                         /* Rounding gives at most a 16% reduction
7412                          * in communicated atoms
7413                          */
7414                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7415                         {
7416                             r = cg_cm[cg][dim0] - corner_round_0;
7417                             /* This is the first dimension, so always r >= 0 */
7418                             r2 += r*r;
7419                             if (bDistMB_pulse)
7420                             {
7421                                 rb2 += r*r;
7422                             }
7423                         }
7424                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7425                         {
7426                             r = cg_cm[cg][dim1] - corner_round_1[zone];
7427                             if (r > 0)
7428                             {
7429                                 r2 += r*r;
7430                             }
7431                             if (bDistMB_pulse)
7432                             {
7433                                 r = cg_cm[cg][dim1] - bcorner_round_1;
7434                                 if (r > 0)
7435                                 {
7436                                     rb2 += r*r;
7437                                 }
7438                             }
7439                         }
7440                     }
7441                     else
7442                     {
7443                         /* Triclinic direction, more complicated */
7444                         clear_rvec(rn);
7445                         clear_rvec(rb);
7446                         /* Rounding, conservative as the skew_fac multiplication
7447                          * will slightly underestimate the distance.
7448                          */
7449                         if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7450                         {
7451                             rn[dim0] = cg_cm[cg][dim0] - corner_round_0;
7452                             for(i=dim0+1; i<DIM; i++)
7453                             {
7454                                 rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7455                             }
7456                             r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7457                             if (bDistMB_pulse)
7458                             {
7459                                 rb[dim0] = rn[dim0];
7460                                 rb2 = r2;
7461                             }
7462                             /* Take care that the cell planes along dim0 might not
7463                              * be orthogonal to those along dim1 and dim2.
7464                              */
7465                             for(i=1; i<=dim_ind; i++)
7466                             {
7467                                 dimd = dd->dim[i];
7468                                 if (normal[dim0][dimd] > 0)
7469                                 {
7470                                     rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7471                                     if (bDistMB_pulse)
7472                                     {
7473                                         rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7474                                     }
7475                                 }
7476                             }
7477                         }
7478                         if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7479                         {
7480                             rn[dim1] += cg_cm[cg][dim1] - corner_round_1[zone];
7481                             tric_sh = 0;
7482                             for(i=dim1+1; i<DIM; i++)
7483                             {
7484                                 tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7485                             }
7486                             rn[dim1] += tric_sh;
7487                             if (rn[dim1] > 0)
7488                             {
7489                                 r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7490                                 /* Take care of coupling of the distances
7491                                  * to the planes along dim0 and dim1 through dim2.
7492                                  */
7493                                 r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7494                                 /* Take care that the cell planes along dim1
7495                                  * might not be orthogonal to that along dim2.
7496                                  */
7497                                 if (normal[dim1][dim2] > 0)
7498                                 {
7499                                     rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7500                                 }
7501                             }
7502                             if (bDistMB_pulse)
7503                             {
7504                                 rb[dim1] +=
7505                                     cg_cm[cg][dim1] - bcorner_round_1 + tric_sh;
7506                                 if (rb[dim1] > 0)
7507                                 {
7508                                     rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7509                                     /* Take care of coupling of the distances
7510                                      * to the planes along dim0 and dim1 through dim2.
7511                                      */
7512                                     rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7513                                     /* Take care that the cell planes along dim1
7514                                      * might not be orthogonal to that along dim2.
7515                                      */
7516                                     if (normal[dim1][dim2] > 0)
7517                                     {
7518                                         rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7519                                     }
7520                                 }
7521                             }
7522                         }
7523                         /* The distance along the communication direction */
7524                         rn[dim] += cg_cm[cg][dim] - corner[dim_ind][zone];
7525                         tric_sh = 0;
7526                         for(i=dim+1; i<DIM; i++)
7527                         {
7528                             tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7529                         }
7530                         rn[dim] += tric_sh;
7531                         if (rn[dim] > 0)
7532                         {
7533                             r2 += rn[dim]*rn[dim]*skew_fac2_d;
7534                             /* Take care of coupling of the distances
7535                              * to the planes along dim0 and dim1 through dim2.
7536                              */
7537                             if (dim_ind == 1 && zonei == 1)
7538                             {
7539                                 r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7540                             }
7541                         }
7542                         if (bDistMB_pulse)
7543                         {
7544                             clear_rvec(rb);
7545                             rb[dim] += cg_cm[cg][dim] - bcorner[dim_ind] + tric_sh;
7546                             if (rb[dim] > 0)
7547                             {
7548                                 rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7549                                 /* Take care of coupling of the distances
7550                                  * to the planes along dim0 and dim1 through dim2.
7551                                  */
7552                                 if (dim_ind == 1 && zonei == 1)
7553                                 {
7554                                     rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7555                                 }
7556                             }
7557                         }
7558                     }
7559
7560                     if (r2 < r_comm2 ||
7561                         (bDistBonded &&
7562                          ((bDistMB && rb2 < r_bcomm2) ||
7563                           (bDist2B && r2  < r_bcomm2)) &&
7564                          (!bBondComm ||
7565                           (GET_CGINFO_BOND_INTER(fr->cginfo[cg]) &&
7566                            missing_link(comm->cglink,index_gl[cg],
7567                                         comm->bLocalCG)))))
7568                     {
7569                         /* Make an index to the local charge groups */
7570                         if (nsend+1 > ind->nalloc)
7571                         {
7572                             ind->nalloc = over_alloc_large(nsend+1);
7573                             srenew(ind->index,ind->nalloc);
7574                         }
7575                         if (nsend+1 > comm->nalloc_int)
7576                         {
7577                             comm->nalloc_int = over_alloc_large(nsend+1);
7578                             srenew(comm->buf_int,comm->nalloc_int);
7579                         }
7580                         ind->index[nsend] = cg;
7581                         comm->buf_int[nsend] = index_gl[cg];
7582                         ind->nsend[zone]++;
7583                         vec_rvec_check_alloc(&comm->vbuf,nsend+1);
7584
7585                         if (dd->ci[dim] == 0)
7586                         {
7587                             /* Correct cg_cm for pbc */
7588                             rvec_add(cg_cm[cg],box[dim],comm->vbuf.v[nsend]);
7589                             if (bScrew)
7590                             {
7591                                 comm->vbuf.v[nsend][YY] =
7592                                     box[YY][YY]-comm->vbuf.v[nsend][YY];
7593                                 comm->vbuf.v[nsend][ZZ] =
7594                                     box[ZZ][ZZ]-comm->vbuf.v[nsend][ZZ];
7595                             }
7596                         }
7597                         else
7598                         {
7599                             copy_rvec(cg_cm[cg],comm->vbuf.v[nsend]);
7600                         }
7601                         nsend++;
7602                         nat += cgindex[cg+1] - cgindex[cg];
7603                     }
7604                 }
7605             }
7606             /* Clear the counts in case we do not have pbc */
7607             for(zone=nzone_send; zone<nzone; zone++)
7608             {
7609                 ind->nsend[zone] = 0;
7610             }
7611             ind->nsend[nzone]   = nsend;
7612             ind->nsend[nzone+1] = nat;
7613             /* Communicate the number of cg's and atoms to receive */
7614             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7615                             ind->nsend, nzone+2,
7616                             ind->nrecv, nzone+2);
7617
7618             /* The rvec buffer is also required for atom buffers of size nsend
7619              * in dd_move_x and dd_move_f.
7620              */
7621             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
7622
7623             if (p > 0)
7624             {
7625                 /* We can receive in place if only the last zone is not empty */
7626                 for(zone=0; zone<nzone-1; zone++)
7627                 {
7628                     if (ind->nrecv[zone] > 0)
7629                     {
7630                         cd->bInPlace = FALSE;
7631                     }
7632                 }
7633                 if (!cd->bInPlace)
7634                 {
7635                     /* The int buffer is only required here for the cg indices */
7636                     if (ind->nrecv[nzone] > comm->nalloc_int2)
7637                     {
7638                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
7639                         srenew(comm->buf_int2,comm->nalloc_int2);
7640                     }
7641                     /* The rvec buffer is also required for atom buffers
7642                      * of size nrecv in dd_move_x and dd_move_f.
7643                      */
7644                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
7645                     vec_rvec_check_alloc(&comm->vbuf2,i);
7646                 }
7647             }
7648
7649             /* Make space for the global cg indices */
7650             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
7651                 || dd->cg_nalloc == 0)
7652             {
7653                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
7654                 srenew(index_gl,dd->cg_nalloc);
7655                 srenew(cgindex,dd->cg_nalloc+1);
7656             }
7657             /* Communicate the global cg indices */
7658             if (cd->bInPlace)
7659             {
7660                 recv_i = index_gl + pos_cg;
7661             }
7662             else
7663             {
7664                 recv_i = comm->buf_int2;
7665             }
7666             dd_sendrecv_int(dd, dim_ind, dddirBackward,
7667                             comm->buf_int, nsend,
7668                             recv_i,        ind->nrecv[nzone]);
7669
7670             /* Make space for cg_cm */
7671             if (pos_cg + ind->nrecv[nzone] > fr->cg_nalloc)
7672             {
7673                 dd_realloc_fr_cg(fr,pos_cg + ind->nrecv[nzone]);
7674                 cg_cm = fr->cg_cm;
7675             }
7676             /* Communicate cg_cm */
7677             if (cd->bInPlace)
7678             {
7679                 recv_vr = cg_cm + pos_cg;
7680             }
7681             else
7682             {
7683                 recv_vr = comm->vbuf2.v;
7684             }
7685             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
7686                              comm->vbuf.v, nsend,
7687                              recv_vr,      ind->nrecv[nzone]);
7688
7689             /* Make the charge group index */
7690             if (cd->bInPlace)
7691             {
7692                 zone = (p == 0 ? 0 : nzone - 1);
7693                 while (zone < nzone)
7694                 {
7695                     for(cg=0; cg<ind->nrecv[zone]; cg++)
7696                     {
7697                         cg_gl = index_gl[pos_cg];
7698                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
7699                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
7700                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
7701                         if (bBondComm)
7702                         {
7703                             /* Update the charge group presence,
7704                              * so we can use it in the next pass of the loop.
7705                              */
7706                             comm->bLocalCG[cg_gl] = TRUE;
7707                         }
7708                         pos_cg++;
7709                     }
7710                     if (p == 0)
7711                     {
7712                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
7713                     }
7714                     zone++;
7715                     zone_cg_range[nzone+zone] = pos_cg;
7716                 }
7717             }
7718             else
7719             {
7720                 /* This part of the code is never executed with bBondComm. */
7721                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
7722                                  index_gl,recv_i,cg_cm,recv_vr,
7723                                  cgindex,fr->cginfo_mb,fr->cginfo);
7724                 pos_cg += ind->nrecv[nzone];
7725             }
7726             nat_tot += ind->nrecv[nzone+1];
7727         }
7728         if (!cd->bInPlace)
7729         {
7730             /* Store the atom block for easy copying of communication buffers */
7731             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
7732         }
7733         nzone += nzone;
7734     }
7735     dd->index_gl = index_gl;
7736     dd->cgindex  = cgindex;
7737
7738     dd->ncg_tot = zone_cg_range[zones->n];
7739     dd->nat_tot = nat_tot;
7740     comm->nat[ddnatHOME] = dd->nat_home;
7741     for(i=ddnatZONE; i<ddnatNR; i++)
7742     {
7743         comm->nat[i] = dd->nat_tot;
7744     }
7745
7746     if (!bBondComm)
7747     {
7748         /* We don't need to update cginfo, since that was alrady done above.
7749          * So we pass NULL for the forcerec.
7750          */
7751         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
7752                       NULL,comm->bLocalCG);
7753     }
7754
7755     if (debug)
7756     {
7757         fprintf(debug,"Finished setting up DD communication, zones:");
7758         for(c=0; c<zones->n; c++)
7759         {
7760             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
7761         }
7762         fprintf(debug,"\n");
7763     }
7764 }
7765
7766 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
7767 {
7768     int c;
7769
7770     for(c=0; c<zones->nizone; c++)
7771     {
7772         zones->izone[c].cg1  = zones->cg_range[c+1];
7773         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
7774         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
7775     }
7776 }
7777
7778 static int comp_cgsort(const void *a,const void *b)
7779 {
7780     int comp;
7781
7782     gmx_cgsort_t *cga,*cgb;
7783     cga = (gmx_cgsort_t *)a;
7784     cgb = (gmx_cgsort_t *)b;
7785
7786     comp = cga->nsc - cgb->nsc;
7787     if (comp == 0)
7788     {
7789         comp = cga->ind_gl - cgb->ind_gl;
7790     }
7791
7792     return comp;
7793 }
7794
7795 static void order_int_cg(int n,gmx_cgsort_t *sort,
7796                          int *a,int *buf)
7797 {
7798     int i;
7799
7800     /* Order the data */
7801     for(i=0; i<n; i++)
7802     {
7803         buf[i] = a[sort[i].ind];
7804     }
7805
7806     /* Copy back to the original array */
7807     for(i=0; i<n; i++)
7808     {
7809         a[i] = buf[i];
7810     }
7811 }
7812
7813 static void order_vec_cg(int n,gmx_cgsort_t *sort,
7814                          rvec *v,rvec *buf)
7815 {
7816     int i;
7817
7818     /* Order the data */
7819     for(i=0; i<n; i++)
7820     {
7821         copy_rvec(v[sort[i].ind],buf[i]);
7822     }
7823
7824     /* Copy back to the original array */
7825     for(i=0; i<n; i++)
7826     {
7827         copy_rvec(buf[i],v[i]);
7828     }
7829 }
7830
7831 static void order_vec_atom(int ncg,int *cgindex,gmx_cgsort_t *sort,
7832                            rvec *v,rvec *buf)
7833 {
7834     int a,atot,cg,cg0,cg1,i;
7835
7836     /* Order the data */
7837     a = 0;
7838     for(cg=0; cg<ncg; cg++)
7839     {
7840         cg0 = cgindex[sort[cg].ind];
7841         cg1 = cgindex[sort[cg].ind+1];
7842         for(i=cg0; i<cg1; i++)
7843         {
7844             copy_rvec(v[i],buf[a]);
7845             a++;
7846         }
7847     }
7848     atot = a;
7849
7850     /* Copy back to the original array */
7851     for(a=0; a<atot; a++)
7852     {
7853         copy_rvec(buf[a],v[a]);
7854     }
7855 }
7856
7857 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
7858                          int nsort_new,gmx_cgsort_t *sort_new,
7859                          gmx_cgsort_t *sort1)
7860 {
7861     int i1,i2,i_new;
7862
7863     /* The new indices are not very ordered, so we qsort them */
7864     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
7865
7866     /* sort2 is already ordered, so now we can merge the two arrays */
7867     i1 = 0;
7868     i2 = 0;
7869     i_new = 0;
7870     while(i2 < nsort2 || i_new < nsort_new)
7871     {
7872         if (i2 == nsort2)
7873         {
7874             sort1[i1++] = sort_new[i_new++];
7875         }
7876         else if (i_new == nsort_new)
7877         {
7878             sort1[i1++] = sort2[i2++];
7879         }
7880         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
7881                  (sort2[i2].nsc == sort_new[i_new].nsc &&
7882                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
7883         {
7884             sort1[i1++] = sort2[i2++];
7885         }
7886         else
7887         {
7888             sort1[i1++] = sort_new[i_new++];
7889         }
7890     }
7891 }
7892
7893 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
7894                           rvec *cgcm,t_forcerec *fr,t_state *state,
7895                           int ncg_home_old)
7896 {
7897     gmx_domdec_sort_t *sort;
7898     gmx_cgsort_t *cgsort,*sort_i;
7899     int  ncg_new,nsort2,nsort_new,i,cell_index,*ibuf,cgsize;
7900     rvec *vbuf;
7901
7902     sort = dd->comm->sort;
7903
7904     if (dd->ncg_home > sort->sort_nalloc)
7905     {
7906         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
7907         srenew(sort->sort1,sort->sort_nalloc);
7908         srenew(sort->sort2,sort->sort_nalloc);
7909     }
7910
7911     if (ncg_home_old >= 0)
7912     {
7913         /* The charge groups that remained in the same ns grid cell
7914          * are completely ordered. So we can sort efficiently by sorting
7915          * the charge groups that did move into the stationary list.
7916          */
7917         ncg_new = 0;
7918         nsort2 = 0;
7919         nsort_new = 0;
7920         for(i=0; i<dd->ncg_home; i++)
7921         {
7922             /* Check if this cg did not move to another node */
7923             cell_index = fr->ns.grid->cell_index[i];
7924             if (cell_index !=  4*fr->ns.grid->ncells)
7925             {
7926                 if (i >= ncg_home_old || cell_index != sort->sort1[i].nsc)
7927                 {
7928                     /* This cg is new on this node or moved ns grid cell */
7929                     if (nsort_new >= sort->sort_new_nalloc)
7930                     {
7931                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
7932                         srenew(sort->sort_new,sort->sort_new_nalloc);
7933                     }
7934                     sort_i = &(sort->sort_new[nsort_new++]);
7935                 }
7936                 else
7937                 {
7938                     /* This cg did not move */
7939                     sort_i = &(sort->sort2[nsort2++]);
7940                 }
7941                 /* Sort on the ns grid cell indices
7942                  * and the global topology index
7943                  */
7944                 sort_i->nsc    = cell_index;
7945                 sort_i->ind_gl = dd->index_gl[i];
7946                 sort_i->ind    = i;
7947                 ncg_new++;
7948             }
7949         }
7950         if (debug)
7951         {
7952             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
7953                     nsort2,nsort_new);
7954         }
7955         /* Sort efficiently */
7956         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,sort->sort1);
7957     }
7958     else
7959     {
7960         cgsort = sort->sort1;
7961         ncg_new = 0;
7962         for(i=0; i<dd->ncg_home; i++)
7963         {
7964             /* Sort on the ns grid cell indices
7965              * and the global topology index
7966              */
7967             cgsort[i].nsc    = fr->ns.grid->cell_index[i];
7968             cgsort[i].ind_gl = dd->index_gl[i];
7969             cgsort[i].ind    = i;
7970             if (cgsort[i].nsc != 4*fr->ns.grid->ncells)
7971             {
7972                 ncg_new++;
7973             }
7974         }
7975         if (debug)
7976         {
7977             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
7978         }
7979         /* Determine the order of the charge groups using qsort */
7980         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
7981     }
7982     cgsort = sort->sort1;
7983
7984     /* We alloc with the old size, since cgindex is still old */
7985     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
7986     vbuf = dd->comm->vbuf.v;
7987
7988     /* Remove the charge groups which are no longer at home here */
7989     dd->ncg_home = ncg_new;
7990
7991     /* Reorder the state */
7992     for(i=0; i<estNR; i++)
7993     {
7994         if (EST_DISTR(i) && (state->flags & (1<<i)))
7995         {
7996             switch (i)
7997             {
7998             case estX:
7999                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->x,vbuf);
8000                 break;
8001             case estV:
8002                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->v,vbuf);
8003                 break;
8004             case estSDX:
8005                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->sd_X,vbuf);
8006                 break;
8007             case estCGP:
8008                 order_vec_atom(dd->ncg_home,dd->cgindex,cgsort,state->cg_p,vbuf);
8009                 break;
8010             case estLD_RNG:
8011             case estLD_RNGI:
8012             case estDISRE_INITF:
8013             case estDISRE_RM3TAV:
8014             case estORIRE_INITF:
8015             case estORIRE_DTAV:
8016                 /* No ordering required */
8017                 break;
8018             default:
8019                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8020                 break;
8021             }
8022         }
8023     }
8024     /* Reorder cgcm */
8025     order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8026
8027     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8028     {
8029         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8030         srenew(sort->ibuf,sort->ibuf_nalloc);
8031     }
8032     ibuf = sort->ibuf;
8033     /* Reorder the global cg index */
8034     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8035     /* Reorder the cginfo */
8036     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8037     /* Rebuild the local cg index */
8038     ibuf[0] = 0;
8039     for(i=0; i<dd->ncg_home; i++)
8040     {
8041         cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8042         ibuf[i+1] = ibuf[i] + cgsize;
8043     }
8044     for(i=0; i<dd->ncg_home+1; i++)
8045     {
8046         dd->cgindex[i] = ibuf[i];
8047     }
8048     /* Set the home atom number */
8049     dd->nat_home = dd->cgindex[dd->ncg_home];
8050
8051     /* Copy the sorted ns cell indices back to the ns grid struct */
8052     for(i=0; i<dd->ncg_home; i++)
8053     {
8054         fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8055     }
8056     fr->ns.grid->nr = dd->ncg_home;
8057 }
8058
8059 static void add_dd_statistics(gmx_domdec_t *dd)
8060 {
8061     gmx_domdec_comm_t *comm;
8062     int ddnat;
8063
8064     comm = dd->comm;
8065
8066     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8067     {
8068         comm->sum_nat[ddnat-ddnatZONE] +=
8069             comm->nat[ddnat] - comm->nat[ddnat-1];
8070     }
8071     comm->ndecomp++;
8072 }
8073
8074 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8075 {
8076     gmx_domdec_comm_t *comm;
8077     int ddnat;
8078
8079     comm = dd->comm;
8080
8081     /* Reset all the statistics and counters for total run counting */
8082     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8083     {
8084         comm->sum_nat[ddnat-ddnatZONE] = 0;
8085     }
8086     comm->ndecomp = 0;
8087     comm->nload = 0;
8088     comm->load_step = 0;
8089     comm->load_sum = 0;
8090     comm->load_max = 0;
8091     clear_ivec(comm->load_lim);
8092     comm->load_mdf = 0;
8093     comm->load_pme = 0;
8094 }
8095
8096 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
8097 {
8098     gmx_domdec_comm_t *comm;
8099     int ddnat;
8100     double av;
8101
8102     comm = cr->dd->comm;
8103
8104     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
8105
8106     if (fplog == NULL)
8107     {
8108         return;
8109     }
8110
8111     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
8112
8113     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8114     {
8115         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
8116         switch(ddnat)
8117         {
8118         case ddnatZONE:
8119             fprintf(fplog,
8120                     " av. #atoms communicated per step for force:  %d x %.1f\n",
8121                     2,av);
8122             break;
8123         case ddnatVSITE:
8124             if (cr->dd->vsite_comm)
8125             {
8126                 fprintf(fplog,
8127                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
8128                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
8129                         av);
8130             }
8131             break;
8132         case ddnatCON:
8133             if (cr->dd->constraint_comm)
8134             {
8135                 fprintf(fplog,
8136                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
8137                         1 + ir->nLincsIter,av);
8138             }
8139             break;
8140         default:
8141             gmx_incons(" Unknown type for DD statistics");
8142         }
8143     }
8144     fprintf(fplog,"\n");
8145
8146     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
8147     {
8148         print_dd_load_av(fplog,cr->dd);
8149     }
8150 }
8151
8152 void dd_partition_system(FILE            *fplog,
8153                          gmx_large_int_t      step,
8154                          t_commrec       *cr,
8155                          gmx_bool            bMasterState,
8156                          int             nstglobalcomm,
8157                          t_state         *state_global,
8158                          gmx_mtop_t      *top_global,
8159                          t_inputrec      *ir,
8160                          t_state         *state_local,
8161                          rvec            **f,
8162                          t_mdatoms       *mdatoms,
8163                          gmx_localtop_t  *top_local,
8164                          t_forcerec      *fr,
8165                          gmx_vsite_t     *vsite,
8166                          gmx_shellfc_t   shellfc,
8167                          gmx_constr_t    constr,
8168                          t_nrnb          *nrnb,
8169                          gmx_wallcycle_t wcycle,
8170                          gmx_bool            bVerbose)
8171 {
8172     gmx_domdec_t *dd;
8173     gmx_domdec_comm_t *comm;
8174     gmx_ddbox_t ddbox={0};
8175     t_block *cgs_gl;
8176     gmx_large_int_t step_pcoupl;
8177     rvec cell_ns_x0,cell_ns_x1;
8178     int  i,j,n,cg0=0,ncg_home_old=-1,nat_f_novirsum;
8179     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
8180     gmx_bool bRedist,bSortCG,bResortAll;
8181     ivec ncells_old,np;
8182     real grid_density;
8183     char sbuf[22];
8184
8185     dd = cr->dd;
8186     comm = dd->comm;
8187
8188     bBoxChanged = (bMasterState || DEFORM(*ir));
8189     if (ir->epc != epcNO)
8190     {
8191         /* With nstpcouple > 1 pressure coupling happens.
8192          * one step after calculating the pressure.
8193          * Box scaling happens at the end of the MD step,
8194          * after the DD partitioning.
8195          * We therefore have to do DLB in the first partitioning
8196          * after an MD step where P-coupling occured.
8197          * We need to determine the last step in which p-coupling occurred.
8198          * MRS -- need to validate this for vv?
8199          */
8200         n = ir->nstpcouple;
8201         if (n == 1)
8202         {
8203             step_pcoupl = step - 1;
8204         }
8205         else
8206         {
8207             step_pcoupl = ((step - 1)/n)*n + 1;
8208         }
8209         if (step_pcoupl >= comm->globalcomm_step)
8210         {
8211             bBoxChanged = TRUE;
8212         }
8213     }
8214
8215     bNStGlobalComm = (step >= comm->globalcomm_step + nstglobalcomm);
8216
8217     if (!comm->bDynLoadBal)
8218     {
8219         bDoDLB = FALSE;
8220     }
8221     else
8222     {
8223         /* Should we do dynamic load balacing this step?
8224          * Since it requires (possibly expensive) global communication,
8225          * we might want to do DLB less frequently.
8226          */
8227         if (bBoxChanged || ir->epc != epcNO)
8228         {
8229             bDoDLB = bBoxChanged;
8230         }
8231         else
8232         {
8233             bDoDLB = bNStGlobalComm;
8234         }
8235     }
8236
8237     /* Check if we have recorded loads on the nodes */
8238     if (comm->bRecordLoad && dd_load_count(comm))
8239     {
8240         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
8241         {
8242             /* Check if we should use DLB at the second partitioning
8243              * and every 100 partitionings,
8244              * so the extra communication cost is negligible.
8245              */
8246             n = max(100,nstglobalcomm);
8247             bCheckDLB = (comm->n_load_collect == 0 ||
8248                          comm->n_load_have % n == n-1);
8249         }
8250         else
8251         {
8252             bCheckDLB = FALSE;
8253         }
8254
8255         /* Print load every nstlog, first and last step to the log file */
8256         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
8257                     comm->n_load_collect == 0 ||
8258                     (ir->nsteps >= 0 &&
8259                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
8260
8261         /* Avoid extra communication due to verbose screen output
8262          * when nstglobalcomm is set.
8263          */
8264         if (bDoDLB || bLogLoad || bCheckDLB ||
8265             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
8266         {
8267             get_load_distribution(dd,wcycle);
8268             if (DDMASTER(dd))
8269             {
8270                 if (bLogLoad)
8271                 {
8272                     dd_print_load(fplog,dd,step-1);
8273                 }
8274                 if (bVerbose)
8275                 {
8276                     dd_print_load_verbose(dd);
8277                 }
8278             }
8279             comm->n_load_collect++;
8280
8281             if (bCheckDLB) {
8282                 /* Since the timings are node dependent, the master decides */
8283                 if (DDMASTER(dd))
8284                 {
8285                     bTurnOnDLB =
8286                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
8287                     if (debug)
8288                     {
8289                         fprintf(debug,"step %s, imb loss %f\n",
8290                                 gmx_step_str(step,sbuf),
8291                                 dd_force_imb_perf_loss(dd));
8292                     }
8293                 }
8294                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
8295                 if (bTurnOnDLB)
8296                 {
8297                     turn_on_dlb(fplog,cr,step);
8298                     bDoDLB = TRUE;
8299                 }
8300             }
8301         }
8302         comm->n_load_have++;
8303     }
8304
8305     cgs_gl = &comm->cgs_gl;
8306
8307     bRedist = FALSE;
8308     if (bMasterState)
8309     {
8310         /* Clear the old state */
8311         clear_dd_indices(dd,0,0);
8312
8313         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
8314                   TRUE,cgs_gl,state_global->x,&ddbox);
8315
8316         get_cg_distribution(fplog,step,dd,cgs_gl,
8317                             state_global->box,&ddbox,state_global->x);
8318
8319         dd_distribute_state(dd,cgs_gl,
8320                             state_global,state_local,f);
8321
8322         dd_make_local_cgs(dd,&top_local->cgs);
8323
8324         if (dd->ncg_home > fr->cg_nalloc)
8325         {
8326             dd_realloc_fr_cg(fr,dd->ncg_home);
8327         }
8328         calc_cgcm(fplog,0,dd->ncg_home,
8329                   &top_local->cgs,state_local->x,fr->cg_cm);
8330
8331         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8332
8333         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8334
8335         cg0 = 0;
8336     }
8337     else if (state_local->ddp_count != dd->ddp_count)
8338     {
8339         if (state_local->ddp_count > dd->ddp_count)
8340         {
8341             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
8342         }
8343
8344         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
8345         {
8346             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
8347         }
8348
8349         /* Clear the old state */
8350         clear_dd_indices(dd,0,0);
8351
8352         /* Build the new indices */
8353         rebuild_cgindex(dd,cgs_gl->index,state_local);
8354         make_dd_indices(dd,cgs_gl->index,0);
8355
8356         /* Redetermine the cg COMs */
8357         calc_cgcm(fplog,0,dd->ncg_home,
8358                   &top_local->cgs,state_local->x,fr->cg_cm);
8359
8360         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
8361
8362         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
8363
8364         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8365                   TRUE,&top_local->cgs,state_local->x,&ddbox);
8366
8367         bRedist = comm->bDynLoadBal;
8368     }
8369     else
8370     {
8371         /* We have the full state, only redistribute the cgs */
8372
8373         /* Clear the non-home indices */
8374         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
8375
8376         /* Avoid global communication for dim's without pbc and -gcom */
8377         if (!bNStGlobalComm)
8378         {
8379             copy_rvec(comm->box0    ,ddbox.box0    );
8380             copy_rvec(comm->box_size,ddbox.box_size);
8381         }
8382         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
8383                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
8384
8385         bBoxChanged = TRUE;
8386         bRedist = TRUE;
8387     }
8388     /* For dim's without pbc and -gcom */
8389     copy_rvec(ddbox.box0    ,comm->box0    );
8390     copy_rvec(ddbox.box_size,comm->box_size);
8391
8392     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
8393                       step,wcycle);
8394
8395     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
8396     {
8397         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
8398     }
8399
8400     /* Check if we should sort the charge groups */
8401     if (comm->nstSortCG > 0)
8402     {
8403         bSortCG = (bMasterState ||
8404                    (bRedist && (step % comm->nstSortCG == 0)));
8405     }
8406     else
8407     {
8408         bSortCG = FALSE;
8409     }
8410
8411     ncg_home_old = dd->ncg_home;
8412
8413     if (bRedist)
8414     {
8415         cg0 = dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
8416                                  state_local,f,fr,mdatoms,
8417                                  !bSortCG,nrnb);
8418     }
8419
8420     get_nsgrid_boundaries(fr->ns.grid,dd,
8421                           state_local->box,&ddbox,&comm->cell_x0,&comm->cell_x1,
8422                           dd->ncg_home,fr->cg_cm,
8423                           cell_ns_x0,cell_ns_x1,&grid_density);
8424
8425     if (bBoxChanged)
8426     {
8427         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
8428     }
8429
8430     copy_ivec(fr->ns.grid->n,ncells_old);
8431     grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
8432                state_local->box,cell_ns_x0,cell_ns_x1,
8433                fr->rlistlong,grid_density);
8434     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
8435     copy_ivec(ddbox.tric_dir,comm->tric_dir);
8436
8437     if (bSortCG)
8438     {
8439         /* Sort the state on charge group position.
8440          * This enables exact restarts from this step.
8441          * It also improves performance by about 15% with larger numbers
8442          * of atoms per node.
8443          */
8444
8445         /* Fill the ns grid with the home cell,
8446          * so we can sort with the indices.
8447          */
8448         set_zones_ncg_home(dd);
8449         fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
8450                   0,dd->ncg_home,fr->cg_cm);
8451
8452         /* Check if we can user the old order and ns grid cell indices
8453          * of the charge groups to sort the charge groups efficiently.
8454          */
8455         bResortAll = (bMasterState ||
8456                       fr->ns.grid->n[XX] != ncells_old[XX] ||
8457                       fr->ns.grid->n[YY] != ncells_old[YY] ||
8458                       fr->ns.grid->n[ZZ] != ncells_old[ZZ]);
8459
8460         if (debug)
8461         {
8462             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
8463                     gmx_step_str(step,sbuf),dd->ncg_home);
8464         }
8465         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
8466                       bResortAll ? -1 : ncg_home_old);
8467         /* Rebuild all the indices */
8468         cg0 = 0;
8469         ga2la_clear(dd->ga2la);
8470     }
8471
8472     /* Setup up the communication and communicate the coordinates */
8473     setup_dd_communication(dd,state_local->box,&ddbox,fr);
8474
8475     /* Set the indices */
8476     make_dd_indices(dd,cgs_gl->index,cg0);
8477
8478     /* Set the charge group boundaries for neighbor searching */
8479     set_cg_boundaries(&comm->zones);
8480
8481     /*
8482     write_dd_pdb("dd_home",step,"dump",top_global,cr,
8483                  -1,state_local->x,state_local->box);
8484     */
8485
8486     /* Extract a local topology from the global topology */
8487     for(i=0; i<dd->ndim; i++)
8488     {
8489         np[dd->dim[i]] = comm->cd[i].np;
8490     }
8491     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
8492                       comm->cellsize_min,np,
8493                       fr,vsite,top_global,top_local);
8494
8495     /* Set up the special atom communication */
8496     n = comm->nat[ddnatZONE];
8497     for(i=ddnatZONE+1; i<ddnatNR; i++)
8498     {
8499         switch(i)
8500         {
8501         case ddnatVSITE:
8502             if (vsite && vsite->n_intercg_vsite)
8503             {
8504                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
8505             }
8506             break;
8507         case ddnatCON:
8508             if (dd->bInterCGcons)
8509             {
8510                 /* Only for inter-cg constraints we need special code */
8511                 n = dd_make_local_constraints(dd,n,top_global,
8512                                               constr,ir->nProjOrder,
8513                                               &top_local->idef.il[F_CONSTR]);
8514             }
8515             break;
8516         default:
8517             gmx_incons("Unknown special atom type setup");
8518         }
8519         comm->nat[i] = n;
8520     }
8521
8522     /* Make space for the extra coordinates for virtual site
8523      * or constraint communication.
8524      */
8525     state_local->natoms = comm->nat[ddnatNR-1];
8526     if (state_local->natoms > state_local->nalloc)
8527     {
8528         dd_realloc_state(state_local,f,state_local->natoms);
8529     }
8530
8531     if (fr->bF_NoVirSum)
8532     {
8533         if (vsite && vsite->n_intercg_vsite)
8534         {
8535             nat_f_novirsum = comm->nat[ddnatVSITE];
8536         }
8537         else
8538         {
8539             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
8540             {
8541                 nat_f_novirsum = dd->nat_tot;
8542             }
8543             else
8544             {
8545                 nat_f_novirsum = dd->nat_home;
8546             }
8547         }
8548     }
8549     else
8550     {
8551         nat_f_novirsum = 0;
8552     }
8553
8554     /* Set the number of atoms required for the force calculation.
8555      * Forces need to be constrained when using a twin-range setup
8556      * or with energy minimization. For simple simulations we could
8557      * avoid some allocation, zeroing and copying, but this is
8558      * probably not worth the complications ande checking.
8559      */
8560     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
8561                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
8562
8563     /* We make the all mdatoms up to nat_tot_con.
8564      * We could save some work by only setting invmass
8565      * between nat_tot and nat_tot_con.
8566      */
8567     /* This call also sets the new number of home particles to dd->nat_home */
8568     atoms2md(top_global,ir,
8569              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
8570
8571     /* Now we have the charges we can sort the FE interactions */
8572     dd_sort_local_top(dd,mdatoms,top_local);
8573
8574     if (shellfc)
8575     {
8576         /* Make the local shell stuff, currently no communication is done */
8577         make_local_shells(cr,mdatoms,shellfc);
8578     }
8579
8580         if (ir->implicit_solvent)
8581     {
8582         make_local_gb(cr,fr->born,ir->gb_algorithm);
8583     }
8584
8585     if (!(cr->duty & DUTY_PME))
8586     {
8587         /* Send the charges to our PME only node */
8588         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
8589                        mdatoms->chargeA,mdatoms->chargeB,
8590                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
8591     }
8592
8593     if (constr)
8594     {
8595         set_constraints(constr,top_local,ir,mdatoms,cr);
8596     }
8597
8598     if (ir->ePull != epullNO)
8599     {
8600         /* Update the local pull groups */
8601         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
8602     }
8603
8604     if (ir->bRot)
8605     {
8606         /* Update the local rotation groups */
8607         dd_make_local_rotation_groups(dd,ir->rot);
8608     }
8609
8610
8611     add_dd_statistics(dd);
8612
8613     /* Make sure we only count the cycles for this DD partitioning */
8614     clear_dd_cycle_counts(dd);
8615
8616     /* Because the order of the atoms might have changed since
8617      * the last vsite construction, we need to communicate the constructing
8618      * atom coordinates again (for spreading the forces this MD step).
8619      */
8620     dd_move_x_vsites(dd,state_local->box,state_local->x);
8621
8622     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
8623     {
8624         dd_move_x(dd,state_local->box,state_local->x);
8625         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
8626                      -1,state_local->x,state_local->box);
8627     }
8628
8629     if (bNStGlobalComm)
8630     {
8631         /* Store the global communication step */
8632         comm->globalcomm_step = step;
8633     }
8634
8635     /* Increase the DD partitioning counter */
8636     dd->ddp_count++;
8637     /* The state currently matches this DD partitioning count, store it */
8638     state_local->ddp_count = dd->ddp_count;
8639     if (bMasterState)
8640     {
8641         /* The DD master node knows the complete cg distribution,
8642          * store the count so we can possibly skip the cg info communication.
8643          */
8644         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
8645     }
8646
8647     if (comm->DD_debug > 0)
8648     {
8649         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
8650         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
8651                                 "after partitioning");
8652     }
8653 }