src/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2008
   5  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   7  * others, as listed in the AUTHORS file in the top-level source
   8  * directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <time.h>
  43 #include <math.h>
  44 #include <string.h>
  45 #include <stdlib.h>
  46 #include "typedefs.h"
  47 #include "smalloc.h"
  48 #include "gmx_fatal.h"
  49 #include "gmx_fatal_collective.h"
  50 #include "vec.h"
  51 #include "domdec.h"
  52 #include "domdec_network.h"
  53 #include "nrnb.h"
  54 #include "pbc.h"
  55 #include "chargegroup.h"
  56 #include "constr.h"
  57 #include "mdatoms.h"
  58 #include "names.h"
  59 #include "pdbio.h"
  60 #include "futil.h"
  61 #include "force.h"
  62 #include "pme.h"
  63 #include "pull.h"
  64 #include "pull_rotation.h"
  65 #include "gmx_wallcycle.h"
  66 #include "mdrun.h"
  67 #include "nsgrid.h"
  68 #include "shellfc.h"
  69 #include "mtop_util.h"
  70 #include "gmxfio.h"
  71 #include "gmx_ga2la.h"
  72 #include "gmx_sort.h"
  73 #include "nbnxn_search.h"
  74 #include "bondf.h"
  75 #include "gmx_omp_nthreads.h"
  76
  77 #ifdef GMX_LIB_MPI
  78 #include <mpi.h>
  79 #endif
  80 #ifdef GMX_THREAD_MPI
  81 #include "tmpi.h"
  82 #endif
  83
  84 #define DDRANK(dd,rank)    (rank)
  85 #define DDMASTERRANK(dd)   (dd->masterrank)
  86
  87 typedef struct gmx_domdec_master
  88 {
  89     /* The cell boundaries */
  90     real **cell_x;
  91     /* The global charge group division */
  92     int  *ncg;     /* Number of home charge groups for each node */
  93     int  *index;   /* Index of nnodes+1 into cg */
  94     int  *cg;      /* Global charge group index */
  95     int  *nat;     /* Number of home atoms for each node. */
  96     int  *ibuf;    /* Buffer for communication */
  97     rvec *vbuf;    /* Buffer for state scattering and gathering */
  98 } gmx_domdec_master_t;
  99
 100 typedef struct
 101 {
 102     /* The numbers of charge groups to send and receive for each cell
 103      * that requires communication, the last entry contains the total
 104      * number of atoms that needs to be communicated.
 105      */
 106     int nsend[DD_MAXIZONE+2];
 107     int nrecv[DD_MAXIZONE+2];
 108     /* The charge groups to send */
 109     int *index;
 110     int nalloc;
 111     /* The atom range for non-in-place communication */
 112     int cell2at0[DD_MAXIZONE];
 113     int cell2at1[DD_MAXIZONE];
 114 } gmx_domdec_ind_t;
 115
 116 typedef struct
 117 {
 118     int  np;                   /* Number of grid pulses in this dimension */
 119     int  np_dlb;               /* For dlb, for use with edlbAUTO          */
 120     gmx_domdec_ind_t *ind;     /* The indices to communicate, size np     */
 121     int  np_nalloc;
 122     gmx_bool bInPlace;             /* Can we communicate in place?            */
 123 } gmx_domdec_comm_dim_t;
 124
 125 typedef struct
 126 {
 127     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 128     real *cell_f;      /* State var.: cell boundaries, box relative      */
 129     real *old_cell_f;  /* Temp. var.: old cell size                      */
 130     real *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 131     real *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 132     real *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 133     real *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 134     gmx_bool bLimited;     /* State var.: is DLB limited in this dim and row */
 135     real *buf_ncd;     /* Temp. var.                                     */
 136 } gmx_domdec_root_t;
 137
 138 #define DD_NLOAD_MAX 9
 139
 140 /* Here floats are accurate enough, since these variables
 141  * only influence the load balancing, not the actual MD results.
 142  */
 143 typedef struct
 144 {
 145     int  nload;
 146     float *load;
 147     float sum;
 148     float max;
 149     float sum_m;
 150     float cvol_min;
 151     float mdf;
 152     float pme;
 153     int   flags;
 154 } gmx_domdec_load_t;
 155
 156 typedef struct
 157 {
 158     int  nsc;
 159     int  ind_gl;
 160     int  ind;
 161 } gmx_cgsort_t;
 162
 163 typedef struct
 164 {
 165     gmx_cgsort_t *sort;
 166     gmx_cgsort_t *sort2;
 167     int  sort_nalloc;
 168     gmx_cgsort_t *sort_new;
 169     int  sort_new_nalloc;
 170     int  *ibuf;
 171     int  ibuf_nalloc;
 172 } gmx_domdec_sort_t;
 173
 174 typedef struct
 175 {
 176     rvec *v;
 177     int  nalloc;
 178 } vec_rvec_t;
 179
 180 /* This enum determines the order of the coordinates.
 181  * ddnatHOME and ddnatZONE should be first and second,
 182  * the others can be ordered as wanted.
 183  */
 184 enum { ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR };
 185
 186 enum { edlbAUTO, edlbNO, edlbYES, edlbNR };
 187 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 188
 189 typedef struct
 190 {
 191     int  dim;      /* The dimension                                          */
 192     gmx_bool dim_match;/* Tells if DD and PME dims match                         */
 193     int  nslab;    /* The number of PME slabs in this dimension              */
 194     real *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 195     int  *pp_min;  /* The minimum pp node location, size nslab               */
 196     int  *pp_max;  /* The maximum pp node location,size nslab                */
 197     int  maxshift; /* The maximum shift for coordinate redistribution in PME */
 198 } gmx_ddpme_t;
 199
 200 typedef struct
 201 {
 202     real min0;    /* The minimum bottom of this zone                        */
 203     real max1;    /* The maximum top of this zone                           */
 204     real min1;    /* The minimum top of this zone                           */
 205     real mch0;    /* The maximum bottom communicaton height for this zone   */
 206     real mch1;    /* The maximum top communicaton height for this zone      */
 207     real p1_0;    /* The bottom value of the first cell in this zone        */
 208     real p1_1;    /* The top value of the first cell in this zone           */
 209 } gmx_ddzone_t;
 210
 211 typedef struct
 212 {
 213     gmx_domdec_ind_t ind;
 214     int *ibuf;
 215     int ibuf_nalloc;
 216     vec_rvec_t vbuf;
 217     int nsend;
 218     int nat;
 219     int nsend_zone;
 220 } dd_comm_setup_work_t;
 221
 222 typedef struct gmx_domdec_comm
 223 {
 224     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 225      * unless stated otherwise.
 226      */
 227
 228     /* The number of decomposition dimensions for PME, 0: no PME */
 229     int  npmedecompdim;
 230     /* The number of nodes doing PME (PP/PME or only PME) */
 231     int  npmenodes;
 232     int  npmenodes_x;
 233     int  npmenodes_y;
 234     /* The communication setup including the PME only nodes */
 235     gmx_bool bCartesianPP_PME;
 236     ivec ntot;
 237     int  cartpmedim;
 238     int  *pmenodes;          /* size npmenodes                         */
 239     int  *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 240                               * but with bCartesianPP_PME              */
 241     gmx_ddpme_t ddpme[2];
 242
 243     /* The DD particle-particle nodes only */
 244     gmx_bool bCartesianPP;
 245     int  *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 246
 247     /* The global charge groups */
 248     t_block cgs_gl;
 249
 250     /* Should we sort the cgs */
 251     int  nstSortCG;
 252     gmx_domdec_sort_t *sort;
 253
 254     /* Are there charge groups? */
 255     gmx_bool bCGs;
 256
 257     /* Are there bonded and multi-body interactions between charge groups? */
 258     gmx_bool bInterCGBondeds;
 259     gmx_bool bInterCGMultiBody;
 260
 261     /* Data for the optional bonded interaction atom communication range */
 262     gmx_bool bBondComm;
 263     t_blocka *cglink;
 264     char *bLocalCG;
 265
 266     /* The DLB option */
 267     int  eDLB;
 268     /* Are we actually using DLB? */
 269     gmx_bool bDynLoadBal;
 270
 271     /* Cell sizes for static load balancing, first index cartesian */
 272     real **slb_frac;
 273
 274     /* The width of the communicated boundaries */
 275     real cutoff_mbody;
 276     real cutoff;
 277     /* The minimum cell size (including triclinic correction) */
 278     rvec cellsize_min;
 279     /* For dlb, for use with edlbAUTO */
 280     rvec cellsize_min_dlb;
 281     /* The lower limit for the DD cell size with DLB */
 282     real cellsize_limit;
 283     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 284     gmx_bool bVacDLBNoLimit;
 285
 286     /* With PME load balancing we set limits on DLB */
 287     gmx_bool bPMELoadBalDLBLimits;
 288     /* DLB needs to take into account that we want to allow this maximum
 289      * cut-off (for PME load balancing), this could limit cell boundaries.
 290      */
 291     real PMELoadBal_max_cutoff;
 292
 293     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 294     ivec tric_dir;
 295     /* box0 and box_size are required with dim's without pbc and -gcom */
 296     rvec box0;
 297     rvec box_size;
 298
 299     /* The cell boundaries */
 300     rvec cell_x0;
 301     rvec cell_x1;
 302
 303     /* The old location of the cell boundaries, to check cg displacements */
 304     rvec old_cell_x0;
 305     rvec old_cell_x1;
 306
 307     /* The communication setup and charge group boundaries for the zones */
 308     gmx_domdec_zones_t zones;
 309
 310     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 311      * cell boundaries of neighboring cells for dynamic load balancing.
 312      */
 313     gmx_ddzone_t zone_d1[2];
 314     gmx_ddzone_t zone_d2[2][2];
 315
 316     /* The coordinate/force communication setup and indices */
 317     gmx_domdec_comm_dim_t cd[DIM];
 318     /* The maximum number of cells to communicate with in one dimension */
 319     int  maxpulse;
 320
 321     /* Which cg distribution is stored on the master node */
 322     int master_cg_ddp_count;
 323
 324     /* The number of cg's received from the direct neighbors */
 325     int  zone_ncg1[DD_MAXZONE];
 326
 327     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 328     int  nat[ddnatNR];
 329
 330     /* Array for signalling if atoms have moved to another domain */
 331     int  *moved;
 332     int  moved_nalloc;
 333
 334     /* Communication buffer for general use */
 335     int  *buf_int;
 336     int  nalloc_int;
 337
 338     /* Communication buffer for general use */
 339     vec_rvec_t vbuf;
 340
 341     /* Temporary storage for thread parallel communication setup */
 342     int nth;
 343     dd_comm_setup_work_t *dth;
 344
 345     /* Communication buffers only used with multiple grid pulses */
 346     int  *buf_int2;
 347     int  nalloc_int2;
 348     vec_rvec_t vbuf2;
 349
 350     /* Communication buffers for local redistribution */
 351     int  **cggl_flag;
 352     int  cggl_flag_nalloc[DIM*2];
 353     rvec **cgcm_state;
 354     int  cgcm_state_nalloc[DIM*2];
 355
 356     /* Cell sizes for dynamic load balancing */
 357     gmx_domdec_root_t **root;
 358     real *cell_f_row;
 359     real cell_f0[DIM];
 360     real cell_f1[DIM];
 361     real cell_f_max0[DIM];
 362     real cell_f_min1[DIM];
 363
 364     /* Stuff for load communication */
 365     gmx_bool bRecordLoad;
 366     gmx_domdec_load_t *load;
 367 #ifdef GMX_MPI
 368     MPI_Comm *mpi_comm_load;
 369 #endif
 370
 371     /* Maximum DLB scaling per load balancing step in percent */
 372     int dlb_scale_lim;
 373
 374     /* Cycle counters */
 375     float cycl[ddCyclNr];
 376     int   cycl_n[ddCyclNr];
 377     float cycl_max[ddCyclNr];
 378     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 379     int eFlop;
 380     double flop;
 381     int    flop_n;
 382     /* Have often have did we have load measurements */
 383     int    n_load_have;
 384     /* Have often have we collected the load measurements */
 385     int    n_load_collect;
 386
 387     /* Statistics */
 388     double sum_nat[ddnatNR-ddnatZONE];
 389     int    ndecomp;
 390     int    nload;
 391     double load_step;
 392     double load_sum;
 393     double load_max;
 394     ivec   load_lim;
 395     double load_mdf;
 396     double load_pme;
 397
 398     /* The last partition step */
 399     gmx_large_int_t partition_step;
 400
 401     /* Debugging */
 402     int  nstDDDump;
 403     int  nstDDDumpGrid;
 404     int  DD_debug;
 405 } gmx_domdec_comm_t;
 406
 407 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 408 #define DD_CGIBS 2
 409
 410 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 411 #define DD_FLAG_NRCG  65535
 412 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 413 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 414
 415 /* Zone permutation required to obtain consecutive charge groups
 416  * for neighbor searching.
 417  */
 418 static const int zone_perm[3][4] = { {0,0,0,0},{1,0,0,0},{3,0,1,2} };
 419
 420 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 421  * components see only j zones with that component 0.
 422  */
 423
 424 /* The DD zone order */
 425 static const ivec dd_zo[DD_MAXZONE] =
 426   {{0,0,0},{1,0,0},{1,1,0},{0,1,0},{0,1,1},{0,0,1},{1,0,1},{1,1,1}};
 427
 428 /* The 3D setup */
 429 #define dd_z3n  8
 430 #define dd_zp3n 4
 431 static const ivec dd_zp3[dd_zp3n] = {{0,0,8},{1,3,6},{2,5,6},{3,5,7}};
 432
 433 /* The 2D setup */
 434 #define dd_z2n  4
 435 #define dd_zp2n 2
 436 static const ivec dd_zp2[dd_zp2n] = {{0,0,4},{1,3,4}};
 437
 438 /* The 1D setup */
 439 #define dd_z1n  2
 440 #define dd_zp1n 1
 441 static const ivec dd_zp1[dd_zp1n] = {{0,0,2}};
 442
 443 /* Factors used to avoid problems due to rounding issues */
 444 #define DD_CELL_MARGIN       1.0001
 445 #define DD_CELL_MARGIN2      1.00005
 446 /* Factor to account for pressure scaling during nstlist steps */
 447 #define DD_PRES_SCALE_MARGIN 1.02
 448
 449 /* Allowed performance loss before we DLB or warn */
 450 #define DD_PERF_LOSS 0.05
 451
 452 #define DD_CELL_F_SIZE(dd,di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 453
 454 /* Use separate MPI send and receive commands
 455  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 456  * This saves memory (and some copying for small nnodes).
 457  * For high parallelization scatter and gather calls are used.
 458  */
 459 #define GMX_DD_NNODES_SENDRECV 4
 460
 461
 462 /*
 463 #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 464
 465 static void index2xyz(ivec nc,int ind,ivec xyz)
 466 {
 467   xyz[XX] = ind % nc[XX];
 468   xyz[YY] = (ind / nc[XX]) % nc[YY];
 469   xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 470 }
 471 */
 472
 473 /* This order is required to minimize the coordinate communication in PME
 474  * which uses decomposition in the x direction.
 475  */
 476 #define dd_index(n,i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 477
 478 static void ddindex2xyz(ivec nc,int ind,ivec xyz)
 479 {
 480     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 481     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 482     xyz[ZZ] = ind % nc[ZZ];
 483 }
 484
 485 static int ddcoord2ddnodeid(gmx_domdec_t *dd,ivec c)
 486 {
 487     int ddindex;
 488     int ddnodeid=-1;
 489
 490     ddindex = dd_index(dd->nc,c);
 491     if (dd->comm->bCartesianPP_PME)
 492     {
 493         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 494     }
 495     else if (dd->comm->bCartesianPP)
 496     {
 497 #ifdef GMX_MPI
 498         MPI_Cart_rank(dd->mpi_comm_all,c,&ddnodeid);
 499 #endif
 500     }
 501     else
 502     {
 503         ddnodeid = ddindex;
 504     }
 505
 506     return ddnodeid;
 507 }
 508
 509 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox,t_inputrec *ir)
 510 {
 511     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 512 }
 513
 514 int ddglatnr(gmx_domdec_t *dd,int i)
 515 {
 516     int atnr;
 517
 518     if (dd == NULL)
 519     {
 520         atnr = i + 1;
 521     }
 522     else
 523     {
 524         if (i >= dd->comm->nat[ddnatNR-1])
 525         {
 526             gmx_fatal(FARGS,"glatnr called with %d, which is larger than the local number of atoms (%d)",i,dd->comm->nat[ddnatNR-1]);
 527         }
 528         atnr = dd->gatindex[i] + 1;
 529     }
 530
 531     return atnr;
 532 }
 533
 534 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 535 {
 536     return &dd->comm->cgs_gl;
 537 }
 538
 539 static void vec_rvec_init(vec_rvec_t *v)
 540 {
 541     v->nalloc = 0;
 542     v->v      = NULL;
 543 }
 544
 545 static void vec_rvec_check_alloc(vec_rvec_t *v,int n)
 546 {
 547     if (n > v->nalloc)
 548     {
 549         v->nalloc = over_alloc_dd(n);
 550         srenew(v->v,v->nalloc);
 551     }
 552 }
 553
 554 void dd_store_state(gmx_domdec_t *dd,t_state *state)
 555 {
 556     int i;
 557
 558     if (state->ddp_count != dd->ddp_count)
 559     {
 560         gmx_incons("The state does not the domain decomposition state");
 561     }
 562
 563     state->ncg_gl = dd->ncg_home;
 564     if (state->ncg_gl > state->cg_gl_nalloc)
 565     {
 566         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 567         srenew(state->cg_gl,state->cg_gl_nalloc);
 568     }
 569     for(i=0; i<state->ncg_gl; i++)
 570     {
 571         state->cg_gl[i] = dd->index_gl[i];
 572     }
 573
 574     state->ddp_count_cg_gl = dd->ddp_count;
 575 }
 576
 577 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 578 {
 579     return &dd->comm->zones;
 580 }
 581
 582 void dd_get_ns_ranges(gmx_domdec_t *dd,int icg,
 583                       int *jcg0,int *jcg1,ivec shift0,ivec shift1)
 584 {
 585     gmx_domdec_zones_t *zones;
 586     int izone,d,dim;
 587
 588     zones = &dd->comm->zones;
 589
 590     izone = 0;
 591     while (icg >= zones->izone[izone].cg1)
 592     {
 593         izone++;
 594     }
 595
 596     if (izone == 0)
 597     {
 598         *jcg0 = icg;
 599     }
 600     else if (izone < zones->nizone)
 601     {
 602         *jcg0 = zones->izone[izone].jcg0;
 603     }
 604     else
 605     {
 606         gmx_fatal(FARGS,"DD icg %d out of range: izone (%d) >= nizone (%d)",
 607                   icg,izone,zones->nizone);
 608     }
 609
 610     *jcg1 = zones->izone[izone].jcg1;
 611
 612     for(d=0; d<dd->ndim; d++)
 613     {
 614         dim = dd->dim[d];
 615         shift0[dim] = zones->izone[izone].shift0[dim];
 616         shift1[dim] = zones->izone[izone].shift1[dim];
 617         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 618         {
 619             /* A conservative approach, this can be optimized */
 620             shift0[dim] -= 1;
 621             shift1[dim] += 1;
 622         }
 623     }
 624 }
 625
 626 int dd_natoms_vsite(gmx_domdec_t *dd)
 627 {
 628     return dd->comm->nat[ddnatVSITE];
 629 }
 630
 631 void dd_get_constraint_range(gmx_domdec_t *dd,int *at_start,int *at_end)
 632 {
 633     *at_start = dd->comm->nat[ddnatCON-1];
 634     *at_end   = dd->comm->nat[ddnatCON];
 635 }
 636
 637 void dd_move_x(gmx_domdec_t *dd,matrix box,rvec x[])
 638 {
 639     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 640     int  *index,*cgindex;
 641     gmx_domdec_comm_t *comm;
 642     gmx_domdec_comm_dim_t *cd;
 643     gmx_domdec_ind_t *ind;
 644     rvec shift={0,0,0},*buf,*rbuf;
 645     gmx_bool bPBC,bScrew;
 646
 647     comm = dd->comm;
 648
 649     cgindex = dd->cgindex;
 650
 651     buf = comm->vbuf.v;
 652
 653     nzone = 1;
 654     nat_tot = dd->nat_home;
 655     for(d=0; d<dd->ndim; d++)
 656     {
 657         bPBC   = (dd->ci[dd->dim[d]] == 0);
 658         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 659         if (bPBC)
 660         {
 661             copy_rvec(box[dd->dim[d]],shift);
 662         }
 663         cd = &comm->cd[d];
 664         for(p=0; p<cd->np; p++)
 665         {
 666             ind = &cd->ind[p];
 667             index = ind->index;
 668             n = 0;
 669             if (!bPBC)
 670             {
 671                 for(i=0; i<ind->nsend[nzone]; i++)
 672                 {
 673                     at0 = cgindex[index[i]];
 674                     at1 = cgindex[index[i]+1];
 675                     for(j=at0; j<at1; j++)
 676                     {
 677                         copy_rvec(x[j],buf[n]);
 678                         n++;
 679                     }
 680                 }
 681             }
 682             else if (!bScrew)
 683             {
 684                 for(i=0; i<ind->nsend[nzone]; i++)
 685                 {
 686                     at0 = cgindex[index[i]];
 687                     at1 = cgindex[index[i]+1];
 688                     for(j=at0; j<at1; j++)
 689                     {
 690                         /* We need to shift the coordinates */
 691                         rvec_add(x[j],shift,buf[n]);
 692                         n++;
 693                     }
 694                 }
 695             }
 696             else
 697             {
 698                 for(i=0; i<ind->nsend[nzone]; i++)
 699                 {
 700                     at0 = cgindex[index[i]];
 701                     at1 = cgindex[index[i]+1];
 702                     for(j=at0; j<at1; j++)
 703                     {
 704                         /* Shift x */
 705                         buf[n][XX] = x[j][XX] + shift[XX];
 706                         /* Rotate y and z.
 707                          * This operation requires a special shift force
 708                          * treatment, which is performed in calc_vir.
 709                          */
 710                         buf[n][YY] = box[YY][YY] - x[j][YY];
 711                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 712                         n++;
 713                     }
 714                 }
 715             }
 716
 717             if (cd->bInPlace)
 718             {
 719                 rbuf = x + nat_tot;
 720             }
 721             else
 722             {
 723                 rbuf = comm->vbuf2.v;
 724             }
 725             /* Send and receive the coordinates */
 726             dd_sendrecv_rvec(dd, d, dddirBackward,
 727                              buf,  ind->nsend[nzone+1],
 728                              rbuf, ind->nrecv[nzone+1]);
 729             if (!cd->bInPlace)
 730             {
 731                 j = 0;
 732                 for(zone=0; zone<nzone; zone++)
 733                 {
 734                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 735                     {
 736                         copy_rvec(rbuf[j],x[i]);
 737                         j++;
 738                     }
 739                 }
 740             }
 741             nat_tot += ind->nrecv[nzone+1];
 742         }
 743         nzone += nzone;
 744     }
 745 }
 746
 747 void dd_move_f(gmx_domdec_t *dd,rvec f[],rvec *fshift)
 748 {
 749     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 750     int  *index,*cgindex;
 751     gmx_domdec_comm_t *comm;
 752     gmx_domdec_comm_dim_t *cd;
 753     gmx_domdec_ind_t *ind;
 754     rvec *buf,*sbuf;
 755     ivec vis;
 756     int  is;
 757     gmx_bool bPBC,bScrew;
 758
 759     comm = dd->comm;
 760
 761     cgindex = dd->cgindex;
 762
 763     buf = comm->vbuf.v;
 764
 765     n = 0;
 766     nzone = comm->zones.n/2;
 767     nat_tot = dd->nat_tot;
 768     for(d=dd->ndim-1; d>=0; d--)
 769     {
 770         bPBC   = (dd->ci[dd->dim[d]] == 0);
 771         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 772         if (fshift == NULL && !bScrew)
 773         {
 774             bPBC = FALSE;
 775         }
 776         /* Determine which shift vector we need */
 777         clear_ivec(vis);
 778         vis[dd->dim[d]] = 1;
 779         is = IVEC2IS(vis);
 780
 781         cd = &comm->cd[d];
 782         for(p=cd->np-1; p>=0; p--) {
 783             ind = &cd->ind[p];
 784             nat_tot -= ind->nrecv[nzone+1];
 785             if (cd->bInPlace)
 786             {
 787                 sbuf = f + nat_tot;
 788             }
 789             else
 790             {
 791                 sbuf = comm->vbuf2.v;
 792                 j = 0;
 793                 for(zone=0; zone<nzone; zone++)
 794                 {
 795                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 796                     {
 797                         copy_rvec(f[i],sbuf[j]);
 798                         j++;
 799                     }
 800                 }
 801             }
 802             /* Communicate the forces */
 803             dd_sendrecv_rvec(dd, d, dddirForward,
 804                              sbuf, ind->nrecv[nzone+1],
 805                              buf,  ind->nsend[nzone+1]);
 806             index = ind->index;
 807             /* Add the received forces */
 808             n = 0;
 809             if (!bPBC)
 810             {
 811                 for(i=0; i<ind->nsend[nzone]; i++)
 812                 {
 813                     at0 = cgindex[index[i]];
 814                     at1 = cgindex[index[i]+1];
 815                     for(j=at0; j<at1; j++)
 816                     {
 817                         rvec_inc(f[j],buf[n]);
 818                         n++;
 819                     }
 820                 }
 821             }
 822             else if (!bScrew)
 823             {
 824                 for(i=0; i<ind->nsend[nzone]; i++)
 825                 {
 826                     at0 = cgindex[index[i]];
 827                     at1 = cgindex[index[i]+1];
 828                     for(j=at0; j<at1; j++)
 829                     {
 830                         rvec_inc(f[j],buf[n]);
 831                         /* Add this force to the shift force */
 832                         rvec_inc(fshift[is],buf[n]);
 833                         n++;
 834                     }
 835                 }
 836             }
 837             else
 838             {
 839                 for(i=0; i<ind->nsend[nzone]; i++)
 840                 {
 841                     at0 = cgindex[index[i]];
 842                     at1 = cgindex[index[i]+1];
 843                     for(j=at0; j<at1; j++)
 844                     {
 845                         /* Rotate the force */
 846                         f[j][XX] += buf[n][XX];
 847                         f[j][YY] -= buf[n][YY];
 848                         f[j][ZZ] -= buf[n][ZZ];
 849                         if (fshift)
 850                         {
 851                             /* Add this force to the shift force */
 852                             rvec_inc(fshift[is],buf[n]);
 853                         }
 854                         n++;
 855                     }
 856                 }
 857             }
 858         }
 859         nzone /= 2;
 860     }
 861 }
 862
 863 void dd_atom_spread_real(gmx_domdec_t *dd,real v[])
 864 {
 865     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 866     int  *index,*cgindex;
 867     gmx_domdec_comm_t *comm;
 868     gmx_domdec_comm_dim_t *cd;
 869     gmx_domdec_ind_t *ind;
 870     real *buf,*rbuf;
 871
 872     comm = dd->comm;
 873
 874     cgindex = dd->cgindex;
 875
 876     buf = &comm->vbuf.v[0][0];
 877
 878     nzone = 1;
 879     nat_tot = dd->nat_home;
 880     for(d=0; d<dd->ndim; d++)
 881     {
 882         cd = &comm->cd[d];
 883         for(p=0; p<cd->np; p++)
 884         {
 885             ind = &cd->ind[p];
 886             index = ind->index;
 887             n = 0;
 888             for(i=0; i<ind->nsend[nzone]; i++)
 889             {
 890                 at0 = cgindex[index[i]];
 891                 at1 = cgindex[index[i]+1];
 892                 for(j=at0; j<at1; j++)
 893                 {
 894                     buf[n] = v[j];
 895                     n++;
 896                 }
 897             }
 898
 899             if (cd->bInPlace)
 900             {
 901                 rbuf = v + nat_tot;
 902             }
 903             else
 904             {
 905                 rbuf = &comm->vbuf2.v[0][0];
 906             }
 907             /* Send and receive the coordinates */
 908             dd_sendrecv_real(dd, d, dddirBackward,
 909                              buf,  ind->nsend[nzone+1],
 910                              rbuf, ind->nrecv[nzone+1]);
 911             if (!cd->bInPlace)
 912             {
 913                 j = 0;
 914                 for(zone=0; zone<nzone; zone++)
 915                 {
 916                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 917                     {
 918                         v[i] = rbuf[j];
 919                         j++;
 920                     }
 921                 }
 922             }
 923             nat_tot += ind->nrecv[nzone+1];
 924         }
 925         nzone += nzone;
 926     }
 927 }
 928
 929 void dd_atom_sum_real(gmx_domdec_t *dd,real v[])
 930 {
 931     int  nzone,nat_tot,n,d,p,i,j,at0,at1,zone;
 932     int  *index,*cgindex;
 933     gmx_domdec_comm_t *comm;
 934     gmx_domdec_comm_dim_t *cd;
 935     gmx_domdec_ind_t *ind;
 936     real *buf,*sbuf;
 937
 938     comm = dd->comm;
 939
 940     cgindex = dd->cgindex;
 941
 942     buf = &comm->vbuf.v[0][0];
 943
 944     n = 0;
 945     nzone = comm->zones.n/2;
 946     nat_tot = dd->nat_tot;
 947     for(d=dd->ndim-1; d>=0; d--)
 948     {
 949         cd = &comm->cd[d];
 950         for(p=cd->np-1; p>=0; p--) {
 951             ind = &cd->ind[p];
 952             nat_tot -= ind->nrecv[nzone+1];
 953             if (cd->bInPlace)
 954             {
 955                 sbuf = v + nat_tot;
 956             }
 957             else
 958             {
 959                 sbuf = &comm->vbuf2.v[0][0];
 960                 j = 0;
 961                 for(zone=0; zone<nzone; zone++)
 962                 {
 963                     for(i=ind->cell2at0[zone]; i<ind->cell2at1[zone]; i++)
 964                     {
 965                         sbuf[j] = v[i];
 966                         j++;
 967                     }
 968                 }
 969             }
 970             /* Communicate the forces */
 971             dd_sendrecv_real(dd, d, dddirForward,
 972                              sbuf, ind->nrecv[nzone+1],
 973                              buf,  ind->nsend[nzone+1]);
 974             index = ind->index;
 975             /* Add the received forces */
 976             n = 0;
 977             for(i=0; i<ind->nsend[nzone]; i++)
 978             {
 979                 at0 = cgindex[index[i]];
 980                 at1 = cgindex[index[i]+1];
 981                 for(j=at0; j<at1; j++)
 982                 {
 983                     v[j] += buf[n];
 984                     n++;
 985                 }
 986             }
 987         }
 988         nzone /= 2;
 989     }
 990 }
 991
 992 static void print_ddzone(FILE *fp,int d,int i,int j,gmx_ddzone_t *zone)
 993 {
 994     fprintf(fp,"zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 995             d,i,j,
 996             zone->min0,zone->max1,
 997             zone->mch0,zone->mch0,
 998             zone->p1_0,zone->p1_1);
 999 }
1000
1001
1002 #define DDZONECOMM_MAXZONE  5
1003 #define DDZONECOMM_BUFSIZE  3
1004
1005 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1006                                int ddimind,int direction,
1007                                gmx_ddzone_t *buf_s,int n_s,
1008                                gmx_ddzone_t *buf_r,int n_r)
1009 {
1010 #define ZBS  DDZONECOMM_BUFSIZE
1011     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1012     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1013     int i;
1014
1015     for(i=0; i<n_s; i++)
1016     {
1017         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1018         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1019         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1020         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1021         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1022         vbuf_s[i*ZBS+1][2] = 0;
1023         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1024         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1025         vbuf_s[i*ZBS+2][2] = 0;
1026     }
1027
1028     dd_sendrecv_rvec(dd, ddimind, direction,
1029                      vbuf_s, n_s*ZBS,
1030                      vbuf_r, n_r*ZBS);
1031
1032     for(i=0; i<n_r; i++)
1033     {
1034         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1035         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1036         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1037         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1038         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1039         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1040         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1041     }
1042
1043 #undef ZBS
1044 }
1045
1046 static void dd_move_cellx(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
1047                           rvec cell_ns_x0,rvec cell_ns_x1)
1048 {
1049     int  d,d1,dim,dim1,pos,buf_size,i,j,k,p,npulse,npulse_min;
1050     gmx_ddzone_t *zp;
1051     gmx_ddzone_t buf_s[DDZONECOMM_MAXZONE];
1052     gmx_ddzone_t buf_r[DDZONECOMM_MAXZONE];
1053     gmx_ddzone_t buf_e[DDZONECOMM_MAXZONE];
1054     rvec extr_s[2],extr_r[2];
1055     rvec dh;
1056     real dist_d,c=0,det;
1057     gmx_domdec_comm_t *comm;
1058     gmx_bool bPBC,bUse;
1059
1060     comm = dd->comm;
1061
1062     for(d=1; d<dd->ndim; d++)
1063     {
1064         dim = dd->dim[d];
1065         zp = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1066         zp->min0 = cell_ns_x0[dim];
1067         zp->max1 = cell_ns_x1[dim];
1068         zp->min1 = cell_ns_x1[dim];
1069         zp->mch0 = cell_ns_x0[dim];
1070         zp->mch1 = cell_ns_x1[dim];
1071         zp->p1_0 = cell_ns_x0[dim];
1072         zp->p1_1 = cell_ns_x1[dim];
1073     }
1074
1075     for(d=dd->ndim-2; d>=0; d--)
1076     {
1077         dim  = dd->dim[d];
1078         bPBC = (dim < ddbox->npbcdim);
1079
1080         /* Use an rvec to store two reals */
1081         extr_s[d][0] = comm->cell_f0[d+1];
1082         extr_s[d][1] = comm->cell_f1[d+1];
1083         extr_s[d][2] = comm->cell_f1[d+1];
1084
1085         pos = 0;
1086         /* Store the extremes in the backward sending buffer,
1087          * so the get updated separately from the forward communication.
1088          */
1089         for(d1=d; d1<dd->ndim-1; d1++)
1090         {
1091             /* We invert the order to be able to use the same loop for buf_e */
1092             buf_s[pos].min0 = extr_s[d1][1];
1093             buf_s[pos].max1 = extr_s[d1][0];
1094             buf_s[pos].min1 = extr_s[d1][2];
1095             buf_s[pos].mch0 = 0;
1096             buf_s[pos].mch1 = 0;
1097             /* Store the cell corner of the dimension we communicate along */
1098             buf_s[pos].p1_0 = comm->cell_x0[dim];
1099             buf_s[pos].p1_1 = 0;
1100             pos++;
1101         }
1102
1103         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1104         pos++;
1105
1106         if (dd->ndim == 3 && d == 0)
1107         {
1108             buf_s[pos] = comm->zone_d2[0][1];
1109             pos++;
1110             buf_s[pos] = comm->zone_d1[0];
1111             pos++;
1112         }
1113
1114         /* We only need to communicate the extremes
1115          * in the forward direction
1116          */
1117         npulse = comm->cd[d].np;
1118         if (bPBC)
1119         {
1120             /* Take the minimum to avoid double communication */
1121             npulse_min = min(npulse,dd->nc[dim]-1-npulse);
1122         }
1123         else
1124         {
1125             /* Without PBC we should really not communicate over
1126              * the boundaries, but implementing that complicates
1127              * the communication setup and therefore we simply
1128              * do all communication, but ignore some data.
1129              */
1130             npulse_min = npulse;
1131         }
1132         for(p=0; p<npulse_min; p++)
1133         {
1134             /* Communicate the extremes forward */
1135             bUse = (bPBC || dd->ci[dim] > 0);
1136
1137             dd_sendrecv_rvec(dd, d, dddirForward,
1138                              extr_s+d, dd->ndim-d-1,
1139                              extr_r+d, dd->ndim-d-1);
1140
1141             if (bUse)
1142             {
1143                 for(d1=d; d1<dd->ndim-1; d1++)
1144                 {
1145                     extr_s[d1][0] = max(extr_s[d1][0],extr_r[d1][0]);
1146                     extr_s[d1][1] = min(extr_s[d1][1],extr_r[d1][1]);
1147                     extr_s[d1][2] = min(extr_s[d1][2],extr_r[d1][2]);
1148                 }
1149             }
1150         }
1151
1152         buf_size = pos;
1153         for(p=0; p<npulse; p++)
1154         {
1155             /* Communicate all the zone information backward */
1156             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1157
1158             dd_sendrecv_ddzone(dd, d, dddirBackward,
1159                                buf_s, buf_size,
1160                                buf_r, buf_size);
1161
1162             clear_rvec(dh);
1163             if (p > 0)
1164             {
1165                 for(d1=d+1; d1<dd->ndim; d1++)
1166                 {
1167                     /* Determine the decrease of maximum required
1168                      * communication height along d1 due to the distance along d,
1169                      * this avoids a lot of useless atom communication.
1170                      */
1171                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1172
1173                     if (ddbox->tric_dir[dim])
1174                     {
1175                         /* c is the off-diagonal coupling between the cell planes
1176                          * along directions d and d1.
1177                          */
1178                         c = ddbox->v[dim][dd->dim[d1]][dim];
1179                     }
1180                     else
1181                     {
1182                         c = 0;
1183                     }
1184                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1185                     if (det > 0)
1186                     {
1187                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1188                     }
1189                     else
1190                     {
1191                         /* A negative value signals out of range */
1192                         dh[d1] = -1;
1193                     }
1194                 }
1195             }
1196
1197             /* Accumulate the extremes over all pulses */
1198             for(i=0; i<buf_size; i++)
1199             {
1200                 if (p == 0)
1201                 {
1202                     buf_e[i] = buf_r[i];
1203                 }
1204                 else
1205                 {
1206                     if (bUse)
1207                     {
1208                         buf_e[i].min0 = min(buf_e[i].min0,buf_r[i].min0);
1209                         buf_e[i].max1 = max(buf_e[i].max1,buf_r[i].max1);
1210                         buf_e[i].min1 = min(buf_e[i].min1,buf_r[i].min1);
1211                     }
1212
1213                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1214                     {
1215                         d1 = 1;
1216                     }
1217                     else
1218                     {
1219                         d1 = d + 1;
1220                     }
1221                     if (bUse && dh[d1] >= 0)
1222                     {
1223                         buf_e[i].mch0 = max(buf_e[i].mch0,buf_r[i].mch0-dh[d1]);
1224                         buf_e[i].mch1 = max(buf_e[i].mch1,buf_r[i].mch1-dh[d1]);
1225                     }
1226                 }
1227                 /* Copy the received buffer to the send buffer,
1228                  * to pass the data through with the next pulse.
1229                  */
1230                 buf_s[i] = buf_r[i];
1231             }
1232             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1233                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1234             {
1235                 /* Store the extremes */
1236                 pos = 0;
1237
1238                 for(d1=d; d1<dd->ndim-1; d1++)
1239                 {
1240                     extr_s[d1][1] = min(extr_s[d1][1],buf_e[pos].min0);
1241                     extr_s[d1][0] = max(extr_s[d1][0],buf_e[pos].max1);
1242                     extr_s[d1][2] = min(extr_s[d1][2],buf_e[pos].min1);
1243                     pos++;
1244                 }
1245
1246                 if (d == 1 || (d == 0 && dd->ndim == 3))
1247                 {
1248                     for(i=d; i<2; i++)
1249                     {
1250                         comm->zone_d2[1-d][i] = buf_e[pos];
1251                         pos++;
1252                     }
1253                 }
1254                 if (d == 0)
1255                 {
1256                     comm->zone_d1[1] = buf_e[pos];
1257                     pos++;
1258                 }
1259             }
1260         }
1261     }
1262
1263     if (dd->ndim >= 2)
1264     {
1265         dim = dd->dim[1];
1266         for(i=0; i<2; i++)
1267         {
1268             if (debug)
1269             {
1270                 print_ddzone(debug,1,i,0,&comm->zone_d1[i]);
1271             }
1272             cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d1[i].min0);
1273             cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d1[i].max1);
1274         }
1275     }
1276     if (dd->ndim >= 3)
1277     {
1278         dim = dd->dim[2];
1279         for(i=0; i<2; i++)
1280         {
1281             for(j=0; j<2; j++)
1282             {
1283                 if (debug)
1284                 {
1285                     print_ddzone(debug,2,i,j,&comm->zone_d2[i][j]);
1286                 }
1287                 cell_ns_x0[dim] = min(cell_ns_x0[dim],comm->zone_d2[i][j].min0);
1288                 cell_ns_x1[dim] = max(cell_ns_x1[dim],comm->zone_d2[i][j].max1);
1289             }
1290         }
1291     }
1292     for(d=1; d<dd->ndim; d++)
1293     {
1294         comm->cell_f_max0[d] = extr_s[d-1][0];
1295         comm->cell_f_min1[d] = extr_s[d-1][1];
1296         if (debug)
1297         {
1298             fprintf(debug,"Cell fraction d %d, max0 %f, min1 %f\n",
1299                     d,comm->cell_f_max0[d],comm->cell_f_min1[d]);
1300         }
1301     }
1302 }
1303
1304 static void dd_collect_cg(gmx_domdec_t *dd,
1305                           t_state *state_local)
1306 {
1307     gmx_domdec_master_t *ma=NULL;
1308     int buf2[2],*ibuf,i,ncg_home=0,*cg=NULL,nat_home=0;
1309     t_block *cgs_gl;
1310
1311     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1312     {
1313         /* The master has the correct distribution */
1314         return;
1315     }
1316
1317     if (state_local->ddp_count == dd->ddp_count)
1318     {
1319         ncg_home = dd->ncg_home;
1320         cg       = dd->index_gl;
1321         nat_home = dd->nat_home;
1322     }
1323     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1324     {
1325         cgs_gl = &dd->comm->cgs_gl;
1326
1327         ncg_home = state_local->ncg_gl;
1328         cg       = state_local->cg_gl;
1329         nat_home = 0;
1330         for(i=0; i<ncg_home; i++)
1331         {
1332             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1333         }
1334     }
1335     else
1336     {
1337         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1338     }
1339
1340     buf2[0] = dd->ncg_home;
1341     buf2[1] = dd->nat_home;
1342     if (DDMASTER(dd))
1343     {
1344         ma = dd->ma;
1345         ibuf = ma->ibuf;
1346     }
1347     else
1348     {
1349         ibuf = NULL;
1350     }
1351     /* Collect the charge group and atom counts on the master */
1352     dd_gather(dd,2*sizeof(int),buf2,ibuf);
1353
1354     if (DDMASTER(dd))
1355     {
1356         ma->index[0] = 0;
1357         for(i=0; i<dd->nnodes; i++)
1358         {
1359             ma->ncg[i] = ma->ibuf[2*i];
1360             ma->nat[i] = ma->ibuf[2*i+1];
1361             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1362
1363         }
1364         /* Make byte counts and indices */
1365         for(i=0; i<dd->nnodes; i++)
1366         {
1367             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
1368             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1369         }
1370         if (debug)
1371         {
1372             fprintf(debug,"Initial charge group distribution: ");
1373             for(i=0; i<dd->nnodes; i++)
1374                 fprintf(debug," %d",ma->ncg[i]);
1375             fprintf(debug,"\n");
1376         }
1377     }
1378
1379     /* Collect the charge group indices on the master */
1380     dd_gatherv(dd,
1381                dd->ncg_home*sizeof(int),dd->index_gl,
1382                DDMASTER(dd) ? ma->ibuf : NULL,
1383                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1384                DDMASTER(dd) ? ma->cg : NULL);
1385
1386     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1387 }
1388
1389 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1390                                     rvec *lv,rvec *v)
1391 {
1392     gmx_domdec_master_t *ma;
1393     int  n,i,c,a,nalloc=0;
1394     rvec *buf=NULL;
1395     t_block *cgs_gl;
1396
1397     ma = dd->ma;
1398
1399     if (!DDMASTER(dd))
1400     {
1401 #ifdef GMX_MPI
1402         MPI_Send(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1403                  dd->rank,dd->mpi_comm_all);
1404 #endif
1405     } else {
1406         /* Copy the master coordinates to the global array */
1407         cgs_gl = &dd->comm->cgs_gl;
1408
1409         n = DDMASTERRANK(dd);
1410         a = 0;
1411         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1412         {
1413             for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1414             {
1415                 copy_rvec(lv[a++],v[c]);
1416             }
1417         }
1418
1419         for(n=0; n<dd->nnodes; n++)
1420         {
1421             if (n != dd->rank)
1422             {
1423                 if (ma->nat[n] > nalloc)
1424                 {
1425                     nalloc = over_alloc_dd(ma->nat[n]);
1426                     srenew(buf,nalloc);
1427                 }
1428 #ifdef GMX_MPI
1429                 MPI_Recv(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,DDRANK(dd,n),
1430                          n,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1431 #endif
1432                 a = 0;
1433                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1434                 {
1435                     for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1436                     {
1437                         copy_rvec(buf[a++],v[c]);
1438                     }
1439                 }
1440             }
1441         }
1442         sfree(buf);
1443     }
1444 }
1445
1446 static void get_commbuffer_counts(gmx_domdec_t *dd,
1447                                   int **counts,int **disps)
1448 {
1449     gmx_domdec_master_t *ma;
1450     int n;
1451
1452     ma = dd->ma;
1453
1454     /* Make the rvec count and displacment arrays */
1455     *counts  = ma->ibuf;
1456     *disps   = ma->ibuf + dd->nnodes;
1457     for(n=0; n<dd->nnodes; n++)
1458     {
1459         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1460         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1461     }
1462 }
1463
1464 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1465                                    rvec *lv,rvec *v)
1466 {
1467     gmx_domdec_master_t *ma;
1468     int  *rcounts=NULL,*disps=NULL;
1469     int  n,i,c,a;
1470     rvec *buf=NULL;
1471     t_block *cgs_gl;
1472
1473     ma = dd->ma;
1474
1475     if (DDMASTER(dd))
1476     {
1477         get_commbuffer_counts(dd,&rcounts,&disps);
1478
1479         buf = ma->vbuf;
1480     }
1481
1482     dd_gatherv(dd,dd->nat_home*sizeof(rvec),lv,rcounts,disps,buf);
1483
1484     if (DDMASTER(dd))
1485     {
1486         cgs_gl = &dd->comm->cgs_gl;
1487
1488         a = 0;
1489         for(n=0; n<dd->nnodes; n++)
1490         {
1491             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1492             {
1493                 for(c=cgs_gl->index[ma->cg[i]]; c<cgs_gl->index[ma->cg[i]+1]; c++)
1494                 {
1495                     copy_rvec(buf[a++],v[c]);
1496                 }
1497             }
1498         }
1499     }
1500 }
1501
1502 void dd_collect_vec(gmx_domdec_t *dd,
1503                     t_state *state_local,rvec *lv,rvec *v)
1504 {
1505     gmx_domdec_master_t *ma;
1506     int  n,i,c,a,nalloc=0;
1507     rvec *buf=NULL;
1508
1509     dd_collect_cg(dd,state_local);
1510
1511     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1512     {
1513         dd_collect_vec_sendrecv(dd,lv,v);
1514     }
1515     else
1516     {
1517         dd_collect_vec_gatherv(dd,lv,v);
1518     }
1519 }
1520
1521
1522 void dd_collect_state(gmx_domdec_t *dd,
1523                       t_state *state_local,t_state *state)
1524 {
1525     int est,i,j,nh;
1526
1527     nh = state->nhchainlength;
1528
1529     if (DDMASTER(dd))
1530     {
1531         for (i=0;i<efptNR;i++) {
1532             state->lambda[i] = state_local->lambda[i];
1533         }
1534         state->fep_state = state_local->fep_state;
1535         state->veta = state_local->veta;
1536         state->vol0 = state_local->vol0;
1537         copy_mat(state_local->box,state->box);
1538         copy_mat(state_local->boxv,state->boxv);
1539         copy_mat(state_local->svir_prev,state->svir_prev);
1540         copy_mat(state_local->fvir_prev,state->fvir_prev);
1541         copy_mat(state_local->pres_prev,state->pres_prev);
1542
1543
1544         for(i=0; i<state_local->ngtc; i++)
1545         {
1546             for(j=0; j<nh; j++) {
1547                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1548                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1549             }
1550             state->therm_integral[i] = state_local->therm_integral[i];
1551         }
1552         for(i=0; i<state_local->nnhpres; i++)
1553         {
1554             for(j=0; j<nh; j++) {
1555                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1556                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1557             }
1558         }
1559     }
1560     for(est=0; est<estNR; est++)
1561     {
1562         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1563         {
1564             switch (est) {
1565             case estX:
1566                 dd_collect_vec(dd,state_local,state_local->x,state->x);
1567                 break;
1568             case estV:
1569                 dd_collect_vec(dd,state_local,state_local->v,state->v);
1570                 break;
1571             case estSDX:
1572                 dd_collect_vec(dd,state_local,state_local->sd_X,state->sd_X);
1573                 break;
1574             case estCGP:
1575                 dd_collect_vec(dd,state_local,state_local->cg_p,state->cg_p);
1576                 break;
1577             case estLD_RNG:
1578                 if (state->nrngi == 1)
1579                 {
1580                     if (DDMASTER(dd))
1581                     {
1582                         for(i=0; i<state_local->nrng; i++)
1583                         {
1584                             state->ld_rng[i] = state_local->ld_rng[i];
1585                         }
1586                     }
1587                 }
1588                 else
1589                 {
1590                     dd_gather(dd,state_local->nrng*sizeof(state->ld_rng[0]),
1591                               state_local->ld_rng,state->ld_rng);
1592                 }
1593                 break;
1594             case estLD_RNGI:
1595                 if (state->nrngi == 1)
1596                 {
1597                    if (DDMASTER(dd))
1598                     {
1599                         state->ld_rngi[0] = state_local->ld_rngi[0];
1600                     }
1601                 }
1602                 else
1603                 {
1604                     dd_gather(dd,sizeof(state->ld_rngi[0]),
1605                               state_local->ld_rngi,state->ld_rngi);
1606                 }
1607                 break;
1608             case estDISRE_INITF:
1609             case estDISRE_RM3TAV:
1610             case estORIRE_INITF:
1611             case estORIRE_DTAV:
1612                 break;
1613             default:
1614                 gmx_incons("Unknown state entry encountered in dd_collect_state");
1615             }
1616         }
1617     }
1618 }
1619
1620 static void dd_realloc_state(t_state *state,rvec **f,int nalloc)
1621 {
1622     int est;
1623
1624     if (debug)
1625     {
1626         fprintf(debug,"Reallocating state: currently %d, required %d, allocating %d\n",state->nalloc,nalloc,over_alloc_dd(nalloc));
1627     }
1628
1629     state->nalloc = over_alloc_dd(nalloc);
1630
1631     for(est=0; est<estNR; est++)
1632     {
1633         if (EST_DISTR(est) && (state->flags & (1<<est)))
1634         {
1635             switch(est) {
1636             case estX:
1637                 srenew(state->x,state->nalloc);
1638                 break;
1639             case estV:
1640                 srenew(state->v,state->nalloc);
1641                 break;
1642             case estSDX:
1643                 srenew(state->sd_X,state->nalloc);
1644                 break;
1645             case estCGP:
1646                 srenew(state->cg_p,state->nalloc);
1647                 break;
1648             case estLD_RNG:
1649             case estLD_RNGI:
1650             case estDISRE_INITF:
1651             case estDISRE_RM3TAV:
1652             case estORIRE_INITF:
1653             case estORIRE_DTAV:
1654                 /* No reallocation required */
1655                 break;
1656             default:
1657                 gmx_incons("Unknown state entry encountered in dd_realloc_state");
1658             }
1659         }
1660     }
1661
1662     if (f != NULL)
1663     {
1664         srenew(*f,state->nalloc);
1665     }
1666 }
1667
1668 static void dd_check_alloc_ncg(t_forcerec *fr,t_state *state,rvec **f,
1669                                int nalloc)
1670 {
1671     if (nalloc > fr->cg_nalloc)
1672     {
1673         if (debug)
1674         {
1675             fprintf(debug,"Reallocating forcerec: currently %d, required %d, allocating %d\n",fr->cg_nalloc,nalloc,over_alloc_dd(nalloc));
1676         }
1677         fr->cg_nalloc = over_alloc_dd(nalloc);
1678         srenew(fr->cginfo,fr->cg_nalloc);
1679         if (fr->cutoff_scheme == ecutsGROUP)
1680         {
1681             srenew(fr->cg_cm,fr->cg_nalloc);
1682         }
1683     }
1684     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1685     {
1686         /* We don't use charge groups, we use x in state to set up
1687          * the atom communication.
1688          */
1689         dd_realloc_state(state,f,nalloc);
1690     }
1691 }
1692
1693 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd,t_block *cgs,
1694                                        rvec *v,rvec *lv)
1695 {
1696     gmx_domdec_master_t *ma;
1697     int  n,i,c,a,nalloc=0;
1698     rvec *buf=NULL;
1699
1700     if (DDMASTER(dd))
1701     {
1702         ma  = dd->ma;
1703
1704         for(n=0; n<dd->nnodes; n++)
1705         {
1706             if (n != dd->rank)
1707             {
1708                 if (ma->nat[n] > nalloc)
1709                 {
1710                     nalloc = over_alloc_dd(ma->nat[n]);
1711                     srenew(buf,nalloc);
1712                 }
1713                 /* Use lv as a temporary buffer */
1714                 a = 0;
1715                 for(i=ma->index[n]; i<ma->index[n+1]; i++)
1716                 {
1717                     for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1718                     {
1719                         copy_rvec(v[c],buf[a++]);
1720                     }
1721                 }
1722                 if (a != ma->nat[n])
1723                 {
1724                     gmx_fatal(FARGS,"Internal error a (%d) != nat (%d)",
1725                               a,ma->nat[n]);
1726                 }
1727
1728 #ifdef GMX_MPI
1729                 MPI_Send(buf,ma->nat[n]*sizeof(rvec),MPI_BYTE,
1730                          DDRANK(dd,n),n,dd->mpi_comm_all);
1731 #endif
1732             }
1733         }
1734         sfree(buf);
1735         n = DDMASTERRANK(dd);
1736         a = 0;
1737         for(i=ma->index[n]; i<ma->index[n+1]; i++)
1738         {
1739             for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1740             {
1741                 copy_rvec(v[c],lv[a++]);
1742             }
1743         }
1744     }
1745     else
1746     {
1747 #ifdef GMX_MPI
1748         MPI_Recv(lv,dd->nat_home*sizeof(rvec),MPI_BYTE,DDMASTERRANK(dd),
1749                  MPI_ANY_TAG,dd->mpi_comm_all,MPI_STATUS_IGNORE);
1750 #endif
1751     }
1752 }
1753
1754 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd,t_block *cgs,
1755                                        rvec *v,rvec *lv)
1756 {
1757     gmx_domdec_master_t *ma;
1758     int  *scounts=NULL,*disps=NULL;
1759     int  n,i,c,a,nalloc=0;
1760     rvec *buf=NULL;
1761
1762     if (DDMASTER(dd))
1763     {
1764         ma  = dd->ma;
1765
1766         get_commbuffer_counts(dd,&scounts,&disps);
1767
1768         buf = ma->vbuf;
1769         a = 0;
1770         for(n=0; n<dd->nnodes; n++)
1771         {
1772             for(i=ma->index[n]; i<ma->index[n+1]; i++)
1773             {
1774                 for(c=cgs->index[ma->cg[i]]; c<cgs->index[ma->cg[i]+1]; c++)
1775                 {
1776                     copy_rvec(v[c],buf[a++]);
1777                 }
1778             }
1779         }
1780     }
1781
1782     dd_scatterv(dd,scounts,disps,buf,dd->nat_home*sizeof(rvec),lv);
1783 }
1784
1785 static void dd_distribute_vec(gmx_domdec_t *dd,t_block *cgs,rvec *v,rvec *lv)
1786 {
1787     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1788     {
1789         dd_distribute_vec_sendrecv(dd,cgs,v,lv);
1790     }
1791     else
1792     {
1793         dd_distribute_vec_scatterv(dd,cgs,v,lv);
1794     }
1795 }
1796
1797 static void dd_distribute_state(gmx_domdec_t *dd,t_block *cgs,
1798                                 t_state *state,t_state *state_local,
1799                                 rvec **f)
1800 {
1801     int  i,j,nh;
1802
1803     nh = state->nhchainlength;
1804
1805     if (DDMASTER(dd))
1806     {
1807         for(i=0;i<efptNR;i++)
1808         {
1809             state_local->lambda[i] = state->lambda[i];
1810         }
1811         state_local->fep_state = state->fep_state;
1812         state_local->veta   = state->veta;
1813         state_local->vol0   = state->vol0;
1814         copy_mat(state->box,state_local->box);
1815         copy_mat(state->box_rel,state_local->box_rel);
1816         copy_mat(state->boxv,state_local->boxv);
1817         copy_mat(state->svir_prev,state_local->svir_prev);
1818         copy_mat(state->fvir_prev,state_local->fvir_prev);
1819         for(i=0; i<state_local->ngtc; i++)
1820         {
1821             for(j=0; j<nh; j++) {
1822                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1823                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1824             }
1825             state_local->therm_integral[i] = state->therm_integral[i];
1826         }
1827         for(i=0; i<state_local->nnhpres; i++)
1828         {
1829             for(j=0; j<nh; j++) {
1830                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1831                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1832             }
1833         }
1834     }
1835     dd_bcast(dd,((efptNR)*sizeof(real)),state_local->lambda);
1836     dd_bcast(dd,sizeof(int),&state_local->fep_state);
1837     dd_bcast(dd,sizeof(real),&state_local->veta);
1838     dd_bcast(dd,sizeof(real),&state_local->vol0);
1839     dd_bcast(dd,sizeof(state_local->box),state_local->box);
1840     dd_bcast(dd,sizeof(state_local->box_rel),state_local->box_rel);
1841     dd_bcast(dd,sizeof(state_local->boxv),state_local->boxv);
1842     dd_bcast(dd,sizeof(state_local->svir_prev),state_local->svir_prev);
1843     dd_bcast(dd,sizeof(state_local->fvir_prev),state_local->fvir_prev);
1844     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_xi);
1845     dd_bcast(dd,((state_local->ngtc*nh)*sizeof(double)),state_local->nosehoover_vxi);
1846     dd_bcast(dd,state_local->ngtc*sizeof(double),state_local->therm_integral);
1847     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_xi);
1848     dd_bcast(dd,((state_local->nnhpres*nh)*sizeof(double)),state_local->nhpres_vxi);
1849
1850     if (dd->nat_home > state_local->nalloc)
1851     {
1852         dd_realloc_state(state_local,f,dd->nat_home);
1853     }
1854     for(i=0; i<estNR; i++)
1855     {
1856         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1857         {
1858             switch (i) {
1859             case estX:
1860                 dd_distribute_vec(dd,cgs,state->x,state_local->x);
1861                 break;
1862             case estV:
1863                 dd_distribute_vec(dd,cgs,state->v,state_local->v);
1864                 break;
1865             case estSDX:
1866                 dd_distribute_vec(dd,cgs,state->sd_X,state_local->sd_X);
1867                 break;
1868             case estCGP:
1869                 dd_distribute_vec(dd,cgs,state->cg_p,state_local->cg_p);
1870                 break;
1871             case estLD_RNG:
1872                 if (state->nrngi == 1)
1873                 {
1874                     dd_bcastc(dd,
1875                               state_local->nrng*sizeof(state_local->ld_rng[0]),
1876                               state->ld_rng,state_local->ld_rng);
1877                 }
1878                 else
1879                 {
1880                     dd_scatter(dd,
1881                                state_local->nrng*sizeof(state_local->ld_rng[0]),
1882                                state->ld_rng,state_local->ld_rng);
1883                 }
1884                 break;
1885             case estLD_RNGI:
1886                 if (state->nrngi == 1)
1887                 {
1888                     dd_bcastc(dd,sizeof(state_local->ld_rngi[0]),
1889                               state->ld_rngi,state_local->ld_rngi);
1890                 }
1891                 else
1892                 {
1893                      dd_scatter(dd,sizeof(state_local->ld_rngi[0]),
1894                                state->ld_rngi,state_local->ld_rngi);
1895                 }
1896                 break;
1897             case estDISRE_INITF:
1898             case estDISRE_RM3TAV:
1899             case estORIRE_INITF:
1900             case estORIRE_DTAV:
1901                 /* Not implemented yet */
1902                 break;
1903             default:
1904                 gmx_incons("Unknown state entry encountered in dd_distribute_state");
1905             }
1906         }
1907     }
1908 }
1909
1910 static char dim2char(int dim)
1911 {
1912     char c='?';
1913
1914     switch (dim)
1915     {
1916     case XX: c = 'X'; break;
1917     case YY: c = 'Y'; break;
1918     case ZZ: c = 'Z'; break;
1919     default: gmx_fatal(FARGS,"Unknown dim %d",dim);
1920     }
1921
1922     return c;
1923 }
1924
1925 static void write_dd_grid_pdb(const char *fn,gmx_large_int_t step,
1926                               gmx_domdec_t *dd,matrix box,gmx_ddbox_t *ddbox)
1927 {
1928     rvec grid_s[2],*grid_r=NULL,cx,r;
1929     char fname[STRLEN],format[STRLEN],buf[22];
1930     FILE *out;
1931     int  a,i,d,z,y,x;
1932     matrix tric;
1933     real vol;
1934
1935     copy_rvec(dd->comm->cell_x0,grid_s[0]);
1936     copy_rvec(dd->comm->cell_x1,grid_s[1]);
1937
1938     if (DDMASTER(dd))
1939     {
1940         snew(grid_r,2*dd->nnodes);
1941     }
1942
1943     dd_gather(dd,2*sizeof(rvec),grid_s[0],DDMASTER(dd) ? grid_r[0] : NULL);
1944
1945     if (DDMASTER(dd))
1946     {
1947         for(d=0; d<DIM; d++)
1948         {
1949             for(i=0; i<DIM; i++)
1950             {
1951                 if (d == i)
1952                 {
1953                     tric[d][i] = 1;
1954                 }
1955                 else
1956                 {
1957                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1958                     {
1959                         tric[d][i] = box[i][d]/box[i][i];
1960                     }
1961                     else
1962                     {
1963                         tric[d][i] = 0;
1964                     }
1965                 }
1966             }
1967         }
1968         sprintf(fname,"%s_%s.pdb",fn,gmx_step_str(step,buf));
1969         sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
1970         out = gmx_fio_fopen(fname,"w");
1971         gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
1972         a = 1;
1973         for(i=0; i<dd->nnodes; i++)
1974         {
1975             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1976             for(d=0; d<DIM; d++)
1977             {
1978                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1979             }
1980             for(z=0; z<2; z++)
1981             {
1982                 for(y=0; y<2; y++)
1983                 {
1984                     for(x=0; x<2; x++)
1985                     {
1986                         cx[XX] = grid_r[i*2+x][XX];
1987                         cx[YY] = grid_r[i*2+y][YY];
1988                         cx[ZZ] = grid_r[i*2+z][ZZ];
1989                         mvmul(tric,cx,r);
1990                         fprintf(out,format,"ATOM",a++,"CA","GLY",' ',1+i,
1991                                 10*r[XX],10*r[YY],10*r[ZZ],1.0,vol);
1992                     }
1993                 }
1994             }
1995             for(d=0; d<DIM; d++)
1996             {
1997                 for(x=0; x<4; x++)
1998                 {
1999                     switch(d)
2000                     {
2001                     case 0: y = 1 + i*8 + 2*x; break;
2002                     case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2003                     case 2: y = 1 + i*8 + x; break;
2004                     }
2005                     fprintf(out,"%6s%5d%5d\n","CONECT",y,y+(1<<d));
2006                 }
2007             }
2008         }
2009         gmx_fio_fclose(out);
2010         sfree(grid_r);
2011     }
2012 }
2013
2014 void write_dd_pdb(const char *fn,gmx_large_int_t step,const char *title,
2015                   gmx_mtop_t *mtop,t_commrec *cr,
2016                   int natoms,rvec x[],matrix box)
2017 {
2018     char fname[STRLEN],format[STRLEN],format4[STRLEN],buf[22];
2019     FILE *out;
2020     int  i,ii,resnr,c;
2021     char *atomname,*resname;
2022     real b;
2023     gmx_domdec_t *dd;
2024
2025     dd = cr->dd;
2026     if (natoms == -1)
2027     {
2028         natoms = dd->comm->nat[ddnatVSITE];
2029     }
2030
2031     sprintf(fname,"%s_%s_n%d.pdb",fn,gmx_step_str(step,buf),cr->sim_nodeid);
2032
2033     sprintf(format,"%s%s\n",pdbformat,"%6.2f%6.2f");
2034     sprintf(format4,"%s%s\n",pdbformat4,"%6.2f%6.2f");
2035
2036     out = gmx_fio_fopen(fname,"w");
2037
2038     fprintf(out,"TITLE     %s\n",title);
2039     gmx_write_pdb_box(out,dd->bScrewPBC ? epbcSCREW : epbcXYZ,box);
2040     for(i=0; i<natoms; i++)
2041     {
2042         ii = dd->gatindex[i];
2043         gmx_mtop_atominfo_global(mtop,ii,&atomname,&resnr,&resname);
2044         if (i < dd->comm->nat[ddnatZONE])
2045         {
2046             c = 0;
2047             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2048             {
2049                 c++;
2050             }
2051             b = c;
2052         }
2053         else if (i < dd->comm->nat[ddnatVSITE])
2054         {
2055             b = dd->comm->zones.n;
2056         }
2057         else
2058         {
2059             b = dd->comm->zones.n + 1;
2060         }
2061         fprintf(out,strlen(atomname)<4 ? format : format4,
2062                 "ATOM",(ii+1)%100000,
2063                 atomname,resname,' ',resnr%10000,' ',
2064                 10*x[i][XX],10*x[i][YY],10*x[i][ZZ],1.0,b);
2065     }
2066     fprintf(out,"TER\n");
2067
2068     gmx_fio_fclose(out);
2069 }
2070
2071 real dd_cutoff_mbody(gmx_domdec_t *dd)
2072 {
2073     gmx_domdec_comm_t *comm;
2074     int  di;
2075     real r;
2076
2077     comm = dd->comm;
2078
2079     r = -1;
2080     if (comm->bInterCGBondeds)
2081     {
2082         if (comm->cutoff_mbody > 0)
2083         {
2084             r = comm->cutoff_mbody;
2085         }
2086         else
2087         {
2088             /* cutoff_mbody=0 means we do not have DLB */
2089             r = comm->cellsize_min[dd->dim[0]];
2090             for(di=1; di<dd->ndim; di++)
2091             {
2092                 r = min(r,comm->cellsize_min[dd->dim[di]]);
2093             }
2094             if (comm->bBondComm)
2095             {
2096                 r = max(r,comm->cutoff_mbody);
2097             }
2098             else
2099             {
2100                 r = min(r,comm->cutoff);
2101             }
2102         }
2103     }
2104
2105     return r;
2106 }
2107
2108 real dd_cutoff_twobody(gmx_domdec_t *dd)
2109 {
2110     real r_mb;
2111
2112     r_mb = dd_cutoff_mbody(dd);
2113
2114     return max(dd->comm->cutoff,r_mb);
2115 }
2116
2117
2118 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd,ivec coord,ivec coord_pme)
2119 {
2120     int nc,ntot;
2121
2122     nc   = dd->nc[dd->comm->cartpmedim];
2123     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2124     copy_ivec(coord,coord_pme);
2125     coord_pme[dd->comm->cartpmedim] =
2126         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2127 }
2128
2129 static int low_ddindex2pmeindex(int ndd,int npme,int ddindex)
2130 {
2131     /* Here we assign a PME node to communicate with this DD node
2132      * by assuming that the major index of both is x.
2133      * We add cr->npmenodes/2 to obtain an even distribution.
2134      */
2135     return (ddindex*npme + npme/2)/ndd;
2136 }
2137
2138 static int ddindex2pmeindex(const gmx_domdec_t *dd,int ddindex)
2139 {
2140     return low_ddindex2pmeindex(dd->nnodes,dd->comm->npmenodes,ddindex);
2141 }
2142
2143 static int cr_ddindex2pmeindex(const t_commrec *cr,int ddindex)
2144 {
2145     return low_ddindex2pmeindex(cr->dd->nnodes,cr->npmenodes,ddindex);
2146 }
2147
2148 static int *dd_pmenodes(t_commrec *cr)
2149 {
2150     int *pmenodes;
2151     int n,i,p0,p1;
2152
2153     snew(pmenodes,cr->npmenodes);
2154     n = 0;
2155     for(i=0; i<cr->dd->nnodes; i++) {
2156         p0 = cr_ddindex2pmeindex(cr,i);
2157         p1 = cr_ddindex2pmeindex(cr,i+1);
2158         if (i+1 == cr->dd->nnodes || p1 > p0) {
2159             if (debug)
2160                 fprintf(debug,"pmenode[%d] = %d\n",n,i+1+n);
2161             pmenodes[n] = i + 1 + n;
2162             n++;
2163         }
2164     }
2165
2166     return pmenodes;
2167 }
2168
2169 static int gmx_ddcoord2pmeindex(t_commrec *cr,int x,int y,int z)
2170 {
2171     gmx_domdec_t *dd;
2172     ivec coords,coords_pme,nc;
2173     int  slab;
2174
2175     dd = cr->dd;
2176     /*
2177       if (dd->comm->bCartesian) {
2178       gmx_ddindex2xyz(dd->nc,ddindex,coords);
2179       dd_coords2pmecoords(dd,coords,coords_pme);
2180       copy_ivec(dd->ntot,nc);
2181       nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2182       coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2183
2184       slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2185       } else {
2186       slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2187       }
2188     */
2189     coords[XX] = x;
2190     coords[YY] = y;
2191     coords[ZZ] = z;
2192     slab = ddindex2pmeindex(dd,dd_index(dd->nc,coords));
2193
2194     return slab;
2195 }
2196
2197 static int ddcoord2simnodeid(t_commrec *cr,int x,int y,int z)
2198 {
2199     gmx_domdec_comm_t *comm;
2200     ivec coords;
2201     int  ddindex,nodeid=-1;
2202
2203     comm = cr->dd->comm;
2204
2205     coords[XX] = x;
2206     coords[YY] = y;
2207     coords[ZZ] = z;
2208     if (comm->bCartesianPP_PME)
2209     {
2210 #ifdef GMX_MPI
2211         MPI_Cart_rank(cr->mpi_comm_mysim,coords,&nodeid);
2212 #endif
2213     }
2214     else
2215     {
2216         ddindex = dd_index(cr->dd->nc,coords);
2217         if (comm->bCartesianPP)
2218         {
2219             nodeid = comm->ddindex2simnodeid[ddindex];
2220         }
2221         else
2222         {
2223             if (comm->pmenodes)
2224             {
2225                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr,x,y,z);
2226             }
2227             else
2228             {
2229                 nodeid = ddindex;
2230             }
2231         }
2232     }
2233
2234     return nodeid;
2235 }
2236
2237 static int dd_simnode2pmenode(t_commrec *cr,int sim_nodeid)
2238 {
2239     gmx_domdec_t *dd;
2240     gmx_domdec_comm_t *comm;
2241     ivec coord,coord_pme;
2242     int  i;
2243     int  pmenode=-1;
2244
2245     dd = cr->dd;
2246     comm = dd->comm;
2247
2248     /* This assumes a uniform x domain decomposition grid cell size */
2249     if (comm->bCartesianPP_PME)
2250     {
2251 #ifdef GMX_MPI
2252         MPI_Cart_coords(cr->mpi_comm_mysim,sim_nodeid,DIM,coord);
2253         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2254         {
2255             /* This is a PP node */
2256             dd_cart_coord2pmecoord(dd,coord,coord_pme);
2257             MPI_Cart_rank(cr->mpi_comm_mysim,coord_pme,&pmenode);
2258         }
2259 #endif
2260     }
2261     else if (comm->bCartesianPP)
2262     {
2263         if (sim_nodeid < dd->nnodes)
2264         {
2265             pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2266         }
2267     }
2268     else
2269     {
2270         /* This assumes DD cells with identical x coordinates
2271          * are numbered sequentially.
2272          */
2273         if (dd->comm->pmenodes == NULL)
2274         {
2275             if (sim_nodeid < dd->nnodes)
2276             {
2277                 /* The DD index equals the nodeid */
2278                 pmenode = dd->nnodes + ddindex2pmeindex(dd,sim_nodeid);
2279             }
2280         }
2281         else
2282         {
2283             i = 0;
2284             while (sim_nodeid > dd->comm->pmenodes[i])
2285             {
2286                 i++;
2287             }
2288             if (sim_nodeid < dd->comm->pmenodes[i])
2289             {
2290                 pmenode = dd->comm->pmenodes[i];
2291             }
2292         }
2293     }
2294
2295     return pmenode;
2296 }
2297
2298 gmx_bool gmx_pmeonlynode(t_commrec *cr,int sim_nodeid)
2299 {
2300     gmx_bool bPMEOnlyNode;
2301
2302     if (DOMAINDECOMP(cr))
2303     {
2304         bPMEOnlyNode = (dd_simnode2pmenode(cr,sim_nodeid) == -1);
2305     }
2306     else
2307     {
2308         bPMEOnlyNode = FALSE;
2309     }
2310
2311     return bPMEOnlyNode;
2312 }
2313
2314 void get_pme_ddnodes(t_commrec *cr,int pmenodeid,
2315                      int *nmy_ddnodes,int **my_ddnodes,int *node_peer)
2316 {
2317     gmx_domdec_t *dd;
2318     int x,y,z;
2319     ivec coord,coord_pme;
2320
2321     dd = cr->dd;
2322
2323     snew(*my_ddnodes,(dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2324
2325     *nmy_ddnodes = 0;
2326     for(x=0; x<dd->nc[XX]; x++)
2327     {
2328         for(y=0; y<dd->nc[YY]; y++)
2329         {
2330             for(z=0; z<dd->nc[ZZ]; z++)
2331             {
2332                 if (dd->comm->bCartesianPP_PME)
2333                 {
2334                     coord[XX] = x;
2335                     coord[YY] = y;
2336                     coord[ZZ] = z;
2337                     dd_cart_coord2pmecoord(dd,coord,coord_pme);
2338                     if (dd->ci[XX] == coord_pme[XX] &&
2339                         dd->ci[YY] == coord_pme[YY] &&
2340                         dd->ci[ZZ] == coord_pme[ZZ])
2341                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2342                 }
2343                 else
2344                 {
2345                     /* The slab corresponds to the nodeid in the PME group */
2346                     if (gmx_ddcoord2pmeindex(cr,x,y,z) == pmenodeid)
2347                     {
2348                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr,x,y,z);
2349                     }
2350                 }
2351             }
2352         }
2353     }
2354
2355     /* The last PP-only node is the peer node */
2356     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2357
2358     if (debug)
2359     {
2360         fprintf(debug,"Receive coordinates from PP nodes:");
2361         for(x=0; x<*nmy_ddnodes; x++)
2362         {
2363             fprintf(debug," %d",(*my_ddnodes)[x]);
2364         }
2365         fprintf(debug,"\n");
2366     }
2367 }
2368
2369 static gmx_bool receive_vir_ener(t_commrec *cr)
2370 {
2371     gmx_domdec_comm_t *comm;
2372     int  pmenode,coords[DIM],rank;
2373     gmx_bool bReceive;
2374
2375     bReceive = TRUE;
2376     if (cr->npmenodes < cr->dd->nnodes)
2377     {
2378         comm = cr->dd->comm;
2379         if (comm->bCartesianPP_PME)
2380         {
2381             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2382 #ifdef GMX_MPI
2383             MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,coords);
2384             coords[comm->cartpmedim]++;
2385             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2386             {
2387                 MPI_Cart_rank(cr->mpi_comm_mysim,coords,&rank);
2388                 if (dd_simnode2pmenode(cr,rank) == pmenode)
2389                 {
2390                     /* This is not the last PP node for pmenode */
2391                     bReceive = FALSE;
2392                 }
2393             }
2394 #endif
2395         }
2396         else
2397         {
2398             pmenode = dd_simnode2pmenode(cr,cr->sim_nodeid);
2399             if (cr->sim_nodeid+1 < cr->nnodes &&
2400                 dd_simnode2pmenode(cr,cr->sim_nodeid+1) == pmenode)
2401             {
2402                 /* This is not the last PP node for pmenode */
2403                 bReceive = FALSE;
2404             }
2405         }
2406     }
2407
2408     return bReceive;
2409 }
2410
2411 static void set_zones_ncg_home(gmx_domdec_t *dd)
2412 {
2413     gmx_domdec_zones_t *zones;
2414     int i;
2415
2416     zones = &dd->comm->zones;
2417
2418     zones->cg_range[0] = 0;
2419     for(i=1; i<zones->n+1; i++)
2420     {
2421         zones->cg_range[i] = dd->ncg_home;
2422     }
2423 }
2424
2425 static void rebuild_cgindex(gmx_domdec_t *dd,
2426                             const int *gcgs_index,t_state *state)
2427 {
2428     int nat,i,*ind,*dd_cg_gl,*cgindex,cg_gl;
2429
2430     ind = state->cg_gl;
2431     dd_cg_gl = dd->index_gl;
2432     cgindex  = dd->cgindex;
2433     nat = 0;
2434     cgindex[0] = nat;
2435     for(i=0; i<state->ncg_gl; i++)
2436     {
2437         cgindex[i] = nat;
2438         cg_gl = ind[i];
2439         dd_cg_gl[i] = cg_gl;
2440         nat += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2441     }
2442     cgindex[i] = nat;
2443
2444     dd->ncg_home = state->ncg_gl;
2445     dd->nat_home = nat;
2446
2447     set_zones_ncg_home(dd);
2448 }
2449
2450 static int ddcginfo(const cginfo_mb_t *cginfo_mb,int cg)
2451 {
2452     while (cg >= cginfo_mb->cg_end)
2453     {
2454         cginfo_mb++;
2455     }
2456
2457     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2458 }
2459
2460 static void dd_set_cginfo(int *index_gl,int cg0,int cg1,
2461                           t_forcerec *fr,char *bLocalCG)
2462 {
2463     cginfo_mb_t *cginfo_mb;
2464     int *cginfo;
2465     int cg;
2466
2467     if (fr != NULL)
2468     {
2469         cginfo_mb = fr->cginfo_mb;
2470         cginfo    = fr->cginfo;
2471
2472         for(cg=cg0; cg<cg1; cg++)
2473         {
2474             cginfo[cg] = ddcginfo(cginfo_mb,index_gl[cg]);
2475         }
2476     }
2477
2478     if (bLocalCG != NULL)
2479     {
2480         for(cg=cg0; cg<cg1; cg++)
2481         {
2482             bLocalCG[index_gl[cg]] = TRUE;
2483         }
2484     }
2485 }
2486
2487 static void make_dd_indices(gmx_domdec_t *dd,
2488                             const int *gcgs_index,int cg_start)
2489 {
2490     int nzone,zone,zone1,cg0,cg1,cg1_p1,cg,cg_gl,a,a_gl;
2491     int *zone2cg,*zone_ncg1,*index_gl,*gatindex;
2492     gmx_ga2la_t *ga2la;
2493     char *bLocalCG;
2494     gmx_bool bCGs;
2495
2496     bLocalCG = dd->comm->bLocalCG;
2497
2498     if (dd->nat_tot > dd->gatindex_nalloc)
2499     {
2500         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2501         srenew(dd->gatindex,dd->gatindex_nalloc);
2502     }
2503
2504     nzone      = dd->comm->zones.n;
2505     zone2cg    = dd->comm->zones.cg_range;
2506     zone_ncg1  = dd->comm->zone_ncg1;
2507     index_gl   = dd->index_gl;
2508     gatindex   = dd->gatindex;
2509     bCGs       = dd->comm->bCGs;
2510
2511     if (zone2cg[1] != dd->ncg_home)
2512     {
2513         gmx_incons("dd->ncg_zone is not up to date");
2514     }
2515
2516     /* Make the local to global and global to local atom index */
2517     a = dd->cgindex[cg_start];
2518     for(zone=0; zone<nzone; zone++)
2519     {
2520         if (zone == 0)
2521         {
2522             cg0 = cg_start;
2523         }
2524         else
2525         {
2526             cg0 = zone2cg[zone];
2527         }
2528         cg1    = zone2cg[zone+1];
2529         cg1_p1 = cg0 + zone_ncg1[zone];
2530
2531         for(cg=cg0; cg<cg1; cg++)
2532         {
2533             zone1 = zone;
2534             if (cg >= cg1_p1)
2535             {
2536                 /* Signal that this cg is from more than one pulse away */
2537                 zone1 += nzone;
2538             }
2539             cg_gl = index_gl[cg];
2540             if (bCGs)
2541             {
2542                 for(a_gl=gcgs_index[cg_gl]; a_gl<gcgs_index[cg_gl+1]; a_gl++)
2543                 {
2544                     gatindex[a] = a_gl;
2545                     ga2la_set(dd->ga2la,a_gl,a,zone1);
2546                     a++;
2547                 }
2548             }
2549             else
2550             {
2551                 gatindex[a] = cg_gl;
2552                 ga2la_set(dd->ga2la,cg_gl,a,zone1);
2553                 a++;
2554             }
2555         }
2556     }
2557 }
2558
2559 static int check_bLocalCG(gmx_domdec_t *dd,int ncg_sys,const char *bLocalCG,
2560                           const char *where)
2561 {
2562     int ncg,i,ngl,nerr;
2563
2564     nerr = 0;
2565     if (bLocalCG == NULL)
2566     {
2567         return nerr;
2568     }
2569     for(i=0; i<dd->ncg_tot; i++)
2570     {
2571         if (!bLocalCG[dd->index_gl[i]])
2572         {
2573             fprintf(stderr,
2574                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n",dd->rank,where,i+1,dd->index_gl[i]+1,dd->ncg_home);
2575             nerr++;
2576         }
2577     }
2578     ngl = 0;
2579     for(i=0; i<ncg_sys; i++)
2580     {
2581         if (bLocalCG[i])
2582         {
2583             ngl++;
2584         }
2585     }
2586     if (ngl != dd->ncg_tot)
2587     {
2588         fprintf(stderr,"DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n",dd->rank,where,ngl,dd->ncg_tot);
2589         nerr++;
2590     }
2591
2592     return nerr;
2593 }
2594
2595 static void check_index_consistency(gmx_domdec_t *dd,
2596                                     int natoms_sys,int ncg_sys,
2597                                     const char *where)
2598 {
2599     int  nerr,ngl,i,a,cell;
2600     int  *have;
2601
2602     nerr = 0;
2603
2604     if (dd->comm->DD_debug > 1)
2605     {
2606         snew(have,natoms_sys);
2607         for(a=0; a<dd->nat_tot; a++)
2608         {
2609             if (have[dd->gatindex[a]] > 0)
2610             {
2611                 fprintf(stderr,"DD node %d: global atom %d occurs twice: index %d and %d\n",dd->rank,dd->gatindex[a]+1,have[dd->gatindex[a]],a+1);
2612             }
2613             else
2614             {
2615                 have[dd->gatindex[a]] = a + 1;
2616             }
2617         }
2618         sfree(have);
2619     }
2620
2621     snew(have,dd->nat_tot);
2622
2623     ngl  = 0;
2624     for(i=0; i<natoms_sys; i++)
2625     {
2626         if (ga2la_get(dd->ga2la,i,&a,&cell))
2627         {
2628             if (a >= dd->nat_tot)
2629             {
2630                 fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n",dd->rank,i+1,a+1,dd->nat_tot);
2631                 nerr++;
2632             }
2633             else
2634             {
2635                 have[a] = 1;
2636                 if (dd->gatindex[a] != i)
2637                 {
2638                     fprintf(stderr,"DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n",dd->rank,i+1,a+1,dd->gatindex[a]+1);
2639                     nerr++;
2640                 }
2641             }
2642             ngl++;
2643         }
2644     }
2645     if (ngl != dd->nat_tot)
2646     {
2647         fprintf(stderr,
2648                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2649                 dd->rank,where,ngl,dd->nat_tot);
2650     }
2651     for(a=0; a<dd->nat_tot; a++)
2652     {
2653         if (have[a] == 0)
2654         {
2655             fprintf(stderr,
2656                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2657                     dd->rank,where,a+1,dd->gatindex[a]+1);
2658         }
2659     }
2660     sfree(have);
2661
2662     nerr += check_bLocalCG(dd,ncg_sys,dd->comm->bLocalCG,where);
2663
2664     if (nerr > 0) {
2665         gmx_fatal(FARGS,"DD node %d, %s: %d atom/cg index inconsistencies",
2666                   dd->rank,where,nerr);
2667     }
2668 }
2669
2670 static void clear_dd_indices(gmx_domdec_t *dd,int cg_start,int a_start)
2671 {
2672     int  i;
2673     char *bLocalCG;
2674
2675     if (a_start == 0)
2676     {
2677         /* Clear the whole list without searching */
2678         ga2la_clear(dd->ga2la);
2679     }
2680     else
2681     {
2682         for(i=a_start; i<dd->nat_tot; i++)
2683         {
2684             ga2la_del(dd->ga2la,dd->gatindex[i]);
2685         }
2686     }
2687
2688     bLocalCG = dd->comm->bLocalCG;
2689     if (bLocalCG)
2690     {
2691         for(i=cg_start; i<dd->ncg_tot; i++)
2692         {
2693             bLocalCG[dd->index_gl[i]] = FALSE;
2694         }
2695     }
2696
2697     dd_clear_local_vsite_indices(dd);
2698
2699     if (dd->constraints)
2700     {
2701         dd_clear_local_constraint_indices(dd);
2702     }
2703 }
2704
2705 /* This function should be used for moving the domain boudaries during DLB,
2706  * for obtaining the minimum cell size. It checks the initially set limit
2707  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2708  * and, possibly, a longer cut-off limit set for PME load balancing.
2709  */
2710 static real cellsize_min_dlb(gmx_domdec_comm_t *comm,int dim_ind,int dim)
2711 {
2712     real cellsize_min;
2713
2714     cellsize_min = comm->cellsize_min[dim];
2715
2716     if (!comm->bVacDLBNoLimit && comm->bPMELoadBalDLBLimits)
2717     {
2718         cellsize_min = max(cellsize_min,
2719                            comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2720     }
2721
2722     return cellsize_min;
2723 }
2724
2725 static real grid_jump_limit(gmx_domdec_comm_t *comm,real cutoff,
2726                             int dim_ind)
2727 {
2728     real grid_jump_limit;
2729
2730     /* The distance between the boundaries of cells at distance
2731      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2732      * and by the fact that cells should not be shifted by more than
2733      * half their size, such that cg's only shift by one cell
2734      * at redecomposition.
2735      */
2736     grid_jump_limit = comm->cellsize_limit;
2737     if (!comm->bVacDLBNoLimit)
2738     {
2739         if (comm->bPMELoadBalDLBLimits)
2740         {
2741             cutoff = max(cutoff,comm->PMELoadBal_max_cutoff);
2742         }
2743         grid_jump_limit = max(grid_jump_limit,
2744                               cutoff/comm->cd[dim_ind].np);
2745     }
2746
2747     return grid_jump_limit;
2748 }
2749
2750 static gmx_bool check_grid_jump(gmx_large_int_t step,
2751                                 gmx_domdec_t *dd,
2752                                 real cutoff,
2753                                 gmx_ddbox_t *ddbox,
2754                                 gmx_bool bFatal)
2755 {
2756     gmx_domdec_comm_t *comm;
2757     int  d,dim;
2758     real limit,bfac;
2759     gmx_bool bInvalid;
2760
2761     bInvalid = FALSE;
2762
2763     comm = dd->comm;
2764
2765     for(d=1; d<dd->ndim; d++)
2766     {
2767         dim = dd->dim[d];
2768         limit = grid_jump_limit(comm,cutoff,d);
2769         bfac = ddbox->box_size[dim];
2770         if (ddbox->tric_dir[dim])
2771         {
2772             bfac *= ddbox->skew_fac[dim];
2773         }
2774         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2775             (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2776         {
2777             bInvalid = TRUE;
2778
2779             if (bFatal)
2780             {
2781                 char buf[22];
2782
2783                 /* This error should never be triggered under normal
2784                  * circumstances, but you never know ...
2785                  */
2786                 gmx_fatal(FARGS,"Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2787                           gmx_step_str(step,buf),
2788                           dim2char(dim),dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
2789             }
2790         }
2791     }
2792
2793     return bInvalid;
2794 }
2795
2796 static int dd_load_count(gmx_domdec_comm_t *comm)
2797 {
2798     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2799 }
2800
2801 static float dd_force_load(gmx_domdec_comm_t *comm)
2802 {
2803     float load;
2804
2805     if (comm->eFlop)
2806     {
2807         load = comm->flop;
2808         if (comm->eFlop > 1)
2809         {
2810             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2811         }
2812     }
2813     else
2814     {
2815         load = comm->cycl[ddCyclF];
2816         if (comm->cycl_n[ddCyclF] > 1)
2817         {
2818             /* Subtract the maximum of the last n cycle counts
2819              * to get rid of possible high counts due to other soures,
2820              * for instance system activity, that would otherwise
2821              * affect the dynamic load balancing.
2822              */
2823             load -= comm->cycl_max[ddCyclF];
2824         }
2825     }
2826
2827     return load;
2828 }
2829
2830 static void set_slb_pme_dim_f(gmx_domdec_t *dd,int dim,real **dim_f)
2831 {
2832     gmx_domdec_comm_t *comm;
2833     int i;
2834
2835     comm = dd->comm;
2836
2837     snew(*dim_f,dd->nc[dim]+1);
2838     (*dim_f)[0] = 0;
2839     for(i=1; i<dd->nc[dim]; i++)
2840     {
2841         if (comm->slb_frac[dim])
2842         {
2843             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2844         }
2845         else
2846         {
2847             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2848         }
2849     }
2850     (*dim_f)[dd->nc[dim]] = 1;
2851 }
2852
2853 static void init_ddpme(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,int dimind)
2854 {
2855     int  pmeindex,slab,nso,i;
2856     ivec xyz;
2857
2858     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2859     {
2860         ddpme->dim = YY;
2861     }
2862     else
2863     {
2864         ddpme->dim = dimind;
2865     }
2866     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2867
2868     ddpme->nslab = (ddpme->dim == 0 ?
2869                     dd->comm->npmenodes_x :
2870                     dd->comm->npmenodes_y);
2871
2872     if (ddpme->nslab <= 1)
2873     {
2874         return;
2875     }
2876
2877     nso = dd->comm->npmenodes/ddpme->nslab;
2878     /* Determine for each PME slab the PP location range for dimension dim */
2879     snew(ddpme->pp_min,ddpme->nslab);
2880     snew(ddpme->pp_max,ddpme->nslab);
2881     for(slab=0; slab<ddpme->nslab; slab++) {
2882         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2883         ddpme->pp_max[slab] = 0;
2884     }
2885     for(i=0; i<dd->nnodes; i++) {
2886         ddindex2xyz(dd->nc,i,xyz);
2887         /* For y only use our y/z slab.
2888          * This assumes that the PME x grid size matches the DD grid size.
2889          */
2890         if (dimind == 0 || xyz[XX] == dd->ci[XX]) {
2891             pmeindex = ddindex2pmeindex(dd,i);
2892             if (dimind == 0) {
2893                 slab = pmeindex/nso;
2894             } else {
2895                 slab = pmeindex % ddpme->nslab;
2896             }
2897             ddpme->pp_min[slab] = min(ddpme->pp_min[slab],xyz[dimind]);
2898             ddpme->pp_max[slab] = max(ddpme->pp_max[slab],xyz[dimind]);
2899         }
2900     }
2901
2902     set_slb_pme_dim_f(dd,ddpme->dim,&ddpme->slb_dim_f);
2903 }
2904
2905 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2906 {
2907     if (dd->comm->ddpme[0].dim == XX)
2908     {
2909         return dd->comm->ddpme[0].maxshift;
2910     }
2911     else
2912     {
2913         return 0;
2914     }
2915 }
2916
2917 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2918 {
2919     if (dd->comm->ddpme[0].dim == YY)
2920     {
2921         return dd->comm->ddpme[0].maxshift;
2922     }
2923     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2924     {
2925         return dd->comm->ddpme[1].maxshift;
2926     }
2927     else
2928     {
2929         return 0;
2930     }
2931 }
2932
2933 static void set_pme_maxshift(gmx_domdec_t *dd,gmx_ddpme_t *ddpme,
2934                              gmx_bool bUniform,gmx_ddbox_t *ddbox,real *cell_f)
2935 {
2936     gmx_domdec_comm_t *comm;
2937     int  nc,ns,s;
2938     int  *xmin,*xmax;
2939     real range,pme_boundary;
2940     int  sh;
2941
2942     comm = dd->comm;
2943     nc  = dd->nc[ddpme->dim];
2944     ns  = ddpme->nslab;
2945
2946     if (!ddpme->dim_match)
2947     {
2948         /* PP decomposition is not along dim: the worst situation */
2949         sh = ns/2;
2950     }
2951     else if (ns <= 3 || (bUniform && ns == nc))
2952     {
2953         /* The optimal situation */
2954         sh = 1;
2955     }
2956     else
2957     {
2958         /* We need to check for all pme nodes which nodes they
2959          * could possibly need to communicate with.
2960          */
2961         xmin = ddpme->pp_min;
2962         xmax = ddpme->pp_max;
2963         /* Allow for atoms to be maximally 2/3 times the cut-off
2964          * out of their DD cell. This is a reasonable balance between
2965          * between performance and support for most charge-group/cut-off
2966          * combinations.
2967          */
2968         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2969         /* Avoid extra communication when we are exactly at a boundary */
2970         range *= 0.999;
2971
2972         sh = 1;
2973         for(s=0; s<ns; s++)
2974         {
2975             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2976             pme_boundary = (real)s/ns;
2977             while (sh+1 < ns &&
2978                    ((s-(sh+1) >= 0 &&
2979                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2980                     (s-(sh+1) <  0 &&
2981                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2982             {
2983                 sh++;
2984             }
2985             pme_boundary = (real)(s+1)/ns;
2986             while (sh+1 < ns &&
2987                    ((s+(sh+1) <  ns &&
2988                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
2989                     (s+(sh+1) >= ns &&
2990                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
2991             {
2992                 sh++;
2993             }
2994         }
2995     }
2996
2997     ddpme->maxshift = sh;
2998
2999     if (debug)
3000     {
3001         fprintf(debug,"PME slab communication range for dim %d is %d\n",
3002                 ddpme->dim,ddpme->maxshift);
3003     }
3004 }
3005
3006 static void check_box_size(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3007 {
3008     int d,dim;
3009
3010     for(d=0; d<dd->ndim; d++)
3011     {
3012         dim = dd->dim[d];
3013         if (dim < ddbox->nboundeddim &&
3014             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3015             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3016         {
3017             gmx_fatal(FARGS,"The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3018                       dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3019                       dd->nc[dim],dd->comm->cellsize_limit);
3020         }
3021     }
3022 }
3023
3024 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd,gmx_ddbox_t *ddbox,
3025                                   gmx_bool bMaster,ivec npulse)
3026 {
3027     gmx_domdec_comm_t *comm;
3028     int  d,j;
3029     rvec cellsize_min;
3030     real *cell_x,cell_dx,cellsize;
3031
3032     comm = dd->comm;
3033
3034     for(d=0; d<DIM; d++)
3035     {
3036         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3037         npulse[d] = 1;
3038         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3039         {
3040             /* Uniform grid */
3041             cell_dx = ddbox->box_size[d]/dd->nc[d];
3042             if (bMaster)
3043             {
3044                 for(j=0; j<dd->nc[d]+1; j++)
3045                 {
3046                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3047                 }
3048             }
3049             else
3050             {
3051                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3052                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3053             }
3054             cellsize = cell_dx*ddbox->skew_fac[d];
3055             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3056             {
3057                 npulse[d]++;
3058             }
3059             cellsize_min[d] = cellsize;
3060         }
3061         else
3062         {
3063             /* Statically load balanced grid */
3064             /* Also when we are not doing a master distribution we determine
3065              * all cell borders in a loop to obtain identical values
3066              * to the master distribution case and to determine npulse.
3067              */
3068             if (bMaster)
3069             {
3070                 cell_x = dd->ma->cell_x[d];
3071             }
3072             else
3073             {
3074                 snew(cell_x,dd->nc[d]+1);
3075             }
3076             cell_x[0] = ddbox->box0[d];
3077             for(j=0; j<dd->nc[d]; j++)
3078             {
3079                 cell_dx = ddbox->box_size[d]*comm->slb_frac[d][j];
3080                 cell_x[j+1] = cell_x[j] + cell_dx;
3081                 cellsize = cell_dx*ddbox->skew_fac[d];
3082                 while (cellsize*npulse[d] < comm->cutoff &&
3083                        npulse[d] < dd->nc[d]-1)
3084                 {
3085                     npulse[d]++;
3086                 }
3087                 cellsize_min[d] = min(cellsize_min[d],cellsize);
3088             }
3089             if (!bMaster)
3090             {
3091                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3092                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3093                 sfree(cell_x);
3094             }
3095         }
3096         /* The following limitation is to avoid that a cell would receive
3097          * some of its own home charge groups back over the periodic boundary.
3098          * Double charge groups cause trouble with the global indices.
3099          */
3100         if (d < ddbox->npbcdim &&
3101             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3102         {
3103             gmx_fatal_collective(FARGS,NULL,dd,
3104                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3105                                  dim2char(d),ddbox->box_size[d],ddbox->skew_fac[d],
3106                                  comm->cutoff,
3107                                  dd->nc[d],dd->nc[d],
3108                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3109         }
3110     }
3111
3112     if (!comm->bDynLoadBal)
3113     {
3114         copy_rvec(cellsize_min,comm->cellsize_min);
3115     }
3116
3117     for(d=0; d<comm->npmedecompdim; d++)
3118     {
3119         set_pme_maxshift(dd,&comm->ddpme[d],
3120                          comm->slb_frac[dd->dim[d]]==NULL,ddbox,
3121                          comm->ddpme[d].slb_dim_f);
3122     }
3123 }
3124
3125
3126 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3127                                        int d,int dim,gmx_domdec_root_t *root,
3128                                        gmx_ddbox_t *ddbox,
3129                                        gmx_bool bUniform,gmx_large_int_t step, real cellsize_limit_f, int range[])
3130 {
3131     gmx_domdec_comm_t *comm;
3132     int  ncd,i,j,nmin,nmin_old;
3133     gmx_bool bLimLo,bLimHi;
3134     real *cell_size;
3135     real fac,halfway,cellsize_limit_f_i,region_size;
3136     gmx_bool bPBC,bLastHi=FALSE;
3137     int nrange[]={range[0],range[1]};
3138
3139     region_size= root->cell_f[range[1]]-root->cell_f[range[0]];
3140
3141     comm = dd->comm;
3142
3143     ncd = dd->nc[dim];
3144
3145     bPBC = (dim < ddbox->npbcdim);
3146
3147     cell_size = root->buf_ncd;
3148
3149     if (debug)
3150     {
3151         fprintf(debug,"enforce_limits: %d %d\n",range[0],range[1]);
3152     }
3153
3154     /* First we need to check if the scaling does not make cells
3155      * smaller than the smallest allowed size.
3156      * We need to do this iteratively, since if a cell is too small,
3157      * it needs to be enlarged, which makes all the other cells smaller,
3158      * which could in turn make another cell smaller than allowed.
3159      */
3160     for(i=range[0]; i<range[1]; i++)
3161     {
3162         root->bCellMin[i] = FALSE;
3163     }
3164     nmin = 0;
3165     do
3166     {
3167         nmin_old = nmin;
3168         /* We need the total for normalization */
3169         fac = 0;
3170         for(i=range[0]; i<range[1]; i++)
3171         {
3172             if (root->bCellMin[i] == FALSE)
3173             {
3174                 fac += cell_size[i];
3175             }
3176         }
3177         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3178         /* Determine the cell boundaries */
3179         for(i=range[0]; i<range[1]; i++)
3180         {
3181             if (root->bCellMin[i] == FALSE)
3182             {
3183                 cell_size[i] *= fac;
3184                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3185                 {
3186                     cellsize_limit_f_i = 0;
3187                 }
3188                 else
3189                 {
3190                     cellsize_limit_f_i = cellsize_limit_f;
3191                 }
3192                 if (cell_size[i] < cellsize_limit_f_i)
3193                 {
3194                     root->bCellMin[i] = TRUE;
3195                     cell_size[i] = cellsize_limit_f_i;
3196                     nmin++;
3197                 }
3198             }
3199             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3200         }
3201     }
3202     while (nmin > nmin_old);
3203
3204     i=range[1]-1;
3205     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3206     /* For this check we should not use DD_CELL_MARGIN,
3207      * but a slightly smaller factor,
3208      * since rounding could get use below the limit.
3209      */
3210     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3211     {
3212         char buf[22];
3213         gmx_fatal(FARGS,"Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3214                   gmx_step_str(step,buf),
3215                   dim2char(dim),ddbox->box_size[dim],ddbox->skew_fac[dim],
3216                   ncd,comm->cellsize_min[dim]);
3217     }
3218
3219     root->bLimited = (nmin > 0) || (range[0]>0) || (range[1]<ncd);
3220
3221     if (!bUniform)
3222     {
3223         /* Check if the boundary did not displace more than halfway
3224          * each of the cells it bounds, as this could cause problems,
3225          * especially when the differences between cell sizes are large.
3226          * If changes are applied, they will not make cells smaller
3227          * than the cut-off, as we check all the boundaries which
3228          * might be affected by a change and if the old state was ok,
3229          * the cells will at most be shrunk back to their old size.
3230          */
3231         for(i=range[0]+1; i<range[1]; i++)
3232         {
3233             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3234             if (root->cell_f[i] < halfway)
3235             {
3236                 root->cell_f[i] = halfway;
3237                 /* Check if the change also causes shifts of the next boundaries */
3238                 for(j=i+1; j<range[1]; j++)
3239                 {
3240                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3241                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3242                 }
3243             }
3244             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3245             if (root->cell_f[i] > halfway)
3246             {
3247                 root->cell_f[i] = halfway;
3248                 /* Check if the change also causes shifts of the next boundaries */
3249                 for(j=i-1; j>=range[0]+1; j--)
3250                 {
3251                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3252                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3253                 }
3254             }
3255         }
3256     }
3257
3258     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3259     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3260      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3261      * for a and b nrange is used */
3262     if (d > 0)
3263     {
3264         /* Take care of the staggering of the cell boundaries */
3265         if (bUniform)
3266         {
3267             for(i=range[0]; i<range[1]; i++)
3268             {
3269                 root->cell_f_max0[i] = root->cell_f[i];
3270                 root->cell_f_min1[i] = root->cell_f[i+1];
3271             }
3272         }
3273         else
3274         {
3275             for(i=range[0]+1; i<range[1]; i++)
3276             {
3277                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3278                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3279                 if (bLimLo && bLimHi)
3280                 {
3281                     /* Both limits violated, try the best we can */
3282                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3283                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3284                     nrange[0]=range[0];
3285                     nrange[1]=i;
3286                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3287
3288                     nrange[0]=i;
3289                     nrange[1]=range[1];
3290                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3291
3292                     return;
3293                 }
3294                 else if (bLimLo)
3295                 {
3296                     /* root->cell_f[i] = root->bound_min[i]; */
3297                     nrange[1]=i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3298                     bLastHi=FALSE;
3299                 }
3300                 else if (bLimHi && !bLastHi)
3301                 {
3302                     bLastHi=TRUE;
3303                     if (nrange[1] < range[1])   /* found a LimLo before */
3304                     {
3305                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3306                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3307                         nrange[0]=nrange[1];
3308                     }
3309                     root->cell_f[i] = root->bound_max[i];
3310                     nrange[1]=i;
3311                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3312                     nrange[0]=i;
3313                     nrange[1]=range[1];
3314                 }
3315             }
3316             if (nrange[1] < range[1])   /* found last a LimLo */
3317             {
3318                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3319                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3320                 nrange[0]=nrange[1];
3321                 nrange[1]=range[1];
3322                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3323             }
3324             else if (nrange[0] > range[0]) /* found at least one LimHi */
3325             {
3326                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3327             }
3328         }
3329     }
3330 }
3331
3332
3333 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3334                                        int d,int dim,gmx_domdec_root_t *root,
3335                                        gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3336                                        gmx_bool bUniform,gmx_large_int_t step)
3337 {
3338     gmx_domdec_comm_t *comm;
3339     int  ncd,d1,i,j,pos;
3340     real *cell_size;
3341     real load_aver,load_i,imbalance,change,change_max,sc;
3342     real cellsize_limit_f,dist_min_f,dist_min_f_hard,space;
3343     real change_limit;
3344     real relax = 0.5;
3345     gmx_bool bPBC;
3346     int range[] = { 0, 0 };
3347
3348     comm = dd->comm;
3349
3350     /* Convert the maximum change from the input percentage to a fraction */
3351     change_limit = comm->dlb_scale_lim*0.01;
3352
3353     ncd = dd->nc[dim];
3354
3355     bPBC = (dim < ddbox->npbcdim);
3356
3357     cell_size = root->buf_ncd;
3358
3359     /* Store the original boundaries */
3360     for(i=0; i<ncd+1; i++)
3361     {
3362         root->old_cell_f[i] = root->cell_f[i];
3363     }
3364     if (bUniform) {
3365         for(i=0; i<ncd; i++)
3366         {
3367             cell_size[i] = 1.0/ncd;
3368         }
3369     }
3370     else if (dd_load_count(comm))
3371     {
3372         load_aver = comm->load[d].sum_m/ncd;
3373         change_max = 0;
3374         for(i=0; i<ncd; i++)
3375         {
3376             /* Determine the relative imbalance of cell i */
3377             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3378             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3379             /* Determine the change of the cell size using underrelaxation */
3380             change = -relax*imbalance;
3381             change_max = max(change_max,max(change,-change));
3382         }
3383         /* Limit the amount of scaling.
3384          * We need to use the same rescaling for all cells in one row,
3385          * otherwise the load balancing might not converge.
3386          */
3387         sc = relax;
3388         if (change_max > change_limit)
3389         {
3390             sc *= change_limit/change_max;
3391         }
3392         for(i=0; i<ncd; i++)
3393         {
3394             /* Determine the relative imbalance of cell i */
3395             load_i = comm->load[d].load[i*comm->load[d].nload+2];
3396             imbalance = (load_i - load_aver)/(load_aver>0 ? load_aver : 1);
3397             /* Determine the change of the cell size using underrelaxation */
3398             change = -sc*imbalance;
3399             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3400         }
3401     }
3402
3403     cellsize_limit_f  = cellsize_min_dlb(comm,d,dim)/ddbox->box_size[dim];
3404     cellsize_limit_f *= DD_CELL_MARGIN;
3405     dist_min_f_hard   = grid_jump_limit(comm,comm->cutoff,d)/ddbox->box_size[dim];
3406     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3407     if (ddbox->tric_dir[dim])
3408     {
3409         cellsize_limit_f /= ddbox->skew_fac[dim];
3410         dist_min_f       /= ddbox->skew_fac[dim];
3411     }
3412     if (bDynamicBox && d > 0)
3413     {
3414         dist_min_f *= DD_PRES_SCALE_MARGIN;
3415     }
3416     if (d > 0 && !bUniform)
3417     {
3418         /* Make sure that the grid is not shifted too much */
3419         for(i=1; i<ncd; i++) {
3420             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3421             {
3422                 gmx_incons("Inconsistent DD boundary staggering limits!");
3423             }
3424             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3425             space = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3426             if (space > 0) {
3427                 root->bound_min[i] += 0.5*space;
3428             }
3429             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3430             space = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3431             if (space < 0) {
3432                 root->bound_max[i] += 0.5*space;
3433             }
3434             if (debug)
3435             {
3436                 fprintf(debug,
3437                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3438                         d,i,
3439                         root->cell_f_max0[i-1] + dist_min_f,
3440                         root->bound_min[i],root->cell_f[i],root->bound_max[i],
3441                         root->cell_f_min1[i] - dist_min_f);
3442             }
3443         }
3444     }
3445     range[1]=ncd;
3446     root->cell_f[0] = 0;
3447     root->cell_f[ncd] = 1;
3448     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3449
3450
3451     /* After the checks above, the cells should obey the cut-off
3452      * restrictions, but it does not hurt to check.
3453      */
3454     for(i=0; i<ncd; i++)
3455     {
3456         if (debug)
3457         {
3458             fprintf(debug,"Relative bounds dim %d  cell %d: %f %f\n",
3459                     dim,i,root->cell_f[i],root->cell_f[i+1]);
3460         }
3461
3462         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3463             root->cell_f[i+1] - root->cell_f[i] <
3464             cellsize_limit_f/DD_CELL_MARGIN)
3465         {
3466             char buf[22];
3467             fprintf(stderr,
3468                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3469                     gmx_step_str(step,buf),dim2char(dim),i,
3470                     (root->cell_f[i+1] - root->cell_f[i])
3471                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3472         }
3473     }
3474
3475     pos = ncd + 1;
3476     /* Store the cell boundaries of the lower dimensions at the end */
3477     for(d1=0; d1<d; d1++)
3478     {
3479         root->cell_f[pos++] = comm->cell_f0[d1];
3480         root->cell_f[pos++] = comm->cell_f1[d1];
3481     }
3482
3483     if (d < comm->npmedecompdim)
3484     {
3485         /* The master determines the maximum shift for
3486          * the coordinate communication between separate PME nodes.
3487          */
3488         set_pme_maxshift(dd,&comm->ddpme[d],bUniform,ddbox,root->cell_f);
3489     }
3490     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3491     if (d >= 1)
3492     {
3493         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3494     }
3495 }
3496
3497 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3498                                              gmx_ddbox_t *ddbox,int dimind)
3499 {
3500     gmx_domdec_comm_t *comm;
3501     int dim;
3502
3503     comm = dd->comm;
3504
3505     /* Set the cell dimensions */
3506     dim = dd->dim[dimind];
3507     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3508     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3509     if (dim >= ddbox->nboundeddim)
3510     {
3511         comm->cell_x0[dim] += ddbox->box0[dim];
3512         comm->cell_x1[dim] += ddbox->box0[dim];
3513     }
3514 }
3515
3516 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3517                                          int d,int dim,real *cell_f_row,
3518                                          gmx_ddbox_t *ddbox)
3519 {
3520     gmx_domdec_comm_t *comm;
3521     int d1,dim1,pos;
3522
3523     comm = dd->comm;
3524
3525 #ifdef GMX_MPI
3526     /* Each node would only need to know two fractions,
3527      * but it is probably cheaper to broadcast the whole array.
3528      */
3529     MPI_Bcast(cell_f_row,DD_CELL_F_SIZE(dd,d)*sizeof(real),MPI_BYTE,
3530               0,comm->mpi_comm_load[d]);
3531 #endif
3532     /* Copy the fractions for this dimension from the buffer */
3533     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3534     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3535     /* The whole array was communicated, so set the buffer position */
3536     pos = dd->nc[dim] + 1;
3537     for(d1=0; d1<=d; d1++)
3538     {
3539         if (d1 < d)
3540         {
3541             /* Copy the cell fractions of the lower dimensions */
3542             comm->cell_f0[d1] = cell_f_row[pos++];
3543             comm->cell_f1[d1] = cell_f_row[pos++];
3544         }
3545         relative_to_absolute_cell_bounds(dd,ddbox,d1);
3546     }
3547     /* Convert the communicated shift from float to int */
3548     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3549     if (d >= 1)
3550     {
3551         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3552     }
3553 }
3554
3555 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3556                                          gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3557                                          gmx_bool bUniform,gmx_large_int_t step)
3558 {
3559     gmx_domdec_comm_t *comm;
3560     int d,dim,d1;
3561     gmx_bool bRowMember,bRowRoot;
3562     real *cell_f_row;
3563
3564     comm = dd->comm;
3565
3566     for(d=0; d<dd->ndim; d++)
3567     {
3568         dim = dd->dim[d];
3569         bRowMember = TRUE;
3570         bRowRoot = TRUE;
3571         for(d1=d; d1<dd->ndim; d1++)
3572         {
3573             if (dd->ci[dd->dim[d1]] > 0)
3574             {
3575                 if (d1 > d)
3576                 {
3577                     bRowMember = FALSE;
3578                 }
3579                 bRowRoot = FALSE;
3580             }
3581         }
3582         if (bRowMember)
3583         {
3584             if (bRowRoot)
3585             {
3586                 set_dd_cell_sizes_dlb_root(dd,d,dim,comm->root[d],
3587                                            ddbox,bDynamicBox,bUniform,step);
3588                 cell_f_row = comm->root[d]->cell_f;
3589             }
3590             else
3591             {
3592                 cell_f_row = comm->cell_f_row;
3593             }
3594             distribute_dd_cell_sizes_dlb(dd,d,dim,cell_f_row,ddbox);
3595         }
3596     }
3597 }
3598
3599 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
3600 {
3601     int d;
3602
3603     /* This function assumes the box is static and should therefore
3604      * not be called when the box has changed since the last
3605      * call to dd_partition_system.
3606      */
3607     for(d=0; d<dd->ndim; d++)
3608     {
3609         relative_to_absolute_cell_bounds(dd,ddbox,d);
3610     }
3611 }
3612
3613
3614
3615 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3616                                   gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3617                                   gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3618                                   gmx_wallcycle_t wcycle)
3619 {
3620     gmx_domdec_comm_t *comm;
3621     int dim;
3622
3623     comm = dd->comm;
3624
3625     if (bDoDLB)
3626     {
3627         wallcycle_start(wcycle,ewcDDCOMMBOUND);
3628         set_dd_cell_sizes_dlb_change(dd,ddbox,bDynamicBox,bUniform,step);
3629         wallcycle_stop(wcycle,ewcDDCOMMBOUND);
3630     }
3631     else if (bDynamicBox)
3632     {
3633         set_dd_cell_sizes_dlb_nochange(dd,ddbox);
3634     }
3635
3636     /* Set the dimensions for which no DD is used */
3637     for(dim=0; dim<DIM; dim++) {
3638         if (dd->nc[dim] == 1) {
3639             comm->cell_x0[dim] = 0;
3640             comm->cell_x1[dim] = ddbox->box_size[dim];
3641             if (dim >= ddbox->nboundeddim)
3642             {
3643                 comm->cell_x0[dim] += ddbox->box0[dim];
3644                 comm->cell_x1[dim] += ddbox->box0[dim];
3645             }
3646         }
3647     }
3648 }
3649
3650 static void realloc_comm_ind(gmx_domdec_t *dd,ivec npulse)
3651 {
3652     int d,np,i;
3653     gmx_domdec_comm_dim_t *cd;
3654
3655     for(d=0; d<dd->ndim; d++)
3656     {
3657         cd = &dd->comm->cd[d];
3658         np = npulse[dd->dim[d]];
3659         if (np > cd->np_nalloc)
3660         {
3661             if (debug)
3662             {
3663                 fprintf(debug,"(Re)allocing cd for %c to %d pulses\n",
3664                         dim2char(dd->dim[d]),np);
3665             }
3666             if (DDMASTER(dd) && cd->np_nalloc > 0)
3667             {
3668                 fprintf(stderr,"\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n",dim2char(dd->dim[d]),np);
3669             }
3670             srenew(cd->ind,np);
3671             for(i=cd->np_nalloc; i<np; i++)
3672             {
3673                 cd->ind[i].index  = NULL;
3674                 cd->ind[i].nalloc = 0;
3675             }
3676             cd->np_nalloc = np;
3677         }
3678         cd->np = np;
3679     }
3680 }
3681
3682
3683 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3684                               gmx_ddbox_t *ddbox,gmx_bool bDynamicBox,
3685                               gmx_bool bUniform,gmx_bool bDoDLB,gmx_large_int_t step,
3686                               gmx_wallcycle_t wcycle)
3687 {
3688     gmx_domdec_comm_t *comm;
3689     int  d;
3690     ivec npulse;
3691
3692     comm = dd->comm;
3693
3694     /* Copy the old cell boundaries for the cg displacement check */
3695     copy_rvec(comm->cell_x0,comm->old_cell_x0);
3696     copy_rvec(comm->cell_x1,comm->old_cell_x1);
3697
3698     if (comm->bDynLoadBal)
3699     {
3700         if (DDMASTER(dd))
3701         {
3702             check_box_size(dd,ddbox);
3703         }
3704         set_dd_cell_sizes_dlb(dd,ddbox,bDynamicBox,bUniform,bDoDLB,step,wcycle);
3705     }
3706     else
3707     {
3708         set_dd_cell_sizes_slb(dd,ddbox,FALSE,npulse);
3709         realloc_comm_ind(dd,npulse);
3710     }
3711
3712     if (debug)
3713     {
3714         for(d=0; d<DIM; d++)
3715         {
3716             fprintf(debug,"cell_x[%d] %f - %f skew_fac %f\n",
3717                     d,comm->cell_x0[d],comm->cell_x1[d],ddbox->skew_fac[d]);
3718         }
3719     }
3720 }
3721
3722 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3723                                   gmx_ddbox_t *ddbox,
3724                                   rvec cell_ns_x0,rvec cell_ns_x1,
3725                                   gmx_large_int_t step)
3726 {
3727     gmx_domdec_comm_t *comm;
3728     int dim_ind,dim;
3729
3730     comm = dd->comm;
3731
3732     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
3733     {
3734         dim = dd->dim[dim_ind];
3735
3736         /* Without PBC we don't have restrictions on the outer cells */
3737         if (!(dim >= ddbox->npbcdim &&
3738               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3739             comm->bDynLoadBal &&
3740             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3741             comm->cellsize_min[dim])
3742         {
3743             char buf[22];
3744             gmx_fatal(FARGS,"Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3745                       gmx_step_str(step,buf),dim2char(dim),
3746                       comm->cell_x1[dim] - comm->cell_x0[dim],
3747                       ddbox->skew_fac[dim],
3748                       dd->comm->cellsize_min[dim],
3749                       dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
3750         }
3751     }
3752
3753     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3754     {
3755         /* Communicate the boundaries and update cell_ns_x0/1 */
3756         dd_move_cellx(dd,ddbox,cell_ns_x0,cell_ns_x1);
3757         if (dd->bGridJump && dd->ndim > 1)
3758         {
3759             check_grid_jump(step,dd,dd->comm->cutoff,ddbox,TRUE);
3760         }
3761     }
3762 }
3763
3764 static void make_tric_corr_matrix(int npbcdim,matrix box,matrix tcm)
3765 {
3766     if (YY < npbcdim)
3767     {
3768         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3769     }
3770     else
3771     {
3772         tcm[YY][XX] = 0;
3773     }
3774     if (ZZ < npbcdim)
3775     {
3776         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3777         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3778     }
3779     else
3780     {
3781         tcm[ZZ][XX] = 0;
3782         tcm[ZZ][YY] = 0;
3783     }
3784 }
3785
3786 static void check_screw_box(matrix box)
3787 {
3788     /* Mathematical limitation */
3789     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3790     {
3791         gmx_fatal(FARGS,"With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3792     }
3793
3794     /* Limitation due to the asymmetry of the eighth shell method */
3795     if (box[ZZ][YY] != 0)
3796     {
3797         gmx_fatal(FARGS,"pbc=screw with non-zero box_zy is not supported");
3798     }
3799 }
3800
3801 static void distribute_cg(FILE *fplog,gmx_large_int_t step,
3802                           matrix box,ivec tric_dir,t_block *cgs,rvec pos[],
3803                           gmx_domdec_t *dd)
3804 {
3805     gmx_domdec_master_t *ma;
3806     int **tmp_ind=NULL,*tmp_nalloc=NULL;
3807     int  i,icg,j,k,k0,k1,d,npbcdim;
3808     matrix tcm;
3809     rvec box_size,cg_cm;
3810     ivec ind;
3811     real nrcg,inv_ncg,pos_d;
3812     atom_id *cgindex;
3813     gmx_bool bUnbounded,bScrew;
3814
3815     ma = dd->ma;
3816
3817     if (tmp_ind == NULL)
3818     {
3819         snew(tmp_nalloc,dd->nnodes);
3820         snew(tmp_ind,dd->nnodes);
3821         for(i=0; i<dd->nnodes; i++)
3822         {
3823             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3824             snew(tmp_ind[i],tmp_nalloc[i]);
3825         }
3826     }
3827
3828     /* Clear the count */
3829     for(i=0; i<dd->nnodes; i++)
3830     {
3831         ma->ncg[i] = 0;
3832         ma->nat[i] = 0;
3833     }
3834
3835     make_tric_corr_matrix(dd->npbcdim,box,tcm);
3836
3837     cgindex = cgs->index;
3838
3839     /* Compute the center of geometry for all charge groups */
3840     for(icg=0; icg<cgs->nr; icg++)
3841     {
3842         k0      = cgindex[icg];
3843         k1      = cgindex[icg+1];
3844         nrcg    = k1 - k0;
3845         if (nrcg == 1)
3846         {
3847             copy_rvec(pos[k0],cg_cm);
3848         }
3849         else
3850         {
3851             inv_ncg = 1.0/nrcg;
3852
3853             clear_rvec(cg_cm);
3854             for(k=k0; (k<k1); k++)
3855             {
3856                 rvec_inc(cg_cm,pos[k]);
3857             }
3858             for(d=0; (d<DIM); d++)
3859             {
3860                 cg_cm[d] *= inv_ncg;
3861             }
3862         }
3863         /* Put the charge group in the box and determine the cell index */
3864         for(d=DIM-1; d>=0; d--) {
3865             pos_d = cg_cm[d];
3866             if (d < dd->npbcdim)
3867             {
3868                 bScrew = (dd->bScrewPBC && d == XX);
3869                 if (tric_dir[d] && dd->nc[d] > 1)
3870                 {
3871                     /* Use triclinic coordintates for this dimension */
3872                     for(j=d+1; j<DIM; j++)
3873                     {
3874                         pos_d += cg_cm[j]*tcm[j][d];
3875                     }
3876                 }
3877                 while(pos_d >= box[d][d])
3878                 {
3879                     pos_d -= box[d][d];
3880                     rvec_dec(cg_cm,box[d]);
3881                     if (bScrew)
3882                     {
3883                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3884                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3885                     }
3886                     for(k=k0; (k<k1); k++)
3887                     {
3888                         rvec_dec(pos[k],box[d]);
3889                         if (bScrew)
3890                         {
3891                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3892                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3893                         }
3894                     }
3895                 }
3896                 while(pos_d < 0)
3897                 {
3898                     pos_d += box[d][d];
3899                     rvec_inc(cg_cm,box[d]);
3900                     if (bScrew)
3901                     {
3902                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3903                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3904                     }
3905                     for(k=k0; (k<k1); k++)
3906                     {
3907                         rvec_inc(pos[k],box[d]);
3908                         if (bScrew) {
3909                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3910                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3911                         }
3912                     }
3913                 }
3914             }
3915             /* This could be done more efficiently */
3916             ind[d] = 0;
3917             while(ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3918             {
3919                 ind[d]++;
3920             }
3921         }
3922         i = dd_index(dd->nc,ind);
3923         if (ma->ncg[i] == tmp_nalloc[i])
3924         {
3925             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3926             srenew(tmp_ind[i],tmp_nalloc[i]);
3927         }
3928         tmp_ind[i][ma->ncg[i]] = icg;
3929         ma->ncg[i]++;
3930         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3931     }
3932
3933     k1 = 0;
3934     for(i=0; i<dd->nnodes; i++)
3935     {
3936         ma->index[i] = k1;
3937         for(k=0; k<ma->ncg[i]; k++)
3938         {
3939             ma->cg[k1++] = tmp_ind[i][k];
3940         }
3941     }
3942     ma->index[dd->nnodes] = k1;
3943
3944     for(i=0; i<dd->nnodes; i++)
3945     {
3946         sfree(tmp_ind[i]);
3947     }
3948     sfree(tmp_ind);
3949     sfree(tmp_nalloc);
3950
3951     if (fplog)
3952     {
3953         char buf[22];
3954         fprintf(fplog,"Charge group distribution at step %s:",
3955                 gmx_step_str(step,buf));
3956         for(i=0; i<dd->nnodes; i++)
3957         {
3958             fprintf(fplog," %d",ma->ncg[i]);
3959         }
3960         fprintf(fplog,"\n");
3961     }
3962 }
3963
3964 static void get_cg_distribution(FILE *fplog,gmx_large_int_t step,gmx_domdec_t *dd,
3965                                 t_block *cgs,matrix box,gmx_ddbox_t *ddbox,
3966                                 rvec pos[])
3967 {
3968     gmx_domdec_master_t *ma=NULL;
3969     ivec npulse;
3970     int  i,cg_gl;
3971     int  *ibuf,buf2[2] = { 0, 0 };
3972     gmx_bool bMaster = DDMASTER(dd);
3973     if (bMaster)
3974     {
3975         ma = dd->ma;
3976
3977         if (dd->bScrewPBC)
3978         {
3979             check_screw_box(box);
3980         }
3981
3982         set_dd_cell_sizes_slb(dd,ddbox,TRUE,npulse);
3983
3984         distribute_cg(fplog,step,box,ddbox->tric_dir,cgs,pos,dd);
3985         for(i=0; i<dd->nnodes; i++)
3986         {
3987             ma->ibuf[2*i]   = ma->ncg[i];
3988             ma->ibuf[2*i+1] = ma->nat[i];
3989         }
3990         ibuf = ma->ibuf;
3991     }
3992     else
3993     {
3994         ibuf = NULL;
3995     }
3996     dd_scatter(dd,2*sizeof(int),ibuf,buf2);
3997
3998     dd->ncg_home = buf2[0];
3999     dd->nat_home = buf2[1];
4000     dd->ncg_tot  = dd->ncg_home;
4001     dd->nat_tot  = dd->nat_home;
4002     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4003     {
4004         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4005         srenew(dd->index_gl,dd->cg_nalloc);
4006         srenew(dd->cgindex,dd->cg_nalloc+1);
4007     }
4008     if (bMaster)
4009     {
4010         for(i=0; i<dd->nnodes; i++)
4011         {
4012             ma->ibuf[i] = ma->ncg[i]*sizeof(int);
4013             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4014         }
4015     }
4016
4017     dd_scatterv(dd,
4018                 DDMASTER(dd) ? ma->ibuf : NULL,
4019                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4020                 DDMASTER(dd) ? ma->cg : NULL,
4021                 dd->ncg_home*sizeof(int),dd->index_gl);
4022
4023     /* Determine the home charge group sizes */
4024     dd->cgindex[0] = 0;
4025     for(i=0; i<dd->ncg_home; i++)
4026     {
4027         cg_gl = dd->index_gl[i];
4028         dd->cgindex[i+1] =
4029             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4030     }
4031
4032     if (debug)
4033     {
4034         fprintf(debug,"Home charge groups:\n");
4035         for(i=0; i<dd->ncg_home; i++)
4036         {
4037             fprintf(debug," %d",dd->index_gl[i]);
4038             if (i % 10 == 9)
4039                 fprintf(debug,"\n");
4040         }
4041         fprintf(debug,"\n");
4042     }
4043 }
4044
4045 static int compact_and_copy_vec_at(int ncg,int *move,
4046                                    int *cgindex,
4047                                    int nvec,int vec,
4048                                    rvec *src,gmx_domdec_comm_t *comm,
4049                                    gmx_bool bCompact)
4050 {
4051     int m,icg,i,i0,i1,nrcg;
4052     int home_pos;
4053     int pos_vec[DIM*2];
4054
4055     home_pos = 0;
4056
4057     for(m=0; m<DIM*2; m++)
4058     {
4059         pos_vec[m] = 0;
4060     }
4061
4062     i0 = 0;
4063     for(icg=0; icg<ncg; icg++)
4064     {
4065         i1 = cgindex[icg+1];
4066         m = move[icg];
4067         if (m == -1)
4068         {
4069             if (bCompact)
4070             {
4071                 /* Compact the home array in place */
4072                 for(i=i0; i<i1; i++)
4073                 {
4074                     copy_rvec(src[i],src[home_pos++]);
4075                 }
4076             }
4077         }
4078         else
4079         {
4080             /* Copy to the communication buffer */
4081             nrcg = i1 - i0;
4082             pos_vec[m] += 1 + vec*nrcg;
4083             for(i=i0; i<i1; i++)
4084             {
4085                 copy_rvec(src[i],comm->cgcm_state[m][pos_vec[m]++]);
4086             }
4087             pos_vec[m] += (nvec - vec - 1)*nrcg;
4088         }
4089         if (!bCompact)
4090         {
4091             home_pos += i1 - i0;
4092         }
4093         i0 = i1;
4094     }
4095
4096     return home_pos;
4097 }
4098
4099 static int compact_and_copy_vec_cg(int ncg,int *move,
4100                                    int *cgindex,
4101                                    int nvec,rvec *src,gmx_domdec_comm_t *comm,
4102                                    gmx_bool bCompact)
4103 {
4104     int m,icg,i0,i1,nrcg;
4105     int home_pos;
4106     int pos_vec[DIM*2];
4107
4108     home_pos = 0;
4109
4110     for(m=0; m<DIM*2; m++)
4111     {
4112         pos_vec[m] = 0;
4113     }
4114
4115     i0 = 0;
4116     for(icg=0; icg<ncg; icg++)
4117     {
4118         i1 = cgindex[icg+1];
4119         m = move[icg];
4120         if (m == -1)
4121         {
4122             if (bCompact)
4123             {
4124                 /* Compact the home array in place */
4125                 copy_rvec(src[icg],src[home_pos++]);
4126             }
4127         }
4128         else
4129         {
4130             nrcg = i1 - i0;
4131             /* Copy to the communication buffer */
4132             copy_rvec(src[icg],comm->cgcm_state[m][pos_vec[m]]);
4133             pos_vec[m] += 1 + nrcg*nvec;
4134         }
4135         i0 = i1;
4136     }
4137     if (!bCompact)
4138     {
4139         home_pos = ncg;
4140     }
4141
4142     return home_pos;
4143 }
4144
4145 static int compact_ind(int ncg,int *move,
4146                        int *index_gl,int *cgindex,
4147                        int *gatindex,
4148                        gmx_ga2la_t ga2la,char *bLocalCG,
4149                        int *cginfo)
4150 {
4151     int cg,nat,a0,a1,a,a_gl;
4152     int home_pos;
4153
4154     home_pos = 0;
4155     nat = 0;
4156     for(cg=0; cg<ncg; cg++)
4157     {
4158         a0 = cgindex[cg];
4159         a1 = cgindex[cg+1];
4160         if (move[cg] == -1)
4161         {
4162             /* Compact the home arrays in place.
4163              * Anything that can be done here avoids access to global arrays.
4164              */
4165             cgindex[home_pos] = nat;
4166             for(a=a0; a<a1; a++)
4167             {
4168                 a_gl = gatindex[a];
4169                 gatindex[nat] = a_gl;
4170                 /* The cell number stays 0, so we don't need to set it */
4171                 ga2la_change_la(ga2la,a_gl,nat);
4172                 nat++;
4173             }
4174             index_gl[home_pos] = index_gl[cg];
4175             cginfo[home_pos]   = cginfo[cg];
4176             /* The charge group remains local, so bLocalCG does not change */
4177             home_pos++;
4178         }
4179         else
4180         {
4181             /* Clear the global indices */
4182             for(a=a0; a<a1; a++)
4183             {
4184                 ga2la_del(ga2la,gatindex[a]);
4185             }
4186             if (bLocalCG)
4187             {
4188                 bLocalCG[index_gl[cg]] = FALSE;
4189             }
4190         }
4191     }
4192     cgindex[home_pos] = nat;
4193
4194     return home_pos;
4195 }
4196
4197 static void clear_and_mark_ind(int ncg,int *move,
4198                                int *index_gl,int *cgindex,int *gatindex,
4199                                gmx_ga2la_t ga2la,char *bLocalCG,
4200                                int *cell_index)
4201 {
4202     int cg,a0,a1,a;
4203
4204     for(cg=0; cg<ncg; cg++)
4205     {
4206         if (move[cg] >= 0)
4207         {
4208             a0 = cgindex[cg];
4209             a1 = cgindex[cg+1];
4210             /* Clear the global indices */
4211             for(a=a0; a<a1; a++)
4212             {
4213                 ga2la_del(ga2la,gatindex[a]);
4214             }
4215             if (bLocalCG)
4216             {
4217                 bLocalCG[index_gl[cg]] = FALSE;
4218             }
4219             /* Signal that this cg has moved using the ns cell index.
4220              * Here we set it to -1. fill_grid will change it
4221              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4222              */
4223             cell_index[cg] = -1;
4224         }
4225     }
4226 }
4227
4228 static void print_cg_move(FILE *fplog,
4229                           gmx_domdec_t *dd,
4230                           gmx_large_int_t step,int cg,int dim,int dir,
4231                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4232                           rvec cm_old,rvec cm_new,real pos_d)
4233 {
4234     gmx_domdec_comm_t *comm;
4235     char buf[22];
4236
4237     comm = dd->comm;
4238
4239     fprintf(fplog,"\nStep %s:\n",gmx_step_str(step,buf));
4240     if (bHaveLimitdAndCMOld)
4241     {
4242         fprintf(fplog,"The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4243                 ddglatnr(dd,dd->cgindex[cg]),limitd,dim2char(dim));
4244     }
4245     else
4246     {
4247         fprintf(fplog,"The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4248                 ddglatnr(dd,dd->cgindex[cg]),dim2char(dim));
4249     }
4250     fprintf(fplog,"distance out of cell %f\n",
4251             dir==1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4252     if (bHaveLimitdAndCMOld)
4253     {
4254         fprintf(fplog,"Old coordinates: %8.3f %8.3f %8.3f\n",
4255                 cm_old[XX],cm_old[YY],cm_old[ZZ]);
4256     }
4257     fprintf(fplog,"New coordinates: %8.3f %8.3f %8.3f\n",
4258             cm_new[XX],cm_new[YY],cm_new[ZZ]);
4259     fprintf(fplog,"Old cell boundaries in direction %c: %8.3f %8.3f\n",
4260             dim2char(dim),
4261             comm->old_cell_x0[dim],comm->old_cell_x1[dim]);
4262     fprintf(fplog,"New cell boundaries in direction %c: %8.3f %8.3f\n",
4263             dim2char(dim),
4264             comm->cell_x0[dim],comm->cell_x1[dim]);
4265 }
4266
4267 static void cg_move_error(FILE *fplog,
4268                           gmx_domdec_t *dd,
4269                           gmx_large_int_t step,int cg,int dim,int dir,
4270                           gmx_bool bHaveLimitdAndCMOld,real limitd,
4271                           rvec cm_old,rvec cm_new,real pos_d)
4272 {
4273     if (fplog)
4274     {
4275         print_cg_move(fplog, dd,step,cg,dim,dir,
4276                       bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4277     }
4278     print_cg_move(stderr,dd,step,cg,dim,dir,
4279                   bHaveLimitdAndCMOld,limitd,cm_old,cm_new,pos_d);
4280     gmx_fatal(FARGS,
4281               "A charge group moved too far between two domain decomposition steps\n"
4282               "This usually means that your system is not well equilibrated");
4283 }
4284
4285 static void rotate_state_atom(t_state *state,int a)
4286 {
4287     int est;
4288
4289     for(est=0; est<estNR; est++)
4290     {
4291         if (EST_DISTR(est) && (state->flags & (1<<est))) {
4292             switch (est) {
4293             case estX:
4294                 /* Rotate the complete state; for a rectangular box only */
4295                 state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4296                 state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4297                 break;
4298             case estV:
4299                 state->v[a][YY] = -state->v[a][YY];
4300                 state->v[a][ZZ] = -state->v[a][ZZ];
4301                 break;
4302             case estSDX:
4303                 state->sd_X[a][YY] = -state->sd_X[a][YY];
4304                 state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4305                 break;
4306             case estCGP:
4307                 state->cg_p[a][YY] = -state->cg_p[a][YY];
4308                 state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4309                 break;
4310             case estDISRE_INITF:
4311             case estDISRE_RM3TAV:
4312             case estORIRE_INITF:
4313             case estORIRE_DTAV:
4314                 /* These are distances, so not affected by rotation */
4315                 break;
4316             default:
4317                 gmx_incons("Unknown state entry encountered in rotate_state_atom");
4318             }
4319         }
4320     }
4321 }
4322
4323 static int *get_moved(gmx_domdec_comm_t *comm,int natoms)
4324 {
4325     if (natoms > comm->moved_nalloc)
4326     {
4327         /* Contents should be preserved here */
4328         comm->moved_nalloc = over_alloc_dd(natoms);
4329         srenew(comm->moved,comm->moved_nalloc);
4330     }
4331
4332     return comm->moved;
4333 }
4334
4335 static void calc_cg_move(FILE *fplog,gmx_large_int_t step,
4336                          gmx_domdec_t *dd,
4337                          t_state *state,
4338                          ivec tric_dir,matrix tcm,
4339                          rvec cell_x0,rvec cell_x1,
4340                          rvec limitd,rvec limit0,rvec limit1,
4341                          const int *cgindex,
4342                          int cg_start,int cg_end,
4343                          rvec *cg_cm,
4344                          int *move)
4345 {
4346     int  npbcdim;
4347     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4348     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4349     int  flag;
4350     gmx_bool bScrew;
4351     ivec dev;
4352     real inv_ncg,pos_d;
4353     rvec cm_new;
4354
4355     npbcdim = dd->npbcdim;
4356
4357     for(cg=cg_start; cg<cg_end; cg++)
4358     {
4359         k0   = cgindex[cg];
4360         k1   = cgindex[cg+1];
4361         nrcg = k1 - k0;
4362         if (nrcg == 1)
4363         {
4364             copy_rvec(state->x[k0],cm_new);
4365         }
4366         else
4367         {
4368             inv_ncg = 1.0/nrcg;
4369
4370             clear_rvec(cm_new);
4371             for(k=k0; (k<k1); k++)
4372             {
4373                 rvec_inc(cm_new,state->x[k]);
4374             }
4375             for(d=0; (d<DIM); d++)
4376             {
4377                 cm_new[d] = inv_ncg*cm_new[d];
4378             }
4379         }
4380
4381         clear_ivec(dev);
4382         /* Do pbc and check DD cell boundary crossings */
4383         for(d=DIM-1; d>=0; d--)
4384         {
4385             if (dd->nc[d] > 1)
4386             {
4387                 bScrew = (dd->bScrewPBC && d == XX);
4388                 /* Determine the location of this cg in lattice coordinates */
4389                 pos_d = cm_new[d];
4390                 if (tric_dir[d])
4391                 {
4392                     for(d2=d+1; d2<DIM; d2++)
4393                     {
4394                         pos_d += cm_new[d2]*tcm[d2][d];
4395                     }
4396                 }
4397                 /* Put the charge group in the triclinic unit-cell */
4398                 if (pos_d >= cell_x1[d])
4399                 {
4400                     if (pos_d >= limit1[d])
4401                     {
4402                         cg_move_error(fplog,dd,step,cg,d,1,TRUE,limitd[d],
4403                                       cg_cm[cg],cm_new,pos_d);
4404                     }
4405                     dev[d] = 1;
4406                     if (dd->ci[d] == dd->nc[d] - 1)
4407                     {
4408                         rvec_dec(cm_new,state->box[d]);
4409                         if (bScrew)
4410                         {
4411                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4412                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4413                         }
4414                         for(k=k0; (k<k1); k++)
4415                         {
4416                             rvec_dec(state->x[k],state->box[d]);
4417                             if (bScrew)
4418                             {
4419                                 rotate_state_atom(state,k);
4420                             }
4421                         }
4422                     }
4423                 }
4424                 else if (pos_d < cell_x0[d])
4425                 {
4426                     if (pos_d < limit0[d])
4427                     {
4428                         cg_move_error(fplog,dd,step,cg,d,-1,TRUE,limitd[d],
4429                                       cg_cm[cg],cm_new,pos_d);
4430                     }
4431                     dev[d] = -1;
4432                     if (dd->ci[d] == 0)
4433                     {
4434                         rvec_inc(cm_new,state->box[d]);
4435                         if (bScrew)
4436                         {
4437                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4438                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4439                         }
4440                         for(k=k0; (k<k1); k++)
4441                         {
4442                             rvec_inc(state->x[k],state->box[d]);
4443                             if (bScrew)
4444                             {
4445                                 rotate_state_atom(state,k);
4446                             }
4447                         }
4448                     }
4449                 }
4450             }
4451             else if (d < npbcdim)
4452             {
4453                 /* Put the charge group in the rectangular unit-cell */
4454                 while (cm_new[d] >= state->box[d][d])
4455                 {
4456                     rvec_dec(cm_new,state->box[d]);
4457                     for(k=k0; (k<k1); k++)
4458                     {
4459                         rvec_dec(state->x[k],state->box[d]);
4460                     }
4461                 }
4462                 while (cm_new[d] < 0)
4463                 {
4464                     rvec_inc(cm_new,state->box[d]);
4465                     for(k=k0; (k<k1); k++)
4466                     {
4467                         rvec_inc(state->x[k],state->box[d]);
4468                     }
4469                 }
4470             }
4471         }
4472
4473         copy_rvec(cm_new,cg_cm[cg]);
4474
4475         /* Determine where this cg should go */
4476         flag = 0;
4477         mc = -1;
4478         for(d=0; d<dd->ndim; d++)
4479         {
4480             dim = dd->dim[d];
4481             if (dev[dim] == 1)
4482             {
4483                 flag |= DD_FLAG_FW(d);
4484                 if (mc == -1)
4485                 {
4486                     mc = d*2;
4487                 }
4488             }
4489             else if (dev[dim] == -1)
4490             {
4491                 flag |= DD_FLAG_BW(d);
4492                 if (mc == -1) {
4493                     if (dd->nc[dim] > 2)
4494                     {
4495                         mc = d*2 + 1;
4496                     }
4497                     else
4498                     {
4499                         mc = d*2;
4500                     }
4501                 }
4502             }
4503         }
4504         /* Temporarily store the flag in move */
4505         move[cg] = mc + flag;
4506     }
4507 }
4508
4509 static void dd_redistribute_cg(FILE *fplog,gmx_large_int_t step,
4510                                gmx_domdec_t *dd,ivec tric_dir,
4511                                t_state *state,rvec **f,
4512                                t_forcerec *fr,t_mdatoms *md,
4513                                gmx_bool bCompact,
4514                                t_nrnb *nrnb,
4515                                int *ncg_stay_home,
4516                                int *ncg_moved)
4517 {
4518     int  *move;
4519     int  npbcdim;
4520     int  ncg[DIM*2],nat[DIM*2];
4521     int  c,i,cg,k,k0,k1,d,dim,dim2,dir,d2,d3,d4,cell_d;
4522     int  mc,cdd,nrcg,ncg_recv,nat_recv,nvs,nvr,nvec,vec;
4523     int  sbuf[2],rbuf[2];
4524     int  home_pos_cg,home_pos_at,buf_pos;
4525     int  flag;
4526     gmx_bool bV=FALSE,bSDX=FALSE,bCGP=FALSE;
4527     gmx_bool bScrew;
4528     ivec dev;
4529     real inv_ncg,pos_d;
4530     matrix tcm;
4531     rvec *cg_cm=NULL,cell_x0,cell_x1,limitd,limit0,limit1,cm_new;
4532     atom_id *cgindex;
4533     cginfo_mb_t *cginfo_mb;
4534     gmx_domdec_comm_t *comm;
4535     int  *moved;
4536     int  nthread,thread;
4537
4538     if (dd->bScrewPBC)
4539     {
4540         check_screw_box(state->box);
4541     }
4542
4543     comm  = dd->comm;
4544     if (fr->cutoff_scheme == ecutsGROUP)
4545     {
4546         cg_cm = fr->cg_cm;
4547     }
4548
4549     for(i=0; i<estNR; i++)
4550     {
4551         if (EST_DISTR(i))
4552         {
4553             switch (i)
4554             {
4555             case estX:   /* Always present */            break;
4556             case estV:   bV   = (state->flags & (1<<i)); break;
4557             case estSDX: bSDX = (state->flags & (1<<i)); break;
4558             case estCGP: bCGP = (state->flags & (1<<i)); break;
4559             case estLD_RNG:
4560             case estLD_RNGI:
4561             case estDISRE_INITF:
4562             case estDISRE_RM3TAV:
4563             case estORIRE_INITF:
4564             case estORIRE_DTAV:
4565                 /* No processing required */
4566                 break;
4567             default:
4568             gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4569             }
4570         }
4571     }
4572
4573     if (dd->ncg_tot > comm->nalloc_int)
4574     {
4575         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4576         srenew(comm->buf_int,comm->nalloc_int);
4577     }
4578     move = comm->buf_int;
4579
4580     /* Clear the count */
4581     for(c=0; c<dd->ndim*2; c++)
4582     {
4583         ncg[c] = 0;
4584         nat[c] = 0;
4585     }
4586
4587     npbcdim = dd->npbcdim;
4588
4589     for(d=0; (d<DIM); d++)
4590     {
4591         limitd[d] = dd->comm->cellsize_min[d];
4592         if (d >= npbcdim && dd->ci[d] == 0)
4593         {
4594             cell_x0[d] = -GMX_FLOAT_MAX;
4595         }
4596         else
4597         {
4598             cell_x0[d] = comm->cell_x0[d];
4599         }
4600         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4601         {
4602             cell_x1[d] = GMX_FLOAT_MAX;
4603         }
4604         else
4605         {
4606             cell_x1[d] = comm->cell_x1[d];
4607         }
4608         if (d < npbcdim)
4609         {
4610             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4611             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4612         }
4613         else
4614         {
4615             /* We check after communication if a charge group moved
4616              * more than one cell. Set the pre-comm check limit to float_max.
4617              */
4618             limit0[d] = -GMX_FLOAT_MAX;
4619             limit1[d] =  GMX_FLOAT_MAX;
4620         }
4621     }
4622
4623     make_tric_corr_matrix(npbcdim,state->box,tcm);
4624
4625     cgindex = dd->cgindex;
4626
4627     nthread = gmx_omp_nthreads_get(emntDomdec);
4628
4629     /* Compute the center of geometry for all home charge groups
4630      * and put them in the box and determine where they should go.
4631      */
4632 #pragma omp parallel for num_threads(nthread) schedule(static)
4633     for(thread=0; thread<nthread; thread++)
4634     {
4635         calc_cg_move(fplog,step,dd,state,tric_dir,tcm,
4636                      cell_x0,cell_x1,limitd,limit0,limit1,
4637                      cgindex,
4638                      ( thread   *dd->ncg_home)/nthread,
4639                      ((thread+1)*dd->ncg_home)/nthread,
4640                      fr->cutoff_scheme==ecutsGROUP ? cg_cm : state->x,
4641                      move);
4642     }
4643
4644     for(cg=0; cg<dd->ncg_home; cg++)
4645     {
4646         if (move[cg] >= 0)
4647         {
4648             mc = move[cg];
4649             flag     = mc & ~DD_FLAG_NRCG;
4650             mc       = mc & DD_FLAG_NRCG;
4651             move[cg] = mc;
4652
4653             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4654             {
4655                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4656                 srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4657             }
4658             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4659             /* We store the cg size in the lower 16 bits
4660              * and the place where the charge group should go
4661              * in the next 6 bits. This saves some communication volume.
4662              */
4663             nrcg = cgindex[cg+1] - cgindex[cg];
4664             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4665             ncg[mc] += 1;
4666             nat[mc] += nrcg;
4667         }
4668     }
4669
4670     inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
4671     inc_nrnb(nrnb,eNR_RESETX,dd->ncg_home);
4672
4673     *ncg_moved = 0;
4674     for(i=0; i<dd->ndim*2; i++)
4675     {
4676         *ncg_moved += ncg[i];
4677     }
4678
4679     nvec = 1;
4680     if (bV)
4681     {
4682         nvec++;
4683     }
4684     if (bSDX)
4685     {
4686         nvec++;
4687     }
4688     if (bCGP)
4689     {
4690         nvec++;
4691     }
4692
4693     /* Make sure the communication buffers are large enough */
4694     for(mc=0; mc<dd->ndim*2; mc++)
4695     {
4696         nvr = ncg[mc] + nat[mc]*nvec;
4697         if (nvr > comm->cgcm_state_nalloc[mc])
4698         {
4699             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4700             srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
4701         }
4702     }
4703
4704     switch (fr->cutoff_scheme)
4705     {
4706     case ecutsGROUP:
4707         /* Recalculating cg_cm might be cheaper than communicating,
4708          * but that could give rise to rounding issues.
4709          */
4710         home_pos_cg =
4711             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4712                                     nvec,cg_cm,comm,bCompact);
4713     break;
4714     case ecutsVERLET:
4715         /* Without charge groups we send the moved atom coordinates
4716          * over twice. This is so the code below can be used without
4717          * many conditionals for both for with and without charge groups.
4718          */
4719         home_pos_cg =
4720             compact_and_copy_vec_cg(dd->ncg_home,move,cgindex,
4721                                     nvec,state->x,comm,FALSE);
4722         if (bCompact)
4723         {
4724             home_pos_cg -= *ncg_moved;
4725         }
4726         break;
4727     default:
4728         gmx_incons("unimplemented");
4729         home_pos_cg = 0;
4730     }
4731
4732     vec = 0;
4733     home_pos_at =
4734         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4735                                 nvec,vec++,state->x,comm,bCompact);
4736     if (bV)
4737     {
4738         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4739                                 nvec,vec++,state->v,comm,bCompact);
4740     }
4741     if (bSDX)
4742     {
4743         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4744                                 nvec,vec++,state->sd_X,comm,bCompact);
4745     }
4746     if (bCGP)
4747     {
4748         compact_and_copy_vec_at(dd->ncg_home,move,cgindex,
4749                                 nvec,vec++,state->cg_p,comm,bCompact);
4750     }
4751
4752     if (bCompact)
4753     {
4754         compact_ind(dd->ncg_home,move,
4755                     dd->index_gl,dd->cgindex,dd->gatindex,
4756                     dd->ga2la,comm->bLocalCG,
4757                     fr->cginfo);
4758     }
4759     else
4760     {
4761         if (fr->cutoff_scheme == ecutsVERLET)
4762         {
4763             moved = get_moved(comm,dd->ncg_home);
4764
4765             for(k=0; k<dd->ncg_home; k++)
4766             {
4767                 moved[k] = 0;
4768             }
4769         }
4770         else
4771         {
4772             moved = fr->ns.grid->cell_index;
4773         }
4774
4775         clear_and_mark_ind(dd->ncg_home,move,
4776                            dd->index_gl,dd->cgindex,dd->gatindex,
4777                            dd->ga2la,comm->bLocalCG,
4778                            moved);
4779     }
4780
4781     cginfo_mb = fr->cginfo_mb;
4782
4783     *ncg_stay_home = home_pos_cg;
4784     for(d=0; d<dd->ndim; d++)
4785     {
4786         dim = dd->dim[d];
4787         ncg_recv = 0;
4788         nat_recv = 0;
4789         nvr      = 0;
4790         for(dir=0; dir<(dd->nc[dim]==2 ? 1 : 2); dir++)
4791         {
4792             cdd = d*2 + dir;
4793             /* Communicate the cg and atom counts */
4794             sbuf[0] = ncg[cdd];
4795             sbuf[1] = nat[cdd];
4796             if (debug)
4797             {
4798                 fprintf(debug,"Sending ddim %d dir %d: ncg %d nat %d\n",
4799                         d,dir,sbuf[0],sbuf[1]);
4800             }
4801             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4802
4803             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4804             {
4805                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4806                 srenew(comm->buf_int,comm->nalloc_int);
4807             }
4808
4809             /* Communicate the charge group indices, sizes and flags */
4810             dd_sendrecv_int(dd, d, dir,
4811                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4812                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4813
4814             nvs = ncg[cdd] + nat[cdd]*nvec;
4815             i   = rbuf[0]  + rbuf[1] *nvec;
4816             vec_rvec_check_alloc(&comm->vbuf,nvr+i);
4817
4818             /* Communicate cgcm and state */
4819             dd_sendrecv_rvec(dd, d, dir,
4820                              comm->cgcm_state[cdd], nvs,
4821                              comm->vbuf.v+nvr, i);
4822             ncg_recv += rbuf[0];
4823             nat_recv += rbuf[1];
4824             nvr      += i;
4825         }
4826
4827         /* Process the received charge groups */
4828         buf_pos = 0;
4829         for(cg=0; cg<ncg_recv; cg++)
4830         {
4831             flag = comm->buf_int[cg*DD_CGIBS+1];
4832
4833             if (dim >= npbcdim && dd->nc[dim] > 2)
4834             {
4835                 /* No pbc in this dim and more than one domain boundary.
4836                  * We do a separate check if a charge group didn't move too far.
4837                  */
4838                 if (((flag & DD_FLAG_FW(d)) &&
4839                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4840                     ((flag & DD_FLAG_BW(d)) &&
4841                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4842                 {
4843                     cg_move_error(fplog,dd,step,cg,dim,
4844                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4845                                    FALSE,0,
4846                                    comm->vbuf.v[buf_pos],
4847                                    comm->vbuf.v[buf_pos],
4848                                    comm->vbuf.v[buf_pos][dim]);
4849                 }
4850             }
4851
4852             mc = -1;
4853             if (d < dd->ndim-1)
4854             {
4855                 /* Check which direction this cg should go */
4856                 for(d2=d+1; (d2<dd->ndim && mc==-1); d2++)
4857                 {
4858                     if (dd->bGridJump)
4859                     {
4860                         /* The cell boundaries for dimension d2 are not equal
4861                          * for each cell row of the lower dimension(s),
4862                          * therefore we might need to redetermine where
4863                          * this cg should go.
4864                          */
4865                         dim2 = dd->dim[d2];
4866                         /* If this cg crosses the box boundary in dimension d2
4867                          * we can use the communicated flag, so we do not
4868                          * have to worry about pbc.
4869                          */
4870                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4871                                (flag & DD_FLAG_FW(d2))) ||
4872                               (dd->ci[dim2] == 0 &&
4873                                (flag & DD_FLAG_BW(d2)))))
4874                         {
4875                             /* Clear the two flags for this dimension */
4876                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4877                             /* Determine the location of this cg
4878                              * in lattice coordinates
4879                              */
4880                             pos_d = comm->vbuf.v[buf_pos][dim2];
4881                             if (tric_dir[dim2])
4882                             {
4883                                 for(d3=dim2+1; d3<DIM; d3++)
4884                                 {
4885                                     pos_d +=
4886                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4887                                 }
4888                             }
4889                             /* Check of we are not at the box edge.
4890                              * pbc is only handled in the first step above,
4891                              * but this check could move over pbc while
4892                              * the first step did not due to different rounding.
4893                              */
4894                             if (pos_d >= cell_x1[dim2] &&
4895                                 dd->ci[dim2] != dd->nc[dim2]-1)
4896                             {
4897                                 flag |= DD_FLAG_FW(d2);
4898                             }
4899                             else if (pos_d < cell_x0[dim2] &&
4900                                      dd->ci[dim2] != 0)
4901                             {
4902                                 flag |= DD_FLAG_BW(d2);
4903                             }
4904                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4905                         }
4906                     }
4907                     /* Set to which neighboring cell this cg should go */
4908                     if (flag & DD_FLAG_FW(d2))
4909                     {
4910                         mc = d2*2;
4911                     }
4912                     else if (flag & DD_FLAG_BW(d2))
4913                     {
4914                         if (dd->nc[dd->dim[d2]] > 2)
4915                         {
4916                             mc = d2*2+1;
4917                         }
4918                         else
4919                         {
4920                             mc = d2*2;
4921                         }
4922                     }
4923                 }
4924             }
4925
4926             nrcg = flag & DD_FLAG_NRCG;
4927             if (mc == -1)
4928             {
4929                 if (home_pos_cg+1 > dd->cg_nalloc)
4930                 {
4931                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4932                     srenew(dd->index_gl,dd->cg_nalloc);
4933                     srenew(dd->cgindex,dd->cg_nalloc+1);
4934                 }
4935                 /* Set the global charge group index and size */
4936                 dd->index_gl[home_pos_cg] = comm->buf_int[cg*DD_CGIBS];
4937                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4938                 /* Copy the state from the buffer */
4939                 dd_check_alloc_ncg(fr,state,f,home_pos_cg+1);
4940                 if (fr->cutoff_scheme == ecutsGROUP)
4941                 {
4942                     cg_cm = fr->cg_cm;
4943                     copy_rvec(comm->vbuf.v[buf_pos],cg_cm[home_pos_cg]);
4944                 }
4945                 buf_pos++;
4946
4947                 /* Set the cginfo */
4948                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4949                                                    dd->index_gl[home_pos_cg]);
4950                 if (comm->bLocalCG)
4951                 {
4952                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4953                 }
4954
4955                 if (home_pos_at+nrcg > state->nalloc)
4956                 {
4957                     dd_realloc_state(state,f,home_pos_at+nrcg);
4958                 }
4959                 for(i=0; i<nrcg; i++)
4960                 {
4961                     copy_rvec(comm->vbuf.v[buf_pos++],
4962                               state->x[home_pos_at+i]);
4963                 }
4964                 if (bV)
4965                 {
4966                     for(i=0; i<nrcg; i++)
4967                     {
4968                         copy_rvec(comm->vbuf.v[buf_pos++],
4969                                   state->v[home_pos_at+i]);
4970                     }
4971                 }
4972                 if (bSDX)
4973                 {
4974                     for(i=0; i<nrcg; i++)
4975                     {
4976                         copy_rvec(comm->vbuf.v[buf_pos++],
4977                                   state->sd_X[home_pos_at+i]);
4978                     }
4979                 }
4980                 if (bCGP)
4981                 {
4982                     for(i=0; i<nrcg; i++)
4983                     {
4984                         copy_rvec(comm->vbuf.v[buf_pos++],
4985                                   state->cg_p[home_pos_at+i]);
4986                     }
4987                 }
4988                 home_pos_cg += 1;
4989                 home_pos_at += nrcg;
4990             }
4991             else
4992             {
4993                 /* Reallocate the buffers if necessary  */
4994                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4995                 {
4996                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4997                     srenew(comm->cggl_flag[mc],comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4998                 }
4999                 nvr = ncg[mc] + nat[mc]*nvec;
5000                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5001                 {
5002                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5003                     srenew(comm->cgcm_state[mc],comm->cgcm_state_nalloc[mc]);
5004                 }
5005                 /* Copy from the receive to the send buffers */
5006                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5007                        comm->buf_int + cg*DD_CGIBS,
5008                        DD_CGIBS*sizeof(int));
5009                 memcpy(comm->cgcm_state[mc][nvr],
5010                        comm->vbuf.v[buf_pos],
5011                        (1+nrcg*nvec)*sizeof(rvec));
5012                 buf_pos += 1 + nrcg*nvec;
5013                 ncg[mc] += 1;
5014                 nat[mc] += nrcg;
5015             }
5016         }
5017     }
5018
5019     /* With sorting (!bCompact) the indices are now only partially up to date
5020      * and ncg_home and nat_home are not the real count, since there are
5021      * "holes" in the arrays for the charge groups that moved to neighbors.
5022      */
5023     if (fr->cutoff_scheme == ecutsVERLET)
5024     {
5025         moved = get_moved(comm,home_pos_cg);
5026
5027         for(i=dd->ncg_home; i<home_pos_cg; i++)
5028         {
5029             moved[i] = 0;
5030         }
5031     }
5032     dd->ncg_home = home_pos_cg;
5033     dd->nat_home = home_pos_at;
5034
5035     if (debug)
5036     {
5037         fprintf(debug,
5038                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5039                 *ncg_moved,dd->ncg_home-*ncg_moved);
5040
5041     }
5042 }
5043
5044 void dd_cycles_add(gmx_domdec_t *dd,float cycles,int ddCycl)
5045 {
5046     dd->comm->cycl[ddCycl] += cycles;
5047     dd->comm->cycl_n[ddCycl]++;
5048     if (cycles > dd->comm->cycl_max[ddCycl])
5049     {
5050         dd->comm->cycl_max[ddCycl] = cycles;
5051     }
5052 }
5053
5054 static double force_flop_count(t_nrnb *nrnb)
5055 {
5056     int i;
5057     double sum;
5058     const char *name;
5059
5060     sum = 0;
5061     for(i=0; i<eNR_NBKERNEL_FREE_ENERGY; i++)
5062     {
5063         /* To get closer to the real timings, we half the count
5064          * for the normal loops and again half it for water loops.
5065          */
5066         name = nrnb_str(i);
5067         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5068         {
5069             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5070         }
5071         else
5072         {
5073             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5074         }
5075     }
5076     for(i=eNR_NBKERNEL_FREE_ENERGY; i<=eNR_NB14; i++)
5077     {
5078         name = nrnb_str(i);
5079         if (strstr(name,"W3") != NULL || strstr(name,"W4") != NULL)
5080         sum += nrnb->n[i]*cost_nrnb(i);
5081     }
5082     for(i=eNR_BONDS; i<=eNR_WALLS; i++)
5083     {
5084         sum += nrnb->n[i]*cost_nrnb(i);
5085     }
5086
5087     return sum;
5088 }
5089
5090 void dd_force_flop_start(gmx_domdec_t *dd,t_nrnb *nrnb)
5091 {
5092     if (dd->comm->eFlop)
5093     {
5094         dd->comm->flop -= force_flop_count(nrnb);
5095     }
5096 }
5097 void dd_force_flop_stop(gmx_domdec_t *dd,t_nrnb *nrnb)
5098 {
5099     if (dd->comm->eFlop)
5100     {
5101         dd->comm->flop += force_flop_count(nrnb);
5102         dd->comm->flop_n++;
5103     }
5104 }
5105
5106 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5107 {
5108     int i;
5109
5110     for(i=0; i<ddCyclNr; i++)
5111     {
5112         dd->comm->cycl[i] = 0;
5113         dd->comm->cycl_n[i] = 0;
5114         dd->comm->cycl_max[i] = 0;
5115     }
5116     dd->comm->flop = 0;
5117     dd->comm->flop_n = 0;
5118 }
5119
5120 static void get_load_distribution(gmx_domdec_t *dd,gmx_wallcycle_t wcycle)
5121 {
5122     gmx_domdec_comm_t *comm;
5123     gmx_domdec_load_t *load;
5124     gmx_domdec_root_t *root=NULL;
5125     int  d,dim,cid,i,pos;
5126     float cell_frac=0,sbuf[DD_NLOAD_MAX];
5127     gmx_bool bSepPME;
5128
5129     if (debug)
5130     {
5131         fprintf(debug,"get_load_distribution start\n");
5132     }
5133
5134     wallcycle_start(wcycle,ewcDDCOMMLOAD);
5135
5136     comm = dd->comm;
5137
5138     bSepPME = (dd->pme_nodeid >= 0);
5139
5140     for(d=dd->ndim-1; d>=0; d--)
5141     {
5142         dim = dd->dim[d];
5143         /* Check if we participate in the communication in this dimension */
5144         if (d == dd->ndim-1 ||
5145             (dd->ci[dd->dim[d+1]]==0 && dd->ci[dd->dim[dd->ndim-1]]==0))
5146         {
5147             load = &comm->load[d];
5148             if (dd->bGridJump)
5149             {
5150                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5151             }
5152             pos = 0;
5153             if (d == dd->ndim-1)
5154             {
5155                 sbuf[pos++] = dd_force_load(comm);
5156                 sbuf[pos++] = sbuf[0];
5157                 if (dd->bGridJump)
5158                 {
5159                     sbuf[pos++] = sbuf[0];
5160                     sbuf[pos++] = cell_frac;
5161                     if (d > 0)
5162                     {
5163                         sbuf[pos++] = comm->cell_f_max0[d];
5164                         sbuf[pos++] = comm->cell_f_min1[d];
5165                     }
5166                 }
5167                 if (bSepPME)
5168                 {
5169                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5170                     sbuf[pos++] = comm->cycl[ddCyclPME];
5171                 }
5172             }
5173             else
5174             {
5175                 sbuf[pos++] = comm->load[d+1].sum;
5176                 sbuf[pos++] = comm->load[d+1].max;
5177                 if (dd->bGridJump)
5178                 {
5179                     sbuf[pos++] = comm->load[d+1].sum_m;
5180                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5181                     sbuf[pos++] = comm->load[d+1].flags;
5182                     if (d > 0)
5183                     {
5184                         sbuf[pos++] = comm->cell_f_max0[d];
5185                         sbuf[pos++] = comm->cell_f_min1[d];
5186                     }
5187                 }
5188                 if (bSepPME)
5189                 {
5190                     sbuf[pos++] = comm->load[d+1].mdf;
5191                     sbuf[pos++] = comm->load[d+1].pme;
5192                 }
5193             }
5194             load->nload = pos;
5195             /* Communicate a row in DD direction d.
5196              * The communicators are setup such that the root always has rank 0.
5197              */
5198 #ifdef GMX_MPI
5199             MPI_Gather(sbuf      ,load->nload*sizeof(float),MPI_BYTE,
5200                        load->load,load->nload*sizeof(float),MPI_BYTE,
5201                        0,comm->mpi_comm_load[d]);
5202 #endif
5203             if (dd->ci[dim] == dd->master_ci[dim])
5204             {
5205                 /* We are the root, process this row */
5206                 if (comm->bDynLoadBal)
5207                 {
5208                     root = comm->root[d];
5209                 }
5210                 load->sum = 0;
5211                 load->max = 0;
5212                 load->sum_m = 0;
5213                 load->cvol_min = 1;
5214                 load->flags = 0;
5215                 load->mdf = 0;
5216                 load->pme = 0;
5217                 pos = 0;
5218                 for(i=0; i<dd->nc[dim]; i++)
5219                 {
5220                     load->sum += load->load[pos++];
5221                     load->max = max(load->max,load->load[pos]);
5222                     pos++;
5223                     if (dd->bGridJump)
5224                     {
5225                         if (root->bLimited)
5226                         {
5227                             /* This direction could not be load balanced properly,
5228                              * therefore we need to use the maximum iso the average load.
5229                              */
5230                             load->sum_m = max(load->sum_m,load->load[pos]);
5231                         }
5232                         else
5233                         {
5234                             load->sum_m += load->load[pos];
5235                         }
5236                         pos++;
5237                         load->cvol_min = min(load->cvol_min,load->load[pos]);
5238                         pos++;
5239                         if (d < dd->ndim-1)
5240                         {
5241                             load->flags = (int)(load->load[pos++] + 0.5);
5242                         }
5243                         if (d > 0)
5244                         {
5245                             root->cell_f_max0[i] = load->load[pos++];
5246                             root->cell_f_min1[i] = load->load[pos++];
5247                         }
5248                     }
5249                     if (bSepPME)
5250                     {
5251                         load->mdf = max(load->mdf,load->load[pos]);
5252                         pos++;
5253                         load->pme = max(load->pme,load->load[pos]);
5254                         pos++;
5255                     }
5256                 }
5257                 if (comm->bDynLoadBal && root->bLimited)
5258                 {
5259                     load->sum_m *= dd->nc[dim];
5260                     load->flags |= (1<<d);
5261                 }
5262             }
5263         }
5264     }
5265
5266     if (DDMASTER(dd))
5267     {
5268         comm->nload      += dd_load_count(comm);
5269         comm->load_step  += comm->cycl[ddCyclStep];
5270         comm->load_sum   += comm->load[0].sum;
5271         comm->load_max   += comm->load[0].max;
5272         if (comm->bDynLoadBal)
5273         {
5274             for(d=0; d<dd->ndim; d++)
5275             {
5276                 if (comm->load[0].flags & (1<<d))
5277                 {
5278                     comm->load_lim[d]++;
5279                 }
5280             }
5281         }
5282         if (bSepPME)
5283         {
5284             comm->load_mdf += comm->load[0].mdf;
5285             comm->load_pme += comm->load[0].pme;
5286         }
5287     }
5288
5289     wallcycle_stop(wcycle,ewcDDCOMMLOAD);
5290
5291     if (debug)
5292     {
5293         fprintf(debug,"get_load_distribution finished\n");
5294     }
5295 }
5296
5297 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5298 {
5299     /* Return the relative performance loss on the total run time
5300      * due to the force calculation load imbalance.
5301      */
5302     if (dd->comm->nload > 0)
5303     {
5304         return
5305             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5306             (dd->comm->load_step*dd->nnodes);
5307     }
5308     else
5309     {
5310         return 0;
5311     }
5312 }
5313
5314 static void print_dd_load_av(FILE *fplog,gmx_domdec_t *dd)
5315 {
5316     char  buf[STRLEN];
5317     int   npp,npme,nnodes,d,limp;
5318     float imbal,pme_f_ratio,lossf,lossp=0;
5319     gmx_bool  bLim;
5320     gmx_domdec_comm_t *comm;
5321
5322     comm = dd->comm;
5323     if (DDMASTER(dd) && comm->nload > 0)
5324     {
5325         npp    = dd->nnodes;
5326         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5327         nnodes = npp + npme;
5328         imbal = comm->load_max*npp/comm->load_sum - 1;
5329         lossf = dd_force_imb_perf_loss(dd);
5330         sprintf(buf," Average load imbalance: %.1f %%\n",imbal*100);
5331         fprintf(fplog,"%s",buf);
5332         fprintf(stderr,"\n");
5333         fprintf(stderr,"%s",buf);
5334         sprintf(buf," Part of the total run time spent waiting due to load imbalance: %.1f %%\n",lossf*100);
5335         fprintf(fplog,"%s",buf);
5336         fprintf(stderr,"%s",buf);
5337         bLim = FALSE;
5338         if (comm->bDynLoadBal)
5339         {
5340             sprintf(buf," Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5341             for(d=0; d<dd->ndim; d++)
5342             {
5343                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5344                 sprintf(buf+strlen(buf)," %c %d %%",dim2char(dd->dim[d]),limp);
5345                 if (limp >= 50)
5346                 {
5347                     bLim = TRUE;
5348                 }
5349             }
5350             sprintf(buf+strlen(buf),"\n");
5351             fprintf(fplog,"%s",buf);
5352             fprintf(stderr,"%s",buf);
5353         }
5354         if (npme > 0)
5355         {
5356             pme_f_ratio = comm->load_pme/comm->load_mdf;
5357             lossp = (comm->load_pme -comm->load_mdf)/comm->load_step;
5358             if (lossp <= 0)
5359             {
5360                 lossp *= (float)npme/(float)nnodes;
5361             }
5362             else
5363             {
5364                 lossp *= (float)npp/(float)nnodes;
5365             }
5366             sprintf(buf," Average PME mesh/force load: %5.3f\n",pme_f_ratio);
5367             fprintf(fplog,"%s",buf);
5368             fprintf(stderr,"%s",buf);
5369             sprintf(buf," Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n",fabs(lossp)*100);
5370             fprintf(fplog,"%s",buf);
5371             fprintf(stderr,"%s",buf);
5372         }
5373         fprintf(fplog,"\n");
5374         fprintf(stderr,"\n");
5375
5376         if (lossf >= DD_PERF_LOSS)
5377         {
5378             sprintf(buf,
5379                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5380                     "      in the domain decomposition.\n",lossf*100);
5381             if (!comm->bDynLoadBal)
5382             {
5383                 sprintf(buf+strlen(buf),"      You might want to use dynamic load balancing (option -dlb.)\n");
5384             }
5385             else if (bLim)
5386             {
5387                 sprintf(buf+strlen(buf),"      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5388             }
5389             fprintf(fplog,"%s\n",buf);
5390             fprintf(stderr,"%s\n",buf);
5391         }
5392         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5393         {
5394             sprintf(buf,
5395                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5396                     "      had %s work to do than the PP nodes.\n"
5397                     "      You might want to %s the number of PME nodes\n"
5398                     "      or %s the cut-off and the grid spacing.\n",
5399                     fabs(lossp*100),
5400                     (lossp < 0) ? "less"     : "more",
5401                     (lossp < 0) ? "decrease" : "increase",
5402                     (lossp < 0) ? "decrease" : "increase");
5403             fprintf(fplog,"%s\n",buf);
5404             fprintf(stderr,"%s\n",buf);
5405         }
5406     }
5407 }
5408
5409 static float dd_vol_min(gmx_domdec_t *dd)
5410 {
5411     return dd->comm->load[0].cvol_min*dd->nnodes;
5412 }
5413
5414 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5415 {
5416     return dd->comm->load[0].flags;
5417 }
5418
5419 static float dd_f_imbal(gmx_domdec_t *dd)
5420 {
5421     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5422 }
5423
5424 float dd_pme_f_ratio(gmx_domdec_t *dd)
5425 {
5426     if (dd->comm->cycl_n[ddCyclPME] > 0)
5427     {
5428         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5429     }
5430     else
5431     {
5432         return -1.0;
5433     }
5434 }
5435
5436 static void dd_print_load(FILE *fplog,gmx_domdec_t *dd,gmx_large_int_t step)
5437 {
5438     int flags,d;
5439     char buf[22];
5440
5441     flags = dd_load_flags(dd);
5442     if (flags)
5443     {
5444         fprintf(fplog,
5445                 "DD  load balancing is limited by minimum cell size in dimension");
5446         for(d=0; d<dd->ndim; d++)
5447         {
5448             if (flags & (1<<d))
5449             {
5450                 fprintf(fplog," %c",dim2char(dd->dim[d]));
5451             }
5452         }
5453         fprintf(fplog,"\n");
5454     }
5455     fprintf(fplog,"DD  step %s",gmx_step_str(step,buf));
5456     if (dd->comm->bDynLoadBal)
5457     {
5458         fprintf(fplog,"  vol min/aver %5.3f%c",
5459                 dd_vol_min(dd),flags ? '!' : ' ');
5460     }
5461     fprintf(fplog," load imb.: force %4.1f%%",dd_f_imbal(dd)*100);
5462     if (dd->comm->cycl_n[ddCyclPME])
5463     {
5464         fprintf(fplog,"  pme mesh/force %5.3f",dd_pme_f_ratio(dd));
5465     }
5466     fprintf(fplog,"\n\n");
5467 }
5468
5469 static void dd_print_load_verbose(gmx_domdec_t *dd)
5470 {
5471     if (dd->comm->bDynLoadBal)
5472     {
5473         fprintf(stderr,"vol %4.2f%c ",
5474                 dd_vol_min(dd),dd_load_flags(dd) ? '!' : ' ');
5475     }
5476     fprintf(stderr,"imb F %2d%% ",(int)(dd_f_imbal(dd)*100+0.5));
5477     if (dd->comm->cycl_n[ddCyclPME])
5478     {
5479         fprintf(stderr,"pme/F %4.2f ",dd_pme_f_ratio(dd));
5480     }
5481 }
5482
5483 #ifdef GMX_MPI
5484 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind,ivec loc)
5485 {
5486     MPI_Comm  c_row;
5487     int  dim, i, rank;
5488     ivec loc_c;
5489     gmx_domdec_root_t *root;
5490     gmx_bool bPartOfGroup = FALSE;
5491
5492     dim = dd->dim[dim_ind];
5493     copy_ivec(loc,loc_c);
5494     for(i=0; i<dd->nc[dim]; i++)
5495     {
5496         loc_c[dim] = i;
5497         rank = dd_index(dd->nc,loc_c);
5498         if (rank == dd->rank)
5499         {
5500             /* This process is part of the group */
5501             bPartOfGroup = TRUE;
5502         }
5503     }
5504     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup?0:MPI_UNDEFINED, dd->rank,
5505                    &c_row);
5506     if (bPartOfGroup)
5507     {
5508         dd->comm->mpi_comm_load[dim_ind] = c_row;
5509         if (dd->comm->eDLB != edlbNO)
5510         {
5511             if (dd->ci[dim] == dd->master_ci[dim])
5512             {
5513                 /* This is the root process of this row */
5514                 snew(dd->comm->root[dim_ind],1);
5515                 root = dd->comm->root[dim_ind];
5516                 snew(root->cell_f,DD_CELL_F_SIZE(dd,dim_ind));
5517                 snew(root->old_cell_f,dd->nc[dim]+1);
5518                 snew(root->bCellMin,dd->nc[dim]);
5519                 if (dim_ind > 0)
5520                 {
5521                     snew(root->cell_f_max0,dd->nc[dim]);
5522                     snew(root->cell_f_min1,dd->nc[dim]);
5523                     snew(root->bound_min,dd->nc[dim]);
5524                     snew(root->bound_max,dd->nc[dim]);
5525                 }
5526                 snew(root->buf_ncd,dd->nc[dim]);
5527             }
5528             else
5529             {
5530                 /* This is not a root process, we only need to receive cell_f */
5531                 snew(dd->comm->cell_f_row,DD_CELL_F_SIZE(dd,dim_ind));
5532             }
5533         }
5534         if (dd->ci[dim] == dd->master_ci[dim])
5535         {
5536             snew(dd->comm->load[dim_ind].load,dd->nc[dim]*DD_NLOAD_MAX);
5537         }
5538     }
5539 }
5540 #endif
5541
5542 static void make_load_communicators(gmx_domdec_t *dd)
5543 {
5544 #ifdef GMX_MPI
5545   int  dim0,dim1,i,j;
5546   ivec loc;
5547
5548   if (debug)
5549     fprintf(debug,"Making load communicators\n");
5550
5551   snew(dd->comm->load,dd->ndim);
5552   snew(dd->comm->mpi_comm_load,dd->ndim);
5553
5554   clear_ivec(loc);
5555   make_load_communicator(dd,0,loc);
5556   if (dd->ndim > 1) {
5557     dim0 = dd->dim[0];
5558     for(i=0; i<dd->nc[dim0]; i++) {
5559       loc[dim0] = i;
5560       make_load_communicator(dd,1,loc);
5561     }
5562   }
5563   if (dd->ndim > 2) {
5564     dim0 = dd->dim[0];
5565     for(i=0; i<dd->nc[dim0]; i++) {
5566       loc[dim0] = i;
5567       dim1 = dd->dim[1];
5568       for(j=0; j<dd->nc[dim1]; j++) {
5569           loc[dim1] = j;
5570           make_load_communicator(dd,2,loc);
5571       }
5572     }
5573   }
5574
5575   if (debug)
5576     fprintf(debug,"Finished making load communicators\n");
5577 #endif
5578 }
5579
5580 void setup_dd_grid(FILE *fplog,gmx_domdec_t *dd)
5581 {
5582     gmx_bool bZYX;
5583     int  d,dim,i,j,m;
5584     ivec tmp,s;
5585     int  nzone,nzonep;
5586     ivec dd_zp[DD_MAXIZONE];
5587     gmx_domdec_zones_t *zones;
5588     gmx_domdec_ns_ranges_t *izone;
5589
5590     for(d=0; d<dd->ndim; d++)
5591     {
5592         dim = dd->dim[d];
5593         copy_ivec(dd->ci,tmp);
5594         tmp[dim] = (tmp[dim] + 1) % dd->nc[dim];
5595         dd->neighbor[d][0] = ddcoord2ddnodeid(dd,tmp);
5596         copy_ivec(dd->ci,tmp);
5597         tmp[dim] = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5598         dd->neighbor[d][1] = ddcoord2ddnodeid(dd,tmp);
5599         if (debug)
5600         {
5601             fprintf(debug,"DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5602                     dd->rank,dim,
5603                     dd->neighbor[d][0],
5604                     dd->neighbor[d][1]);
5605         }
5606     }
5607
5608     if (fplog)
5609     {
5610         fprintf(fplog,"\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5611                 dd->ndim,
5612                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],
5613                 dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5614     }
5615     switch (dd->ndim)
5616     {
5617     case 3:
5618         nzone  = dd_z3n;
5619         nzonep = dd_zp3n;
5620         for(i=0; i<nzonep; i++)
5621         {
5622             copy_ivec(dd_zp3[i],dd_zp[i]);
5623         }
5624         break;
5625     case 2:
5626         nzone  = dd_z2n;
5627         nzonep = dd_zp2n;
5628         for(i=0; i<nzonep; i++)
5629         {
5630             copy_ivec(dd_zp2[i],dd_zp[i]);
5631         }
5632         break;
5633     case 1:
5634         nzone  = dd_z1n;
5635         nzonep = dd_zp1n;
5636         for(i=0; i<nzonep; i++)
5637         {
5638             copy_ivec(dd_zp1[i],dd_zp[i]);
5639         }
5640         break;
5641     default:
5642         gmx_fatal(FARGS,"Can only do 1, 2 or 3D domain decomposition");
5643         nzone = 0;
5644         nzonep = 0;
5645     }
5646
5647     zones = &dd->comm->zones;
5648
5649     for(i=0; i<nzone; i++)
5650     {
5651         m = 0;
5652         clear_ivec(zones->shift[i]);
5653         for(d=0; d<dd->ndim; d++)
5654         {
5655             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5656         }
5657     }
5658
5659     zones->n = nzone;
5660     for(i=0; i<nzone; i++)
5661     {
5662         for(d=0; d<DIM; d++)
5663         {
5664             s[d] = dd->ci[d] - zones->shift[i][d];
5665             if (s[d] < 0)
5666             {
5667                 s[d] += dd->nc[d];
5668             }
5669             else if (s[d] >= dd->nc[d])
5670             {
5671                 s[d] -= dd->nc[d];
5672             }
5673         }
5674     }
5675     zones->nizone = nzonep;
5676     for(i=0; i<zones->nizone; i++)
5677     {
5678         if (dd_zp[i][0] != i)
5679         {
5680             gmx_fatal(FARGS,"Internal inconsistency in the dd grid setup");
5681         }
5682         izone = &zones->izone[i];
5683         izone->j0 = dd_zp[i][1];
5684         izone->j1 = dd_zp[i][2];
5685         for(dim=0; dim<DIM; dim++)
5686         {
5687             if (dd->nc[dim] == 1)
5688             {
5689                 /* All shifts should be allowed */
5690                 izone->shift0[dim] = -1;
5691                 izone->shift1[dim] = 1;
5692             }
5693             else
5694             {
5695                 /*
5696                   izone->shift0[d] = 0;
5697                   izone->shift1[d] = 0;
5698                   for(j=izone->j0; j<izone->j1; j++) {
5699                   if (dd->shift[j][d] > dd->shift[i][d])
5700                   izone->shift0[d] = -1;
5701                   if (dd->shift[j][d] < dd->shift[i][d])
5702                   izone->shift1[d] = 1;
5703                   }
5704                 */
5705
5706                 int shift_diff;
5707
5708                 /* Assume the shift are not more than 1 cell */
5709                 izone->shift0[dim] = 1;
5710                 izone->shift1[dim] = -1;
5711                 for(j=izone->j0; j<izone->j1; j++)
5712                 {
5713                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5714                     if (shift_diff < izone->shift0[dim])
5715                     {
5716                         izone->shift0[dim] = shift_diff;
5717                     }
5718                     if (shift_diff > izone->shift1[dim])
5719                     {
5720                         izone->shift1[dim] = shift_diff;
5721                     }
5722                 }
5723             }
5724         }
5725     }
5726
5727     if (dd->comm->eDLB != edlbNO)
5728     {
5729         snew(dd->comm->root,dd->ndim);
5730     }
5731
5732     if (dd->comm->bRecordLoad)
5733     {
5734         make_load_communicators(dd);
5735     }
5736 }
5737
5738 static void make_pp_communicator(FILE *fplog,t_commrec *cr,int reorder)
5739 {
5740     gmx_domdec_t *dd;
5741     gmx_domdec_comm_t *comm;
5742     int  i,rank,*buf;
5743     ivec periods;
5744 #ifdef GMX_MPI
5745     MPI_Comm comm_cart;
5746 #endif
5747
5748     dd = cr->dd;
5749     comm = dd->comm;
5750
5751 #ifdef GMX_MPI
5752     if (comm->bCartesianPP)
5753     {
5754         /* Set up cartesian communication for the particle-particle part */
5755         if (fplog)
5756         {
5757             fprintf(fplog,"Will use a Cartesian communicator: %d x %d x %d\n",
5758                     dd->nc[XX],dd->nc[YY],dd->nc[ZZ]);
5759         }
5760
5761         for(i=0; i<DIM; i++)
5762         {
5763             periods[i] = TRUE;
5764         }
5765         MPI_Cart_create(cr->mpi_comm_mygroup,DIM,dd->nc,periods,reorder,
5766                         &comm_cart);
5767         /* We overwrite the old communicator with the new cartesian one */
5768         cr->mpi_comm_mygroup = comm_cart;
5769     }
5770
5771     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5772     MPI_Comm_rank(dd->mpi_comm_all,&dd->rank);
5773
5774     if (comm->bCartesianPP_PME)
5775     {
5776         /* Since we want to use the original cartesian setup for sim,
5777          * and not the one after split, we need to make an index.
5778          */
5779         snew(comm->ddindex2ddnodeid,dd->nnodes);
5780         comm->ddindex2ddnodeid[dd_index(dd->nc,dd->ci)] = dd->rank;
5781         gmx_sumi(dd->nnodes,comm->ddindex2ddnodeid,cr);
5782         /* Get the rank of the DD master,
5783          * above we made sure that the master node is a PP node.
5784          */
5785         if (MASTER(cr))
5786         {
5787             rank = dd->rank;
5788         }
5789         else
5790         {
5791             rank = 0;
5792         }
5793         MPI_Allreduce(&rank,&dd->masterrank,1,MPI_INT,MPI_SUM,dd->mpi_comm_all);
5794     }
5795     else if (comm->bCartesianPP)
5796     {
5797         if (cr->npmenodes == 0)
5798         {
5799             /* The PP communicator is also
5800              * the communicator for this simulation
5801              */
5802             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5803         }
5804         cr->nodeid = dd->rank;
5805
5806         MPI_Cart_coords(dd->mpi_comm_all,dd->rank,DIM,dd->ci);
5807
5808         /* We need to make an index to go from the coordinates
5809          * to the nodeid of this simulation.
5810          */
5811         snew(comm->ddindex2simnodeid,dd->nnodes);
5812         snew(buf,dd->nnodes);
5813         if (cr->duty & DUTY_PP)
5814         {
5815             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5816         }
5817         /* Communicate the ddindex to simulation nodeid index */
5818         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5819                       cr->mpi_comm_mysim);
5820         sfree(buf);
5821
5822         /* Determine the master coordinates and rank.
5823          * The DD master should be the same node as the master of this sim.
5824          */
5825         for(i=0; i<dd->nnodes; i++)
5826         {
5827             if (comm->ddindex2simnodeid[i] == 0)
5828             {
5829                 ddindex2xyz(dd->nc,i,dd->master_ci);
5830                 MPI_Cart_rank(dd->mpi_comm_all,dd->master_ci,&dd->masterrank);
5831             }
5832         }
5833         if (debug)
5834         {
5835             fprintf(debug,"The master rank is %d\n",dd->masterrank);
5836         }
5837     }
5838     else
5839     {
5840         /* No Cartesian communicators */
5841         /* We use the rank in dd->comm->all as DD index */
5842         ddindex2xyz(dd->nc,dd->rank,dd->ci);
5843         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5844         dd->masterrank = 0;
5845         clear_ivec(dd->master_ci);
5846     }
5847 #endif
5848
5849     if (fplog)
5850     {
5851         fprintf(fplog,
5852                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5853                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5854     }
5855     if (debug)
5856     {
5857         fprintf(debug,
5858                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5859                 dd->rank,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
5860     }
5861 }
5862
5863 static void receive_ddindex2simnodeid(t_commrec *cr)
5864 {
5865     gmx_domdec_t *dd;
5866
5867     gmx_domdec_comm_t *comm;
5868     int  *buf;
5869
5870     dd = cr->dd;
5871     comm = dd->comm;
5872
5873 #ifdef GMX_MPI
5874     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5875     {
5876         snew(comm->ddindex2simnodeid,dd->nnodes);
5877         snew(buf,dd->nnodes);
5878         if (cr->duty & DUTY_PP)
5879         {
5880             buf[dd_index(dd->nc,dd->ci)] = cr->sim_nodeid;
5881         }
5882 #ifdef GMX_MPI
5883         /* Communicate the ddindex to simulation nodeid index */
5884         MPI_Allreduce(buf,comm->ddindex2simnodeid,dd->nnodes,MPI_INT,MPI_SUM,
5885                       cr->mpi_comm_mysim);
5886 #endif
5887         sfree(buf);
5888     }
5889 #endif
5890 }
5891
5892 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5893                                                      int ncg,int natoms)
5894 {
5895     gmx_domdec_master_t *ma;
5896     int i;
5897
5898     snew(ma,1);
5899
5900     snew(ma->ncg,dd->nnodes);
5901     snew(ma->index,dd->nnodes+1);
5902     snew(ma->cg,ncg);
5903     snew(ma->nat,dd->nnodes);
5904     snew(ma->ibuf,dd->nnodes*2);
5905     snew(ma->cell_x,DIM);
5906     for(i=0; i<DIM; i++)
5907     {
5908         snew(ma->cell_x[i],dd->nc[i]+1);
5909     }
5910
5911     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5912     {
5913         ma->vbuf = NULL;
5914     }
5915     else
5916     {
5917         snew(ma->vbuf,natoms);
5918     }
5919
5920     return ma;
5921 }
5922
5923 static void split_communicator(FILE *fplog,t_commrec *cr,int dd_node_order,
5924                                int reorder)
5925 {
5926     gmx_domdec_t *dd;
5927     gmx_domdec_comm_t *comm;
5928     int  i,rank;
5929     gmx_bool bDiv[DIM];
5930     ivec periods;
5931 #ifdef GMX_MPI
5932     MPI_Comm comm_cart;
5933 #endif
5934
5935     dd = cr->dd;
5936     comm = dd->comm;
5937
5938     if (comm->bCartesianPP)
5939     {
5940         for(i=1; i<DIM; i++)
5941         {
5942             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5943         }
5944         if (bDiv[YY] || bDiv[ZZ])
5945         {
5946             comm->bCartesianPP_PME = TRUE;
5947             /* If we have 2D PME decomposition, which is always in x+y,
5948              * we stack the PME only nodes in z.
5949              * Otherwise we choose the direction that provides the thinnest slab
5950              * of PME only nodes as this will have the least effect
5951              * on the PP communication.
5952              * But for the PME communication the opposite might be better.
5953              */
5954             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5955                              !bDiv[YY] ||
5956                              dd->nc[YY] > dd->nc[ZZ]))
5957             {
5958                 comm->cartpmedim = ZZ;
5959             }
5960             else
5961             {
5962                 comm->cartpmedim = YY;
5963             }
5964             comm->ntot[comm->cartpmedim]
5965                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
5966         }
5967         else if (fplog)
5968         {
5969             fprintf(fplog,"#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n",cr->npmenodes,dd->nc[XX],dd->nc[YY],dd->nc[XX],dd->nc[ZZ]);
5970             fprintf(fplog,
5971                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
5972         }
5973     }
5974
5975 #ifdef GMX_MPI
5976     if (comm->bCartesianPP_PME)
5977     {
5978         if (fplog)
5979         {
5980             fprintf(fplog,"Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n",comm->ntot[XX],comm->ntot[YY],comm->ntot[ZZ]);
5981         }
5982
5983         for(i=0; i<DIM; i++)
5984         {
5985             periods[i] = TRUE;
5986         }
5987         MPI_Cart_create(cr->mpi_comm_mysim,DIM,comm->ntot,periods,reorder,
5988                         &comm_cart);
5989
5990         MPI_Comm_rank(comm_cart,&rank);
5991         if (MASTERNODE(cr) && rank != 0)
5992         {
5993             gmx_fatal(FARGS,"MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
5994         }
5995
5996         /* With this assigment we loose the link to the original communicator
5997          * which will usually be MPI_COMM_WORLD, unless have multisim.
5998          */
5999         cr->mpi_comm_mysim = comm_cart;
6000         cr->sim_nodeid = rank;
6001
6002         MPI_Cart_coords(cr->mpi_comm_mysim,cr->sim_nodeid,DIM,dd->ci);
6003
6004         if (fplog)
6005         {
6006             fprintf(fplog,"Cartesian nodeid %d, coordinates %d %d %d\n\n",
6007                     cr->sim_nodeid,dd->ci[XX],dd->ci[YY],dd->ci[ZZ]);
6008         }
6009
6010         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6011         {
6012             cr->duty = DUTY_PP;
6013         }
6014         if (cr->npmenodes == 0 ||
6015             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6016         {
6017             cr->duty = DUTY_PME;
6018         }
6019
6020         /* Split the sim communicator into PP and PME only nodes */
6021         MPI_Comm_split(cr->mpi_comm_mysim,
6022                        cr->duty,
6023                        dd_index(comm->ntot,dd->ci),
6024                        &cr->mpi_comm_mygroup);
6025     }
6026     else
6027     {
6028         switch (dd_node_order)
6029         {
6030         case ddnoPP_PME:
6031             if (fplog)
6032             {
6033                 fprintf(fplog,"Order of the nodes: PP first, PME last\n");
6034             }
6035             break;
6036         case ddnoINTERLEAVE:
6037             /* Interleave the PP-only and PME-only nodes,
6038              * as on clusters with dual-core machines this will double
6039              * the communication bandwidth of the PME processes
6040              * and thus speed up the PP <-> PME and inter PME communication.
6041              */
6042             if (fplog)
6043             {
6044                 fprintf(fplog,"Interleaving PP and PME nodes\n");
6045             }
6046             comm->pmenodes = dd_pmenodes(cr);
6047             break;
6048         case ddnoCARTESIAN:
6049             break;
6050         default:
6051             gmx_fatal(FARGS,"Unknown dd_node_order=%d",dd_node_order);
6052         }
6053
6054         if (dd_simnode2pmenode(cr,cr->sim_nodeid) == -1)
6055         {
6056             cr->duty = DUTY_PME;
6057         }
6058         else
6059         {
6060             cr->duty = DUTY_PP;
6061         }
6062
6063         /* Split the sim communicator into PP and PME only nodes */
6064         MPI_Comm_split(cr->mpi_comm_mysim,
6065                        cr->duty,
6066                        cr->nodeid,
6067                        &cr->mpi_comm_mygroup);
6068         MPI_Comm_rank(cr->mpi_comm_mygroup,&cr->nodeid);
6069     }
6070 #endif
6071
6072     if (fplog)
6073     {
6074         fprintf(fplog,"This is a %s only node\n\n",
6075                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6076     }
6077 }
6078
6079 void make_dd_communicators(FILE *fplog,t_commrec *cr,int dd_node_order)
6080 {
6081     gmx_domdec_t *dd;
6082     gmx_domdec_comm_t *comm;
6083     int CartReorder;
6084
6085     dd = cr->dd;
6086     comm = dd->comm;
6087
6088     copy_ivec(dd->nc,comm->ntot);
6089
6090     comm->bCartesianPP = (dd_node_order == ddnoCARTESIAN);
6091     comm->bCartesianPP_PME = FALSE;
6092
6093     /* Reorder the nodes by default. This might change the MPI ranks.
6094      * Real reordering is only supported on very few architectures,
6095      * Blue Gene is one of them.
6096      */
6097     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6098
6099     if (cr->npmenodes > 0)
6100     {
6101         /* Split the communicator into a PP and PME part */
6102         split_communicator(fplog,cr,dd_node_order,CartReorder);
6103         if (comm->bCartesianPP_PME)
6104         {
6105             /* We (possibly) reordered the nodes in split_communicator,
6106              * so it is no longer required in make_pp_communicator.
6107              */
6108             CartReorder = FALSE;
6109         }
6110     }
6111     else
6112     {
6113         /* All nodes do PP and PME */
6114 #ifdef GMX_MPI
6115         /* We do not require separate communicators */
6116         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6117 #endif
6118     }
6119
6120     if (cr->duty & DUTY_PP)
6121     {
6122         /* Copy or make a new PP communicator */
6123         make_pp_communicator(fplog,cr,CartReorder);
6124     }
6125     else
6126     {
6127         receive_ddindex2simnodeid(cr);
6128     }
6129
6130     if (!(cr->duty & DUTY_PME))
6131     {
6132         /* Set up the commnuication to our PME node */
6133         dd->pme_nodeid = dd_simnode2pmenode(cr,cr->sim_nodeid);
6134         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6135         if (debug)
6136         {
6137             fprintf(debug,"My pme_nodeid %d receive ener %d\n",
6138                     dd->pme_nodeid,dd->pme_receive_vir_ener);
6139         }
6140     }
6141     else
6142     {
6143         dd->pme_nodeid = -1;
6144     }
6145
6146     if (DDMASTER(dd))
6147     {
6148         dd->ma = init_gmx_domdec_master_t(dd,
6149                                           comm->cgs_gl.nr,
6150                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6151     }
6152 }
6153
6154 static real *get_slb_frac(FILE *fplog,const char *dir,int nc,const char *size_string)
6155 {
6156     real *slb_frac,tot;
6157     int  i,n;
6158     double dbl;
6159
6160     slb_frac = NULL;
6161     if (nc > 1 && size_string != NULL)
6162     {
6163         if (fplog)
6164         {
6165             fprintf(fplog,"Using static load balancing for the %s direction\n",
6166                     dir);
6167         }
6168         snew(slb_frac,nc);
6169         tot = 0;
6170         for (i=0; i<nc; i++)
6171         {
6172             dbl = 0;
6173             sscanf(size_string,"%lf%n",&dbl,&n);
6174             if (dbl == 0)
6175             {
6176                 gmx_fatal(FARGS,"Incorrect or not enough DD cell size entries for direction %s: '%s'",dir,size_string);
6177             }
6178             slb_frac[i] = dbl;
6179             size_string += n;
6180             tot += slb_frac[i];
6181         }
6182         /* Normalize */
6183         if (fplog)
6184         {
6185             fprintf(fplog,"Relative cell sizes:");
6186         }
6187         for (i=0; i<nc; i++)
6188         {
6189             slb_frac[i] /= tot;
6190             if (fplog)
6191             {
6192                 fprintf(fplog," %5.3f",slb_frac[i]);
6193             }
6194         }
6195         if (fplog)
6196         {
6197             fprintf(fplog,"\n");
6198         }
6199     }
6200
6201     return slb_frac;
6202 }
6203
6204 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6205 {
6206     int n,nmol,ftype;
6207     gmx_mtop_ilistloop_t iloop;
6208     t_ilist *il;
6209
6210     n = 0;
6211     iloop = gmx_mtop_ilistloop_init(mtop);
6212     while (gmx_mtop_ilistloop_next(iloop,&il,&nmol))
6213     {
6214         for(ftype=0; ftype<F_NRE; ftype++)
6215         {
6216             if ((interaction_function[ftype].flags & IF_BOND) &&
6217                 NRAL(ftype) >  2)
6218             {
6219                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6220             }
6221         }
6222   }
6223
6224   return n;
6225 }
6226
6227 static int dd_nst_env(FILE *fplog,const char *env_var,int def)
6228 {
6229     char *val;
6230     int  nst;
6231
6232     nst = def;
6233     val = getenv(env_var);
6234     if (val)
6235     {
6236         if (sscanf(val,"%d",&nst) <= 0)
6237         {
6238             nst = 1;
6239         }
6240         if (fplog)
6241         {
6242             fprintf(fplog,"Found env.var. %s = %s, using value %d\n",
6243                     env_var,val,nst);
6244         }
6245     }
6246
6247     return nst;
6248 }
6249
6250 static void dd_warning(t_commrec *cr,FILE *fplog,const char *warn_string)
6251 {
6252     if (MASTER(cr))
6253     {
6254         fprintf(stderr,"\n%s\n",warn_string);
6255     }
6256     if (fplog)
6257     {
6258         fprintf(fplog,"\n%s\n",warn_string);
6259     }
6260 }
6261
6262 static void check_dd_restrictions(t_commrec *cr,gmx_domdec_t *dd,
6263                                   t_inputrec *ir,FILE *fplog)
6264 {
6265     if (ir->ePBC == epbcSCREW &&
6266         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6267     {
6268         gmx_fatal(FARGS,"With pbc=%s can only do domain decomposition in the x-direction",epbc_names[ir->ePBC]);
6269     }
6270
6271     if (ir->ns_type == ensSIMPLE)
6272     {
6273         gmx_fatal(FARGS,"Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6274     }
6275
6276     if (ir->nstlist == 0)
6277     {
6278         gmx_fatal(FARGS,"Domain decomposition does not work with nstlist=0");
6279     }
6280
6281     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6282     {
6283         dd_warning(cr,fplog,"comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6284     }
6285 }
6286
6287 static real average_cellsize_min(gmx_domdec_t *dd,gmx_ddbox_t *ddbox)
6288 {
6289     int  di,d;
6290     real r;
6291
6292     r = ddbox->box_size[XX];
6293     for(di=0; di<dd->ndim; di++)
6294     {
6295         d = dd->dim[di];
6296         /* Check using the initial average cell size */
6297         r = min(r,ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6298     }
6299
6300     return r;
6301 }
6302
6303 static int check_dlb_support(FILE *fplog,t_commrec *cr,
6304                              const char *dlb_opt,gmx_bool bRecordLoad,
6305                              unsigned long Flags,t_inputrec *ir)
6306 {
6307     gmx_domdec_t *dd;
6308     int  eDLB=-1;
6309     char buf[STRLEN];
6310
6311     switch (dlb_opt[0])
6312     {
6313     case 'a': eDLB = edlbAUTO; break;
6314     case 'n': eDLB = edlbNO;   break;
6315     case 'y': eDLB = edlbYES;  break;
6316     default: gmx_incons("Unknown dlb_opt");
6317     }
6318
6319     if (Flags & MD_RERUN)
6320     {
6321         return edlbNO;
6322     }
6323
6324     if (!EI_DYNAMICS(ir->eI))
6325     {
6326         if (eDLB == edlbYES)
6327         {
6328             sprintf(buf,"NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n",EI(ir->eI));
6329             dd_warning(cr,fplog,buf);
6330         }
6331
6332         return edlbNO;
6333     }
6334
6335     if (!bRecordLoad)
6336     {
6337         dd_warning(cr,fplog,"NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6338
6339         return edlbNO;
6340     }
6341
6342     if (Flags & MD_REPRODUCIBLE)
6343     {
6344         switch (eDLB)
6345         {
6346                         case edlbNO:
6347                                 break;
6348                         case edlbAUTO:
6349                                 dd_warning(cr,fplog,"NOTE: reproducibility requested, will not use dynamic load balancing\n");
6350                                 eDLB = edlbNO;
6351                                 break;
6352                         case edlbYES:
6353                                 dd_warning(cr,fplog,"WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6354                                 break;
6355                         default:
6356                                 gmx_fatal(FARGS,"Death horror: undefined case (%d) for load balancing choice",eDLB);
6357                                 break;
6358         }
6359     }
6360
6361     return eDLB;
6362 }
6363
6364 static void set_dd_dim(FILE *fplog,gmx_domdec_t *dd)
6365 {
6366     int dim;
6367
6368     dd->ndim = 0;
6369     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6370     {
6371         /* Decomposition order z,y,x */
6372         if (fplog)
6373         {
6374             fprintf(fplog,"Using domain decomposition order z, y, x\n");
6375         }
6376         for(dim=DIM-1; dim>=0; dim--)
6377         {
6378             if (dd->nc[dim] > 1)
6379             {
6380                 dd->dim[dd->ndim++] = dim;
6381             }
6382         }
6383     }
6384     else
6385     {
6386         /* Decomposition order x,y,z */
6387         for(dim=0; dim<DIM; dim++)
6388         {
6389             if (dd->nc[dim] > 1)
6390             {
6391                 dd->dim[dd->ndim++] = dim;
6392             }
6393         }
6394     }
6395 }
6396
6397 static gmx_domdec_comm_t *init_dd_comm()
6398 {
6399     gmx_domdec_comm_t *comm;
6400     int  i;
6401
6402     snew(comm,1);
6403     snew(comm->cggl_flag,DIM*2);
6404     snew(comm->cgcm_state,DIM*2);
6405     for(i=0; i<DIM*2; i++)
6406     {
6407         comm->cggl_flag_nalloc[i]  = 0;
6408         comm->cgcm_state_nalloc[i] = 0;
6409     }
6410
6411     comm->nalloc_int = 0;
6412     comm->buf_int    = NULL;
6413
6414     vec_rvec_init(&comm->vbuf);
6415
6416     comm->n_load_have    = 0;
6417     comm->n_load_collect = 0;
6418
6419     for(i=0; i<ddnatNR-ddnatZONE; i++)
6420     {
6421         comm->sum_nat[i] = 0;
6422     }
6423     comm->ndecomp = 0;
6424     comm->nload   = 0;
6425     comm->load_step = 0;
6426     comm->load_sum  = 0;
6427     comm->load_max  = 0;
6428     clear_ivec(comm->load_lim);
6429     comm->load_mdf  = 0;
6430     comm->load_pme  = 0;
6431
6432     return comm;
6433 }
6434
6435 gmx_domdec_t *init_domain_decomposition(FILE *fplog,t_commrec *cr,
6436                                         unsigned long Flags,
6437                                         ivec nc,
6438                                         real comm_distance_min,real rconstr,
6439                                         const char *dlb_opt,real dlb_scale,
6440                                         const char *sizex,const char *sizey,const char *sizez,
6441                                         gmx_mtop_t *mtop,t_inputrec *ir,
6442                                         matrix box,rvec *x,
6443                                         gmx_ddbox_t *ddbox,
6444                                         int *npme_x,int *npme_y)
6445 {
6446     gmx_domdec_t *dd;
6447     gmx_domdec_comm_t *comm;
6448     int  recload;
6449     int  d,i,j;
6450     real r_2b,r_mb,r_bonded=-1,r_bonded_limit=-1,limit,acs;
6451     gmx_bool bC;
6452     char buf[STRLEN];
6453
6454     if (fplog)
6455     {
6456         fprintf(fplog,
6457                 "\nInitializing Domain Decomposition on %d nodes\n",cr->nnodes);
6458     }
6459
6460     snew(dd,1);
6461
6462     dd->comm = init_dd_comm();
6463     comm = dd->comm;
6464     snew(comm->cggl_flag,DIM*2);
6465     snew(comm->cgcm_state,DIM*2);
6466
6467     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6468     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6469
6470     dd->bSendRecv2      = dd_nst_env(fplog,"GMX_DD_SENDRECV2",0);
6471     comm->dlb_scale_lim = dd_nst_env(fplog,"GMX_DLB_MAX",10);
6472     comm->eFlop         = dd_nst_env(fplog,"GMX_DLB_FLOP",0);
6473     recload             = dd_nst_env(fplog,"GMX_DD_LOAD",1);
6474     comm->nstSortCG     = dd_nst_env(fplog,"GMX_DD_SORT",1);
6475     comm->nstDDDump     = dd_nst_env(fplog,"GMX_DD_DUMP",0);
6476     comm->nstDDDumpGrid = dd_nst_env(fplog,"GMX_DD_DUMP_GRID",0);
6477     comm->DD_debug      = dd_nst_env(fplog,"GMX_DD_DEBUG",0);
6478
6479     dd->pme_recv_f_alloc = 0;
6480     dd->pme_recv_f_buf = NULL;
6481
6482     if (dd->bSendRecv2 && fplog)
6483     {
6484         fprintf(fplog,"Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6485     }
6486     if (comm->eFlop)
6487     {
6488         if (fplog)
6489         {
6490             fprintf(fplog,"Will load balance based on FLOP count\n");
6491         }
6492         if (comm->eFlop > 1)
6493         {
6494             srand(1+cr->nodeid);
6495         }
6496         comm->bRecordLoad = TRUE;
6497     }
6498     else
6499     {
6500         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6501
6502     }
6503
6504     comm->eDLB = check_dlb_support(fplog,cr,dlb_opt,comm->bRecordLoad,Flags,ir);
6505
6506     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6507     if (fplog)
6508     {
6509         fprintf(fplog,"Dynamic load balancing: %s\n",edlb_names[comm->eDLB]);
6510     }
6511     dd->bGridJump = comm->bDynLoadBal;
6512     comm->bPMELoadBalDLBLimits = FALSE;
6513
6514     if (comm->nstSortCG)
6515     {
6516         if (fplog)
6517         {
6518             if (comm->nstSortCG == 1)
6519             {
6520                 fprintf(fplog,"Will sort the charge groups at every domain (re)decomposition\n");
6521             }
6522             else
6523             {
6524                 fprintf(fplog,"Will sort the charge groups every %d steps\n",
6525                         comm->nstSortCG);
6526             }
6527         }
6528         snew(comm->sort,1);
6529     }
6530     else
6531     {
6532         if (fplog)
6533         {
6534             fprintf(fplog,"Will not sort the charge groups\n");
6535         }
6536     }
6537
6538     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6539
6540     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6541     if (comm->bInterCGBondeds)
6542     {
6543         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6544     }
6545     else
6546     {
6547         comm->bInterCGMultiBody = FALSE;
6548     }
6549
6550     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6551     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6552
6553     if (ir->rlistlong == 0)
6554     {
6555         /* Set the cut-off to some very large value,
6556          * so we don't need if statements everywhere in the code.
6557          * We use sqrt, since the cut-off is squared in some places.
6558          */
6559         comm->cutoff   = GMX_CUTOFF_INF;
6560     }
6561     else
6562     {
6563         comm->cutoff   = ir->rlistlong;
6564     }
6565     comm->cutoff_mbody = 0;
6566
6567     comm->cellsize_limit = 0;
6568     comm->bBondComm = FALSE;
6569
6570     if (comm->bInterCGBondeds)
6571     {
6572         if (comm_distance_min > 0)
6573         {
6574             comm->cutoff_mbody = comm_distance_min;
6575             if (Flags & MD_DDBONDCOMM)
6576             {
6577                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6578             }
6579             else
6580             {
6581                 comm->cutoff = max(comm->cutoff,comm->cutoff_mbody);
6582             }
6583             r_bonded_limit = comm->cutoff_mbody;
6584         }
6585         else if (ir->bPeriodicMols)
6586         {
6587             /* Can not easily determine the required cut-off */
6588             dd_warning(cr,fplog,"NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6589             comm->cutoff_mbody = comm->cutoff/2;
6590             r_bonded_limit = comm->cutoff_mbody;
6591         }
6592         else
6593         {
6594             if (MASTER(cr))
6595             {
6596                 dd_bonded_cg_distance(fplog,dd,mtop,ir,x,box,
6597                                       Flags & MD_DDBONDCHECK,&r_2b,&r_mb);
6598             }
6599             gmx_bcast(sizeof(r_2b),&r_2b,cr);
6600             gmx_bcast(sizeof(r_mb),&r_mb,cr);
6601
6602             /* We use an initial margin of 10% for the minimum cell size,
6603              * except when we are just below the non-bonded cut-off.
6604              */
6605             if (Flags & MD_DDBONDCOMM)
6606             {
6607                 if (max(r_2b,r_mb) > comm->cutoff)
6608                 {
6609                     r_bonded       = max(r_2b,r_mb);
6610                     r_bonded_limit = 1.1*r_bonded;
6611                     comm->bBondComm = TRUE;
6612                 }
6613                 else
6614                 {
6615                     r_bonded       = r_mb;
6616                     r_bonded_limit = min(1.1*r_bonded,comm->cutoff);
6617                 }
6618                 /* We determine cutoff_mbody later */
6619             }
6620             else
6621             {
6622                 /* No special bonded communication,
6623                  * simply increase the DD cut-off.
6624                  */
6625                 r_bonded_limit     = 1.1*max(r_2b,r_mb);
6626                 comm->cutoff_mbody = r_bonded_limit;
6627                 comm->cutoff       = max(comm->cutoff,comm->cutoff_mbody);
6628             }
6629         }
6630         comm->cellsize_limit = max(comm->cellsize_limit,r_bonded_limit);
6631         if (fplog)
6632         {
6633             fprintf(fplog,
6634                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6635                     comm->cellsize_limit);
6636         }
6637     }
6638
6639     if (dd->bInterCGcons && rconstr <= 0)
6640     {
6641         /* There is a cell size limit due to the constraints (P-LINCS) */
6642         rconstr = constr_r_max(fplog,mtop,ir);
6643         if (fplog)
6644         {
6645             fprintf(fplog,
6646                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6647                     rconstr);
6648             if (rconstr > comm->cellsize_limit)
6649             {
6650                 fprintf(fplog,"This distance will limit the DD cell size, you can override this with -rcon\n");
6651             }
6652         }
6653     }
6654     else if (rconstr > 0 && fplog)
6655     {
6656         /* Here we do not check for dd->bInterCGcons,
6657          * because one can also set a cell size limit for virtual sites only
6658          * and at this point we don't know yet if there are intercg v-sites.
6659          */
6660         fprintf(fplog,
6661                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6662                 rconstr);
6663     }
6664     comm->cellsize_limit = max(comm->cellsize_limit,rconstr);
6665
6666     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6667
6668     if (nc[XX] > 0)
6669     {
6670         copy_ivec(nc,dd->nc);
6671         set_dd_dim(fplog,dd);
6672         set_ddbox_cr(cr,&dd->nc,ir,box,&comm->cgs_gl,x,ddbox);
6673
6674         if (cr->npmenodes == -1)
6675         {
6676             cr->npmenodes = 0;
6677         }
6678         acs = average_cellsize_min(dd,ddbox);
6679         if (acs < comm->cellsize_limit)
6680         {
6681             if (fplog)
6682             {
6683                 fprintf(fplog,"ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n",acs,comm->cellsize_limit);
6684             }
6685             gmx_fatal_collective(FARGS,cr,NULL,
6686                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6687                                  acs,comm->cellsize_limit);
6688         }
6689     }
6690     else
6691     {
6692         set_ddbox_cr(cr,NULL,ir,box,&comm->cgs_gl,x,ddbox);
6693
6694         /* We need to choose the optimal DD grid and possibly PME nodes */
6695         limit = dd_choose_grid(fplog,cr,dd,ir,mtop,box,ddbox,
6696                                comm->eDLB!=edlbNO,dlb_scale,
6697                                comm->cellsize_limit,comm->cutoff,
6698                                comm->bInterCGBondeds,comm->bInterCGMultiBody);
6699
6700         if (dd->nc[XX] == 0)
6701         {
6702             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6703             sprintf(buf,"Change the number of nodes or mdrun option %s%s%s",
6704                     !bC ? "-rdd" : "-rcon",
6705                     comm->eDLB!=edlbNO ? " or -dds" : "",
6706                     bC ? " or your LINCS settings" : "");
6707
6708             gmx_fatal_collective(FARGS,cr,NULL,
6709                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6710                                  "%s\n"
6711                                  "Look in the log file for details on the domain decomposition",
6712                                  cr->nnodes-cr->npmenodes,limit,buf);
6713         }
6714         set_dd_dim(fplog,dd);
6715     }
6716
6717     if (fplog)
6718     {
6719         fprintf(fplog,
6720                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6721                 dd->nc[XX],dd->nc[YY],dd->nc[ZZ],cr->npmenodes);
6722     }
6723
6724     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6725     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6726     {
6727         gmx_fatal_collective(FARGS,cr,NULL,
6728                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6729                              dd->nnodes,cr->nnodes - cr->npmenodes,cr->nnodes);
6730     }
6731     if (cr->npmenodes > dd->nnodes)
6732     {
6733         gmx_fatal_collective(FARGS,cr,NULL,
6734                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.",cr->npmenodes,dd->nnodes);
6735     }
6736     if (cr->npmenodes > 0)
6737     {
6738         comm->npmenodes = cr->npmenodes;
6739     }
6740     else
6741     {
6742         comm->npmenodes = dd->nnodes;
6743     }
6744
6745     if (EEL_PME(ir->coulombtype))
6746     {
6747         /* The following choices should match those
6748          * in comm_cost_est in domdec_setup.c.
6749          * Note that here the checks have to take into account
6750          * that the decomposition might occur in a different order than xyz
6751          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6752          * in which case they will not match those in comm_cost_est,
6753          * but since that is mainly for testing purposes that's fine.
6754          */
6755         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6756             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6757             getenv("GMX_PMEONEDD") == NULL)
6758         {
6759             comm->npmedecompdim = 2;
6760             comm->npmenodes_x   = dd->nc[XX];
6761             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6762         }
6763         else
6764         {
6765             /* In case nc is 1 in both x and y we could still choose to
6766              * decompose pme in y instead of x, but we use x for simplicity.
6767              */
6768             comm->npmedecompdim = 1;
6769             if (dd->dim[0] == YY)
6770             {
6771                 comm->npmenodes_x = 1;
6772                 comm->npmenodes_y = comm->npmenodes;
6773             }
6774             else
6775             {
6776                 comm->npmenodes_x = comm->npmenodes;
6777                 comm->npmenodes_y = 1;
6778             }
6779         }
6780         if (fplog)
6781         {
6782             fprintf(fplog,"PME domain decomposition: %d x %d x %d\n",
6783                     comm->npmenodes_x,comm->npmenodes_y,1);
6784         }
6785     }
6786     else
6787     {
6788         comm->npmedecompdim = 0;
6789         comm->npmenodes_x   = 0;
6790         comm->npmenodes_y   = 0;
6791     }
6792
6793     /* Technically we don't need both of these,
6794      * but it simplifies code not having to recalculate it.
6795      */
6796     *npme_x = comm->npmenodes_x;
6797     *npme_y = comm->npmenodes_y;
6798
6799     snew(comm->slb_frac,DIM);
6800     if (comm->eDLB == edlbNO)
6801     {
6802         comm->slb_frac[XX] = get_slb_frac(fplog,"x",dd->nc[XX],sizex);
6803         comm->slb_frac[YY] = get_slb_frac(fplog,"y",dd->nc[YY],sizey);
6804         comm->slb_frac[ZZ] = get_slb_frac(fplog,"z",dd->nc[ZZ],sizez);
6805     }
6806
6807     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6808     {
6809         if (comm->bBondComm || comm->eDLB != edlbNO)
6810         {
6811             /* Set the bonded communication distance to halfway
6812              * the minimum and the maximum,
6813              * since the extra communication cost is nearly zero.
6814              */
6815             acs = average_cellsize_min(dd,ddbox);
6816             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6817             if (comm->eDLB != edlbNO)
6818             {
6819                 /* Check if this does not limit the scaling */
6820                 comm->cutoff_mbody = min(comm->cutoff_mbody,dlb_scale*acs);
6821             }
6822             if (!comm->bBondComm)
6823             {
6824                 /* Without bBondComm do not go beyond the n.b. cut-off */
6825                 comm->cutoff_mbody = min(comm->cutoff_mbody,comm->cutoff);
6826                 if (comm->cellsize_limit >= comm->cutoff)
6827                 {
6828                     /* We don't loose a lot of efficieny
6829                      * when increasing it to the n.b. cut-off.
6830                      * It can even be slightly faster, because we need
6831                      * less checks for the communication setup.
6832                      */
6833                     comm->cutoff_mbody = comm->cutoff;
6834                 }
6835             }
6836             /* Check if we did not end up below our original limit */
6837             comm->cutoff_mbody = max(comm->cutoff_mbody,r_bonded_limit);
6838
6839             if (comm->cutoff_mbody > comm->cellsize_limit)
6840             {
6841                 comm->cellsize_limit = comm->cutoff_mbody;
6842             }
6843         }
6844         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6845     }
6846
6847     if (debug)
6848     {
6849         fprintf(debug,"Bonded atom communication beyond the cut-off: %d\n"
6850                 "cellsize limit %f\n",
6851                 comm->bBondComm,comm->cellsize_limit);
6852     }
6853
6854     if (MASTER(cr))
6855     {
6856         check_dd_restrictions(cr,dd,ir,fplog);
6857     }
6858
6859     comm->partition_step = INT_MIN;
6860     dd->ddp_count = 0;
6861
6862     clear_dd_cycle_counts(dd);
6863
6864     return dd;
6865 }
6866
6867 static void set_dlb_limits(gmx_domdec_t *dd)
6868
6869 {
6870     int d;
6871
6872     for(d=0; d<dd->ndim; d++)
6873     {
6874         dd->comm->cd[d].np = dd->comm->cd[d].np_dlb;
6875         dd->comm->cellsize_min[dd->dim[d]] =
6876             dd->comm->cellsize_min_dlb[dd->dim[d]];
6877     }
6878 }
6879
6880
6881 static void turn_on_dlb(FILE *fplog,t_commrec *cr,gmx_large_int_t step)
6882 {
6883     gmx_domdec_t *dd;
6884     gmx_domdec_comm_t *comm;
6885     real cellsize_min;
6886     int  d,nc,i;
6887     char buf[STRLEN];
6888
6889     dd = cr->dd;
6890     comm = dd->comm;
6891
6892     if (fplog)
6893     {
6894         fprintf(fplog,"At step %s the performance loss due to force load imbalance is %.1f %%\n",gmx_step_str(step,buf),dd_force_imb_perf_loss(dd)*100);
6895     }
6896
6897     cellsize_min = comm->cellsize_min[dd->dim[0]];
6898     for(d=1; d<dd->ndim; d++)
6899     {
6900         cellsize_min = min(cellsize_min,comm->cellsize_min[dd->dim[d]]);
6901     }
6902
6903     if (cellsize_min < comm->cellsize_limit*1.05)
6904     {
6905         dd_warning(cr,fplog,"NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6906
6907         /* Change DLB from "auto" to "no". */
6908         comm->eDLB = edlbNO;
6909
6910         return;
6911     }
6912
6913     dd_warning(cr,fplog,"NOTE: Turning on dynamic load balancing\n");
6914     comm->bDynLoadBal = TRUE;
6915     dd->bGridJump = TRUE;
6916
6917     set_dlb_limits(dd);
6918
6919     /* We can set the required cell size info here,
6920      * so we do not need to communicate this.
6921      * The grid is completely uniform.
6922      */
6923     for(d=0; d<dd->ndim; d++)
6924     {
6925         if (comm->root[d])
6926         {
6927             comm->load[d].sum_m = comm->load[d].sum;
6928
6929             nc = dd->nc[dd->dim[d]];
6930             for(i=0; i<nc; i++)
6931             {
6932                 comm->root[d]->cell_f[i]    = i/(real)nc;
6933                 if (d > 0)
6934                 {
6935                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6936                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6937                 }
6938             }
6939             comm->root[d]->cell_f[nc] = 1.0;
6940         }
6941     }
6942 }
6943
6944 static char *init_bLocalCG(gmx_mtop_t *mtop)
6945 {
6946     int  ncg,cg;
6947     char *bLocalCG;
6948
6949     ncg = ncg_mtop(mtop);
6950     snew(bLocalCG,ncg);
6951     for(cg=0; cg<ncg; cg++)
6952     {
6953         bLocalCG[cg] = FALSE;
6954     }
6955
6956     return bLocalCG;
6957 }
6958
6959 void dd_init_bondeds(FILE *fplog,
6960                      gmx_domdec_t *dd,gmx_mtop_t *mtop,
6961                      gmx_vsite_t *vsite,gmx_constr_t constr,
6962                      t_inputrec *ir,gmx_bool bBCheck,cginfo_mb_t *cginfo_mb)
6963 {
6964     gmx_domdec_comm_t *comm;
6965     gmx_bool bBondComm;
6966     int  d;
6967
6968     dd_make_reverse_top(fplog,dd,mtop,vsite,constr,ir,bBCheck);
6969
6970     comm = dd->comm;
6971
6972     if (comm->bBondComm)
6973     {
6974         /* Communicate atoms beyond the cut-off for bonded interactions */
6975         comm = dd->comm;
6976
6977         comm->cglink = make_charge_group_links(mtop,dd,cginfo_mb);
6978
6979         comm->bLocalCG = init_bLocalCG(mtop);
6980     }
6981     else
6982     {
6983         /* Only communicate atoms based on cut-off */
6984         comm->cglink   = NULL;
6985         comm->bLocalCG = NULL;
6986     }
6987 }
6988
6989 static void print_dd_settings(FILE *fplog,gmx_domdec_t *dd,
6990                               t_inputrec *ir,
6991                               gmx_bool bDynLoadBal,real dlb_scale,
6992                               gmx_ddbox_t *ddbox)
6993 {
6994     gmx_domdec_comm_t *comm;
6995     int  d;
6996     ivec np;
6997     real limit,shrink;
6998     char buf[64];
6999
7000     if (fplog == NULL)
7001     {
7002         return;
7003     }
7004
7005     comm = dd->comm;
7006
7007     if (bDynLoadBal)
7008     {
7009         fprintf(fplog,"The maximum number of communication pulses is:");
7010         for(d=0; d<dd->ndim; d++)
7011         {
7012             fprintf(fplog," %c %d",dim2char(dd->dim[d]),comm->cd[d].np_dlb);
7013         }
7014         fprintf(fplog,"\n");
7015         fprintf(fplog,"The minimum size for domain decomposition cells is %.3f nm\n",comm->cellsize_limit);
7016         fprintf(fplog,"The requested allowed shrink of DD cells (option -dds) is: %.2f\n",dlb_scale);
7017         fprintf(fplog,"The allowed shrink of domain decomposition cells is:");
7018         for(d=0; d<DIM; d++)
7019         {
7020             if (dd->nc[d] > 1)
7021             {
7022                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7023                 {
7024                     shrink = 0;
7025                 }
7026                 else
7027                 {
7028                     shrink =
7029                         comm->cellsize_min_dlb[d]/
7030                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7031                 }
7032                 fprintf(fplog," %c %.2f",dim2char(d),shrink);
7033             }
7034         }
7035         fprintf(fplog,"\n");
7036     }
7037     else
7038     {
7039         set_dd_cell_sizes_slb(dd,ddbox,FALSE,np);
7040         fprintf(fplog,"The initial number of communication pulses is:");
7041         for(d=0; d<dd->ndim; d++)
7042         {
7043             fprintf(fplog," %c %d",dim2char(dd->dim[d]),np[dd->dim[d]]);
7044         }
7045         fprintf(fplog,"\n");
7046         fprintf(fplog,"The initial domain decomposition cell size is:");
7047         for(d=0; d<DIM; d++) {
7048             if (dd->nc[d] > 1)
7049             {
7050                 fprintf(fplog," %c %.2f nm",
7051                         dim2char(d),dd->comm->cellsize_min[d]);
7052             }
7053         }
7054         fprintf(fplog,"\n\n");
7055     }
7056
7057     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7058     {
7059         fprintf(fplog,"The maximum allowed distance for charge groups involved in interactions is:\n");
7060         fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7061                 "non-bonded interactions","",comm->cutoff);
7062
7063         if (bDynLoadBal)
7064         {
7065             limit = dd->comm->cellsize_limit;
7066         }
7067         else
7068         {
7069             if (dynamic_dd_box(ddbox,ir))
7070             {
7071                 fprintf(fplog,"(the following are initial values, they could change due to box deformation)\n");
7072             }
7073             limit = dd->comm->cellsize_min[XX];
7074             for(d=1; d<DIM; d++)
7075             {
7076                 limit = min(limit,dd->comm->cellsize_min[d]);
7077             }
7078         }
7079
7080         if (comm->bInterCGBondeds)
7081         {
7082             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7083                     "two-body bonded interactions","(-rdd)",
7084                     max(comm->cutoff,comm->cutoff_mbody));
7085             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7086                     "multi-body bonded interactions","(-rdd)",
7087                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff,limit));
7088         }
7089         if (dd->vsite_comm)
7090         {
7091             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7092                     "virtual site constructions","(-rcon)",limit);
7093         }
7094         if (dd->constraint_comm)
7095         {
7096             sprintf(buf,"atoms separated by up to %d constraints",
7097                     1+ir->nProjOrder);
7098             fprintf(fplog,"%40s  %-7s %6.3f nm\n",
7099                     buf,"(-rcon)",limit);
7100         }
7101         fprintf(fplog,"\n");
7102     }
7103
7104     fflush(fplog);
7105 }
7106
7107 static void set_cell_limits_dlb(gmx_domdec_t *dd,
7108                                 real dlb_scale,
7109                                 const t_inputrec *ir,
7110                                 const gmx_ddbox_t *ddbox)
7111 {
7112     gmx_domdec_comm_t *comm;
7113     int  d,dim,npulse,npulse_d_max,npulse_d;
7114     gmx_bool bNoCutOff;
7115
7116     comm = dd->comm;
7117
7118     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7119
7120     /* Determine the maximum number of comm. pulses in one dimension */
7121
7122     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7123
7124     /* Determine the maximum required number of grid pulses */
7125     if (comm->cellsize_limit >= comm->cutoff)
7126     {
7127         /* Only a single pulse is required */
7128         npulse = 1;
7129     }
7130     else if (!bNoCutOff && comm->cellsize_limit > 0)
7131     {
7132         /* We round down slightly here to avoid overhead due to the latency
7133          * of extra communication calls when the cut-off
7134          * would be only slightly longer than the cell size.
7135          * Later cellsize_limit is redetermined,
7136          * so we can not miss interactions due to this rounding.
7137          */
7138         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7139     }
7140     else
7141     {
7142         /* There is no cell size limit */
7143         npulse = max(dd->nc[XX]-1,max(dd->nc[YY]-1,dd->nc[ZZ]-1));
7144     }
7145
7146     if (!bNoCutOff && npulse > 1)
7147     {
7148         /* See if we can do with less pulses, based on dlb_scale */
7149         npulse_d_max = 0;
7150         for(d=0; d<dd->ndim; d++)
7151         {
7152             dim = dd->dim[d];
7153             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7154                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7155             npulse_d_max = max(npulse_d_max,npulse_d);
7156         }
7157         npulse = min(npulse,npulse_d_max);
7158     }
7159
7160     /* This env var can override npulse */
7161     d = dd_nst_env(debug,"GMX_DD_NPULSE",0);
7162     if (d > 0)
7163     {
7164         npulse = d;
7165     }
7166
7167     comm->maxpulse = 1;
7168     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7169     for(d=0; d<dd->ndim; d++)
7170     {
7171         comm->cd[d].np_dlb = min(npulse,dd->nc[dd->dim[d]]-1);
7172         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7173         snew(comm->cd[d].ind,comm->cd[d].np_nalloc);
7174         comm->maxpulse = max(comm->maxpulse,comm->cd[d].np_dlb);
7175         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7176         {
7177             comm->bVacDLBNoLimit = FALSE;
7178         }
7179     }
7180
7181     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7182     if (!comm->bVacDLBNoLimit)
7183     {
7184         comm->cellsize_limit = max(comm->cellsize_limit,
7185                                    comm->cutoff/comm->maxpulse);
7186     }
7187     comm->cellsize_limit = max(comm->cellsize_limit,comm->cutoff_mbody);
7188     /* Set the minimum cell size for each DD dimension */
7189     for(d=0; d<dd->ndim; d++)
7190     {
7191         if (comm->bVacDLBNoLimit ||
7192             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7193         {
7194             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7195         }
7196         else
7197         {
7198             comm->cellsize_min_dlb[dd->dim[d]] =
7199                 comm->cutoff/comm->cd[d].np_dlb;
7200         }
7201     }
7202     if (comm->cutoff_mbody <= 0)
7203     {
7204         comm->cutoff_mbody = min(comm->cutoff,comm->cellsize_limit);
7205     }
7206     if (comm->bDynLoadBal)
7207     {
7208         set_dlb_limits(dd);
7209     }
7210 }
7211
7212 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd,int ePBC)
7213 {
7214     /* If each molecule is a single charge group
7215      * or we use domain decomposition for each periodic dimension,
7216      * we do not need to take pbc into account for the bonded interactions.
7217      */
7218     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7219             !(dd->nc[XX]>1 &&
7220               dd->nc[YY]>1 &&
7221               (dd->nc[ZZ]>1 || ePBC==epbcXY)));
7222 }
7223
7224 void set_dd_parameters(FILE *fplog,gmx_domdec_t *dd,real dlb_scale,
7225                        t_inputrec *ir,t_forcerec *fr,
7226                        gmx_ddbox_t *ddbox)
7227 {
7228     gmx_domdec_comm_t *comm;
7229     int  natoms_tot;
7230     real vol_frac;
7231
7232     comm = dd->comm;
7233
7234     /* Initialize the thread data.
7235      * This can not be done in init_domain_decomposition,
7236      * as the numbers of threads is determined later.
7237      */
7238     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7239     if (comm->nth > 1)
7240     {
7241         snew(comm->dth,comm->nth);
7242     }
7243
7244     if (EEL_PME(ir->coulombtype))
7245     {
7246         init_ddpme(dd,&comm->ddpme[0],0);
7247         if (comm->npmedecompdim >= 2)
7248         {
7249             init_ddpme(dd,&comm->ddpme[1],1);
7250         }
7251     }
7252     else
7253     {
7254         comm->npmenodes = 0;
7255         if (dd->pme_nodeid >= 0)
7256         {
7257             gmx_fatal_collective(FARGS,NULL,dd,
7258                                  "Can not have separate PME nodes without PME electrostatics");
7259         }
7260     }
7261
7262     if (debug)
7263     {
7264         fprintf(debug,"The DD cut-off is %f\n",comm->cutoff);
7265     }
7266     if (comm->eDLB != edlbNO)
7267     {
7268         set_cell_limits_dlb(dd,dlb_scale,ir,ddbox);
7269     }
7270
7271     print_dd_settings(fplog,dd,ir,comm->bDynLoadBal,dlb_scale,ddbox);
7272     if (comm->eDLB == edlbAUTO)
7273     {
7274         if (fplog)
7275         {
7276             fprintf(fplog,"When dynamic load balancing gets turned on, these settings will change to:\n");
7277         }
7278         print_dd_settings(fplog,dd,ir,TRUE,dlb_scale,ddbox);
7279     }
7280
7281     if (ir->ePBC == epbcNONE)
7282     {
7283         vol_frac = 1 - 1/(double)dd->nnodes;
7284     }
7285     else
7286     {
7287         vol_frac =
7288             (1 + comm_box_frac(dd->nc,comm->cutoff,ddbox))/(double)dd->nnodes;
7289     }
7290     if (debug)
7291     {
7292         fprintf(debug,"Volume fraction for all DD zones: %f\n",vol_frac);
7293     }
7294     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7295
7296     dd->ga2la = ga2la_init(natoms_tot,vol_frac*natoms_tot);
7297 }
7298
7299 static gmx_bool test_dd_cutoff(t_commrec *cr,
7300                                t_state *state,t_inputrec *ir,
7301                                real cutoff_req)
7302 {
7303     gmx_domdec_t *dd;
7304     gmx_ddbox_t ddbox;
7305     int d,dim,np;
7306     real inv_cell_size;
7307     int LocallyLimited;
7308
7309     dd = cr->dd;
7310
7311     set_ddbox(dd,FALSE,cr,ir,state->box,
7312               TRUE,&dd->comm->cgs_gl,state->x,&ddbox);
7313
7314     LocallyLimited = 0;
7315
7316     for(d=0; d<dd->ndim; d++)
7317     {
7318         dim = dd->dim[d];
7319
7320         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7321         if (dynamic_dd_box(&ddbox,ir))
7322         {
7323             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7324         }
7325
7326         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7327
7328         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7329             dd->comm->cd[d].np_dlb > 0)
7330         {
7331             if (np > dd->comm->cd[d].np_dlb)
7332             {
7333                 return FALSE;
7334             }
7335
7336             /* If a current local cell size is smaller than the requested
7337              * cut-off, we could still fix it, but this gets very complicated.
7338              * Without fixing here, we might actually need more checks.
7339              */
7340             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7341             {
7342                 LocallyLimited = 1;
7343             }
7344         }
7345     }
7346
7347     if (dd->comm->eDLB != edlbNO)
7348     {
7349         /* If DLB is not active yet, we don't need to check the grid jumps.
7350          * Actually we shouldn't, because then the grid jump data is not set.
7351          */
7352         if (dd->comm->bDynLoadBal &&
7353             check_grid_jump(0,dd,cutoff_req,&ddbox,FALSE))
7354         {
7355             LocallyLimited = 1;
7356         }
7357
7358         gmx_sumi(1,&LocallyLimited,cr);
7359
7360         if (LocallyLimited > 0)
7361         {
7362             return FALSE;
7363         }
7364     }
7365
7366     return TRUE;
7367 }
7368
7369 gmx_bool change_dd_cutoff(t_commrec *cr,t_state *state,t_inputrec *ir,
7370                           real cutoff_req)
7371 {
7372     gmx_bool bCutoffAllowed;
7373
7374     bCutoffAllowed = test_dd_cutoff(cr,state,ir,cutoff_req);
7375
7376     if (bCutoffAllowed)
7377     {
7378         cr->dd->comm->cutoff = cutoff_req;
7379     }
7380
7381     return bCutoffAllowed;
7382 }
7383
7384 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7385 {
7386     gmx_domdec_comm_t *comm;
7387
7388     comm = cr->dd->comm;
7389
7390     /* Turn on the DLB limiting (might have been on already) */
7391     comm->bPMELoadBalDLBLimits = TRUE;
7392
7393     /* Change the cut-off limit */
7394     comm->PMELoadBal_max_cutoff = comm->cutoff;
7395 }
7396
7397 static void merge_cg_buffers(int ncell,
7398                              gmx_domdec_comm_dim_t *cd, int pulse,
7399                              int  *ncg_cell,
7400                              int  *index_gl, int  *recv_i,
7401                              rvec *cg_cm,    rvec *recv_vr,
7402                              int *cgindex,
7403                              cginfo_mb_t *cginfo_mb,int *cginfo)
7404 {
7405     gmx_domdec_ind_t *ind,*ind_p;
7406     int p,cell,c,cg,cg0,cg1,cg_gl,nat;
7407     int shift,shift_at;
7408
7409     ind = &cd->ind[pulse];
7410
7411     /* First correct the already stored data */
7412     shift = ind->nrecv[ncell];
7413     for(cell=ncell-1; cell>=0; cell--)
7414     {
7415         shift -= ind->nrecv[cell];
7416         if (shift > 0)
7417         {
7418             /* Move the cg's present from previous grid pulses */
7419             cg0 = ncg_cell[ncell+cell];
7420             cg1 = ncg_cell[ncell+cell+1];
7421             cgindex[cg1+shift] = cgindex[cg1];
7422             for(cg=cg1-1; cg>=cg0; cg--)
7423             {
7424                 index_gl[cg+shift] = index_gl[cg];
7425                 copy_rvec(cg_cm[cg],cg_cm[cg+shift]);
7426                 cgindex[cg+shift] = cgindex[cg];
7427                 cginfo[cg+shift] = cginfo[cg];
7428             }
7429             /* Correct the already stored send indices for the shift */
7430             for(p=1; p<=pulse; p++)
7431             {
7432                 ind_p = &cd->ind[p];
7433                 cg0 = 0;
7434                 for(c=0; c<cell; c++)
7435                 {
7436                     cg0 += ind_p->nsend[c];
7437                 }
7438                 cg1 = cg0 + ind_p->nsend[cell];
7439                 for(cg=cg0; cg<cg1; cg++)
7440                 {
7441                     ind_p->index[cg] += shift;
7442                 }
7443             }
7444         }
7445     }
7446
7447     /* Merge in the communicated buffers */
7448     shift = 0;
7449     shift_at = 0;
7450     cg0 = 0;
7451     for(cell=0; cell<ncell; cell++)
7452     {
7453         cg1 = ncg_cell[ncell+cell+1] + shift;
7454         if (shift_at > 0)
7455         {
7456             /* Correct the old cg indices */
7457             for(cg=ncg_cell[ncell+cell]; cg<cg1; cg++)
7458             {
7459                 cgindex[cg+1] += shift_at;
7460             }
7461         }
7462         for(cg=0; cg<ind->nrecv[cell]; cg++)
7463         {
7464             /* Copy this charge group from the buffer */
7465             index_gl[cg1] = recv_i[cg0];
7466             copy_rvec(recv_vr[cg0],cg_cm[cg1]);
7467             /* Add it to the cgindex */
7468             cg_gl = index_gl[cg1];
7469             cginfo[cg1] = ddcginfo(cginfo_mb,cg_gl);
7470             nat = GET_CGINFO_NATOMS(cginfo[cg1]);
7471             cgindex[cg1+1] = cgindex[cg1] + nat;
7472             cg0++;
7473             cg1++;
7474             shift_at += nat;
7475         }
7476         shift += ind->nrecv[cell];
7477         ncg_cell[ncell+cell+1] = cg1;
7478     }
7479 }
7480
7481 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7482                                int nzone,int cg0,const int *cgindex)
7483 {
7484     int cg,zone,p;
7485
7486     /* Store the atom block boundaries for easy copying of communication buffers
7487      */
7488     cg = cg0;
7489     for(zone=0; zone<nzone; zone++)
7490     {
7491         for(p=0; p<cd->np; p++) {
7492             cd->ind[p].cell2at0[zone] = cgindex[cg];
7493             cg += cd->ind[p].nrecv[zone];
7494             cd->ind[p].cell2at1[zone] = cgindex[cg];
7495         }
7496     }
7497 }
7498
7499 static gmx_bool missing_link(t_blocka *link,int cg_gl,char *bLocalCG)
7500 {
7501     int  i;
7502     gmx_bool bMiss;
7503
7504     bMiss = FALSE;
7505     for(i=link->index[cg_gl]; i<link->index[cg_gl+1]; i++)
7506     {
7507         if (!bLocalCG[link->a[i]])
7508         {
7509             bMiss = TRUE;
7510         }
7511     }
7512
7513     return bMiss;
7514 }
7515
7516 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7517 typedef struct {
7518     real c[DIM][4]; /* the corners for the non-bonded communication */
7519     real cr0;       /* corner for rounding */
7520     real cr1[4];    /* corners for rounding */
7521     real bc[DIM];   /* corners for bounded communication */
7522     real bcr1;      /* corner for rounding for bonded communication */
7523 } dd_corners_t;
7524
7525 /* Determine the corners of the domain(s) we are communicating with */
7526 static void
7527 set_dd_corners(const gmx_domdec_t *dd,
7528                int dim0, int dim1, int dim2,
7529                gmx_bool bDistMB,
7530                dd_corners_t *c)
7531 {
7532     const gmx_domdec_comm_t *comm;
7533     const gmx_domdec_zones_t *zones;
7534     int i,j;
7535
7536     comm = dd->comm;
7537
7538     zones = &comm->zones;
7539
7540     /* Keep the compiler happy */
7541     c->cr0  = 0;
7542     c->bcr1 = 0;
7543
7544     /* The first dimension is equal for all cells */
7545     c->c[0][0] = comm->cell_x0[dim0];
7546     if (bDistMB)
7547     {
7548         c->bc[0] = c->c[0][0];
7549     }
7550     if (dd->ndim >= 2)
7551     {
7552         dim1 = dd->dim[1];
7553         /* This cell row is only seen from the first row */
7554         c->c[1][0] = comm->cell_x0[dim1];
7555         /* All rows can see this row */
7556         c->c[1][1] = comm->cell_x0[dim1];
7557         if (dd->bGridJump)
7558         {
7559             c->c[1][1] = max(comm->cell_x0[dim1],comm->zone_d1[1].mch0);
7560             if (bDistMB)
7561             {
7562                 /* For the multi-body distance we need the maximum */
7563                 c->bc[1] = max(comm->cell_x0[dim1],comm->zone_d1[1].p1_0);
7564             }
7565         }
7566         /* Set the upper-right corner for rounding */
7567         c->cr0 = comm->cell_x1[dim0];
7568
7569         if (dd->ndim >= 3)
7570         {
7571             dim2 = dd->dim[2];
7572             for(j=0; j<4; j++)
7573             {
7574                 c->c[2][j] = comm->cell_x0[dim2];
7575             }
7576             if (dd->bGridJump)
7577             {
7578                 /* Use the maximum of the i-cells that see a j-cell */
7579                 for(i=0; i<zones->nizone; i++)
7580                 {
7581                     for(j=zones->izone[i].j0; j<zones->izone[i].j1; j++)
7582                     {
7583                         if (j >= 4)
7584                         {
7585                             c->c[2][j-4] =
7586                                 max(c->c[2][j-4],
7587                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7588                         }
7589                     }
7590                 }
7591                 if (bDistMB)
7592                 {
7593                     /* For the multi-body distance we need the maximum */
7594                     c->bc[2] = comm->cell_x0[dim2];
7595                     for(i=0; i<2; i++)
7596                     {
7597                         for(j=0; j<2; j++)
7598                         {
7599                             c->bc[2] = max(c->bc[2],comm->zone_d2[i][j].p1_0);
7600                         }
7601                     }
7602                 }
7603             }
7604
7605             /* Set the upper-right corner for rounding */
7606             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7607              * Only cell (0,0,0) can see cell 7 (1,1,1)
7608              */
7609             c->cr1[0] = comm->cell_x1[dim1];
7610             c->cr1[3] = comm->cell_x1[dim1];
7611             if (dd->bGridJump)
7612             {
7613                 c->cr1[0] = max(comm->cell_x1[dim1],comm->zone_d1[1].mch1);
7614                 if (bDistMB)
7615                 {
7616                     /* For the multi-body distance we need the maximum */
7617                     c->bcr1 = max(comm->cell_x1[dim1],comm->zone_d1[1].p1_1);
7618                 }
7619             }
7620         }
7621     }
7622 }
7623
7624 /* Determine which cg's we need to send in this pulse from this zone */
7625 static void
7626 get_zone_pulse_cgs(gmx_domdec_t *dd,
7627                    int zonei, int zone,
7628                    int cg0, int cg1,
7629                    const int *index_gl,
7630                    const int *cgindex,
7631                    int dim, int dim_ind,
7632                    int dim0, int dim1, int dim2,
7633                    real r_comm2, real r_bcomm2,
7634                    matrix box,
7635                    ivec tric_dist,
7636                    rvec *normal,
7637                    real skew_fac2_d, real skew_fac_01,
7638                    rvec *v_d, rvec *v_0, rvec *v_1,
7639                    const dd_corners_t *c,
7640                    rvec sf2_round,
7641                    gmx_bool bDistBonded,
7642                    gmx_bool bBondComm,
7643                    gmx_bool bDist2B,
7644                    gmx_bool bDistMB,
7645                    rvec *cg_cm,
7646                    int *cginfo,
7647                    gmx_domdec_ind_t *ind,
7648                    int **ibuf, int *ibuf_nalloc,
7649                    vec_rvec_t *vbuf,
7650                    int *nsend_ptr,
7651                    int *nat_ptr,
7652                    int *nsend_z_ptr)
7653 {
7654     gmx_domdec_comm_t *comm;
7655     gmx_bool bScrew;
7656     gmx_bool bDistMB_pulse;
7657     int  cg,i;
7658     real r2,rb2,r,tric_sh;
7659     rvec rn,rb;
7660     int  dimd;
7661     int  nsend_z,nsend,nat;
7662
7663     comm = dd->comm;
7664
7665     bScrew = (dd->bScrewPBC && dim == XX);
7666
7667     bDistMB_pulse = (bDistMB && bDistBonded);
7668
7669     nsend_z = 0;
7670     nsend   = *nsend_ptr;
7671     nat     = *nat_ptr;
7672
7673     for(cg=cg0; cg<cg1; cg++)
7674     {
7675         r2  = 0;
7676         rb2 = 0;
7677         if (tric_dist[dim_ind] == 0)
7678         {
7679             /* Rectangular direction, easy */
7680             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7681             if (r > 0)
7682             {
7683                 r2 += r*r;
7684             }
7685             if (bDistMB_pulse)
7686             {
7687                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7688                 if (r > 0)
7689                 {
7690                     rb2 += r*r;
7691                 }
7692             }
7693             /* Rounding gives at most a 16% reduction
7694              * in communicated atoms
7695              */
7696             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7697             {
7698                 r = cg_cm[cg][dim0] - c->cr0;
7699                 /* This is the first dimension, so always r >= 0 */
7700                 r2 += r*r;
7701                 if (bDistMB_pulse)
7702                 {
7703                     rb2 += r*r;
7704                 }
7705             }
7706             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7707             {
7708                 r = cg_cm[cg][dim1] - c->cr1[zone];
7709                 if (r > 0)
7710                 {
7711                     r2 += r*r;
7712                 }
7713                 if (bDistMB_pulse)
7714                 {
7715                     r = cg_cm[cg][dim1] - c->bcr1;
7716                     if (r > 0)
7717                     {
7718                         rb2 += r*r;
7719                     }
7720                 }
7721             }
7722         }
7723         else
7724         {
7725             /* Triclinic direction, more complicated */
7726             clear_rvec(rn);
7727             clear_rvec(rb);
7728             /* Rounding, conservative as the skew_fac multiplication
7729              * will slightly underestimate the distance.
7730              */
7731             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7732             {
7733                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7734                 for(i=dim0+1; i<DIM; i++)
7735                 {
7736                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7737                 }
7738                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7739                 if (bDistMB_pulse)
7740                 {
7741                     rb[dim0] = rn[dim0];
7742                     rb2 = r2;
7743                 }
7744                 /* Take care that the cell planes along dim0 might not
7745                  * be orthogonal to those along dim1 and dim2.
7746                  */
7747                 for(i=1; i<=dim_ind; i++)
7748                 {
7749                     dimd = dd->dim[i];
7750                     if (normal[dim0][dimd] > 0)
7751                     {
7752                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7753                         if (bDistMB_pulse)
7754                         {
7755                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7756                         }
7757                     }
7758                 }
7759             }
7760             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7761             {
7762                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7763                 tric_sh = 0;
7764                 for(i=dim1+1; i<DIM; i++)
7765                 {
7766                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7767                 }
7768                 rn[dim1] += tric_sh;
7769                 if (rn[dim1] > 0)
7770                 {
7771                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7772                     /* Take care of coupling of the distances
7773                      * to the planes along dim0 and dim1 through dim2.
7774                      */
7775                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7776                     /* Take care that the cell planes along dim1
7777                      * might not be orthogonal to that along dim2.
7778                      */
7779                     if (normal[dim1][dim2] > 0)
7780                     {
7781                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7782                     }
7783                 }
7784                 if (bDistMB_pulse)
7785                 {
7786                     rb[dim1] +=
7787                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7788                     if (rb[dim1] > 0)
7789                     {
7790                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7791                         /* Take care of coupling of the distances
7792                          * to the planes along dim0 and dim1 through dim2.
7793                          */
7794                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7795                         /* Take care that the cell planes along dim1
7796                          * might not be orthogonal to that along dim2.
7797                          */
7798                         if (normal[dim1][dim2] > 0)
7799                         {
7800                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7801                         }
7802                     }
7803                 }
7804             }
7805             /* The distance along the communication direction */
7806             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7807             tric_sh = 0;
7808             for(i=dim+1; i<DIM; i++)
7809             {
7810                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7811             }
7812             rn[dim] += tric_sh;
7813             if (rn[dim] > 0)
7814             {
7815                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7816                 /* Take care of coupling of the distances
7817                  * to the planes along dim0 and dim1 through dim2.
7818                  */
7819                 if (dim_ind == 1 && zonei == 1)
7820                 {
7821                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7822                 }
7823             }
7824             if (bDistMB_pulse)
7825             {
7826                 clear_rvec(rb);
7827                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7828                 if (rb[dim] > 0)
7829                 {
7830                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7831                     /* Take care of coupling of the distances
7832                      * to the planes along dim0 and dim1 through dim2.
7833                      */
7834                     if (dim_ind == 1 && zonei == 1)
7835                     {
7836                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7837                     }
7838                 }
7839             }
7840         }
7841
7842         if (r2 < r_comm2 ||
7843             (bDistBonded &&
7844              ((bDistMB && rb2 < r_bcomm2) ||
7845               (bDist2B && r2  < r_bcomm2)) &&
7846              (!bBondComm ||
7847               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7848                missing_link(comm->cglink,index_gl[cg],
7849                             comm->bLocalCG)))))
7850         {
7851             /* Make an index to the local charge groups */
7852             if (nsend+1 > ind->nalloc)
7853             {
7854                 ind->nalloc = over_alloc_large(nsend+1);
7855                 srenew(ind->index,ind->nalloc);
7856             }
7857             if (nsend+1 > *ibuf_nalloc)
7858             {
7859                 *ibuf_nalloc = over_alloc_large(nsend+1);
7860                 srenew(*ibuf,*ibuf_nalloc);
7861             }
7862             ind->index[nsend] = cg;
7863             (*ibuf)[nsend] = index_gl[cg];
7864             nsend_z++;
7865             vec_rvec_check_alloc(vbuf,nsend+1);
7866
7867             if (dd->ci[dim] == 0)
7868             {
7869                 /* Correct cg_cm for pbc */
7870                 rvec_add(cg_cm[cg],box[dim],vbuf->v[nsend]);
7871                 if (bScrew)
7872                 {
7873                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7874                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7875                 }
7876             }
7877             else
7878             {
7879                 copy_rvec(cg_cm[cg],vbuf->v[nsend]);
7880             }
7881             nsend++;
7882             nat += cgindex[cg+1] - cgindex[cg];
7883         }
7884     }
7885
7886     *nsend_ptr   = nsend;
7887     *nat_ptr     = nat;
7888     *nsend_z_ptr = nsend_z;
7889 }
7890
7891 static void setup_dd_communication(gmx_domdec_t *dd,
7892                                    matrix box,gmx_ddbox_t *ddbox,
7893                                    t_forcerec *fr,t_state *state,rvec **f)
7894 {
7895     int dim_ind,dim,dim0,dim1,dim2,dimd,p,nat_tot;
7896     int nzone,nzone_send,zone,zonei,cg0,cg1;
7897     int c,i,j,cg,cg_gl,nrcg;
7898     int *zone_cg_range,pos_cg,*index_gl,*cgindex,*recv_i;
7899     gmx_domdec_comm_t *comm;
7900     gmx_domdec_zones_t *zones;
7901     gmx_domdec_comm_dim_t *cd;
7902     gmx_domdec_ind_t *ind;
7903     cginfo_mb_t *cginfo_mb;
7904     gmx_bool bBondComm,bDist2B,bDistMB,bDistBonded;
7905     real r_mb,r_comm2,r_scomm2,r_bcomm2,r_0,r_1,r2inc,inv_ncg;
7906     dd_corners_t corners;
7907     ivec tric_dist;
7908     rvec *cg_cm,*normal,*v_d,*v_0=NULL,*v_1=NULL,*recv_vr;
7909     real skew_fac2_d,skew_fac_01;
7910     rvec sf2_round;
7911     int  nsend,nat;
7912     int  th;
7913
7914     if (debug)
7915     {
7916         fprintf(debug,"Setting up DD communication\n");
7917     }
7918
7919     comm  = dd->comm;
7920
7921     switch (fr->cutoff_scheme)
7922     {
7923     case ecutsGROUP:
7924         cg_cm = fr->cg_cm;
7925         break;
7926     case ecutsVERLET:
7927         cg_cm = state->x;
7928         break;
7929     default:
7930         gmx_incons("unimplemented");
7931         cg_cm = NULL;
7932     }
7933
7934     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
7935     {
7936         dim = dd->dim[dim_ind];
7937
7938         /* Check if we need to use triclinic distances */
7939         tric_dist[dim_ind] = 0;
7940         for(i=0; i<=dim_ind; i++)
7941         {
7942             if (ddbox->tric_dir[dd->dim[i]])
7943             {
7944                 tric_dist[dim_ind] = 1;
7945             }
7946         }
7947     }
7948
7949     bBondComm = comm->bBondComm;
7950
7951     /* Do we need to determine extra distances for multi-body bondeds? */
7952     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7953
7954     /* Do we need to determine extra distances for only two-body bondeds? */
7955     bDist2B = (bBondComm && !bDistMB);
7956
7957     r_comm2  = sqr(comm->cutoff);
7958     r_bcomm2 = sqr(comm->cutoff_mbody);
7959
7960     if (debug)
7961     {
7962         fprintf(debug,"bBondComm %d, r_bc %f\n",bBondComm,sqrt(r_bcomm2));
7963     }
7964
7965     zones = &comm->zones;
7966
7967     dim0 = dd->dim[0];
7968     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
7969     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
7970
7971     set_dd_corners(dd,dim0,dim1,dim2,bDistMB,&corners);
7972
7973     /* Triclinic stuff */
7974     normal = ddbox->normal;
7975     skew_fac_01 = 0;
7976     if (dd->ndim >= 2)
7977     {
7978         v_0 = ddbox->v[dim0];
7979         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
7980         {
7981             /* Determine the coupling coefficient for the distances
7982              * to the cell planes along dim0 and dim1 through dim2.
7983              * This is required for correct rounding.
7984              */
7985             skew_fac_01 =
7986                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
7987             if (debug)
7988             {
7989                 fprintf(debug,"\nskew_fac_01 %f\n",skew_fac_01);
7990             }
7991         }
7992     }
7993     if (dd->ndim >= 3)
7994     {
7995         v_1 = ddbox->v[dim1];
7996     }
7997
7998     zone_cg_range = zones->cg_range;
7999     index_gl = dd->index_gl;
8000     cgindex  = dd->cgindex;
8001     cginfo_mb = fr->cginfo_mb;
8002
8003     zone_cg_range[0]   = 0;
8004     zone_cg_range[1]   = dd->ncg_home;
8005     comm->zone_ncg1[0] = dd->ncg_home;
8006     pos_cg             = dd->ncg_home;
8007
8008     nat_tot = dd->nat_home;
8009     nzone = 1;
8010     for(dim_ind=0; dim_ind<dd->ndim; dim_ind++)
8011     {
8012         dim = dd->dim[dim_ind];
8013         cd = &comm->cd[dim_ind];
8014
8015         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8016         {
8017             /* No pbc in this dimension, the first node should not comm. */
8018             nzone_send = 0;
8019         }
8020         else
8021         {
8022             nzone_send = nzone;
8023         }
8024
8025         v_d = ddbox->v[dim];
8026         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8027
8028         cd->bInPlace = TRUE;
8029         for(p=0; p<cd->np; p++)
8030         {
8031             /* Only atoms communicated in the first pulse are used
8032              * for multi-body bonded interactions or for bBondComm.
8033              */
8034             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8035
8036             ind = &cd->ind[p];
8037             nsend = 0;
8038             nat = 0;
8039             for(zone=0; zone<nzone_send; zone++)
8040             {
8041                 if (tric_dist[dim_ind] && dim_ind > 0)
8042                 {
8043                     /* Determine slightly more optimized skew_fac's
8044                      * for rounding.
8045                      * This reduces the number of communicated atoms
8046                      * by about 10% for 3D DD of rhombic dodecahedra.
8047                      */
8048                     for(dimd=0; dimd<dim; dimd++)
8049                     {
8050                         sf2_round[dimd] = 1;
8051                         if (ddbox->tric_dir[dimd])
8052                         {
8053                             for(i=dd->dim[dimd]+1; i<DIM; i++)
8054                             {
8055                                 /* If we are shifted in dimension i
8056                                  * and the cell plane is tilted forward
8057                                  * in dimension i, skip this coupling.
8058                                  */
8059                                 if (!(zones->shift[nzone+zone][i] &&
8060                                       ddbox->v[dimd][i][dimd] >= 0))
8061                                 {
8062                                     sf2_round[dimd] +=
8063                                         sqr(ddbox->v[dimd][i][dimd]);
8064                                 }
8065                             }
8066                             sf2_round[dimd] = 1/sf2_round[dimd];
8067                         }
8068                     }
8069                 }
8070
8071                 zonei = zone_perm[dim_ind][zone];
8072                 if (p == 0)
8073                 {
8074                     /* Here we permutate the zones to obtain a convenient order
8075                      * for neighbor searching
8076                      */
8077                     cg0 = zone_cg_range[zonei];
8078                     cg1 = zone_cg_range[zonei+1];
8079                 }
8080                 else
8081                 {
8082                     /* Look only at the cg's received in the previous grid pulse
8083                      */
8084                     cg1 = zone_cg_range[nzone+zone+1];
8085                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8086                 }
8087
8088 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8089                 for(th=0; th<comm->nth; th++)
8090                 {
8091                     gmx_domdec_ind_t *ind_p;
8092                     int **ibuf_p,*ibuf_nalloc_p;
8093                     vec_rvec_t *vbuf_p;
8094                     int *nsend_p,*nat_p;
8095                     int *nsend_zone_p;
8096                     int cg0_th,cg1_th;
8097
8098                     if (th == 0)
8099                     {
8100                         /* Thread 0 writes in the comm buffers */
8101                         ind_p         = ind;
8102                         ibuf_p        = &comm->buf_int;
8103                         ibuf_nalloc_p = &comm->nalloc_int;
8104                         vbuf_p        = &comm->vbuf;
8105                         nsend_p       = &nsend;
8106                         nat_p         = &nat;
8107                         nsend_zone_p  = &ind->nsend[zone];
8108                     }
8109                     else
8110                     {
8111                         /* Other threads write into temp buffers */
8112                         ind_p         = &comm->dth[th].ind;
8113                         ibuf_p        = &comm->dth[th].ibuf;
8114                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8115                         vbuf_p        = &comm->dth[th].vbuf;
8116                         nsend_p       = &comm->dth[th].nsend;
8117                         nat_p         = &comm->dth[th].nat;
8118                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8119
8120                         comm->dth[th].nsend      = 0;
8121                         comm->dth[th].nat        = 0;
8122                         comm->dth[th].nsend_zone = 0;
8123                     }
8124
8125                     if (comm->nth == 1)
8126                     {
8127                         cg0_th = cg0;
8128                         cg1_th = cg1;
8129                     }
8130                     else
8131                     {
8132                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8133                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8134                     }
8135
8136                     /* Get the cg's for this pulse in this zone */
8137                     get_zone_pulse_cgs(dd,zonei,zone,cg0_th,cg1_th,
8138                                        index_gl,cgindex,
8139                                        dim,dim_ind,dim0,dim1,dim2,
8140                                        r_comm2,r_bcomm2,
8141                                        box,tric_dist,
8142                                        normal,skew_fac2_d,skew_fac_01,
8143                                        v_d,v_0,v_1,&corners,sf2_round,
8144                                        bDistBonded,bBondComm,
8145                                        bDist2B,bDistMB,
8146                                        cg_cm,fr->cginfo,
8147                                        ind_p,
8148                                        ibuf_p,ibuf_nalloc_p,
8149                                        vbuf_p,
8150                                        nsend_p,nat_p,
8151                                        nsend_zone_p);
8152                 }
8153
8154                 /* Append data of threads>=1 to the communication buffers */
8155                 for(th=1; th<comm->nth; th++)
8156                 {
8157                     dd_comm_setup_work_t *dth;
8158                     int i,ns1;
8159
8160                     dth = &comm->dth[th];
8161
8162                     ns1 = nsend + dth->nsend_zone;
8163                     if (ns1 > ind->nalloc)
8164                     {
8165                         ind->nalloc = over_alloc_dd(ns1);
8166                         srenew(ind->index,ind->nalloc);
8167                     }
8168                     if (ns1 > comm->nalloc_int)
8169                     {
8170                         comm->nalloc_int = over_alloc_dd(ns1);
8171                         srenew(comm->buf_int,comm->nalloc_int);
8172                     }
8173                     if (ns1 > comm->vbuf.nalloc)
8174                     {
8175                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8176                         srenew(comm->vbuf.v,comm->vbuf.nalloc);
8177                     }
8178
8179                     for(i=0; i<dth->nsend_zone; i++)
8180                     {
8181                         ind->index[nsend] = dth->ind.index[i];
8182                         comm->buf_int[nsend] = dth->ibuf[i];
8183                         copy_rvec(dth->vbuf.v[i],
8184                                   comm->vbuf.v[nsend]);
8185                         nsend++;
8186                     }
8187                     nat              += dth->nat;
8188                     ind->nsend[zone] += dth->nsend_zone;
8189                 }
8190             }
8191             /* Clear the counts in case we do not have pbc */
8192             for(zone=nzone_send; zone<nzone; zone++)
8193             {
8194                 ind->nsend[zone] = 0;
8195             }
8196             ind->nsend[nzone]   = nsend;
8197             ind->nsend[nzone+1] = nat;
8198             /* Communicate the number of cg's and atoms to receive */
8199             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8200                             ind->nsend, nzone+2,
8201                             ind->nrecv, nzone+2);
8202
8203             /* The rvec buffer is also required for atom buffers of size nsend
8204              * in dd_move_x and dd_move_f.
8205              */
8206             vec_rvec_check_alloc(&comm->vbuf,ind->nsend[nzone+1]);
8207
8208             if (p > 0)
8209             {
8210                 /* We can receive in place if only the last zone is not empty */
8211                 for(zone=0; zone<nzone-1; zone++)
8212                 {
8213                     if (ind->nrecv[zone] > 0)
8214                     {
8215                         cd->bInPlace = FALSE;
8216                     }
8217                 }
8218                 if (!cd->bInPlace)
8219                 {
8220                     /* The int buffer is only required here for the cg indices */
8221                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8222                     {
8223                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8224                         srenew(comm->buf_int2,comm->nalloc_int2);
8225                     }
8226                     /* The rvec buffer is also required for atom buffers
8227                      * of size nrecv in dd_move_x and dd_move_f.
8228                      */
8229                     i = max(cd->ind[0].nrecv[nzone+1],ind->nrecv[nzone+1]);
8230                     vec_rvec_check_alloc(&comm->vbuf2,i);
8231                 }
8232             }
8233
8234             /* Make space for the global cg indices */
8235             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8236                 || dd->cg_nalloc == 0)
8237             {
8238                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8239                 srenew(index_gl,dd->cg_nalloc);
8240                 srenew(cgindex,dd->cg_nalloc+1);
8241             }
8242             /* Communicate the global cg indices */
8243             if (cd->bInPlace)
8244             {
8245                 recv_i = index_gl + pos_cg;
8246             }
8247             else
8248             {
8249                 recv_i = comm->buf_int2;
8250             }
8251             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8252                             comm->buf_int, nsend,
8253                             recv_i,        ind->nrecv[nzone]);
8254
8255             /* Make space for cg_cm */
8256             dd_check_alloc_ncg(fr,state,f,pos_cg + ind->nrecv[nzone]);
8257             if (fr->cutoff_scheme == ecutsGROUP)
8258             {
8259                 cg_cm = fr->cg_cm;
8260             }
8261             else
8262             {
8263                 cg_cm = state->x;
8264             }
8265             /* Communicate cg_cm */
8266             if (cd->bInPlace)
8267             {
8268                 recv_vr = cg_cm + pos_cg;
8269             }
8270             else
8271             {
8272                 recv_vr = comm->vbuf2.v;
8273             }
8274             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8275                              comm->vbuf.v, nsend,
8276                              recv_vr,      ind->nrecv[nzone]);
8277
8278             /* Make the charge group index */
8279             if (cd->bInPlace)
8280             {
8281                 zone = (p == 0 ? 0 : nzone - 1);
8282                 while (zone < nzone)
8283                 {
8284                     for(cg=0; cg<ind->nrecv[zone]; cg++)
8285                     {
8286                         cg_gl = index_gl[pos_cg];
8287                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb,cg_gl);
8288                         nrcg = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8289                         cgindex[pos_cg+1] = cgindex[pos_cg] + nrcg;
8290                         if (bBondComm)
8291                         {
8292                             /* Update the charge group presence,
8293                              * so we can use it in the next pass of the loop.
8294                              */
8295                             comm->bLocalCG[cg_gl] = TRUE;
8296                         }
8297                         pos_cg++;
8298                     }
8299                     if (p == 0)
8300                     {
8301                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8302                     }
8303                     zone++;
8304                     zone_cg_range[nzone+zone] = pos_cg;
8305                 }
8306             }
8307             else
8308             {
8309                 /* This part of the code is never executed with bBondComm. */
8310                 merge_cg_buffers(nzone,cd,p,zone_cg_range,
8311                                  index_gl,recv_i,cg_cm,recv_vr,
8312                                  cgindex,fr->cginfo_mb,fr->cginfo);
8313                 pos_cg += ind->nrecv[nzone];
8314             }
8315             nat_tot += ind->nrecv[nzone+1];
8316         }
8317         if (!cd->bInPlace)
8318         {
8319             /* Store the atom block for easy copying of communication buffers */
8320             make_cell2at_index(cd,nzone,zone_cg_range[nzone],cgindex);
8321         }
8322         nzone += nzone;
8323     }
8324     dd->index_gl = index_gl;
8325     dd->cgindex  = cgindex;
8326
8327     dd->ncg_tot = zone_cg_range[zones->n];
8328     dd->nat_tot = nat_tot;
8329     comm->nat[ddnatHOME] = dd->nat_home;
8330     for(i=ddnatZONE; i<ddnatNR; i++)
8331     {
8332         comm->nat[i] = dd->nat_tot;
8333     }
8334
8335     if (!bBondComm)
8336     {
8337         /* We don't need to update cginfo, since that was alrady done above.
8338          * So we pass NULL for the forcerec.
8339          */
8340         dd_set_cginfo(dd->index_gl,dd->ncg_home,dd->ncg_tot,
8341                       NULL,comm->bLocalCG);
8342     }
8343
8344     if (debug)
8345     {
8346         fprintf(debug,"Finished setting up DD communication, zones:");
8347         for(c=0; c<zones->n; c++)
8348         {
8349             fprintf(debug," %d",zones->cg_range[c+1]-zones->cg_range[c]);
8350         }
8351         fprintf(debug,"\n");
8352     }
8353 }
8354
8355 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8356 {
8357     int c;
8358
8359     for(c=0; c<zones->nizone; c++)
8360     {
8361         zones->izone[c].cg1  = zones->cg_range[c+1];
8362         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8363         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8364     }
8365 }
8366
8367 static void set_zones_size(gmx_domdec_t *dd,
8368                            matrix box,const gmx_ddbox_t *ddbox,
8369                            int zone_start,int zone_end)
8370 {
8371     gmx_domdec_comm_t *comm;
8372     gmx_domdec_zones_t *zones;
8373     gmx_bool bDistMB;
8374     int  z,zi,zj0,zj1,d,dim;
8375     real rcs,rcmbs;
8376     int  i,j;
8377     real size_j,add_tric;
8378     real vol;
8379
8380     comm = dd->comm;
8381
8382     zones = &comm->zones;
8383
8384     /* Do we need to determine extra distances for multi-body bondeds? */
8385     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8386
8387     for(z=zone_start; z<zone_end; z++)
8388     {
8389         /* Copy cell limits to zone limits.
8390          * Valid for non-DD dims and non-shifted dims.
8391          */
8392         copy_rvec(comm->cell_x0,zones->size[z].x0);
8393         copy_rvec(comm->cell_x1,zones->size[z].x1);
8394     }
8395
8396     for(d=0; d<dd->ndim; d++)
8397     {
8398         dim = dd->dim[d];
8399
8400         for(z=0; z<zones->n; z++)
8401         {
8402             /* With a staggered grid we have different sizes
8403              * for non-shifted dimensions.
8404              */
8405             if (dd->bGridJump && zones->shift[z][dim] == 0)
8406             {
8407                 if (d == 1)
8408                 {
8409                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8410                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8411                 }
8412                 else if (d == 2)
8413                 {
8414                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8415                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8416                 }
8417             }
8418         }
8419
8420         rcs   = comm->cutoff;
8421         rcmbs = comm->cutoff_mbody;
8422         if (ddbox->tric_dir[dim])
8423         {
8424             rcs   /= ddbox->skew_fac[dim];
8425             rcmbs /= ddbox->skew_fac[dim];
8426         }
8427
8428         /* Set the lower limit for the shifted zone dimensions */
8429         for(z=zone_start; z<zone_end; z++)
8430         {
8431             if (zones->shift[z][dim] > 0)
8432             {
8433                 dim = dd->dim[d];
8434                 if (!dd->bGridJump || d == 0)
8435                 {
8436                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8437                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8438                 }
8439                 else
8440                 {
8441                     /* Here we take the lower limit of the zone from
8442                      * the lowest domain of the zone below.
8443                      */
8444                     if (z < 4)
8445                     {
8446                         zones->size[z].x0[dim] =
8447                              comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8448                     }
8449                     else
8450                     {
8451                         if (d == 1)
8452                         {
8453                             zones->size[z].x0[dim] =
8454                                 zones->size[zone_perm[2][z-4]].x0[dim];
8455                         }
8456                         else
8457                         {
8458                             zones->size[z].x0[dim] =
8459                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8460                         }
8461                     }
8462                     /* A temporary limit, is updated below */
8463                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8464
8465                     if (bDistMB)
8466                     {
8467                         for(zi=0; zi<zones->nizone; zi++)
8468                         {
8469                             if (zones->shift[zi][dim] == 0)
8470                             {
8471                                 /* This takes the whole zone into account.
8472                                  * With multiple pulses this will lead
8473                                  * to a larger zone then strictly necessary.
8474                                  */
8475                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8476                                                              zones->size[zi].x1[dim]+rcmbs);
8477                             }
8478                         }
8479                     }
8480                 }
8481             }
8482         }
8483
8484         /* Loop over the i-zones to set the upper limit of each
8485          * j-zone they see.
8486          */
8487         for(zi=0; zi<zones->nizone; zi++)
8488         {
8489             if (zones->shift[zi][dim] == 0)
8490             {
8491                 for(z=zones->izone[zi].j0; z<zones->izone[zi].j1; z++)
8492                 {
8493                     if (zones->shift[z][dim] > 0)
8494                     {
8495                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8496                                                      zones->size[zi].x1[dim]+rcs);
8497                     }
8498                 }
8499             }
8500         }
8501     }
8502
8503     for(z=zone_start; z<zone_end; z++)
8504     {
8505         /* Initialization only required to keep the compiler happy */
8506         rvec corner_min={0,0,0},corner_max={0,0,0},corner;
8507         int  nc,c;
8508
8509         /* To determine the bounding box for a zone we need to find
8510          * the extreme corners of 4, 2 or 1 corners.
8511          */
8512         nc = 1 << (ddbox->npbcdim - 1);
8513
8514         for(c=0; c<nc; c++)
8515         {
8516             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8517             corner[XX] = 0;
8518             if ((c & 1) == 0)
8519             {
8520                 corner[YY] = zones->size[z].x0[YY];
8521             }
8522             else
8523             {
8524                 corner[YY] = zones->size[z].x1[YY];
8525             }
8526             if ((c & 2) == 0)
8527             {
8528                 corner[ZZ] = zones->size[z].x0[ZZ];
8529             }
8530             else
8531             {
8532                 corner[ZZ] = zones->size[z].x1[ZZ];
8533             }
8534             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8535             {
8536                 /* With 1D domain decomposition the cg's are not in
8537                  * the triclinic box, but triclinic x-y and rectangular y-z.
8538                  * Shift y back, so it will later end up at 0.
8539                  */
8540                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8541             }
8542             /* Apply the triclinic couplings */
8543             for(i=YY; i<ddbox->npbcdim; i++)
8544             {
8545                 for(j=XX; j<i; j++)
8546                 {
8547                     corner[j] += corner[i]*box[i][j]/box[i][i];
8548                 }
8549             }
8550             if (c == 0)
8551             {
8552                 copy_rvec(corner,corner_min);
8553                 copy_rvec(corner,corner_max);
8554             }
8555             else
8556             {
8557                 for(i=0; i<DIM; i++)
8558                 {
8559                     corner_min[i] = min(corner_min[i],corner[i]);
8560                     corner_max[i] = max(corner_max[i],corner[i]);
8561                 }
8562             }
8563         }
8564         /* Copy the extreme cornes without offset along x */
8565         for(i=0; i<DIM; i++)
8566         {
8567             zones->size[z].bb_x0[i] = corner_min[i];
8568             zones->size[z].bb_x1[i] = corner_max[i];
8569         }
8570         /* Add the offset along x */
8571         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8572         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8573     }
8574
8575     if (zone_start == 0)
8576     {
8577         vol = 1;
8578         for(dim=0; dim<DIM; dim++)
8579         {
8580             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8581         }
8582         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8583     }
8584
8585     if (debug)
8586     {
8587         for(z=zone_start; z<zone_end; z++)
8588         {
8589             fprintf(debug,"zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8590                     z,
8591                     zones->size[z].x0[XX],zones->size[z].x1[XX],
8592                     zones->size[z].x0[YY],zones->size[z].x1[YY],
8593                     zones->size[z].x0[ZZ],zones->size[z].x1[ZZ]);
8594             fprintf(debug,"zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8595                     z,
8596                     zones->size[z].bb_x0[XX],zones->size[z].bb_x1[XX],
8597                     zones->size[z].bb_x0[YY],zones->size[z].bb_x1[YY],
8598                     zones->size[z].bb_x0[ZZ],zones->size[z].bb_x1[ZZ]);
8599         }
8600     }
8601 }
8602
8603 static int comp_cgsort(const void *a,const void *b)
8604 {
8605     int comp;
8606
8607     gmx_cgsort_t *cga,*cgb;
8608     cga = (gmx_cgsort_t *)a;
8609     cgb = (gmx_cgsort_t *)b;
8610
8611     comp = cga->nsc - cgb->nsc;
8612     if (comp == 0)
8613     {
8614         comp = cga->ind_gl - cgb->ind_gl;
8615     }
8616
8617     return comp;
8618 }
8619
8620 static void order_int_cg(int n,const gmx_cgsort_t *sort,
8621                          int *a,int *buf)
8622 {
8623     int i;
8624
8625     /* Order the data */
8626     for(i=0; i<n; i++)
8627     {
8628         buf[i] = a[sort[i].ind];
8629     }
8630
8631     /* Copy back to the original array */
8632     for(i=0; i<n; i++)
8633     {
8634         a[i] = buf[i];
8635     }
8636 }
8637
8638 static void order_vec_cg(int n,const gmx_cgsort_t *sort,
8639                          rvec *v,rvec *buf)
8640 {
8641     int i;
8642
8643     /* Order the data */
8644     for(i=0; i<n; i++)
8645     {
8646         copy_rvec(v[sort[i].ind],buf[i]);
8647     }
8648
8649     /* Copy back to the original array */
8650     for(i=0; i<n; i++)
8651     {
8652         copy_rvec(buf[i],v[i]);
8653     }
8654 }
8655
8656 static void order_vec_atom(int ncg,const int *cgindex,const gmx_cgsort_t *sort,
8657                            rvec *v,rvec *buf)
8658 {
8659     int a,atot,cg,cg0,cg1,i;
8660
8661     if (cgindex == NULL)
8662     {
8663         /* Avoid the useless loop of the atoms within a cg */
8664         order_vec_cg(ncg,sort,v,buf);
8665
8666         return;
8667     }
8668
8669     /* Order the data */
8670     a = 0;
8671     for(cg=0; cg<ncg; cg++)
8672     {
8673         cg0 = cgindex[sort[cg].ind];
8674         cg1 = cgindex[sort[cg].ind+1];
8675         for(i=cg0; i<cg1; i++)
8676         {
8677             copy_rvec(v[i],buf[a]);
8678             a++;
8679         }
8680     }
8681     atot = a;
8682
8683     /* Copy back to the original array */
8684     for(a=0; a<atot; a++)
8685     {
8686         copy_rvec(buf[a],v[a]);
8687     }
8688 }
8689
8690 static void ordered_sort(int nsort2,gmx_cgsort_t *sort2,
8691                          int nsort_new,gmx_cgsort_t *sort_new,
8692                          gmx_cgsort_t *sort1)
8693 {
8694     int i1,i2,i_new;
8695
8696     /* The new indices are not very ordered, so we qsort them */
8697     qsort_threadsafe(sort_new,nsort_new,sizeof(sort_new[0]),comp_cgsort);
8698
8699     /* sort2 is already ordered, so now we can merge the two arrays */
8700     i1 = 0;
8701     i2 = 0;
8702     i_new = 0;
8703     while(i2 < nsort2 || i_new < nsort_new)
8704     {
8705         if (i2 == nsort2)
8706         {
8707             sort1[i1++] = sort_new[i_new++];
8708         }
8709         else if (i_new == nsort_new)
8710         {
8711             sort1[i1++] = sort2[i2++];
8712         }
8713         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8714                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8715                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8716         {
8717             sort1[i1++] = sort2[i2++];
8718         }
8719         else
8720         {
8721             sort1[i1++] = sort_new[i_new++];
8722         }
8723     }
8724 }
8725
8726 static int dd_sort_order(gmx_domdec_t *dd,t_forcerec *fr,int ncg_home_old)
8727 {
8728     gmx_domdec_sort_t *sort;
8729     gmx_cgsort_t *cgsort,*sort_i;
8730     int  ncg_new,nsort2,nsort_new,i,*a,moved,*ibuf;
8731     int  sort_last,sort_skip;
8732
8733     sort = dd->comm->sort;
8734
8735     a = fr->ns.grid->cell_index;
8736
8737     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8738
8739     if (ncg_home_old >= 0)
8740     {
8741         /* The charge groups that remained in the same ns grid cell
8742          * are completely ordered. So we can sort efficiently by sorting
8743          * the charge groups that did move into the stationary list.
8744          */
8745         ncg_new = 0;
8746         nsort2 = 0;
8747         nsort_new = 0;
8748         for(i=0; i<dd->ncg_home; i++)
8749         {
8750             /* Check if this cg did not move to another node */
8751             if (a[i] < moved)
8752             {
8753                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8754                 {
8755                     /* This cg is new on this node or moved ns grid cell */
8756                     if (nsort_new >= sort->sort_new_nalloc)
8757                     {
8758                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8759                         srenew(sort->sort_new,sort->sort_new_nalloc);
8760                     }
8761                     sort_i = &(sort->sort_new[nsort_new++]);
8762                 }
8763                 else
8764                 {
8765                     /* This cg did not move */
8766                     sort_i = &(sort->sort2[nsort2++]);
8767                 }
8768                 /* Sort on the ns grid cell indices
8769                  * and the global topology index.
8770                  * index_gl is irrelevant with cell ns,
8771                  * but we set it here anyhow to avoid a conditional.
8772                  */
8773                 sort_i->nsc    = a[i];
8774                 sort_i->ind_gl = dd->index_gl[i];
8775                 sort_i->ind    = i;
8776                 ncg_new++;
8777             }
8778         }
8779         if (debug)
8780         {
8781             fprintf(debug,"ordered sort cgs: stationary %d moved %d\n",
8782                     nsort2,nsort_new);
8783         }
8784         /* Sort efficiently */
8785         ordered_sort(nsort2,sort->sort2,nsort_new,sort->sort_new,
8786                      sort->sort);
8787     }
8788     else
8789     {
8790         cgsort = sort->sort;
8791         ncg_new = 0;
8792         for(i=0; i<dd->ncg_home; i++)
8793         {
8794             /* Sort on the ns grid cell indices
8795              * and the global topology index
8796              */
8797             cgsort[i].nsc    = a[i];
8798             cgsort[i].ind_gl = dd->index_gl[i];
8799             cgsort[i].ind    = i;
8800             if (cgsort[i].nsc < moved)
8801             {
8802                 ncg_new++;
8803             }
8804         }
8805         if (debug)
8806         {
8807             fprintf(debug,"qsort cgs: %d new home %d\n",dd->ncg_home,ncg_new);
8808         }
8809         /* Determine the order of the charge groups using qsort */
8810         qsort_threadsafe(cgsort,dd->ncg_home,sizeof(cgsort[0]),comp_cgsort);
8811     }
8812
8813     return ncg_new;
8814 }
8815
8816 static int dd_sort_order_nbnxn(gmx_domdec_t *dd,t_forcerec *fr)
8817 {
8818     gmx_cgsort_t *sort;
8819     int  ncg_new,i,*a,na;
8820
8821     sort = dd->comm->sort->sort;
8822
8823     nbnxn_get_atomorder(fr->nbv->nbs,&a,&na);
8824
8825     ncg_new = 0;
8826     for(i=0; i<na; i++)
8827     {
8828         if (a[i] >= 0)
8829         {
8830             sort[ncg_new].ind = a[i];
8831             ncg_new++;
8832         }
8833     }
8834
8835     return ncg_new;
8836 }
8837
8838 static void dd_sort_state(gmx_domdec_t *dd,int ePBC,
8839                           rvec *cgcm,t_forcerec *fr,t_state *state,
8840                           int ncg_home_old)
8841 {
8842     gmx_domdec_sort_t *sort;
8843     gmx_cgsort_t *cgsort,*sort_i;
8844     int  *cgindex;
8845     int  ncg_new,i,*ibuf,cgsize;
8846     rvec *vbuf;
8847
8848     sort = dd->comm->sort;
8849
8850     if (dd->ncg_home > sort->sort_nalloc)
8851     {
8852         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8853         srenew(sort->sort,sort->sort_nalloc);
8854         srenew(sort->sort2,sort->sort_nalloc);
8855     }
8856     cgsort = sort->sort;
8857
8858     switch (fr->cutoff_scheme)
8859     {
8860     case ecutsGROUP:
8861         ncg_new = dd_sort_order(dd,fr,ncg_home_old);
8862         break;
8863     case ecutsVERLET:
8864         ncg_new = dd_sort_order_nbnxn(dd,fr);
8865         break;
8866     default:
8867         gmx_incons("unimplemented");
8868         ncg_new = 0;
8869     }
8870
8871     /* We alloc with the old size, since cgindex is still old */
8872     vec_rvec_check_alloc(&dd->comm->vbuf,dd->cgindex[dd->ncg_home]);
8873     vbuf = dd->comm->vbuf.v;
8874
8875     if (dd->comm->bCGs)
8876     {
8877         cgindex = dd->cgindex;
8878     }
8879     else
8880     {
8881         cgindex = NULL;
8882     }
8883
8884     /* Remove the charge groups which are no longer at home here */
8885     dd->ncg_home = ncg_new;
8886     if (debug)
8887     {
8888         fprintf(debug,"Set the new home charge group count to %d\n",
8889                 dd->ncg_home);
8890     }
8891
8892     /* Reorder the state */
8893     for(i=0; i<estNR; i++)
8894     {
8895         if (EST_DISTR(i) && (state->flags & (1<<i)))
8896         {
8897             switch (i)
8898             {
8899             case estX:
8900                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->x,vbuf);
8901                 break;
8902             case estV:
8903                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->v,vbuf);
8904                 break;
8905             case estSDX:
8906                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->sd_X,vbuf);
8907                 break;
8908             case estCGP:
8909                 order_vec_atom(dd->ncg_home,cgindex,cgsort,state->cg_p,vbuf);
8910                 break;
8911             case estLD_RNG:
8912             case estLD_RNGI:
8913             case estDISRE_INITF:
8914             case estDISRE_RM3TAV:
8915             case estORIRE_INITF:
8916             case estORIRE_DTAV:
8917                 /* No ordering required */
8918                 break;
8919             default:
8920                 gmx_incons("Unknown state entry encountered in dd_sort_state");
8921                 break;
8922             }
8923         }
8924     }
8925     if (fr->cutoff_scheme == ecutsGROUP)
8926     {
8927         /* Reorder cgcm */
8928         order_vec_cg(dd->ncg_home,cgsort,cgcm,vbuf);
8929     }
8930
8931     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8932     {
8933         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8934         srenew(sort->ibuf,sort->ibuf_nalloc);
8935     }
8936     ibuf = sort->ibuf;
8937     /* Reorder the global cg index */
8938     order_int_cg(dd->ncg_home,cgsort,dd->index_gl,ibuf);
8939     /* Reorder the cginfo */
8940     order_int_cg(dd->ncg_home,cgsort,fr->cginfo,ibuf);
8941     /* Rebuild the local cg index */
8942     if (dd->comm->bCGs)
8943     {
8944         ibuf[0] = 0;
8945         for(i=0; i<dd->ncg_home; i++)
8946         {
8947             cgsize = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8948             ibuf[i+1] = ibuf[i] + cgsize;
8949         }
8950         for(i=0; i<dd->ncg_home+1; i++)
8951         {
8952             dd->cgindex[i] = ibuf[i];
8953         }
8954     }
8955     else
8956     {
8957         for(i=0; i<dd->ncg_home+1; i++)
8958         {
8959             dd->cgindex[i] = i;
8960         }
8961     }
8962     /* Set the home atom number */
8963     dd->nat_home = dd->cgindex[dd->ncg_home];
8964
8965     if (fr->cutoff_scheme == ecutsVERLET)
8966     {
8967         /* The atoms are now exactly in grid order, update the grid order */
8968         nbnxn_set_atomorder(fr->nbv->nbs);
8969     }
8970     else
8971     {
8972         /* Copy the sorted ns cell indices back to the ns grid struct */
8973         for(i=0; i<dd->ncg_home; i++)
8974         {
8975             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
8976         }
8977         fr->ns.grid->nr = dd->ncg_home;
8978     }
8979 }
8980
8981 static void add_dd_statistics(gmx_domdec_t *dd)
8982 {
8983     gmx_domdec_comm_t *comm;
8984     int ddnat;
8985
8986     comm = dd->comm;
8987
8988     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
8989     {
8990         comm->sum_nat[ddnat-ddnatZONE] +=
8991             comm->nat[ddnat] - comm->nat[ddnat-1];
8992     }
8993     comm->ndecomp++;
8994 }
8995
8996 void reset_dd_statistics_counters(gmx_domdec_t *dd)
8997 {
8998     gmx_domdec_comm_t *comm;
8999     int ddnat;
9000
9001     comm = dd->comm;
9002
9003     /* Reset all the statistics and counters for total run counting */
9004     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
9005     {
9006         comm->sum_nat[ddnat-ddnatZONE] = 0;
9007     }
9008     comm->ndecomp = 0;
9009     comm->nload = 0;
9010     comm->load_step = 0;
9011     comm->load_sum = 0;
9012     comm->load_max = 0;
9013     clear_ivec(comm->load_lim);
9014     comm->load_mdf = 0;
9015     comm->load_pme = 0;
9016 }
9017
9018 void print_dd_statistics(t_commrec *cr,t_inputrec *ir,FILE *fplog)
9019 {
9020     gmx_domdec_comm_t *comm;
9021     int ddnat;
9022     double av;
9023
9024     comm = cr->dd->comm;
9025
9026     gmx_sumd(ddnatNR-ddnatZONE,comm->sum_nat,cr);
9027
9028     if (fplog == NULL)
9029     {
9030         return;
9031     }
9032
9033     fprintf(fplog,"\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9034
9035     for(ddnat=ddnatZONE; ddnat<ddnatNR; ddnat++)
9036     {
9037         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9038         switch(ddnat)
9039         {
9040         case ddnatZONE:
9041             fprintf(fplog,
9042                     " av. #atoms communicated per step for force:  %d x %.1f\n",
9043                     2,av);
9044             break;
9045         case ddnatVSITE:
9046             if (cr->dd->vsite_comm)
9047             {
9048                 fprintf(fplog,
9049                         " av. #atoms communicated per step for vsites: %d x %.1f\n",
9050                         (EEL_PME(ir->coulombtype) || ir->coulombtype==eelEWALD) ? 3 : 2,
9051                         av);
9052             }
9053             break;
9054         case ddnatCON:
9055             if (cr->dd->constraint_comm)
9056             {
9057                 fprintf(fplog,
9058                         " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9059                         1 + ir->nLincsIter,av);
9060             }
9061             break;
9062         default:
9063             gmx_incons(" Unknown type for DD statistics");
9064         }
9065     }
9066     fprintf(fplog,"\n");
9067
9068     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9069     {
9070         print_dd_load_av(fplog,cr->dd);
9071     }
9072 }
9073
9074 void dd_partition_system(FILE            *fplog,
9075                          gmx_large_int_t      step,
9076                          t_commrec       *cr,
9077                          gmx_bool            bMasterState,
9078                          int             nstglobalcomm,
9079                          t_state         *state_global,
9080                          gmx_mtop_t      *top_global,
9081                          t_inputrec      *ir,
9082                          t_state         *state_local,
9083                          rvec            **f,
9084                          t_mdatoms       *mdatoms,
9085                          gmx_localtop_t  *top_local,
9086                          t_forcerec      *fr,
9087                          gmx_vsite_t     *vsite,
9088                          gmx_shellfc_t   shellfc,
9089                          gmx_constr_t    constr,
9090                          t_nrnb          *nrnb,
9091                          gmx_wallcycle_t wcycle,
9092                          gmx_bool            bVerbose)
9093 {
9094     gmx_domdec_t *dd;
9095     gmx_domdec_comm_t *comm;
9096     gmx_ddbox_t ddbox={0};
9097     t_block *cgs_gl;
9098     gmx_large_int_t step_pcoupl;
9099     rvec cell_ns_x0,cell_ns_x1;
9100     int  i,j,n,cg0=0,ncg_home_old=-1,ncg_moved,nat_f_novirsum;
9101     gmx_bool bBoxChanged,bNStGlobalComm,bDoDLB,bCheckDLB,bTurnOnDLB,bLogLoad;
9102     gmx_bool bRedist,bSortCG,bResortAll;
9103     ivec ncells_old={0,0,0},ncells_new={0,0,0},np;
9104     real grid_density;
9105     char sbuf[22];
9106
9107     dd = cr->dd;
9108     comm = dd->comm;
9109
9110     bBoxChanged = (bMasterState || DEFORM(*ir));
9111     if (ir->epc != epcNO)
9112     {
9113         /* With nstpcouple > 1 pressure coupling happens.
9114          * one step after calculating the pressure.
9115          * Box scaling happens at the end of the MD step,
9116          * after the DD partitioning.
9117          * We therefore have to do DLB in the first partitioning
9118          * after an MD step where P-coupling occured.
9119          * We need to determine the last step in which p-coupling occurred.
9120          * MRS -- need to validate this for vv?
9121          */
9122         n = ir->nstpcouple;
9123         if (n == 1)
9124         {
9125             step_pcoupl = step - 1;
9126         }
9127         else
9128         {
9129             step_pcoupl = ((step - 1)/n)*n + 1;
9130         }
9131         if (step_pcoupl >= comm->partition_step)
9132         {
9133             bBoxChanged = TRUE;
9134         }
9135     }
9136
9137     bNStGlobalComm = (step % nstglobalcomm == 0);
9138
9139     if (!comm->bDynLoadBal)
9140     {
9141         bDoDLB = FALSE;
9142     }
9143     else
9144     {
9145         /* Should we do dynamic load balacing this step?
9146          * Since it requires (possibly expensive) global communication,
9147          * we might want to do DLB less frequently.
9148          */
9149         if (bBoxChanged || ir->epc != epcNO)
9150         {
9151             bDoDLB = bBoxChanged;
9152         }
9153         else
9154         {
9155             bDoDLB = bNStGlobalComm;
9156         }
9157     }
9158
9159     /* Check if we have recorded loads on the nodes */
9160     if (comm->bRecordLoad && dd_load_count(comm))
9161     {
9162         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9163         {
9164             /* Check if we should use DLB at the second partitioning
9165              * and every 100 partitionings,
9166              * so the extra communication cost is negligible.
9167              */
9168             n = max(100,nstglobalcomm);
9169             bCheckDLB = (comm->n_load_collect == 0 ||
9170                          comm->n_load_have % n == n-1);
9171         }
9172         else
9173         {
9174             bCheckDLB = FALSE;
9175         }
9176
9177         /* Print load every nstlog, first and last step to the log file */
9178         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9179                     comm->n_load_collect == 0 ||
9180                     (ir->nsteps >= 0 &&
9181                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9182
9183         /* Avoid extra communication due to verbose screen output
9184          * when nstglobalcomm is set.
9185          */
9186         if (bDoDLB || bLogLoad || bCheckDLB ||
9187             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9188         {
9189             get_load_distribution(dd,wcycle);
9190             if (DDMASTER(dd))
9191             {
9192                 if (bLogLoad)
9193                 {
9194                     dd_print_load(fplog,dd,step-1);
9195                 }
9196                 if (bVerbose)
9197                 {
9198                     dd_print_load_verbose(dd);
9199                 }
9200             }
9201             comm->n_load_collect++;
9202
9203             if (bCheckDLB) {
9204                 /* Since the timings are node dependent, the master decides */
9205                 if (DDMASTER(dd))
9206                 {
9207                     bTurnOnDLB =
9208                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9209                     if (debug)
9210                     {
9211                         fprintf(debug,"step %s, imb loss %f\n",
9212                                 gmx_step_str(step,sbuf),
9213                                 dd_force_imb_perf_loss(dd));
9214                     }
9215                 }
9216                 dd_bcast(dd,sizeof(bTurnOnDLB),&bTurnOnDLB);
9217                 if (bTurnOnDLB)
9218                 {
9219                     turn_on_dlb(fplog,cr,step);
9220                     bDoDLB = TRUE;
9221                 }
9222             }
9223         }
9224         comm->n_load_have++;
9225     }
9226
9227     cgs_gl = &comm->cgs_gl;
9228
9229     bRedist = FALSE;
9230     if (bMasterState)
9231     {
9232         /* Clear the old state */
9233         clear_dd_indices(dd,0,0);
9234
9235         set_ddbox(dd,bMasterState,cr,ir,state_global->box,
9236                   TRUE,cgs_gl,state_global->x,&ddbox);
9237
9238         get_cg_distribution(fplog,step,dd,cgs_gl,
9239                             state_global->box,&ddbox,state_global->x);
9240
9241         dd_distribute_state(dd,cgs_gl,
9242                             state_global,state_local,f);
9243
9244         dd_make_local_cgs(dd,&top_local->cgs);
9245
9246         /* Ensure that we have space for the new distribution */
9247         dd_check_alloc_ncg(fr,state_local,f,dd->ncg_home);
9248
9249         if (fr->cutoff_scheme == ecutsGROUP)
9250         {
9251             calc_cgcm(fplog,0,dd->ncg_home,
9252                       &top_local->cgs,state_local->x,fr->cg_cm);
9253         }
9254
9255         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9256
9257         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9258
9259         cg0 = 0;
9260     }
9261     else if (state_local->ddp_count != dd->ddp_count)
9262     {
9263         if (state_local->ddp_count > dd->ddp_count)
9264         {
9265             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)",state_local->ddp_count,dd->ddp_count);
9266         }
9267
9268         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9269         {
9270             gmx_fatal(FARGS,"Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)",state_local->ddp_count_cg_gl,state_local->ddp_count);
9271         }
9272
9273         /* Clear the old state */
9274         clear_dd_indices(dd,0,0);
9275
9276         /* Build the new indices */
9277         rebuild_cgindex(dd,cgs_gl->index,state_local);
9278         make_dd_indices(dd,cgs_gl->index,0);
9279
9280         if (fr->cutoff_scheme == ecutsGROUP)
9281         {
9282             /* Redetermine the cg COMs */
9283             calc_cgcm(fplog,0,dd->ncg_home,
9284                       &top_local->cgs,state_local->x,fr->cg_cm);
9285         }
9286
9287         inc_nrnb(nrnb,eNR_CGCM,dd->nat_home);
9288
9289         dd_set_cginfo(dd->index_gl,0,dd->ncg_home,fr,comm->bLocalCG);
9290
9291         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9292                   TRUE,&top_local->cgs,state_local->x,&ddbox);
9293
9294         bRedist = comm->bDynLoadBal;
9295     }
9296     else
9297     {
9298         /* We have the full state, only redistribute the cgs */
9299
9300         /* Clear the non-home indices */
9301         clear_dd_indices(dd,dd->ncg_home,dd->nat_home);
9302
9303         /* Avoid global communication for dim's without pbc and -gcom */
9304         if (!bNStGlobalComm)
9305         {
9306             copy_rvec(comm->box0    ,ddbox.box0    );
9307             copy_rvec(comm->box_size,ddbox.box_size);
9308         }
9309         set_ddbox(dd,bMasterState,cr,ir,state_local->box,
9310                   bNStGlobalComm,&top_local->cgs,state_local->x,&ddbox);
9311
9312         bBoxChanged = TRUE;
9313         bRedist = TRUE;
9314     }
9315     /* For dim's without pbc and -gcom */
9316     copy_rvec(ddbox.box0    ,comm->box0    );
9317     copy_rvec(ddbox.box_size,comm->box_size);
9318
9319     set_dd_cell_sizes(dd,&ddbox,dynamic_dd_box(&ddbox,ir),bMasterState,bDoDLB,
9320                       step,wcycle);
9321
9322     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9323     {
9324         write_dd_grid_pdb("dd_grid",step,dd,state_local->box,&ddbox);
9325     }
9326
9327     /* Check if we should sort the charge groups */
9328     if (comm->nstSortCG > 0)
9329     {
9330         bSortCG = (bMasterState ||
9331                    (bRedist && (step % comm->nstSortCG == 0)));
9332     }
9333     else
9334     {
9335         bSortCG = FALSE;
9336     }
9337
9338     ncg_home_old = dd->ncg_home;
9339
9340     ncg_moved = 0;
9341     if (bRedist)
9342     {
9343         wallcycle_sub_start(wcycle,ewcsDD_REDIST);
9344
9345         dd_redistribute_cg(fplog,step,dd,ddbox.tric_dir,
9346                            state_local,f,fr,mdatoms,
9347                            !bSortCG,nrnb,&cg0,&ncg_moved);
9348
9349         wallcycle_sub_stop(wcycle,ewcsDD_REDIST);
9350     }
9351
9352     get_nsgrid_boundaries(ddbox.nboundeddim,state_local->box,
9353                           dd,&ddbox,
9354                           &comm->cell_x0,&comm->cell_x1,
9355                           dd->ncg_home,fr->cg_cm,
9356                           cell_ns_x0,cell_ns_x1,&grid_density);
9357
9358     if (bBoxChanged)
9359     {
9360         comm_dd_ns_cell_sizes(dd,&ddbox,cell_ns_x0,cell_ns_x1,step);
9361     }
9362
9363     switch (fr->cutoff_scheme)
9364     {
9365     case ecutsGROUP:
9366         copy_ivec(fr->ns.grid->n,ncells_old);
9367         grid_first(fplog,fr->ns.grid,dd,&ddbox,fr->ePBC,
9368                    state_local->box,cell_ns_x0,cell_ns_x1,
9369                    fr->rlistlong,grid_density);
9370         break;
9371     case ecutsVERLET:
9372         nbnxn_get_ncells(fr->nbv->nbs,&ncells_old[XX],&ncells_old[YY]);
9373         break;
9374     default:
9375         gmx_incons("unimplemented");
9376     }
9377     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9378     copy_ivec(ddbox.tric_dir,comm->tric_dir);
9379
9380     if (bSortCG)
9381     {
9382         wallcycle_sub_start(wcycle,ewcsDD_GRID);
9383
9384         /* Sort the state on charge group position.
9385          * This enables exact restarts from this step.
9386          * It also improves performance by about 15% with larger numbers
9387          * of atoms per node.
9388          */
9389
9390         /* Fill the ns grid with the home cell,
9391          * so we can sort with the indices.
9392          */
9393         set_zones_ncg_home(dd);
9394
9395         switch (fr->cutoff_scheme)
9396         {
9397         case ecutsVERLET:
9398             set_zones_size(dd,state_local->box,&ddbox,0,1);
9399
9400             nbnxn_put_on_grid(fr->nbv->nbs,fr->ePBC,state_local->box,
9401                               0,
9402                               comm->zones.size[0].bb_x0,
9403                               comm->zones.size[0].bb_x1,
9404                               0,dd->ncg_home,
9405                               comm->zones.dens_zone0,
9406                               fr->cginfo,
9407                               state_local->x,
9408                               ncg_moved,bRedist ? comm->moved : NULL,
9409                               fr->nbv->grp[eintLocal].kernel_type,
9410                               fr->nbv->grp[eintLocal].nbat);
9411
9412             nbnxn_get_ncells(fr->nbv->nbs,&ncells_new[XX],&ncells_new[YY]);
9413             break;
9414         case ecutsGROUP:
9415             fill_grid(fplog,&comm->zones,fr->ns.grid,dd->ncg_home,
9416                       0,dd->ncg_home,fr->cg_cm);
9417
9418             copy_ivec(fr->ns.grid->n,ncells_new);
9419             break;
9420         default:
9421             gmx_incons("unimplemented");
9422         }
9423
9424         bResortAll = bMasterState;
9425
9426         /* Check if we can user the old order and ns grid cell indices
9427          * of the charge groups to sort the charge groups efficiently.
9428          */
9429         if (ncells_new[XX] != ncells_old[XX] ||
9430             ncells_new[YY] != ncells_old[YY] ||
9431             ncells_new[ZZ] != ncells_old[ZZ])
9432         {
9433             bResortAll = TRUE;
9434         }
9435
9436         if (debug)
9437         {
9438             fprintf(debug,"Step %s, sorting the %d home charge groups\n",
9439                     gmx_step_str(step,sbuf),dd->ncg_home);
9440         }
9441         dd_sort_state(dd,ir->ePBC,fr->cg_cm,fr,state_local,
9442                       bResortAll ? -1 : ncg_home_old);
9443         /* Rebuild all the indices */
9444         cg0 = 0;
9445         ga2la_clear(dd->ga2la);
9446
9447         wallcycle_sub_stop(wcycle,ewcsDD_GRID);
9448     }
9449
9450     wallcycle_sub_start(wcycle,ewcsDD_SETUPCOMM);
9451
9452     /* Setup up the communication and communicate the coordinates */
9453     setup_dd_communication(dd,state_local->box,&ddbox,fr,state_local,f);
9454
9455     /* Set the indices */
9456     make_dd_indices(dd,cgs_gl->index,cg0);
9457
9458     /* Set the charge group boundaries for neighbor searching */
9459     set_cg_boundaries(&comm->zones);
9460
9461     if (fr->cutoff_scheme == ecutsVERLET)
9462     {
9463         set_zones_size(dd,state_local->box,&ddbox,
9464                        bSortCG ? 1 : 0,comm->zones.n);
9465     }
9466
9467     wallcycle_sub_stop(wcycle,ewcsDD_SETUPCOMM);
9468
9469     /*
9470     write_dd_pdb("dd_home",step,"dump",top_global,cr,
9471                  -1,state_local->x,state_local->box);
9472     */
9473
9474     wallcycle_sub_start(wcycle,ewcsDD_MAKETOP);
9475
9476     /* Extract a local topology from the global topology */
9477     for(i=0; i<dd->ndim; i++)
9478     {
9479         np[dd->dim[i]] = comm->cd[i].np;
9480     }
9481     dd_make_local_top(fplog,dd,&comm->zones,dd->npbcdim,state_local->box,
9482                       comm->cellsize_min,np,
9483                       fr,
9484                       fr->cutoff_scheme==ecutsGROUP ? fr->cg_cm : state_local->x,
9485                       vsite,top_global,top_local);
9486
9487     wallcycle_sub_stop(wcycle,ewcsDD_MAKETOP);
9488
9489     wallcycle_sub_start(wcycle,ewcsDD_MAKECONSTR);
9490
9491     /* Set up the special atom communication */
9492     n = comm->nat[ddnatZONE];
9493     for(i=ddnatZONE+1; i<ddnatNR; i++)
9494     {
9495         switch(i)
9496         {
9497         case ddnatVSITE:
9498             if (vsite && vsite->n_intercg_vsite)
9499             {
9500                 n = dd_make_local_vsites(dd,n,top_local->idef.il);
9501             }
9502             break;
9503         case ddnatCON:
9504             if (dd->bInterCGcons || dd->bInterCGsettles)
9505             {
9506                 /* Only for inter-cg constraints we need special code */
9507                 n = dd_make_local_constraints(dd,n,top_global,fr->cginfo,
9508                                               constr,ir->nProjOrder,
9509                                               top_local->idef.il);
9510             }
9511             break;
9512         default:
9513             gmx_incons("Unknown special atom type setup");
9514         }
9515         comm->nat[i] = n;
9516     }
9517
9518     wallcycle_sub_stop(wcycle,ewcsDD_MAKECONSTR);
9519
9520     wallcycle_sub_start(wcycle,ewcsDD_TOPOTHER);
9521
9522     /* Make space for the extra coordinates for virtual site
9523      * or constraint communication.
9524      */
9525     state_local->natoms = comm->nat[ddnatNR-1];
9526     if (state_local->natoms > state_local->nalloc)
9527     {
9528         dd_realloc_state(state_local,f,state_local->natoms);
9529     }
9530
9531     if (fr->bF_NoVirSum)
9532     {
9533         if (vsite && vsite->n_intercg_vsite)
9534         {
9535             nat_f_novirsum = comm->nat[ddnatVSITE];
9536         }
9537         else
9538         {
9539             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9540             {
9541                 nat_f_novirsum = dd->nat_tot;
9542             }
9543             else
9544             {
9545                 nat_f_novirsum = dd->nat_home;
9546             }
9547         }
9548     }
9549     else
9550     {
9551         nat_f_novirsum = 0;
9552     }
9553
9554     /* Set the number of atoms required for the force calculation.
9555      * Forces need to be constrained when using a twin-range setup
9556      * or with energy minimization. For simple simulations we could
9557      * avoid some allocation, zeroing and copying, but this is
9558      * probably not worth the complications ande checking.
9559      */
9560     forcerec_set_ranges(fr,dd->ncg_home,dd->ncg_tot,
9561                         dd->nat_tot,comm->nat[ddnatCON],nat_f_novirsum);
9562
9563     /* We make the all mdatoms up to nat_tot_con.
9564      * We could save some work by only setting invmass
9565      * between nat_tot and nat_tot_con.
9566      */
9567     /* This call also sets the new number of home particles to dd->nat_home */
9568     atoms2md(top_global,ir,
9569              comm->nat[ddnatCON],dd->gatindex,0,dd->nat_home,mdatoms);
9570
9571     /* Now we have the charges we can sort the FE interactions */
9572     dd_sort_local_top(dd,mdatoms,top_local);
9573
9574     if (vsite != NULL)
9575     {
9576         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9577         split_vsites_over_threads(top_local->idef.il,mdatoms,FALSE,vsite);
9578     }
9579
9580     if (shellfc)
9581     {
9582         /* Make the local shell stuff, currently no communication is done */
9583         make_local_shells(cr,mdatoms,shellfc);
9584     }
9585
9586         if (ir->implicit_solvent)
9587     {
9588         make_local_gb(cr,fr->born,ir->gb_algorithm);
9589     }
9590
9591     init_bonded_thread_force_reduction(fr,&top_local->idef);
9592
9593     if (!(cr->duty & DUTY_PME))
9594     {
9595         /* Send the charges to our PME only node */
9596         gmx_pme_send_q(cr,mdatoms->nChargePerturbed,
9597                        mdatoms->chargeA,mdatoms->chargeB,
9598                        dd_pme_maxshift_x(dd),dd_pme_maxshift_y(dd));
9599     }
9600
9601     if (constr)
9602     {
9603         set_constraints(constr,top_local,ir,mdatoms,cr);
9604     }
9605
9606     if (ir->ePull != epullNO)
9607     {
9608         /* Update the local pull groups */
9609         dd_make_local_pull_groups(dd,ir->pull,mdatoms);
9610     }
9611
9612     if (ir->bRot)
9613     {
9614         /* Update the local rotation groups */
9615         dd_make_local_rotation_groups(dd,ir->rot);
9616     }
9617
9618
9619     add_dd_statistics(dd);
9620
9621     /* Make sure we only count the cycles for this DD partitioning */
9622     clear_dd_cycle_counts(dd);
9623
9624     /* Because the order of the atoms might have changed since
9625      * the last vsite construction, we need to communicate the constructing
9626      * atom coordinates again (for spreading the forces this MD step).
9627      */
9628     dd_move_x_vsites(dd,state_local->box,state_local->x);
9629
9630     wallcycle_sub_stop(wcycle,ewcsDD_TOPOTHER);
9631
9632     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9633     {
9634         dd_move_x(dd,state_local->box,state_local->x);
9635         write_dd_pdb("dd_dump",step,"dump",top_global,cr,
9636                      -1,state_local->x,state_local->box);
9637     }
9638
9639     /* Store the partitioning step */
9640     comm->partition_step = step;
9641
9642     /* Increase the DD partitioning counter */
9643     dd->ddp_count++;
9644     /* The state currently matches this DD partitioning count, store it */
9645     state_local->ddp_count = dd->ddp_count;
9646     if (bMasterState)
9647     {
9648         /* The DD master node knows the complete cg distribution,
9649          * store the count so we can possibly skip the cg info communication.
9650          */
9651         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9652     }
9653
9654     if (comm->DD_debug > 0)
9655     {
9656         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9657         check_index_consistency(dd,top_global->natoms,ncg_mtop(top_global),
9658                                 "after partitioning");
9659     }
9660 }