src/gromacs/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "gmx_fatal.h"
  31 #include "gmx_fatal_collective.h"
  32 #include "vec.h"
  33 #include "domdec.h"
  34 #include "domdec_network.h"
  35 #include "nrnb.h"
  36 #include "pbc.h"
  37 #include "chargegroup.h"
  38 #include "constr.h"
  39 #include "mdatoms.h"
  40 #include "names.h"
  41 #include "pdbio.h"
  42 #include "futil.h"
  43 #include "force.h"
  44 #include "pme.h"
  45 #include "pull.h"
  46 #include "pull_rotation.h"
  47 #include "gmx_wallcycle.h"
  48 #include "mdrun.h"
  49 #include "nsgrid.h"
  50 #include "shellfc.h"
  51 #include "mtop_util.h"
  52 #include "gmxfio.h"
  53 #include "gmx_ga2la.h"
  54 #include "gmx_sort.h"
  55 #include "macros.h"
  56 #include "nbnxn_search.h"
  57 #include "bondf.h"
  58 #include "gmx_omp_nthreads.h"
  59
  60 #include "gromacs/utility/gmxmpi.h"
  61
  62 #define DDRANK(dd, rank)    (rank)
  63 #define DDMASTERRANK(dd)   (dd->masterrank)
  64
  65 typedef struct gmx_domdec_master
  66 {
  67     /* The cell boundaries */
  68     real **cell_x;
  69     /* The global charge group division */
  70     int   *ncg;    /* Number of home charge groups for each node */
  71     int   *index;  /* Index of nnodes+1 into cg */
  72     int   *cg;     /* Global charge group index */
  73     int   *nat;    /* Number of home atoms for each node. */
  74     int   *ibuf;   /* Buffer for communication */
  75     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  76 } gmx_domdec_master_t;
  77
  78 typedef struct
  79 {
  80     /* The numbers of charge groups to send and receive for each cell
  81      * that requires communication, the last entry contains the total
  82      * number of atoms that needs to be communicated.
  83      */
  84     int  nsend[DD_MAXIZONE+2];
  85     int  nrecv[DD_MAXIZONE+2];
  86     /* The charge groups to send */
  87     int *index;
  88     int  nalloc;
  89     /* The atom range for non-in-place communication */
  90     int  cell2at0[DD_MAXIZONE];
  91     int  cell2at1[DD_MAXIZONE];
  92 } gmx_domdec_ind_t;
  93
  94 typedef struct
  95 {
  96     int               np;       /* Number of grid pulses in this dimension */
  97     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
  98     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
  99     int               np_nalloc;
 100     gmx_bool          bInPlace; /* Can we communicate in place?            */
 101 } gmx_domdec_comm_dim_t;
 102
 103 typedef struct
 104 {
 105     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 106     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 107     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 108     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 109     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 110     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 111     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 112     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 113     real     *buf_ncd;     /* Temp. var.                                     */
 114 } gmx_domdec_root_t;
 115
 116 #define DD_NLOAD_MAX 9
 117
 118 /* Here floats are accurate enough, since these variables
 119  * only influence the load balancing, not the actual MD results.
 120  */
 121 typedef struct
 122 {
 123     int    nload;
 124     float *load;
 125     float  sum;
 126     float  max;
 127     float  sum_m;
 128     float  cvol_min;
 129     float  mdf;
 130     float  pme;
 131     int    flags;
 132 } gmx_domdec_load_t;
 133
 134 typedef struct
 135 {
 136     int  nsc;
 137     int  ind_gl;
 138     int  ind;
 139 } gmx_cgsort_t;
 140
 141 typedef struct
 142 {
 143     gmx_cgsort_t *sort;
 144     gmx_cgsort_t *sort2;
 145     int           sort_nalloc;
 146     gmx_cgsort_t *sort_new;
 147     int           sort_new_nalloc;
 148     int          *ibuf;
 149     int           ibuf_nalloc;
 150 } gmx_domdec_sort_t;
 151
 152 typedef struct
 153 {
 154     rvec *v;
 155     int   nalloc;
 156 } vec_rvec_t;
 157
 158 /* This enum determines the order of the coordinates.
 159  * ddnatHOME and ddnatZONE should be first and second,
 160  * the others can be ordered as wanted.
 161  */
 162 enum {
 163     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 164 };
 165
 166 enum {
 167     edlbAUTO, edlbNO, edlbYES, edlbNR
 168 };
 169 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 170
 171 typedef struct
 172 {
 173     int      dim;       /* The dimension                                          */
 174     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 175     int      nslab;     /* The number of PME slabs in this dimension              */
 176     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 177     int     *pp_min;    /* The minimum pp node location, size nslab               */
 178     int     *pp_max;    /* The maximum pp node location,size nslab                */
 179     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 180 } gmx_ddpme_t;
 181
 182 typedef struct
 183 {
 184     real min0;    /* The minimum bottom of this zone                        */
 185     real max1;    /* The maximum top of this zone                           */
 186     real min1;    /* The minimum top of this zone                           */
 187     real mch0;    /* The maximum bottom communicaton height for this zone   */
 188     real mch1;    /* The maximum top communicaton height for this zone      */
 189     real p1_0;    /* The bottom value of the first cell in this zone        */
 190     real p1_1;    /* The top value of the first cell in this zone           */
 191 } gmx_ddzone_t;
 192
 193 typedef struct
 194 {
 195     gmx_domdec_ind_t ind;
 196     int             *ibuf;
 197     int              ibuf_nalloc;
 198     vec_rvec_t       vbuf;
 199     int              nsend;
 200     int              nat;
 201     int              nsend_zone;
 202 } dd_comm_setup_work_t;
 203
 204 typedef struct gmx_domdec_comm
 205 {
 206     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 207      * unless stated otherwise.
 208      */
 209
 210     /* The number of decomposition dimensions for PME, 0: no PME */
 211     int         npmedecompdim;
 212     /* The number of nodes doing PME (PP/PME or only PME) */
 213     int         npmenodes;
 214     int         npmenodes_x;
 215     int         npmenodes_y;
 216     /* The communication setup including the PME only nodes */
 217     gmx_bool    bCartesianPP_PME;
 218     ivec        ntot;
 219     int         cartpmedim;
 220     int        *pmenodes;          /* size npmenodes                         */
 221     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 222                                     * but with bCartesianPP_PME              */
 223     gmx_ddpme_t ddpme[2];
 224
 225     /* The DD particle-particle nodes only */
 226     gmx_bool bCartesianPP;
 227     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 228
 229     /* The global charge groups */
 230     t_block cgs_gl;
 231
 232     /* Should we sort the cgs */
 233     int                nstSortCG;
 234     gmx_domdec_sort_t *sort;
 235
 236     /* Are there charge groups? */
 237     gmx_bool bCGs;
 238
 239     /* Are there bonded and multi-body interactions between charge groups? */
 240     gmx_bool bInterCGBondeds;
 241     gmx_bool bInterCGMultiBody;
 242
 243     /* Data for the optional bonded interaction atom communication range */
 244     gmx_bool  bBondComm;
 245     t_blocka *cglink;
 246     char     *bLocalCG;
 247
 248     /* The DLB option */
 249     int      eDLB;
 250     /* Are we actually using DLB? */
 251     gmx_bool bDynLoadBal;
 252
 253     /* Cell sizes for static load balancing, first index cartesian */
 254     real **slb_frac;
 255
 256     /* The width of the communicated boundaries */
 257     real     cutoff_mbody;
 258     real     cutoff;
 259     /* The minimum cell size (including triclinic correction) */
 260     rvec     cellsize_min;
 261     /* For dlb, for use with edlbAUTO */
 262     rvec     cellsize_min_dlb;
 263     /* The lower limit for the DD cell size with DLB */
 264     real     cellsize_limit;
 265     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 266     gmx_bool bVacDLBNoLimit;
 267
 268     /* With PME load balancing we set limits on DLB */
 269     gmx_bool bPMELoadBalDLBLimits;
 270     /* DLB needs to take into account that we want to allow this maximum
 271      * cut-off (for PME load balancing), this could limit cell boundaries.
 272      */
 273     real PMELoadBal_max_cutoff;
 274
 275     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 276     ivec tric_dir;
 277     /* box0 and box_size are required with dim's without pbc and -gcom */
 278     rvec box0;
 279     rvec box_size;
 280
 281     /* The cell boundaries */
 282     rvec cell_x0;
 283     rvec cell_x1;
 284
 285     /* The old location of the cell boundaries, to check cg displacements */
 286     rvec old_cell_x0;
 287     rvec old_cell_x1;
 288
 289     /* The communication setup and charge group boundaries for the zones */
 290     gmx_domdec_zones_t zones;
 291
 292     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 293      * cell boundaries of neighboring cells for dynamic load balancing.
 294      */
 295     gmx_ddzone_t zone_d1[2];
 296     gmx_ddzone_t zone_d2[2][2];
 297
 298     /* The coordinate/force communication setup and indices */
 299     gmx_domdec_comm_dim_t cd[DIM];
 300     /* The maximum number of cells to communicate with in one dimension */
 301     int                   maxpulse;
 302
 303     /* Which cg distribution is stored on the master node */
 304     int master_cg_ddp_count;
 305
 306     /* The number of cg's received from the direct neighbors */
 307     int  zone_ncg1[DD_MAXZONE];
 308
 309     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 310     int  nat[ddnatNR];
 311
 312     /* Array for signalling if atoms have moved to another domain */
 313     int  *moved;
 314     int   moved_nalloc;
 315
 316     /* Communication buffer for general use */
 317     int  *buf_int;
 318     int   nalloc_int;
 319
 320     /* Communication buffer for general use */
 321     vec_rvec_t vbuf;
 322
 323     /* Temporary storage for thread parallel communication setup */
 324     int                   nth;
 325     dd_comm_setup_work_t *dth;
 326
 327     /* Communication buffers only used with multiple grid pulses */
 328     int       *buf_int2;
 329     int        nalloc_int2;
 330     vec_rvec_t vbuf2;
 331
 332     /* Communication buffers for local redistribution */
 333     int  **cggl_flag;
 334     int    cggl_flag_nalloc[DIM*2];
 335     rvec **cgcm_state;
 336     int    cgcm_state_nalloc[DIM*2];
 337
 338     /* Cell sizes for dynamic load balancing */
 339     gmx_domdec_root_t **root;
 340     real               *cell_f_row;
 341     real                cell_f0[DIM];
 342     real                cell_f1[DIM];
 343     real                cell_f_max0[DIM];
 344     real                cell_f_min1[DIM];
 345
 346     /* Stuff for load communication */
 347     gmx_bool           bRecordLoad;
 348     gmx_domdec_load_t *load;
 349 #ifdef GMX_MPI
 350     MPI_Comm          *mpi_comm_load;
 351 #endif
 352
 353     /* Maximum DLB scaling per load balancing step in percent */
 354     int dlb_scale_lim;
 355
 356     /* Cycle counters */
 357     float  cycl[ddCyclNr];
 358     int    cycl_n[ddCyclNr];
 359     float  cycl_max[ddCyclNr];
 360     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 361     int    eFlop;
 362     double flop;
 363     int    flop_n;
 364     /* Have often have did we have load measurements */
 365     int    n_load_have;
 366     /* Have often have we collected the load measurements */
 367     int    n_load_collect;
 368
 369     /* Statistics */
 370     double sum_nat[ddnatNR-ddnatZONE];
 371     int    ndecomp;
 372     int    nload;
 373     double load_step;
 374     double load_sum;
 375     double load_max;
 376     ivec   load_lim;
 377     double load_mdf;
 378     double load_pme;
 379
 380     /* The last partition step */
 381     gmx_large_int_t partition_step;
 382
 383     /* Debugging */
 384     int  nstDDDump;
 385     int  nstDDDumpGrid;
 386     int  DD_debug;
 387 } gmx_domdec_comm_t;
 388
 389 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 390 #define DD_CGIBS 2
 391
 392 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 393 #define DD_FLAG_NRCG  65535
 394 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 395 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 396
 397 /* Zone permutation required to obtain consecutive charge groups
 398  * for neighbor searching.
 399  */
 400 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 401
 402 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 403  * components see only j zones with that component 0.
 404  */
 405
 406 /* The DD zone order */
 407 static const ivec dd_zo[DD_MAXZONE] =
 408 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 409
 410 /* The 3D setup */
 411 #define dd_z3n  8
 412 #define dd_zp3n 4
 413 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 414
 415 /* The 2D setup */
 416 #define dd_z2n  4
 417 #define dd_zp2n 2
 418 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 419
 420 /* The 1D setup */
 421 #define dd_z1n  2
 422 #define dd_zp1n 1
 423 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 424
 425 /* Factors used to avoid problems due to rounding issues */
 426 #define DD_CELL_MARGIN       1.0001
 427 #define DD_CELL_MARGIN2      1.00005
 428 /* Factor to account for pressure scaling during nstlist steps */
 429 #define DD_PRES_SCALE_MARGIN 1.02
 430
 431 /* Allowed performance loss before we DLB or warn */
 432 #define DD_PERF_LOSS 0.05
 433
 434 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 435
 436 /* Use separate MPI send and receive commands
 437  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 438  * This saves memory (and some copying for small nnodes).
 439  * For high parallelization scatter and gather calls are used.
 440  */
 441 #define GMX_DD_NNODES_SENDRECV 4
 442
 443
 444 /*
 445    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 446
 447    static void index2xyz(ivec nc,int ind,ivec xyz)
 448    {
 449    xyz[XX] = ind % nc[XX];
 450    xyz[YY] = (ind / nc[XX]) % nc[YY];
 451    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 452    }
 453  */
 454
 455 /* This order is required to minimize the coordinate communication in PME
 456  * which uses decomposition in the x direction.
 457  */
 458 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 459
 460 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 461 {
 462     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 463     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 464     xyz[ZZ] = ind % nc[ZZ];
 465 }
 466
 467 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 468 {
 469     int ddindex;
 470     int ddnodeid = -1;
 471
 472     ddindex = dd_index(dd->nc, c);
 473     if (dd->comm->bCartesianPP_PME)
 474     {
 475         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 476     }
 477     else if (dd->comm->bCartesianPP)
 478     {
 479 #ifdef GMX_MPI
 480         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 481 #endif
 482     }
 483     else
 484     {
 485         ddnodeid = ddindex;
 486     }
 487
 488     return ddnodeid;
 489 }
 490
 491 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 492 {
 493     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 494 }
 495
 496 int ddglatnr(gmx_domdec_t *dd, int i)
 497 {
 498     int atnr;
 499
 500     if (dd == NULL)
 501     {
 502         atnr = i + 1;
 503     }
 504     else
 505     {
 506         if (i >= dd->comm->nat[ddnatNR-1])
 507         {
 508             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 509         }
 510         atnr = dd->gatindex[i] + 1;
 511     }
 512
 513     return atnr;
 514 }
 515
 516 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 517 {
 518     return &dd->comm->cgs_gl;
 519 }
 520
 521 static void vec_rvec_init(vec_rvec_t *v)
 522 {
 523     v->nalloc = 0;
 524     v->v      = NULL;
 525 }
 526
 527 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 528 {
 529     if (n > v->nalloc)
 530     {
 531         v->nalloc = over_alloc_dd(n);
 532         srenew(v->v, v->nalloc);
 533     }
 534 }
 535
 536 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 537 {
 538     int i;
 539
 540     if (state->ddp_count != dd->ddp_count)
 541     {
 542         gmx_incons("The state does not the domain decomposition state");
 543     }
 544
 545     state->ncg_gl = dd->ncg_home;
 546     if (state->ncg_gl > state->cg_gl_nalloc)
 547     {
 548         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 549         srenew(state->cg_gl, state->cg_gl_nalloc);
 550     }
 551     for (i = 0; i < state->ncg_gl; i++)
 552     {
 553         state->cg_gl[i] = dd->index_gl[i];
 554     }
 555
 556     state->ddp_count_cg_gl = dd->ddp_count;
 557 }
 558
 559 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 560 {
 561     return &dd->comm->zones;
 562 }
 563
 564 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 565                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 566 {
 567     gmx_domdec_zones_t *zones;
 568     int                 izone, d, dim;
 569
 570     zones = &dd->comm->zones;
 571
 572     izone = 0;
 573     while (icg >= zones->izone[izone].cg1)
 574     {
 575         izone++;
 576     }
 577
 578     if (izone == 0)
 579     {
 580         *jcg0 = icg;
 581     }
 582     else if (izone < zones->nizone)
 583     {
 584         *jcg0 = zones->izone[izone].jcg0;
 585     }
 586     else
 587     {
 588         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 589                   icg, izone, zones->nizone);
 590     }
 591
 592     *jcg1 = zones->izone[izone].jcg1;
 593
 594     for (d = 0; d < dd->ndim; d++)
 595     {
 596         dim         = dd->dim[d];
 597         shift0[dim] = zones->izone[izone].shift0[dim];
 598         shift1[dim] = zones->izone[izone].shift1[dim];
 599         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 600         {
 601             /* A conservative approach, this can be optimized */
 602             shift0[dim] -= 1;
 603             shift1[dim] += 1;
 604         }
 605     }
 606 }
 607
 608 int dd_natoms_vsite(gmx_domdec_t *dd)
 609 {
 610     return dd->comm->nat[ddnatVSITE];
 611 }
 612
 613 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 614 {
 615     *at_start = dd->comm->nat[ddnatCON-1];
 616     *at_end   = dd->comm->nat[ddnatCON];
 617 }
 618
 619 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 620 {
 621     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 622     int                   *index, *cgindex;
 623     gmx_domdec_comm_t     *comm;
 624     gmx_domdec_comm_dim_t *cd;
 625     gmx_domdec_ind_t      *ind;
 626     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 627     gmx_bool               bPBC, bScrew;
 628
 629     comm = dd->comm;
 630
 631     cgindex = dd->cgindex;
 632
 633     buf = comm->vbuf.v;
 634
 635     nzone   = 1;
 636     nat_tot = dd->nat_home;
 637     for (d = 0; d < dd->ndim; d++)
 638     {
 639         bPBC   = (dd->ci[dd->dim[d]] == 0);
 640         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 641         if (bPBC)
 642         {
 643             copy_rvec(box[dd->dim[d]], shift);
 644         }
 645         cd = &comm->cd[d];
 646         for (p = 0; p < cd->np; p++)
 647         {
 648             ind   = &cd->ind[p];
 649             index = ind->index;
 650             n     = 0;
 651             if (!bPBC)
 652             {
 653                 for (i = 0; i < ind->nsend[nzone]; i++)
 654                 {
 655                     at0 = cgindex[index[i]];
 656                     at1 = cgindex[index[i]+1];
 657                     for (j = at0; j < at1; j++)
 658                     {
 659                         copy_rvec(x[j], buf[n]);
 660                         n++;
 661                     }
 662                 }
 663             }
 664             else if (!bScrew)
 665             {
 666                 for (i = 0; i < ind->nsend[nzone]; i++)
 667                 {
 668                     at0 = cgindex[index[i]];
 669                     at1 = cgindex[index[i]+1];
 670                     for (j = at0; j < at1; j++)
 671                     {
 672                         /* We need to shift the coordinates */
 673                         rvec_add(x[j], shift, buf[n]);
 674                         n++;
 675                     }
 676                 }
 677             }
 678             else
 679             {
 680                 for (i = 0; i < ind->nsend[nzone]; i++)
 681                 {
 682                     at0 = cgindex[index[i]];
 683                     at1 = cgindex[index[i]+1];
 684                     for (j = at0; j < at1; j++)
 685                     {
 686                         /* Shift x */
 687                         buf[n][XX] = x[j][XX] + shift[XX];
 688                         /* Rotate y and z.
 689                          * This operation requires a special shift force
 690                          * treatment, which is performed in calc_vir.
 691                          */
 692                         buf[n][YY] = box[YY][YY] - x[j][YY];
 693                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 694                         n++;
 695                     }
 696                 }
 697             }
 698
 699             if (cd->bInPlace)
 700             {
 701                 rbuf = x + nat_tot;
 702             }
 703             else
 704             {
 705                 rbuf = comm->vbuf2.v;
 706             }
 707             /* Send and receive the coordinates */
 708             dd_sendrecv_rvec(dd, d, dddirBackward,
 709                              buf,  ind->nsend[nzone+1],
 710                              rbuf, ind->nrecv[nzone+1]);
 711             if (!cd->bInPlace)
 712             {
 713                 j = 0;
 714                 for (zone = 0; zone < nzone; zone++)
 715                 {
 716                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 717                     {
 718                         copy_rvec(rbuf[j], x[i]);
 719                         j++;
 720                     }
 721                 }
 722             }
 723             nat_tot += ind->nrecv[nzone+1];
 724         }
 725         nzone += nzone;
 726     }
 727 }
 728
 729 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 730 {
 731     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 732     int                   *index, *cgindex;
 733     gmx_domdec_comm_t     *comm;
 734     gmx_domdec_comm_dim_t *cd;
 735     gmx_domdec_ind_t      *ind;
 736     rvec                  *buf, *sbuf;
 737     ivec                   vis;
 738     int                    is;
 739     gmx_bool               bPBC, bScrew;
 740
 741     comm = dd->comm;
 742
 743     cgindex = dd->cgindex;
 744
 745     buf = comm->vbuf.v;
 746
 747     n       = 0;
 748     nzone   = comm->zones.n/2;
 749     nat_tot = dd->nat_tot;
 750     for (d = dd->ndim-1; d >= 0; d--)
 751     {
 752         bPBC   = (dd->ci[dd->dim[d]] == 0);
 753         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 754         if (fshift == NULL && !bScrew)
 755         {
 756             bPBC = FALSE;
 757         }
 758         /* Determine which shift vector we need */
 759         clear_ivec(vis);
 760         vis[dd->dim[d]] = 1;
 761         is              = IVEC2IS(vis);
 762
 763         cd = &comm->cd[d];
 764         for (p = cd->np-1; p >= 0; p--)
 765         {
 766             ind      = &cd->ind[p];
 767             nat_tot -= ind->nrecv[nzone+1];
 768             if (cd->bInPlace)
 769             {
 770                 sbuf = f + nat_tot;
 771             }
 772             else
 773             {
 774                 sbuf = comm->vbuf2.v;
 775                 j    = 0;
 776                 for (zone = 0; zone < nzone; zone++)
 777                 {
 778                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 779                     {
 780                         copy_rvec(f[i], sbuf[j]);
 781                         j++;
 782                     }
 783                 }
 784             }
 785             /* Communicate the forces */
 786             dd_sendrecv_rvec(dd, d, dddirForward,
 787                              sbuf, ind->nrecv[nzone+1],
 788                              buf,  ind->nsend[nzone+1]);
 789             index = ind->index;
 790             /* Add the received forces */
 791             n = 0;
 792             if (!bPBC)
 793             {
 794                 for (i = 0; i < ind->nsend[nzone]; i++)
 795                 {
 796                     at0 = cgindex[index[i]];
 797                     at1 = cgindex[index[i]+1];
 798                     for (j = at0; j < at1; j++)
 799                     {
 800                         rvec_inc(f[j], buf[n]);
 801                         n++;
 802                     }
 803                 }
 804             }
 805             else if (!bScrew)
 806             {
 807                 for (i = 0; i < ind->nsend[nzone]; i++)
 808                 {
 809                     at0 = cgindex[index[i]];
 810                     at1 = cgindex[index[i]+1];
 811                     for (j = at0; j < at1; j++)
 812                     {
 813                         rvec_inc(f[j], buf[n]);
 814                         /* Add this force to the shift force */
 815                         rvec_inc(fshift[is], buf[n]);
 816                         n++;
 817                     }
 818                 }
 819             }
 820             else
 821             {
 822                 for (i = 0; i < ind->nsend[nzone]; i++)
 823                 {
 824                     at0 = cgindex[index[i]];
 825                     at1 = cgindex[index[i]+1];
 826                     for (j = at0; j < at1; j++)
 827                     {
 828                         /* Rotate the force */
 829                         f[j][XX] += buf[n][XX];
 830                         f[j][YY] -= buf[n][YY];
 831                         f[j][ZZ] -= buf[n][ZZ];
 832                         if (fshift)
 833                         {
 834                             /* Add this force to the shift force */
 835                             rvec_inc(fshift[is], buf[n]);
 836                         }
 837                         n++;
 838                     }
 839                 }
 840             }
 841         }
 842         nzone /= 2;
 843     }
 844 }
 845
 846 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 847 {
 848     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 849     int                   *index, *cgindex;
 850     gmx_domdec_comm_t     *comm;
 851     gmx_domdec_comm_dim_t *cd;
 852     gmx_domdec_ind_t      *ind;
 853     real                  *buf, *rbuf;
 854
 855     comm = dd->comm;
 856
 857     cgindex = dd->cgindex;
 858
 859     buf = &comm->vbuf.v[0][0];
 860
 861     nzone   = 1;
 862     nat_tot = dd->nat_home;
 863     for (d = 0; d < dd->ndim; d++)
 864     {
 865         cd = &comm->cd[d];
 866         for (p = 0; p < cd->np; p++)
 867         {
 868             ind   = &cd->ind[p];
 869             index = ind->index;
 870             n     = 0;
 871             for (i = 0; i < ind->nsend[nzone]; i++)
 872             {
 873                 at0 = cgindex[index[i]];
 874                 at1 = cgindex[index[i]+1];
 875                 for (j = at0; j < at1; j++)
 876                 {
 877                     buf[n] = v[j];
 878                     n++;
 879                 }
 880             }
 881
 882             if (cd->bInPlace)
 883             {
 884                 rbuf = v + nat_tot;
 885             }
 886             else
 887             {
 888                 rbuf = &comm->vbuf2.v[0][0];
 889             }
 890             /* Send and receive the coordinates */
 891             dd_sendrecv_real(dd, d, dddirBackward,
 892                              buf,  ind->nsend[nzone+1],
 893                              rbuf, ind->nrecv[nzone+1]);
 894             if (!cd->bInPlace)
 895             {
 896                 j = 0;
 897                 for (zone = 0; zone < nzone; zone++)
 898                 {
 899                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 900                     {
 901                         v[i] = rbuf[j];
 902                         j++;
 903                     }
 904                 }
 905             }
 906             nat_tot += ind->nrecv[nzone+1];
 907         }
 908         nzone += nzone;
 909     }
 910 }
 911
 912 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 913 {
 914     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 915     int                   *index, *cgindex;
 916     gmx_domdec_comm_t     *comm;
 917     gmx_domdec_comm_dim_t *cd;
 918     gmx_domdec_ind_t      *ind;
 919     real                  *buf, *sbuf;
 920
 921     comm = dd->comm;
 922
 923     cgindex = dd->cgindex;
 924
 925     buf = &comm->vbuf.v[0][0];
 926
 927     n       = 0;
 928     nzone   = comm->zones.n/2;
 929     nat_tot = dd->nat_tot;
 930     for (d = dd->ndim-1; d >= 0; d--)
 931     {
 932         cd = &comm->cd[d];
 933         for (p = cd->np-1; p >= 0; p--)
 934         {
 935             ind      = &cd->ind[p];
 936             nat_tot -= ind->nrecv[nzone+1];
 937             if (cd->bInPlace)
 938             {
 939                 sbuf = v + nat_tot;
 940             }
 941             else
 942             {
 943                 sbuf = &comm->vbuf2.v[0][0];
 944                 j    = 0;
 945                 for (zone = 0; zone < nzone; zone++)
 946                 {
 947                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 948                     {
 949                         sbuf[j] = v[i];
 950                         j++;
 951                     }
 952                 }
 953             }
 954             /* Communicate the forces */
 955             dd_sendrecv_real(dd, d, dddirForward,
 956                              sbuf, ind->nrecv[nzone+1],
 957                              buf,  ind->nsend[nzone+1]);
 958             index = ind->index;
 959             /* Add the received forces */
 960             n = 0;
 961             for (i = 0; i < ind->nsend[nzone]; i++)
 962             {
 963                 at0 = cgindex[index[i]];
 964                 at1 = cgindex[index[i]+1];
 965                 for (j = at0; j < at1; j++)
 966                 {
 967                     v[j] += buf[n];
 968                     n++;
 969                 }
 970             }
 971         }
 972         nzone /= 2;
 973     }
 974 }
 975
 976 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 977 {
 978     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 979             d, i, j,
 980             zone->min0, zone->max1,
 981             zone->mch0, zone->mch0,
 982             zone->p1_0, zone->p1_1);
 983 }
 984
 985
 986 #define DDZONECOMM_MAXZONE  5
 987 #define DDZONECOMM_BUFSIZE  3
 988
 989 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 990                                int ddimind, int direction,
 991                                gmx_ddzone_t *buf_s, int n_s,
 992                                gmx_ddzone_t *buf_r, int n_r)
 993 {
 994 #define ZBS  DDZONECOMM_BUFSIZE
 995     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 996     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 997     int  i;
 998
 999     for (i = 0; i < n_s; i++)
1000     {
1001         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1002         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1003         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1004         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1005         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1006         vbuf_s[i*ZBS+1][2] = 0;
1007         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1008         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1009         vbuf_s[i*ZBS+2][2] = 0;
1010     }
1011
1012     dd_sendrecv_rvec(dd, ddimind, direction,
1013                      vbuf_s, n_s*ZBS,
1014                      vbuf_r, n_r*ZBS);
1015
1016     for (i = 0; i < n_r; i++)
1017     {
1018         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1019         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1020         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1021         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1022         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1023         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1024         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1025     }
1026
1027 #undef ZBS
1028 }
1029
1030 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1031                           rvec cell_ns_x0, rvec cell_ns_x1)
1032 {
1033     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1034     gmx_ddzone_t      *zp;
1035     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1036     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1037     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1038     rvec               extr_s[2], extr_r[2];
1039     rvec               dh;
1040     real               dist_d, c = 0, det;
1041     gmx_domdec_comm_t *comm;
1042     gmx_bool           bPBC, bUse;
1043
1044     comm = dd->comm;
1045
1046     for (d = 1; d < dd->ndim; d++)
1047     {
1048         dim      = dd->dim[d];
1049         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1050         zp->min0 = cell_ns_x0[dim];
1051         zp->max1 = cell_ns_x1[dim];
1052         zp->min1 = cell_ns_x1[dim];
1053         zp->mch0 = cell_ns_x0[dim];
1054         zp->mch1 = cell_ns_x1[dim];
1055         zp->p1_0 = cell_ns_x0[dim];
1056         zp->p1_1 = cell_ns_x1[dim];
1057     }
1058
1059     for (d = dd->ndim-2; d >= 0; d--)
1060     {
1061         dim  = dd->dim[d];
1062         bPBC = (dim < ddbox->npbcdim);
1063
1064         /* Use an rvec to store two reals */
1065         extr_s[d][0] = comm->cell_f0[d+1];
1066         extr_s[d][1] = comm->cell_f1[d+1];
1067         extr_s[d][2] = comm->cell_f1[d+1];
1068
1069         pos = 0;
1070         /* Store the extremes in the backward sending buffer,
1071          * so the get updated separately from the forward communication.
1072          */
1073         for (d1 = d; d1 < dd->ndim-1; d1++)
1074         {
1075             /* We invert the order to be able to use the same loop for buf_e */
1076             buf_s[pos].min0 = extr_s[d1][1];
1077             buf_s[pos].max1 = extr_s[d1][0];
1078             buf_s[pos].min1 = extr_s[d1][2];
1079             buf_s[pos].mch0 = 0;
1080             buf_s[pos].mch1 = 0;
1081             /* Store the cell corner of the dimension we communicate along */
1082             buf_s[pos].p1_0 = comm->cell_x0[dim];
1083             buf_s[pos].p1_1 = 0;
1084             pos++;
1085         }
1086
1087         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1088         pos++;
1089
1090         if (dd->ndim == 3 && d == 0)
1091         {
1092             buf_s[pos] = comm->zone_d2[0][1];
1093             pos++;
1094             buf_s[pos] = comm->zone_d1[0];
1095             pos++;
1096         }
1097
1098         /* We only need to communicate the extremes
1099          * in the forward direction
1100          */
1101         npulse = comm->cd[d].np;
1102         if (bPBC)
1103         {
1104             /* Take the minimum to avoid double communication */
1105             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1106         }
1107         else
1108         {
1109             /* Without PBC we should really not communicate over
1110              * the boundaries, but implementing that complicates
1111              * the communication setup and therefore we simply
1112              * do all communication, but ignore some data.
1113              */
1114             npulse_min = npulse;
1115         }
1116         for (p = 0; p < npulse_min; p++)
1117         {
1118             /* Communicate the extremes forward */
1119             bUse = (bPBC || dd->ci[dim] > 0);
1120
1121             dd_sendrecv_rvec(dd, d, dddirForward,
1122                              extr_s+d, dd->ndim-d-1,
1123                              extr_r+d, dd->ndim-d-1);
1124
1125             if (bUse)
1126             {
1127                 for (d1 = d; d1 < dd->ndim-1; d1++)
1128                 {
1129                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1130                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1131                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1132                 }
1133             }
1134         }
1135
1136         buf_size = pos;
1137         for (p = 0; p < npulse; p++)
1138         {
1139             /* Communicate all the zone information backward */
1140             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1141
1142             dd_sendrecv_ddzone(dd, d, dddirBackward,
1143                                buf_s, buf_size,
1144                                buf_r, buf_size);
1145
1146             clear_rvec(dh);
1147             if (p > 0)
1148             {
1149                 for (d1 = d+1; d1 < dd->ndim; d1++)
1150                 {
1151                     /* Determine the decrease of maximum required
1152                      * communication height along d1 due to the distance along d,
1153                      * this avoids a lot of useless atom communication.
1154                      */
1155                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1156
1157                     if (ddbox->tric_dir[dim])
1158                     {
1159                         /* c is the off-diagonal coupling between the cell planes
1160                          * along directions d and d1.
1161                          */
1162                         c = ddbox->v[dim][dd->dim[d1]][dim];
1163                     }
1164                     else
1165                     {
1166                         c = 0;
1167                     }
1168                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1169                     if (det > 0)
1170                     {
1171                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1172                     }
1173                     else
1174                     {
1175                         /* A negative value signals out of range */
1176                         dh[d1] = -1;
1177                     }
1178                 }
1179             }
1180
1181             /* Accumulate the extremes over all pulses */
1182             for (i = 0; i < buf_size; i++)
1183             {
1184                 if (p == 0)
1185                 {
1186                     buf_e[i] = buf_r[i];
1187                 }
1188                 else
1189                 {
1190                     if (bUse)
1191                     {
1192                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1193                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1194                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1195                     }
1196
1197                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1198                     {
1199                         d1 = 1;
1200                     }
1201                     else
1202                     {
1203                         d1 = d + 1;
1204                     }
1205                     if (bUse && dh[d1] >= 0)
1206                     {
1207                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1208                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1209                     }
1210                 }
1211                 /* Copy the received buffer to the send buffer,
1212                  * to pass the data through with the next pulse.
1213                  */
1214                 buf_s[i] = buf_r[i];
1215             }
1216             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1217                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1218             {
1219                 /* Store the extremes */
1220                 pos = 0;
1221
1222                 for (d1 = d; d1 < dd->ndim-1; d1++)
1223                 {
1224                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1225                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1226                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1227                     pos++;
1228                 }
1229
1230                 if (d == 1 || (d == 0 && dd->ndim == 3))
1231                 {
1232                     for (i = d; i < 2; i++)
1233                     {
1234                         comm->zone_d2[1-d][i] = buf_e[pos];
1235                         pos++;
1236                     }
1237                 }
1238                 if (d == 0)
1239                 {
1240                     comm->zone_d1[1] = buf_e[pos];
1241                     pos++;
1242                 }
1243             }
1244         }
1245     }
1246
1247     if (dd->ndim >= 2)
1248     {
1249         dim = dd->dim[1];
1250         for (i = 0; i < 2; i++)
1251         {
1252             if (debug)
1253             {
1254                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1255             }
1256             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1257             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1258         }
1259     }
1260     if (dd->ndim >= 3)
1261     {
1262         dim = dd->dim[2];
1263         for (i = 0; i < 2; i++)
1264         {
1265             for (j = 0; j < 2; j++)
1266             {
1267                 if (debug)
1268                 {
1269                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1270                 }
1271                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1272                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1273             }
1274         }
1275     }
1276     for (d = 1; d < dd->ndim; d++)
1277     {
1278         comm->cell_f_max0[d] = extr_s[d-1][0];
1279         comm->cell_f_min1[d] = extr_s[d-1][1];
1280         if (debug)
1281         {
1282             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1283                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1284         }
1285     }
1286 }
1287
1288 static void dd_collect_cg(gmx_domdec_t *dd,
1289                           t_state      *state_local)
1290 {
1291     gmx_domdec_master_t *ma = NULL;
1292     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1293     t_block             *cgs_gl;
1294
1295     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1296     {
1297         /* The master has the correct distribution */
1298         return;
1299     }
1300
1301     if (state_local->ddp_count == dd->ddp_count)
1302     {
1303         ncg_home = dd->ncg_home;
1304         cg       = dd->index_gl;
1305         nat_home = dd->nat_home;
1306     }
1307     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1308     {
1309         cgs_gl = &dd->comm->cgs_gl;
1310
1311         ncg_home = state_local->ncg_gl;
1312         cg       = state_local->cg_gl;
1313         nat_home = 0;
1314         for (i = 0; i < ncg_home; i++)
1315         {
1316             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1317         }
1318     }
1319     else
1320     {
1321         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1322     }
1323
1324     buf2[0] = dd->ncg_home;
1325     buf2[1] = dd->nat_home;
1326     if (DDMASTER(dd))
1327     {
1328         ma   = dd->ma;
1329         ibuf = ma->ibuf;
1330     }
1331     else
1332     {
1333         ibuf = NULL;
1334     }
1335     /* Collect the charge group and atom counts on the master */
1336     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1337
1338     if (DDMASTER(dd))
1339     {
1340         ma->index[0] = 0;
1341         for (i = 0; i < dd->nnodes; i++)
1342         {
1343             ma->ncg[i]     = ma->ibuf[2*i];
1344             ma->nat[i]     = ma->ibuf[2*i+1];
1345             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1346
1347         }
1348         /* Make byte counts and indices */
1349         for (i = 0; i < dd->nnodes; i++)
1350         {
1351             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1352             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1353         }
1354         if (debug)
1355         {
1356             fprintf(debug, "Initial charge group distribution: ");
1357             for (i = 0; i < dd->nnodes; i++)
1358             {
1359                 fprintf(debug, " %d", ma->ncg[i]);
1360             }
1361             fprintf(debug, "\n");
1362         }
1363     }
1364
1365     /* Collect the charge group indices on the master */
1366     dd_gatherv(dd,
1367                dd->ncg_home*sizeof(int), dd->index_gl,
1368                DDMASTER(dd) ? ma->ibuf : NULL,
1369                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1370                DDMASTER(dd) ? ma->cg : NULL);
1371
1372     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1373 }
1374
1375 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1376                                     rvec *lv, rvec *v)
1377 {
1378     gmx_domdec_master_t *ma;
1379     int                  n, i, c, a, nalloc = 0;
1380     rvec                *buf = NULL;
1381     t_block             *cgs_gl;
1382
1383     ma = dd->ma;
1384
1385     if (!DDMASTER(dd))
1386     {
1387 #ifdef GMX_MPI
1388         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1389                  dd->rank, dd->mpi_comm_all);
1390 #endif
1391     }
1392     else
1393     {
1394         /* Copy the master coordinates to the global array */
1395         cgs_gl = &dd->comm->cgs_gl;
1396
1397         n = DDMASTERRANK(dd);
1398         a = 0;
1399         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1400         {
1401             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1402             {
1403                 copy_rvec(lv[a++], v[c]);
1404             }
1405         }
1406
1407         for (n = 0; n < dd->nnodes; n++)
1408         {
1409             if (n != dd->rank)
1410             {
1411                 if (ma->nat[n] > nalloc)
1412                 {
1413                     nalloc = over_alloc_dd(ma->nat[n]);
1414                     srenew(buf, nalloc);
1415                 }
1416 #ifdef GMX_MPI
1417                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1418                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1419 #endif
1420                 a = 0;
1421                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1422                 {
1423                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1424                     {
1425                         copy_rvec(buf[a++], v[c]);
1426                     }
1427                 }
1428             }
1429         }
1430         sfree(buf);
1431     }
1432 }
1433
1434 static void get_commbuffer_counts(gmx_domdec_t *dd,
1435                                   int **counts, int **disps)
1436 {
1437     gmx_domdec_master_t *ma;
1438     int                  n;
1439
1440     ma = dd->ma;
1441
1442     /* Make the rvec count and displacment arrays */
1443     *counts  = ma->ibuf;
1444     *disps   = ma->ibuf + dd->nnodes;
1445     for (n = 0; n < dd->nnodes; n++)
1446     {
1447         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1448         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1449     }
1450 }
1451
1452 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1453                                    rvec *lv, rvec *v)
1454 {
1455     gmx_domdec_master_t *ma;
1456     int                 *rcounts = NULL, *disps = NULL;
1457     int                  n, i, c, a;
1458     rvec                *buf = NULL;
1459     t_block             *cgs_gl;
1460
1461     ma = dd->ma;
1462
1463     if (DDMASTER(dd))
1464     {
1465         get_commbuffer_counts(dd, &rcounts, &disps);
1466
1467         buf = ma->vbuf;
1468     }
1469
1470     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1471
1472     if (DDMASTER(dd))
1473     {
1474         cgs_gl = &dd->comm->cgs_gl;
1475
1476         a = 0;
1477         for (n = 0; n < dd->nnodes; n++)
1478         {
1479             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1480             {
1481                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1482                 {
1483                     copy_rvec(buf[a++], v[c]);
1484                 }
1485             }
1486         }
1487     }
1488 }
1489
1490 void dd_collect_vec(gmx_domdec_t *dd,
1491                     t_state *state_local, rvec *lv, rvec *v)
1492 {
1493     gmx_domdec_master_t *ma;
1494     int                  n, i, c, a, nalloc = 0;
1495     rvec                *buf = NULL;
1496
1497     dd_collect_cg(dd, state_local);
1498
1499     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1500     {
1501         dd_collect_vec_sendrecv(dd, lv, v);
1502     }
1503     else
1504     {
1505         dd_collect_vec_gatherv(dd, lv, v);
1506     }
1507 }
1508
1509
1510 void dd_collect_state(gmx_domdec_t *dd,
1511                       t_state *state_local, t_state *state)
1512 {
1513     int est, i, j, nh;
1514
1515     nh = state->nhchainlength;
1516
1517     if (DDMASTER(dd))
1518     {
1519         for (i = 0; i < efptNR; i++)
1520         {
1521             state->lambda[i] = state_local->lambda[i];
1522         }
1523         state->fep_state = state_local->fep_state;
1524         state->veta      = state_local->veta;
1525         state->vol0      = state_local->vol0;
1526         copy_mat(state_local->box, state->box);
1527         copy_mat(state_local->boxv, state->boxv);
1528         copy_mat(state_local->svir_prev, state->svir_prev);
1529         copy_mat(state_local->fvir_prev, state->fvir_prev);
1530         copy_mat(state_local->pres_prev, state->pres_prev);
1531
1532
1533         for (i = 0; i < state_local->ngtc; i++)
1534         {
1535             for (j = 0; j < nh; j++)
1536             {
1537                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1538                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1539             }
1540             state->therm_integral[i] = state_local->therm_integral[i];
1541         }
1542         for (i = 0; i < state_local->nnhpres; i++)
1543         {
1544             for (j = 0; j < nh; j++)
1545             {
1546                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1547                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1548             }
1549         }
1550     }
1551     for (est = 0; est < estNR; est++)
1552     {
1553         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1554         {
1555             switch (est)
1556             {
1557                 case estX:
1558                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1559                     break;
1560                 case estV:
1561                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1562                     break;
1563                 case estSDX:
1564                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1565                     break;
1566                 case estCGP:
1567                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1568                     break;
1569                 case estLD_RNG:
1570                     if (state->nrngi == 1)
1571                     {
1572                         if (DDMASTER(dd))
1573                         {
1574                             for (i = 0; i < state_local->nrng; i++)
1575                             {
1576                                 state->ld_rng[i] = state_local->ld_rng[i];
1577                             }
1578                         }
1579                     }
1580                     else
1581                     {
1582                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1583                                   state_local->ld_rng, state->ld_rng);
1584                     }
1585                     break;
1586                 case estLD_RNGI:
1587                     if (state->nrngi == 1)
1588                     {
1589                         if (DDMASTER(dd))
1590                         {
1591                             state->ld_rngi[0] = state_local->ld_rngi[0];
1592                         }
1593                     }
1594                     else
1595                     {
1596                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1597                                   state_local->ld_rngi, state->ld_rngi);
1598                     }
1599                     break;
1600                 case estDISRE_INITF:
1601                 case estDISRE_RM3TAV:
1602                 case estORIRE_INITF:
1603                 case estORIRE_DTAV:
1604                     break;
1605                 default:
1606                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1607             }
1608         }
1609     }
1610 }
1611
1612 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1613 {
1614     int est;
1615
1616     if (debug)
1617     {
1618         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1619     }
1620
1621     state->nalloc = over_alloc_dd(nalloc);
1622
1623     for (est = 0; est < estNR; est++)
1624     {
1625         if (EST_DISTR(est) && (state->flags & (1<<est)))
1626         {
1627             switch (est)
1628             {
1629                 case estX:
1630                     srenew(state->x, state->nalloc);
1631                     break;
1632                 case estV:
1633                     srenew(state->v, state->nalloc);
1634                     break;
1635                 case estSDX:
1636                     srenew(state->sd_X, state->nalloc);
1637                     break;
1638                 case estCGP:
1639                     srenew(state->cg_p, state->nalloc);
1640                     break;
1641                 case estLD_RNG:
1642                 case estLD_RNGI:
1643                 case estDISRE_INITF:
1644                 case estDISRE_RM3TAV:
1645                 case estORIRE_INITF:
1646                 case estORIRE_DTAV:
1647                     /* No reallocation required */
1648                     break;
1649                 default:
1650                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1651             }
1652         }
1653     }
1654
1655     if (f != NULL)
1656     {
1657         srenew(*f, state->nalloc);
1658     }
1659 }
1660
1661 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1662                                int nalloc)
1663 {
1664     if (nalloc > fr->cg_nalloc)
1665     {
1666         if (debug)
1667         {
1668             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1669         }
1670         fr->cg_nalloc = over_alloc_dd(nalloc);
1671         srenew(fr->cginfo, fr->cg_nalloc);
1672         if (fr->cutoff_scheme == ecutsGROUP)
1673         {
1674             srenew(fr->cg_cm, fr->cg_nalloc);
1675         }
1676     }
1677     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1678     {
1679         /* We don't use charge groups, we use x in state to set up
1680          * the atom communication.
1681          */
1682         dd_realloc_state(state, f, nalloc);
1683     }
1684 }
1685
1686 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1687                                        rvec *v, rvec *lv)
1688 {
1689     gmx_domdec_master_t *ma;
1690     int                  n, i, c, a, nalloc = 0;
1691     rvec                *buf = NULL;
1692
1693     if (DDMASTER(dd))
1694     {
1695         ma  = dd->ma;
1696
1697         for (n = 0; n < dd->nnodes; n++)
1698         {
1699             if (n != dd->rank)
1700             {
1701                 if (ma->nat[n] > nalloc)
1702                 {
1703                     nalloc = over_alloc_dd(ma->nat[n]);
1704                     srenew(buf, nalloc);
1705                 }
1706                 /* Use lv as a temporary buffer */
1707                 a = 0;
1708                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1709                 {
1710                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1711                     {
1712                         copy_rvec(v[c], buf[a++]);
1713                     }
1714                 }
1715                 if (a != ma->nat[n])
1716                 {
1717                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1718                               a, ma->nat[n]);
1719                 }
1720
1721 #ifdef GMX_MPI
1722                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1723                          DDRANK(dd, n), n, dd->mpi_comm_all);
1724 #endif
1725             }
1726         }
1727         sfree(buf);
1728         n = DDMASTERRANK(dd);
1729         a = 0;
1730         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1731         {
1732             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1733             {
1734                 copy_rvec(v[c], lv[a++]);
1735             }
1736         }
1737     }
1738     else
1739     {
1740 #ifdef GMX_MPI
1741         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1742                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1743 #endif
1744     }
1745 }
1746
1747 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1748                                        rvec *v, rvec *lv)
1749 {
1750     gmx_domdec_master_t *ma;
1751     int                 *scounts = NULL, *disps = NULL;
1752     int                  n, i, c, a, nalloc = 0;
1753     rvec                *buf = NULL;
1754
1755     if (DDMASTER(dd))
1756     {
1757         ma  = dd->ma;
1758
1759         get_commbuffer_counts(dd, &scounts, &disps);
1760
1761         buf = ma->vbuf;
1762         a   = 0;
1763         for (n = 0; n < dd->nnodes; n++)
1764         {
1765             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1766             {
1767                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1768                 {
1769                     copy_rvec(v[c], buf[a++]);
1770                 }
1771             }
1772         }
1773     }
1774
1775     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1776 }
1777
1778 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1779 {
1780     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1781     {
1782         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1783     }
1784     else
1785     {
1786         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1787     }
1788 }
1789
1790 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1791                                 t_state *state, t_state *state_local,
1792                                 rvec **f)
1793 {
1794     int  i, j, nh;
1795
1796     nh = state->nhchainlength;
1797
1798     if (DDMASTER(dd))
1799     {
1800         for (i = 0; i < efptNR; i++)
1801         {
1802             state_local->lambda[i] = state->lambda[i];
1803         }
1804         state_local->fep_state = state->fep_state;
1805         state_local->veta      = state->veta;
1806         state_local->vol0      = state->vol0;
1807         copy_mat(state->box, state_local->box);
1808         copy_mat(state->box_rel, state_local->box_rel);
1809         copy_mat(state->boxv, state_local->boxv);
1810         copy_mat(state->svir_prev, state_local->svir_prev);
1811         copy_mat(state->fvir_prev, state_local->fvir_prev);
1812         for (i = 0; i < state_local->ngtc; i++)
1813         {
1814             for (j = 0; j < nh; j++)
1815             {
1816                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1817                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1818             }
1819             state_local->therm_integral[i] = state->therm_integral[i];
1820         }
1821         for (i = 0; i < state_local->nnhpres; i++)
1822         {
1823             for (j = 0; j < nh; j++)
1824             {
1825                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1826                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1827             }
1828         }
1829     }
1830     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1831     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1832     dd_bcast(dd, sizeof(real), &state_local->veta);
1833     dd_bcast(dd, sizeof(real), &state_local->vol0);
1834     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1835     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1836     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1837     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1838     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1839     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1840     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1841     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1842     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1843     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1844
1845     if (dd->nat_home > state_local->nalloc)
1846     {
1847         dd_realloc_state(state_local, f, dd->nat_home);
1848     }
1849     for (i = 0; i < estNR; i++)
1850     {
1851         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1852         {
1853             switch (i)
1854             {
1855                 case estX:
1856                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1857                     break;
1858                 case estV:
1859                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1860                     break;
1861                 case estSDX:
1862                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1863                     break;
1864                 case estCGP:
1865                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1866                     break;
1867                 case estLD_RNG:
1868                     if (state->nrngi == 1)
1869                     {
1870                         dd_bcastc(dd,
1871                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1872                                   state->ld_rng, state_local->ld_rng);
1873                     }
1874                     else
1875                     {
1876                         dd_scatter(dd,
1877                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1878                                    state->ld_rng, state_local->ld_rng);
1879                     }
1880                     break;
1881                 case estLD_RNGI:
1882                     if (state->nrngi == 1)
1883                     {
1884                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1885                                   state->ld_rngi, state_local->ld_rngi);
1886                     }
1887                     else
1888                     {
1889                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1890                                    state->ld_rngi, state_local->ld_rngi);
1891                     }
1892                     break;
1893                 case estDISRE_INITF:
1894                 case estDISRE_RM3TAV:
1895                 case estORIRE_INITF:
1896                 case estORIRE_DTAV:
1897                     /* Not implemented yet */
1898                     break;
1899                 default:
1900                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1901             }
1902         }
1903     }
1904 }
1905
1906 static char dim2char(int dim)
1907 {
1908     char c = '?';
1909
1910     switch (dim)
1911     {
1912         case XX: c = 'X'; break;
1913         case YY: c = 'Y'; break;
1914         case ZZ: c = 'Z'; break;
1915         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1916     }
1917
1918     return c;
1919 }
1920
1921 static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
1922                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1923 {
1924     rvec   grid_s[2], *grid_r = NULL, cx, r;
1925     char   fname[STRLEN], format[STRLEN], buf[22];
1926     FILE  *out;
1927     int    a, i, d, z, y, x;
1928     matrix tric;
1929     real   vol;
1930
1931     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1932     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1933
1934     if (DDMASTER(dd))
1935     {
1936         snew(grid_r, 2*dd->nnodes);
1937     }
1938
1939     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1940
1941     if (DDMASTER(dd))
1942     {
1943         for (d = 0; d < DIM; d++)
1944         {
1945             for (i = 0; i < DIM; i++)
1946             {
1947                 if (d == i)
1948                 {
1949                     tric[d][i] = 1;
1950                 }
1951                 else
1952                 {
1953                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1954                     {
1955                         tric[d][i] = box[i][d]/box[i][i];
1956                     }
1957                     else
1958                     {
1959                         tric[d][i] = 0;
1960                     }
1961                 }
1962             }
1963         }
1964         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1965         sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
1966         out = gmx_fio_fopen(fname, "w");
1967         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1968         a = 1;
1969         for (i = 0; i < dd->nnodes; i++)
1970         {
1971             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1972             for (d = 0; d < DIM; d++)
1973             {
1974                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1975             }
1976             for (z = 0; z < 2; z++)
1977             {
1978                 for (y = 0; y < 2; y++)
1979                 {
1980                     for (x = 0; x < 2; x++)
1981                     {
1982                         cx[XX] = grid_r[i*2+x][XX];
1983                         cx[YY] = grid_r[i*2+y][YY];
1984                         cx[ZZ] = grid_r[i*2+z][ZZ];
1985                         mvmul(tric, cx, r);
1986                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
1987                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
1988                     }
1989                 }
1990             }
1991             for (d = 0; d < DIM; d++)
1992             {
1993                 for (x = 0; x < 4; x++)
1994                 {
1995                     switch (d)
1996                     {
1997                         case 0: y = 1 + i*8 + 2*x; break;
1998                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1999                         case 2: y = 1 + i*8 + x; break;
2000                     }
2001                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2002                 }
2003             }
2004         }
2005         gmx_fio_fclose(out);
2006         sfree(grid_r);
2007     }
2008 }
2009
2010 void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
2011                   gmx_mtop_t *mtop, t_commrec *cr,
2012                   int natoms, rvec x[], matrix box)
2013 {
2014     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2015     FILE         *out;
2016     int           i, ii, resnr, c;
2017     char         *atomname, *resname;
2018     real          b;
2019     gmx_domdec_t *dd;
2020
2021     dd = cr->dd;
2022     if (natoms == -1)
2023     {
2024         natoms = dd->comm->nat[ddnatVSITE];
2025     }
2026
2027     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2028
2029     sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
2030     sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
2031
2032     out = gmx_fio_fopen(fname, "w");
2033
2034     fprintf(out, "TITLE     %s\n", title);
2035     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2036     for (i = 0; i < natoms; i++)
2037     {
2038         ii = dd->gatindex[i];
2039         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2040         if (i < dd->comm->nat[ddnatZONE])
2041         {
2042             c = 0;
2043             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2044             {
2045                 c++;
2046             }
2047             b = c;
2048         }
2049         else if (i < dd->comm->nat[ddnatVSITE])
2050         {
2051             b = dd->comm->zones.n;
2052         }
2053         else
2054         {
2055             b = dd->comm->zones.n + 1;
2056         }
2057         fprintf(out, strlen(atomname) < 4 ? format : format4,
2058                 "ATOM", (ii+1)%100000,
2059                 atomname, resname, ' ', resnr%10000, ' ',
2060                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2061     }
2062     fprintf(out, "TER\n");
2063
2064     gmx_fio_fclose(out);
2065 }
2066
2067 real dd_cutoff_mbody(gmx_domdec_t *dd)
2068 {
2069     gmx_domdec_comm_t *comm;
2070     int                di;
2071     real               r;
2072
2073     comm = dd->comm;
2074
2075     r = -1;
2076     if (comm->bInterCGBondeds)
2077     {
2078         if (comm->cutoff_mbody > 0)
2079         {
2080             r = comm->cutoff_mbody;
2081         }
2082         else
2083         {
2084             /* cutoff_mbody=0 means we do not have DLB */
2085             r = comm->cellsize_min[dd->dim[0]];
2086             for (di = 1; di < dd->ndim; di++)
2087             {
2088                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2089             }
2090             if (comm->bBondComm)
2091             {
2092                 r = max(r, comm->cutoff_mbody);
2093             }
2094             else
2095             {
2096                 r = min(r, comm->cutoff);
2097             }
2098         }
2099     }
2100
2101     return r;
2102 }
2103
2104 real dd_cutoff_twobody(gmx_domdec_t *dd)
2105 {
2106     real r_mb;
2107
2108     r_mb = dd_cutoff_mbody(dd);
2109
2110     return max(dd->comm->cutoff, r_mb);
2111 }
2112
2113
2114 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2115 {
2116     int nc, ntot;
2117
2118     nc   = dd->nc[dd->comm->cartpmedim];
2119     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2120     copy_ivec(coord, coord_pme);
2121     coord_pme[dd->comm->cartpmedim] =
2122         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2123 }
2124
2125 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2126 {
2127     /* Here we assign a PME node to communicate with this DD node
2128      * by assuming that the major index of both is x.
2129      * We add cr->npmenodes/2 to obtain an even distribution.
2130      */
2131     return (ddindex*npme + npme/2)/ndd;
2132 }
2133
2134 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2135 {
2136     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2137 }
2138
2139 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2140 {
2141     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2142 }
2143
2144 static int *dd_pmenodes(t_commrec *cr)
2145 {
2146     int *pmenodes;
2147     int  n, i, p0, p1;
2148
2149     snew(pmenodes, cr->npmenodes);
2150     n = 0;
2151     for (i = 0; i < cr->dd->nnodes; i++)
2152     {
2153         p0 = cr_ddindex2pmeindex(cr, i);
2154         p1 = cr_ddindex2pmeindex(cr, i+1);
2155         if (i+1 == cr->dd->nnodes || p1 > p0)
2156         {
2157             if (debug)
2158             {
2159                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2160             }
2161             pmenodes[n] = i + 1 + n;
2162             n++;
2163         }
2164     }
2165
2166     return pmenodes;
2167 }
2168
2169 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2170 {
2171     gmx_domdec_t *dd;
2172     ivec          coords, coords_pme, nc;
2173     int           slab;
2174
2175     dd = cr->dd;
2176     /*
2177        if (dd->comm->bCartesian) {
2178        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2179        dd_coords2pmecoords(dd,coords,coords_pme);
2180        copy_ivec(dd->ntot,nc);
2181        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2182        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2183
2184        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2185        } else {
2186        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2187        }
2188      */
2189     coords[XX] = x;
2190     coords[YY] = y;
2191     coords[ZZ] = z;
2192     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2193
2194     return slab;
2195 }
2196
2197 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2198 {
2199     gmx_domdec_comm_t *comm;
2200     ivec               coords;
2201     int                ddindex, nodeid = -1;
2202
2203     comm = cr->dd->comm;
2204
2205     coords[XX] = x;
2206     coords[YY] = y;
2207     coords[ZZ] = z;
2208     if (comm->bCartesianPP_PME)
2209     {
2210 #ifdef GMX_MPI
2211         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2212 #endif
2213     }
2214     else
2215     {
2216         ddindex = dd_index(cr->dd->nc, coords);
2217         if (comm->bCartesianPP)
2218         {
2219             nodeid = comm->ddindex2simnodeid[ddindex];
2220         }
2221         else
2222         {
2223             if (comm->pmenodes)
2224             {
2225                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2226             }
2227             else
2228             {
2229                 nodeid = ddindex;
2230             }
2231         }
2232     }
2233
2234     return nodeid;
2235 }
2236
2237 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2238 {
2239     gmx_domdec_t      *dd;
2240     gmx_domdec_comm_t *comm;
2241     ivec               coord, coord_pme;
2242     int                i;
2243     int                pmenode = -1;
2244
2245     dd   = cr->dd;
2246     comm = dd->comm;
2247
2248     /* This assumes a uniform x domain decomposition grid cell size */
2249     if (comm->bCartesianPP_PME)
2250     {
2251 #ifdef GMX_MPI
2252         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2253         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2254         {
2255             /* This is a PP node */
2256             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2257             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2258         }
2259 #endif
2260     }
2261     else if (comm->bCartesianPP)
2262     {
2263         if (sim_nodeid < dd->nnodes)
2264         {
2265             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2266         }
2267     }
2268     else
2269     {
2270         /* This assumes DD cells with identical x coordinates
2271          * are numbered sequentially.
2272          */
2273         if (dd->comm->pmenodes == NULL)
2274         {
2275             if (sim_nodeid < dd->nnodes)
2276             {
2277                 /* The DD index equals the nodeid */
2278                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2279             }
2280         }
2281         else
2282         {
2283             i = 0;
2284             while (sim_nodeid > dd->comm->pmenodes[i])
2285             {
2286                 i++;
2287             }
2288             if (sim_nodeid < dd->comm->pmenodes[i])
2289             {
2290                 pmenode = dd->comm->pmenodes[i];
2291             }
2292         }
2293     }
2294
2295     return pmenode;
2296 }
2297
2298 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2299 {
2300     gmx_bool bPMEOnlyNode;
2301
2302     if (DOMAINDECOMP(cr))
2303     {
2304         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2305     }
2306     else
2307     {
2308         bPMEOnlyNode = FALSE;
2309     }
2310
2311     return bPMEOnlyNode;
2312 }
2313
2314 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2315                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2316 {
2317     gmx_domdec_t *dd;
2318     int           x, y, z;
2319     ivec          coord, coord_pme;
2320
2321     dd = cr->dd;
2322
2323     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2324
2325     *nmy_ddnodes = 0;
2326     for (x = 0; x < dd->nc[XX]; x++)
2327     {
2328         for (y = 0; y < dd->nc[YY]; y++)
2329         {
2330             for (z = 0; z < dd->nc[ZZ]; z++)
2331             {
2332                 if (dd->comm->bCartesianPP_PME)
2333                 {
2334                     coord[XX] = x;
2335                     coord[YY] = y;
2336                     coord[ZZ] = z;
2337                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2338                     if (dd->ci[XX] == coord_pme[XX] &&
2339                         dd->ci[YY] == coord_pme[YY] &&
2340                         dd->ci[ZZ] == coord_pme[ZZ])
2341                     {
2342                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2343                     }
2344                 }
2345                 else
2346                 {
2347                     /* The slab corresponds to the nodeid in the PME group */
2348                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2349                     {
2350                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2351                     }
2352                 }
2353             }
2354         }
2355     }
2356
2357     /* The last PP-only node is the peer node */
2358     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2359
2360     if (debug)
2361     {
2362         fprintf(debug, "Receive coordinates from PP nodes:");
2363         for (x = 0; x < *nmy_ddnodes; x++)
2364         {
2365             fprintf(debug, " %d", (*my_ddnodes)[x]);
2366         }
2367         fprintf(debug, "\n");
2368     }
2369 }
2370
2371 static gmx_bool receive_vir_ener(t_commrec *cr)
2372 {
2373     gmx_domdec_comm_t *comm;
2374     int                pmenode, coords[DIM], rank;
2375     gmx_bool           bReceive;
2376
2377     bReceive = TRUE;
2378     if (cr->npmenodes < cr->dd->nnodes)
2379     {
2380         comm = cr->dd->comm;
2381         if (comm->bCartesianPP_PME)
2382         {
2383             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2384 #ifdef GMX_MPI
2385             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2386             coords[comm->cartpmedim]++;
2387             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2388             {
2389                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2390                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2391                 {
2392                     /* This is not the last PP node for pmenode */
2393                     bReceive = FALSE;
2394                 }
2395             }
2396 #endif
2397         }
2398         else
2399         {
2400             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2401             if (cr->sim_nodeid+1 < cr->nnodes &&
2402                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2403             {
2404                 /* This is not the last PP node for pmenode */
2405                 bReceive = FALSE;
2406             }
2407         }
2408     }
2409
2410     return bReceive;
2411 }
2412
2413 static void set_zones_ncg_home(gmx_domdec_t *dd)
2414 {
2415     gmx_domdec_zones_t *zones;
2416     int                 i;
2417
2418     zones = &dd->comm->zones;
2419
2420     zones->cg_range[0] = 0;
2421     for (i = 1; i < zones->n+1; i++)
2422     {
2423         zones->cg_range[i] = dd->ncg_home;
2424     }
2425 }
2426
2427 static void rebuild_cgindex(gmx_domdec_t *dd,
2428                             const int *gcgs_index, t_state *state)
2429 {
2430     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2431
2432     ind        = state->cg_gl;
2433     dd_cg_gl   = dd->index_gl;
2434     cgindex    = dd->cgindex;
2435     nat        = 0;
2436     cgindex[0] = nat;
2437     for (i = 0; i < state->ncg_gl; i++)
2438     {
2439         cgindex[i]  = nat;
2440         cg_gl       = ind[i];
2441         dd_cg_gl[i] = cg_gl;
2442         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2443     }
2444     cgindex[i] = nat;
2445
2446     dd->ncg_home = state->ncg_gl;
2447     dd->nat_home = nat;
2448
2449     set_zones_ncg_home(dd);
2450 }
2451
2452 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2453 {
2454     while (cg >= cginfo_mb->cg_end)
2455     {
2456         cginfo_mb++;
2457     }
2458
2459     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2460 }
2461
2462 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2463                           t_forcerec *fr, char *bLocalCG)
2464 {
2465     cginfo_mb_t *cginfo_mb;
2466     int         *cginfo;
2467     int          cg;
2468
2469     if (fr != NULL)
2470     {
2471         cginfo_mb = fr->cginfo_mb;
2472         cginfo    = fr->cginfo;
2473
2474         for (cg = cg0; cg < cg1; cg++)
2475         {
2476             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2477         }
2478     }
2479
2480     if (bLocalCG != NULL)
2481     {
2482         for (cg = cg0; cg < cg1; cg++)
2483         {
2484             bLocalCG[index_gl[cg]] = TRUE;
2485         }
2486     }
2487 }
2488
2489 static void make_dd_indices(gmx_domdec_t *dd,
2490                             const int *gcgs_index, int cg_start)
2491 {
2492     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2493     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2494     gmx_ga2la_t *ga2la;
2495     char        *bLocalCG;
2496     gmx_bool     bCGs;
2497
2498     bLocalCG = dd->comm->bLocalCG;
2499
2500     if (dd->nat_tot > dd->gatindex_nalloc)
2501     {
2502         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2503         srenew(dd->gatindex, dd->gatindex_nalloc);
2504     }
2505
2506     nzone      = dd->comm->zones.n;
2507     zone2cg    = dd->comm->zones.cg_range;
2508     zone_ncg1  = dd->comm->zone_ncg1;
2509     index_gl   = dd->index_gl;
2510     gatindex   = dd->gatindex;
2511     bCGs       = dd->comm->bCGs;
2512
2513     if (zone2cg[1] != dd->ncg_home)
2514     {
2515         gmx_incons("dd->ncg_zone is not up to date");
2516     }
2517
2518     /* Make the local to global and global to local atom index */
2519     a = dd->cgindex[cg_start];
2520     for (zone = 0; zone < nzone; zone++)
2521     {
2522         if (zone == 0)
2523         {
2524             cg0 = cg_start;
2525         }
2526         else
2527         {
2528             cg0 = zone2cg[zone];
2529         }
2530         cg1    = zone2cg[zone+1];
2531         cg1_p1 = cg0 + zone_ncg1[zone];
2532
2533         for (cg = cg0; cg < cg1; cg++)
2534         {
2535             zone1 = zone;
2536             if (cg >= cg1_p1)
2537             {
2538                 /* Signal that this cg is from more than one pulse away */
2539                 zone1 += nzone;
2540             }
2541             cg_gl = index_gl[cg];
2542             if (bCGs)
2543             {
2544                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2545                 {
2546                     gatindex[a] = a_gl;
2547                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2548                     a++;
2549                 }
2550             }
2551             else
2552             {
2553                 gatindex[a] = cg_gl;
2554                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2555                 a++;
2556             }
2557         }
2558     }
2559 }
2560
2561 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2562                           const char *where)
2563 {
2564     int ncg, i, ngl, nerr;
2565
2566     nerr = 0;
2567     if (bLocalCG == NULL)
2568     {
2569         return nerr;
2570     }
2571     for (i = 0; i < dd->ncg_tot; i++)
2572     {
2573         if (!bLocalCG[dd->index_gl[i]])
2574         {
2575             fprintf(stderr,
2576                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2577             nerr++;
2578         }
2579     }
2580     ngl = 0;
2581     for (i = 0; i < ncg_sys; i++)
2582     {
2583         if (bLocalCG[i])
2584         {
2585             ngl++;
2586         }
2587     }
2588     if (ngl != dd->ncg_tot)
2589     {
2590         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2591         nerr++;
2592     }
2593
2594     return nerr;
2595 }
2596
2597 static void check_index_consistency(gmx_domdec_t *dd,
2598                                     int natoms_sys, int ncg_sys,
2599                                     const char *where)
2600 {
2601     int   nerr, ngl, i, a, cell;
2602     int  *have;
2603
2604     nerr = 0;
2605
2606     if (dd->comm->DD_debug > 1)
2607     {
2608         snew(have, natoms_sys);
2609         for (a = 0; a < dd->nat_tot; a++)
2610         {
2611             if (have[dd->gatindex[a]] > 0)
2612             {
2613                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2614             }
2615             else
2616             {
2617                 have[dd->gatindex[a]] = a + 1;
2618             }
2619         }
2620         sfree(have);
2621     }
2622
2623     snew(have, dd->nat_tot);
2624
2625     ngl  = 0;
2626     for (i = 0; i < natoms_sys; i++)
2627     {
2628         if (ga2la_get(dd->ga2la, i, &a, &cell))
2629         {
2630             if (a >= dd->nat_tot)
2631             {
2632                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2633                 nerr++;
2634             }
2635             else
2636             {
2637                 have[a] = 1;
2638                 if (dd->gatindex[a] != i)
2639                 {
2640                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2641                     nerr++;
2642                 }
2643             }
2644             ngl++;
2645         }
2646     }
2647     if (ngl != dd->nat_tot)
2648     {
2649         fprintf(stderr,
2650                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2651                 dd->rank, where, ngl, dd->nat_tot);
2652     }
2653     for (a = 0; a < dd->nat_tot; a++)
2654     {
2655         if (have[a] == 0)
2656         {
2657             fprintf(stderr,
2658                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2659                     dd->rank, where, a+1, dd->gatindex[a]+1);
2660         }
2661     }
2662     sfree(have);
2663
2664     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2665
2666     if (nerr > 0)
2667     {
2668         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2669                   dd->rank, where, nerr);
2670     }
2671 }
2672
2673 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2674 {
2675     int   i;
2676     char *bLocalCG;
2677
2678     if (a_start == 0)
2679     {
2680         /* Clear the whole list without searching */
2681         ga2la_clear(dd->ga2la);
2682     }
2683     else
2684     {
2685         for (i = a_start; i < dd->nat_tot; i++)
2686         {
2687             ga2la_del(dd->ga2la, dd->gatindex[i]);
2688         }
2689     }
2690
2691     bLocalCG = dd->comm->bLocalCG;
2692     if (bLocalCG)
2693     {
2694         for (i = cg_start; i < dd->ncg_tot; i++)
2695         {
2696             bLocalCG[dd->index_gl[i]] = FALSE;
2697         }
2698     }
2699
2700     dd_clear_local_vsite_indices(dd);
2701
2702     if (dd->constraints)
2703     {
2704         dd_clear_local_constraint_indices(dd);
2705     }
2706 }
2707
2708 /* This function should be used for moving the domain boudaries during DLB,
2709  * for obtaining the minimum cell size. It checks the initially set limit
2710  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2711  * and, possibly, a longer cut-off limit set for PME load balancing.
2712  */
2713 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2714 {
2715     real cellsize_min;
2716
2717     cellsize_min = comm->cellsize_min[dim];
2718
2719     if (!comm->bVacDLBNoLimit)
2720     {
2721         /* The cut-off might have changed, e.g. by PME load balacning,
2722          * from the value used to set comm->cellsize_min, so check it.
2723          */
2724         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2725
2726         if (comm->bPMELoadBalDLBLimits)
2727         {
2728             /* Check for the cut-off limit set by the PME load balancing */
2729             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2730         }
2731     }
2732
2733     return cellsize_min;
2734 }
2735
2736 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2737                             int dim_ind)
2738 {
2739     real grid_jump_limit;
2740
2741     /* The distance between the boundaries of cells at distance
2742      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2743      * and by the fact that cells should not be shifted by more than
2744      * half their size, such that cg's only shift by one cell
2745      * at redecomposition.
2746      */
2747     grid_jump_limit = comm->cellsize_limit;
2748     if (!comm->bVacDLBNoLimit)
2749     {
2750         if (comm->bPMELoadBalDLBLimits)
2751         {
2752             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2753         }
2754         grid_jump_limit = max(grid_jump_limit,
2755                               cutoff/comm->cd[dim_ind].np);
2756     }
2757
2758     return grid_jump_limit;
2759 }
2760
2761 static gmx_bool check_grid_jump(gmx_large_int_t step,
2762                                 gmx_domdec_t   *dd,
2763                                 real            cutoff,
2764                                 gmx_ddbox_t    *ddbox,
2765                                 gmx_bool        bFatal)
2766 {
2767     gmx_domdec_comm_t *comm;
2768     int                d, dim;
2769     real               limit, bfac;
2770     gmx_bool           bInvalid;
2771
2772     bInvalid = FALSE;
2773
2774     comm = dd->comm;
2775
2776     for (d = 1; d < dd->ndim; d++)
2777     {
2778         dim   = dd->dim[d];
2779         limit = grid_jump_limit(comm, cutoff, d);
2780         bfac  = ddbox->box_size[dim];
2781         if (ddbox->tric_dir[dim])
2782         {
2783             bfac *= ddbox->skew_fac[dim];
2784         }
2785         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2786                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2787         {
2788             bInvalid = TRUE;
2789
2790             if (bFatal)
2791             {
2792                 char buf[22];
2793
2794                 /* This error should never be triggered under normal
2795                  * circumstances, but you never know ...
2796                  */
2797                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2798                           gmx_step_str(step, buf),
2799                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2800             }
2801         }
2802     }
2803
2804     return bInvalid;
2805 }
2806
2807 static int dd_load_count(gmx_domdec_comm_t *comm)
2808 {
2809     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2810 }
2811
2812 static float dd_force_load(gmx_domdec_comm_t *comm)
2813 {
2814     float load;
2815
2816     if (comm->eFlop)
2817     {
2818         load = comm->flop;
2819         if (comm->eFlop > 1)
2820         {
2821             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2822         }
2823     }
2824     else
2825     {
2826         load = comm->cycl[ddCyclF];
2827         if (comm->cycl_n[ddCyclF] > 1)
2828         {
2829             /* Subtract the maximum of the last n cycle counts
2830              * to get rid of possible high counts due to other soures,
2831              * for instance system activity, that would otherwise
2832              * affect the dynamic load balancing.
2833              */
2834             load -= comm->cycl_max[ddCyclF];
2835         }
2836     }
2837
2838     return load;
2839 }
2840
2841 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2842 {
2843     gmx_domdec_comm_t *comm;
2844     int                i;
2845
2846     comm = dd->comm;
2847
2848     snew(*dim_f, dd->nc[dim]+1);
2849     (*dim_f)[0] = 0;
2850     for (i = 1; i < dd->nc[dim]; i++)
2851     {
2852         if (comm->slb_frac[dim])
2853         {
2854             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2855         }
2856         else
2857         {
2858             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2859         }
2860     }
2861     (*dim_f)[dd->nc[dim]] = 1;
2862 }
2863
2864 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2865 {
2866     int  pmeindex, slab, nso, i;
2867     ivec xyz;
2868
2869     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2870     {
2871         ddpme->dim = YY;
2872     }
2873     else
2874     {
2875         ddpme->dim = dimind;
2876     }
2877     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2878
2879     ddpme->nslab = (ddpme->dim == 0 ?
2880                     dd->comm->npmenodes_x :
2881                     dd->comm->npmenodes_y);
2882
2883     if (ddpme->nslab <= 1)
2884     {
2885         return;
2886     }
2887
2888     nso = dd->comm->npmenodes/ddpme->nslab;
2889     /* Determine for each PME slab the PP location range for dimension dim */
2890     snew(ddpme->pp_min, ddpme->nslab);
2891     snew(ddpme->pp_max, ddpme->nslab);
2892     for (slab = 0; slab < ddpme->nslab; slab++)
2893     {
2894         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2895         ddpme->pp_max[slab] = 0;
2896     }
2897     for (i = 0; i < dd->nnodes; i++)
2898     {
2899         ddindex2xyz(dd->nc, i, xyz);
2900         /* For y only use our y/z slab.
2901          * This assumes that the PME x grid size matches the DD grid size.
2902          */
2903         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2904         {
2905             pmeindex = ddindex2pmeindex(dd, i);
2906             if (dimind == 0)
2907             {
2908                 slab = pmeindex/nso;
2909             }
2910             else
2911             {
2912                 slab = pmeindex % ddpme->nslab;
2913             }
2914             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2915             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2916         }
2917     }
2918
2919     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2920 }
2921
2922 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2923 {
2924     if (dd->comm->ddpme[0].dim == XX)
2925     {
2926         return dd->comm->ddpme[0].maxshift;
2927     }
2928     else
2929     {
2930         return 0;
2931     }
2932 }
2933
2934 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2935 {
2936     if (dd->comm->ddpme[0].dim == YY)
2937     {
2938         return dd->comm->ddpme[0].maxshift;
2939     }
2940     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2941     {
2942         return dd->comm->ddpme[1].maxshift;
2943     }
2944     else
2945     {
2946         return 0;
2947     }
2948 }
2949
2950 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2951                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2952 {
2953     gmx_domdec_comm_t *comm;
2954     int                nc, ns, s;
2955     int               *xmin, *xmax;
2956     real               range, pme_boundary;
2957     int                sh;
2958
2959     comm = dd->comm;
2960     nc   = dd->nc[ddpme->dim];
2961     ns   = ddpme->nslab;
2962
2963     if (!ddpme->dim_match)
2964     {
2965         /* PP decomposition is not along dim: the worst situation */
2966         sh = ns/2;
2967     }
2968     else if (ns <= 3 || (bUniform && ns == nc))
2969     {
2970         /* The optimal situation */
2971         sh = 1;
2972     }
2973     else
2974     {
2975         /* We need to check for all pme nodes which nodes they
2976          * could possibly need to communicate with.
2977          */
2978         xmin = ddpme->pp_min;
2979         xmax = ddpme->pp_max;
2980         /* Allow for atoms to be maximally 2/3 times the cut-off
2981          * out of their DD cell. This is a reasonable balance between
2982          * between performance and support for most charge-group/cut-off
2983          * combinations.
2984          */
2985         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2986         /* Avoid extra communication when we are exactly at a boundary */
2987         range *= 0.999;
2988
2989         sh = 1;
2990         for (s = 0; s < ns; s++)
2991         {
2992             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2993             pme_boundary = (real)s/ns;
2994             while (sh+1 < ns &&
2995                    ((s-(sh+1) >= 0 &&
2996                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2997                     (s-(sh+1) <  0 &&
2998                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2999             {
3000                 sh++;
3001             }
3002             pme_boundary = (real)(s+1)/ns;
3003             while (sh+1 < ns &&
3004                    ((s+(sh+1) <  ns &&
3005                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3006                     (s+(sh+1) >= ns &&
3007                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3008             {
3009                 sh++;
3010             }
3011         }
3012     }
3013
3014     ddpme->maxshift = sh;
3015
3016     if (debug)
3017     {
3018         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3019                 ddpme->dim, ddpme->maxshift);
3020     }
3021 }
3022
3023 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3024 {
3025     int d, dim;
3026
3027     for (d = 0; d < dd->ndim; d++)
3028     {
3029         dim = dd->dim[d];
3030         if (dim < ddbox->nboundeddim &&
3031             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3032             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3033         {
3034             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3035                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3036                       dd->nc[dim], dd->comm->cellsize_limit);
3037         }
3038     }
3039 }
3040
3041 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3042                                   gmx_bool bMaster, ivec npulse)
3043 {
3044     gmx_domdec_comm_t *comm;
3045     int                d, j;
3046     rvec               cellsize_min;
3047     real              *cell_x, cell_dx, cellsize;
3048
3049     comm = dd->comm;
3050
3051     for (d = 0; d < DIM; d++)
3052     {
3053         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3054         npulse[d]       = 1;
3055         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3056         {
3057             /* Uniform grid */
3058             cell_dx = ddbox->box_size[d]/dd->nc[d];
3059             if (bMaster)
3060             {
3061                 for (j = 0; j < dd->nc[d]+1; j++)
3062                 {
3063                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3064                 }
3065             }
3066             else
3067             {
3068                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3069                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3070             }
3071             cellsize = cell_dx*ddbox->skew_fac[d];
3072             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3073             {
3074                 npulse[d]++;
3075             }
3076             cellsize_min[d] = cellsize;
3077         }
3078         else
3079         {
3080             /* Statically load balanced grid */
3081             /* Also when we are not doing a master distribution we determine
3082              * all cell borders in a loop to obtain identical values
3083              * to the master distribution case and to determine npulse.
3084              */
3085             if (bMaster)
3086             {
3087                 cell_x = dd->ma->cell_x[d];
3088             }
3089             else
3090             {
3091                 snew(cell_x, dd->nc[d]+1);
3092             }
3093             cell_x[0] = ddbox->box0[d];
3094             for (j = 0; j < dd->nc[d]; j++)
3095             {
3096                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3097                 cell_x[j+1] = cell_x[j] + cell_dx;
3098                 cellsize    = cell_dx*ddbox->skew_fac[d];
3099                 while (cellsize*npulse[d] < comm->cutoff &&
3100                        npulse[d] < dd->nc[d]-1)
3101                 {
3102                     npulse[d]++;
3103                 }
3104                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3105             }
3106             if (!bMaster)
3107             {
3108                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3109                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3110                 sfree(cell_x);
3111             }
3112         }
3113         /* The following limitation is to avoid that a cell would receive
3114          * some of its own home charge groups back over the periodic boundary.
3115          * Double charge groups cause trouble with the global indices.
3116          */
3117         if (d < ddbox->npbcdim &&
3118             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3119         {
3120             gmx_fatal_collective(FARGS, NULL, dd,
3121                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3122                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3123                                  comm->cutoff,
3124                                  dd->nc[d], dd->nc[d],
3125                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3126         }
3127     }
3128
3129     if (!comm->bDynLoadBal)
3130     {
3131         copy_rvec(cellsize_min, comm->cellsize_min);
3132     }
3133
3134     for (d = 0; d < comm->npmedecompdim; d++)
3135     {
3136         set_pme_maxshift(dd, &comm->ddpme[d],
3137                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3138                          comm->ddpme[d].slb_dim_f);
3139     }
3140 }
3141
3142
3143 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3144                                                   int d, int dim, gmx_domdec_root_t *root,
3145                                                   gmx_ddbox_t *ddbox,
3146                                                   gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
3147 {
3148     gmx_domdec_comm_t *comm;
3149     int                ncd, i, j, nmin, nmin_old;
3150     gmx_bool           bLimLo, bLimHi;
3151     real              *cell_size;
3152     real               fac, halfway, cellsize_limit_f_i, region_size;
3153     gmx_bool           bPBC, bLastHi = FALSE;
3154     int                nrange[] = {range[0], range[1]};
3155
3156     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3157
3158     comm = dd->comm;
3159
3160     ncd = dd->nc[dim];
3161
3162     bPBC = (dim < ddbox->npbcdim);
3163
3164     cell_size = root->buf_ncd;
3165
3166     if (debug)
3167     {
3168         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3169     }
3170
3171     /* First we need to check if the scaling does not make cells
3172      * smaller than the smallest allowed size.
3173      * We need to do this iteratively, since if a cell is too small,
3174      * it needs to be enlarged, which makes all the other cells smaller,
3175      * which could in turn make another cell smaller than allowed.
3176      */
3177     for (i = range[0]; i < range[1]; i++)
3178     {
3179         root->bCellMin[i] = FALSE;
3180     }
3181     nmin = 0;
3182     do
3183     {
3184         nmin_old = nmin;
3185         /* We need the total for normalization */
3186         fac = 0;
3187         for (i = range[0]; i < range[1]; i++)
3188         {
3189             if (root->bCellMin[i] == FALSE)
3190             {
3191                 fac += cell_size[i];
3192             }
3193         }
3194         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3195         /* Determine the cell boundaries */
3196         for (i = range[0]; i < range[1]; i++)
3197         {
3198             if (root->bCellMin[i] == FALSE)
3199             {
3200                 cell_size[i] *= fac;
3201                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3202                 {
3203                     cellsize_limit_f_i = 0;
3204                 }
3205                 else
3206                 {
3207                     cellsize_limit_f_i = cellsize_limit_f;
3208                 }
3209                 if (cell_size[i] < cellsize_limit_f_i)
3210                 {
3211                     root->bCellMin[i] = TRUE;
3212                     cell_size[i]      = cellsize_limit_f_i;
3213                     nmin++;
3214                 }
3215             }
3216             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3217         }
3218     }
3219     while (nmin > nmin_old);
3220
3221     i            = range[1]-1;
3222     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3223     /* For this check we should not use DD_CELL_MARGIN,
3224      * but a slightly smaller factor,
3225      * since rounding could get use below the limit.
3226      */
3227     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3228     {
3229         char buf[22];
3230         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3231                   gmx_step_str(step, buf),
3232                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3233                   ncd, comm->cellsize_min[dim]);
3234     }
3235
3236     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3237
3238     if (!bUniform)
3239     {
3240         /* Check if the boundary did not displace more than halfway
3241          * each of the cells it bounds, as this could cause problems,
3242          * especially when the differences between cell sizes are large.
3243          * If changes are applied, they will not make cells smaller
3244          * than the cut-off, as we check all the boundaries which
3245          * might be affected by a change and if the old state was ok,
3246          * the cells will at most be shrunk back to their old size.
3247          */
3248         for (i = range[0]+1; i < range[1]; i++)
3249         {
3250             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3251             if (root->cell_f[i] < halfway)
3252             {
3253                 root->cell_f[i] = halfway;
3254                 /* Check if the change also causes shifts of the next boundaries */
3255                 for (j = i+1; j < range[1]; j++)
3256                 {
3257                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3258                     {
3259                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3260                     }
3261                 }
3262             }
3263             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3264             if (root->cell_f[i] > halfway)
3265             {
3266                 root->cell_f[i] = halfway;
3267                 /* Check if the change also causes shifts of the next boundaries */
3268                 for (j = i-1; j >= range[0]+1; j--)
3269                 {
3270                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3271                     {
3272                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3273                     }
3274                 }
3275             }
3276         }
3277     }
3278
3279     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3280     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3281      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3282      * for a and b nrange is used */
3283     if (d > 0)
3284     {
3285         /* Take care of the staggering of the cell boundaries */
3286         if (bUniform)
3287         {
3288             for (i = range[0]; i < range[1]; i++)
3289             {
3290                 root->cell_f_max0[i] = root->cell_f[i];
3291                 root->cell_f_min1[i] = root->cell_f[i+1];
3292             }
3293         }
3294         else
3295         {
3296             for (i = range[0]+1; i < range[1]; i++)
3297             {
3298                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3299                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3300                 if (bLimLo && bLimHi)
3301                 {
3302                     /* Both limits violated, try the best we can */
3303                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3304                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3305                     nrange[0]       = range[0];
3306                     nrange[1]       = i;
3307                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3308
3309                     nrange[0] = i;
3310                     nrange[1] = range[1];
3311                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3312
3313                     return;
3314                 }
3315                 else if (bLimLo)
3316                 {
3317                     /* root->cell_f[i] = root->bound_min[i]; */
3318                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3319                     bLastHi   = FALSE;
3320                 }
3321                 else if (bLimHi && !bLastHi)
3322                 {
3323                     bLastHi = TRUE;
3324                     if (nrange[1] < range[1])   /* found a LimLo before */
3325                     {
3326                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3327                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3328                         nrange[0] = nrange[1];
3329                     }
3330                     root->cell_f[i] = root->bound_max[i];
3331                     nrange[1]       = i;
3332                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3333                     nrange[0] = i;
3334                     nrange[1] = range[1];
3335                 }
3336             }
3337             if (nrange[1] < range[1])   /* found last a LimLo */
3338             {
3339                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3340                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3341                 nrange[0] = nrange[1];
3342                 nrange[1] = range[1];
3343                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3344             }
3345             else if (nrange[0] > range[0]) /* found at least one LimHi */
3346             {
3347                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3348             }
3349         }
3350     }
3351 }
3352
3353
3354 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3355                                        int d, int dim, gmx_domdec_root_t *root,
3356                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3357                                        gmx_bool bUniform, gmx_large_int_t step)
3358 {
3359     gmx_domdec_comm_t *comm;
3360     int                ncd, d1, i, j, pos;
3361     real              *cell_size;
3362     real               load_aver, load_i, imbalance, change, change_max, sc;
3363     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3364     real               change_limit;
3365     real               relax = 0.5;
3366     gmx_bool           bPBC;
3367     int                range[] = { 0, 0 };
3368
3369     comm = dd->comm;
3370
3371     /* Convert the maximum change from the input percentage to a fraction */
3372     change_limit = comm->dlb_scale_lim*0.01;
3373
3374     ncd = dd->nc[dim];
3375
3376     bPBC = (dim < ddbox->npbcdim);
3377
3378     cell_size = root->buf_ncd;
3379
3380     /* Store the original boundaries */
3381     for (i = 0; i < ncd+1; i++)
3382     {
3383         root->old_cell_f[i] = root->cell_f[i];
3384     }
3385     if (bUniform)
3386     {
3387         for (i = 0; i < ncd; i++)
3388         {
3389             cell_size[i] = 1.0/ncd;
3390         }
3391     }
3392     else if (dd_load_count(comm))
3393     {
3394         load_aver  = comm->load[d].sum_m/ncd;
3395         change_max = 0;
3396         for (i = 0; i < ncd; i++)
3397         {
3398             /* Determine the relative imbalance of cell i */
3399             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3400             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3401             /* Determine the change of the cell size using underrelaxation */
3402             change     = -relax*imbalance;
3403             change_max = max(change_max, max(change, -change));
3404         }
3405         /* Limit the amount of scaling.
3406          * We need to use the same rescaling for all cells in one row,
3407          * otherwise the load balancing might not converge.
3408          */
3409         sc = relax;
3410         if (change_max > change_limit)
3411         {
3412             sc *= change_limit/change_max;
3413         }
3414         for (i = 0; i < ncd; i++)
3415         {
3416             /* Determine the relative imbalance of cell i */
3417             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3418             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3419             /* Determine the change of the cell size using underrelaxation */
3420             change       = -sc*imbalance;
3421             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3422         }
3423     }
3424
3425     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3426     cellsize_limit_f *= DD_CELL_MARGIN;
3427     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3428     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3429     if (ddbox->tric_dir[dim])
3430     {
3431         cellsize_limit_f /= ddbox->skew_fac[dim];
3432         dist_min_f       /= ddbox->skew_fac[dim];
3433     }
3434     if (bDynamicBox && d > 0)
3435     {
3436         dist_min_f *= DD_PRES_SCALE_MARGIN;
3437     }
3438     if (d > 0 && !bUniform)
3439     {
3440         /* Make sure that the grid is not shifted too much */
3441         for (i = 1; i < ncd; i++)
3442         {
3443             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3444             {
3445                 gmx_incons("Inconsistent DD boundary staggering limits!");
3446             }
3447             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3448             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3449             if (space > 0)
3450             {
3451                 root->bound_min[i] += 0.5*space;
3452             }
3453             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3454             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3455             if (space < 0)
3456             {
3457                 root->bound_max[i] += 0.5*space;
3458             }
3459             if (debug)
3460             {
3461                 fprintf(debug,
3462                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3463                         d, i,
3464                         root->cell_f_max0[i-1] + dist_min_f,
3465                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3466                         root->cell_f_min1[i] - dist_min_f);
3467             }
3468         }
3469     }
3470     range[1]          = ncd;
3471     root->cell_f[0]   = 0;
3472     root->cell_f[ncd] = 1;
3473     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3474
3475
3476     /* After the checks above, the cells should obey the cut-off
3477      * restrictions, but it does not hurt to check.
3478      */
3479     for (i = 0; i < ncd; i++)
3480     {
3481         if (debug)
3482         {
3483             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3484                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3485         }
3486
3487         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3488             root->cell_f[i+1] - root->cell_f[i] <
3489             cellsize_limit_f/DD_CELL_MARGIN)
3490         {
3491             char buf[22];
3492             fprintf(stderr,
3493                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3494                     gmx_step_str(step, buf), dim2char(dim), i,
3495                     (root->cell_f[i+1] - root->cell_f[i])
3496                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3497         }
3498     }
3499
3500     pos = ncd + 1;
3501     /* Store the cell boundaries of the lower dimensions at the end */
3502     for (d1 = 0; d1 < d; d1++)
3503     {
3504         root->cell_f[pos++] = comm->cell_f0[d1];
3505         root->cell_f[pos++] = comm->cell_f1[d1];
3506     }
3507
3508     if (d < comm->npmedecompdim)
3509     {
3510         /* The master determines the maximum shift for
3511          * the coordinate communication between separate PME nodes.
3512          */
3513         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3514     }
3515     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3516     if (d >= 1)
3517     {
3518         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3519     }
3520 }
3521
3522 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3523                                              gmx_ddbox_t *ddbox, int dimind)
3524 {
3525     gmx_domdec_comm_t *comm;
3526     int                dim;
3527
3528     comm = dd->comm;
3529
3530     /* Set the cell dimensions */
3531     dim                = dd->dim[dimind];
3532     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3533     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3534     if (dim >= ddbox->nboundeddim)
3535     {
3536         comm->cell_x0[dim] += ddbox->box0[dim];
3537         comm->cell_x1[dim] += ddbox->box0[dim];
3538     }
3539 }
3540
3541 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3542                                          int d, int dim, real *cell_f_row,
3543                                          gmx_ddbox_t *ddbox)
3544 {
3545     gmx_domdec_comm_t *comm;
3546     int                d1, dim1, pos;
3547
3548     comm = dd->comm;
3549
3550 #ifdef GMX_MPI
3551     /* Each node would only need to know two fractions,
3552      * but it is probably cheaper to broadcast the whole array.
3553      */
3554     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3555               0, comm->mpi_comm_load[d]);
3556 #endif
3557     /* Copy the fractions for this dimension from the buffer */
3558     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3559     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3560     /* The whole array was communicated, so set the buffer position */
3561     pos = dd->nc[dim] + 1;
3562     for (d1 = 0; d1 <= d; d1++)
3563     {
3564         if (d1 < d)
3565         {
3566             /* Copy the cell fractions of the lower dimensions */
3567             comm->cell_f0[d1] = cell_f_row[pos++];
3568             comm->cell_f1[d1] = cell_f_row[pos++];
3569         }
3570         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3571     }
3572     /* Convert the communicated shift from float to int */
3573     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3574     if (d >= 1)
3575     {
3576         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3577     }
3578 }
3579
3580 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3581                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3582                                          gmx_bool bUniform, gmx_large_int_t step)
3583 {
3584     gmx_domdec_comm_t *comm;
3585     int                d, dim, d1;
3586     gmx_bool           bRowMember, bRowRoot;
3587     real              *cell_f_row;
3588
3589     comm = dd->comm;
3590
3591     for (d = 0; d < dd->ndim; d++)
3592     {
3593         dim        = dd->dim[d];
3594         bRowMember = TRUE;
3595         bRowRoot   = TRUE;
3596         for (d1 = d; d1 < dd->ndim; d1++)
3597         {
3598             if (dd->ci[dd->dim[d1]] > 0)
3599             {
3600                 if (d1 > d)
3601                 {
3602                     bRowMember = FALSE;
3603                 }
3604                 bRowRoot = FALSE;
3605             }
3606         }
3607         if (bRowMember)
3608         {
3609             if (bRowRoot)
3610             {
3611                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3612                                            ddbox, bDynamicBox, bUniform, step);
3613                 cell_f_row = comm->root[d]->cell_f;
3614             }
3615             else
3616             {
3617                 cell_f_row = comm->cell_f_row;
3618             }
3619             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3620         }
3621     }
3622 }
3623
3624 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3625 {
3626     int d;
3627
3628     /* This function assumes the box is static and should therefore
3629      * not be called when the box has changed since the last
3630      * call to dd_partition_system.
3631      */
3632     for (d = 0; d < dd->ndim; d++)
3633     {
3634         relative_to_absolute_cell_bounds(dd, ddbox, d);
3635     }
3636 }
3637
3638
3639
3640 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3641                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3642                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3643                                   gmx_wallcycle_t wcycle)
3644 {
3645     gmx_domdec_comm_t *comm;
3646     int                dim;
3647
3648     comm = dd->comm;
3649
3650     if (bDoDLB)
3651     {
3652         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3653         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3654         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3655     }
3656     else if (bDynamicBox)
3657     {
3658         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3659     }
3660
3661     /* Set the dimensions for which no DD is used */
3662     for (dim = 0; dim < DIM; dim++)
3663     {
3664         if (dd->nc[dim] == 1)
3665         {
3666             comm->cell_x0[dim] = 0;
3667             comm->cell_x1[dim] = ddbox->box_size[dim];
3668             if (dim >= ddbox->nboundeddim)
3669             {
3670                 comm->cell_x0[dim] += ddbox->box0[dim];
3671                 comm->cell_x1[dim] += ddbox->box0[dim];
3672             }
3673         }
3674     }
3675 }
3676
3677 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3678 {
3679     int                    d, np, i;
3680     gmx_domdec_comm_dim_t *cd;
3681
3682     for (d = 0; d < dd->ndim; d++)
3683     {
3684         cd = &dd->comm->cd[d];
3685         np = npulse[dd->dim[d]];
3686         if (np > cd->np_nalloc)
3687         {
3688             if (debug)
3689             {
3690                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3691                         dim2char(dd->dim[d]), np);
3692             }
3693             if (DDMASTER(dd) && cd->np_nalloc > 0)
3694             {
3695                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3696             }
3697             srenew(cd->ind, np);
3698             for (i = cd->np_nalloc; i < np; i++)
3699             {
3700                 cd->ind[i].index  = NULL;
3701                 cd->ind[i].nalloc = 0;
3702             }
3703             cd->np_nalloc = np;
3704         }
3705         cd->np = np;
3706     }
3707 }
3708
3709
3710 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3711                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3712                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3713                               gmx_wallcycle_t wcycle)
3714 {
3715     gmx_domdec_comm_t *comm;
3716     int                d;
3717     ivec               npulse;
3718
3719     comm = dd->comm;
3720
3721     /* Copy the old cell boundaries for the cg displacement check */
3722     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3723     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3724
3725     if (comm->bDynLoadBal)
3726     {
3727         if (DDMASTER(dd))
3728         {
3729             check_box_size(dd, ddbox);
3730         }
3731         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3732     }
3733     else
3734     {
3735         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
3736         realloc_comm_ind(dd, npulse);
3737     }
3738
3739     if (debug)
3740     {
3741         for (d = 0; d < DIM; d++)
3742         {
3743             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3744                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3745         }
3746     }
3747 }
3748
3749 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3750                                   gmx_ddbox_t *ddbox,
3751                                   rvec cell_ns_x0, rvec cell_ns_x1,
3752                                   gmx_large_int_t step)
3753 {
3754     gmx_domdec_comm_t *comm;
3755     int                dim_ind, dim;
3756
3757     comm = dd->comm;
3758
3759     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3760     {
3761         dim = dd->dim[dim_ind];
3762
3763         /* Without PBC we don't have restrictions on the outer cells */
3764         if (!(dim >= ddbox->npbcdim &&
3765               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3766             comm->bDynLoadBal &&
3767             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3768             comm->cellsize_min[dim])
3769         {
3770             char buf[22];
3771             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3772                       gmx_step_str(step, buf), dim2char(dim),
3773                       comm->cell_x1[dim] - comm->cell_x0[dim],
3774                       ddbox->skew_fac[dim],
3775                       dd->comm->cellsize_min[dim],
3776                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3777         }
3778     }
3779
3780     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3781     {
3782         /* Communicate the boundaries and update cell_ns_x0/1 */
3783         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3784         if (dd->bGridJump && dd->ndim > 1)
3785         {
3786             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3787         }
3788     }
3789 }
3790
3791 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3792 {
3793     if (YY < npbcdim)
3794     {
3795         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3796     }
3797     else
3798     {
3799         tcm[YY][XX] = 0;
3800     }
3801     if (ZZ < npbcdim)
3802     {
3803         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3804         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3805     }
3806     else
3807     {
3808         tcm[ZZ][XX] = 0;
3809         tcm[ZZ][YY] = 0;
3810     }
3811 }
3812
3813 static void check_screw_box(matrix box)
3814 {
3815     /* Mathematical limitation */
3816     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3817     {
3818         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3819     }
3820
3821     /* Limitation due to the asymmetry of the eighth shell method */
3822     if (box[ZZ][YY] != 0)
3823     {
3824         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3825     }
3826 }
3827
3828 static void distribute_cg(FILE *fplog, gmx_large_int_t step,
3829                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3830                           gmx_domdec_t *dd)
3831 {
3832     gmx_domdec_master_t *ma;
3833     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3834     int                  i, icg, j, k, k0, k1, d, npbcdim;
3835     matrix               tcm;
3836     rvec                 box_size, cg_cm;
3837     ivec                 ind;
3838     real                 nrcg, inv_ncg, pos_d;
3839     atom_id             *cgindex;
3840     gmx_bool             bUnbounded, bScrew;
3841
3842     ma = dd->ma;
3843
3844     if (tmp_ind == NULL)
3845     {
3846         snew(tmp_nalloc, dd->nnodes);
3847         snew(tmp_ind, dd->nnodes);
3848         for (i = 0; i < dd->nnodes; i++)
3849         {
3850             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3851             snew(tmp_ind[i], tmp_nalloc[i]);
3852         }
3853     }
3854
3855     /* Clear the count */
3856     for (i = 0; i < dd->nnodes; i++)
3857     {
3858         ma->ncg[i] = 0;
3859         ma->nat[i] = 0;
3860     }
3861
3862     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3863
3864     cgindex = cgs->index;
3865
3866     /* Compute the center of geometry for all charge groups */
3867     for (icg = 0; icg < cgs->nr; icg++)
3868     {
3869         k0      = cgindex[icg];
3870         k1      = cgindex[icg+1];
3871         nrcg    = k1 - k0;
3872         if (nrcg == 1)
3873         {
3874             copy_rvec(pos[k0], cg_cm);
3875         }
3876         else
3877         {
3878             inv_ncg = 1.0/nrcg;
3879
3880             clear_rvec(cg_cm);
3881             for (k = k0; (k < k1); k++)
3882             {
3883                 rvec_inc(cg_cm, pos[k]);
3884             }
3885             for (d = 0; (d < DIM); d++)
3886             {
3887                 cg_cm[d] *= inv_ncg;
3888             }
3889         }
3890         /* Put the charge group in the box and determine the cell index */
3891         for (d = DIM-1; d >= 0; d--)
3892         {
3893             pos_d = cg_cm[d];
3894             if (d < dd->npbcdim)
3895             {
3896                 bScrew = (dd->bScrewPBC && d == XX);
3897                 if (tric_dir[d] && dd->nc[d] > 1)
3898                 {
3899                     /* Use triclinic coordintates for this dimension */
3900                     for (j = d+1; j < DIM; j++)
3901                     {
3902                         pos_d += cg_cm[j]*tcm[j][d];
3903                     }
3904                 }
3905                 while (pos_d >= box[d][d])
3906                 {
3907                     pos_d -= box[d][d];
3908                     rvec_dec(cg_cm, box[d]);
3909                     if (bScrew)
3910                     {
3911                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3912                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3913                     }
3914                     for (k = k0; (k < k1); k++)
3915                     {
3916                         rvec_dec(pos[k], box[d]);
3917                         if (bScrew)
3918                         {
3919                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3920                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3921                         }
3922                     }
3923                 }
3924                 while (pos_d < 0)
3925                 {
3926                     pos_d += box[d][d];
3927                     rvec_inc(cg_cm, box[d]);
3928                     if (bScrew)
3929                     {
3930                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3931                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3932                     }
3933                     for (k = k0; (k < k1); k++)
3934                     {
3935                         rvec_inc(pos[k], box[d]);
3936                         if (bScrew)
3937                         {
3938                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3939                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3940                         }
3941                     }
3942                 }
3943             }
3944             /* This could be done more efficiently */
3945             ind[d] = 0;
3946             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3947             {
3948                 ind[d]++;
3949             }
3950         }
3951         i = dd_index(dd->nc, ind);
3952         if (ma->ncg[i] == tmp_nalloc[i])
3953         {
3954             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3955             srenew(tmp_ind[i], tmp_nalloc[i]);
3956         }
3957         tmp_ind[i][ma->ncg[i]] = icg;
3958         ma->ncg[i]++;
3959         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3960     }
3961
3962     k1 = 0;
3963     for (i = 0; i < dd->nnodes; i++)
3964     {
3965         ma->index[i] = k1;
3966         for (k = 0; k < ma->ncg[i]; k++)
3967         {
3968             ma->cg[k1++] = tmp_ind[i][k];
3969         }
3970     }
3971     ma->index[dd->nnodes] = k1;
3972
3973     for (i = 0; i < dd->nnodes; i++)
3974     {
3975         sfree(tmp_ind[i]);
3976     }
3977     sfree(tmp_ind);
3978     sfree(tmp_nalloc);
3979
3980     if (fplog)
3981     {
3982         char buf[22];
3983         fprintf(fplog, "Charge group distribution at step %s:",
3984                 gmx_step_str(step, buf));
3985         for (i = 0; i < dd->nnodes; i++)
3986         {
3987             fprintf(fplog, " %d", ma->ncg[i]);
3988         }
3989         fprintf(fplog, "\n");
3990     }
3991 }
3992
3993 static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
3994                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3995                                 rvec pos[])
3996 {
3997     gmx_domdec_master_t *ma = NULL;
3998     ivec                 npulse;
3999     int                  i, cg_gl;
4000     int                 *ibuf, buf2[2] = { 0, 0 };
4001     gmx_bool             bMaster = DDMASTER(dd);
4002     if (bMaster)
4003     {
4004         ma = dd->ma;
4005
4006         if (dd->bScrewPBC)
4007         {
4008             check_screw_box(box);
4009         }
4010
4011         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
4012
4013         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4014         for (i = 0; i < dd->nnodes; i++)
4015         {
4016             ma->ibuf[2*i]   = ma->ncg[i];
4017             ma->ibuf[2*i+1] = ma->nat[i];
4018         }
4019         ibuf = ma->ibuf;
4020     }
4021     else
4022     {
4023         ibuf = NULL;
4024     }
4025     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4026
4027     dd->ncg_home = buf2[0];
4028     dd->nat_home = buf2[1];
4029     dd->ncg_tot  = dd->ncg_home;
4030     dd->nat_tot  = dd->nat_home;
4031     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4032     {
4033         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4034         srenew(dd->index_gl, dd->cg_nalloc);
4035         srenew(dd->cgindex, dd->cg_nalloc+1);
4036     }
4037     if (bMaster)
4038     {
4039         for (i = 0; i < dd->nnodes; i++)
4040         {
4041             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4042             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4043         }
4044     }
4045
4046     dd_scatterv(dd,
4047                 DDMASTER(dd) ? ma->ibuf : NULL,
4048                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4049                 DDMASTER(dd) ? ma->cg : NULL,
4050                 dd->ncg_home*sizeof(int), dd->index_gl);
4051
4052     /* Determine the home charge group sizes */
4053     dd->cgindex[0] = 0;
4054     for (i = 0; i < dd->ncg_home; i++)
4055     {
4056         cg_gl            = dd->index_gl[i];
4057         dd->cgindex[i+1] =
4058             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4059     }
4060
4061     if (debug)
4062     {
4063         fprintf(debug, "Home charge groups:\n");
4064         for (i = 0; i < dd->ncg_home; i++)
4065         {
4066             fprintf(debug, " %d", dd->index_gl[i]);
4067             if (i % 10 == 9)
4068             {
4069                 fprintf(debug, "\n");
4070             }
4071         }
4072         fprintf(debug, "\n");
4073     }
4074 }
4075
4076 static int compact_and_copy_vec_at(int ncg, int *move,
4077                                    int *cgindex,
4078                                    int nvec, int vec,
4079                                    rvec *src, gmx_domdec_comm_t *comm,
4080                                    gmx_bool bCompact)
4081 {
4082     int m, icg, i, i0, i1, nrcg;
4083     int home_pos;
4084     int pos_vec[DIM*2];
4085
4086     home_pos = 0;
4087
4088     for (m = 0; m < DIM*2; m++)
4089     {
4090         pos_vec[m] = 0;
4091     }
4092
4093     i0 = 0;
4094     for (icg = 0; icg < ncg; icg++)
4095     {
4096         i1 = cgindex[icg+1];
4097         m  = move[icg];
4098         if (m == -1)
4099         {
4100             if (bCompact)
4101             {
4102                 /* Compact the home array in place */
4103                 for (i = i0; i < i1; i++)
4104                 {
4105                     copy_rvec(src[i], src[home_pos++]);
4106                 }
4107             }
4108         }
4109         else
4110         {
4111             /* Copy to the communication buffer */
4112             nrcg        = i1 - i0;
4113             pos_vec[m] += 1 + vec*nrcg;
4114             for (i = i0; i < i1; i++)
4115             {
4116                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4117             }
4118             pos_vec[m] += (nvec - vec - 1)*nrcg;
4119         }
4120         if (!bCompact)
4121         {
4122             home_pos += i1 - i0;
4123         }
4124         i0 = i1;
4125     }
4126
4127     return home_pos;
4128 }
4129
4130 static int compact_and_copy_vec_cg(int ncg, int *move,
4131                                    int *cgindex,
4132                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4133                                    gmx_bool bCompact)
4134 {
4135     int m, icg, i0, i1, nrcg;
4136     int home_pos;
4137     int pos_vec[DIM*2];
4138
4139     home_pos = 0;
4140
4141     for (m = 0; m < DIM*2; m++)
4142     {
4143         pos_vec[m] = 0;
4144     }
4145
4146     i0 = 0;
4147     for (icg = 0; icg < ncg; icg++)
4148     {
4149         i1 = cgindex[icg+1];
4150         m  = move[icg];
4151         if (m == -1)
4152         {
4153             if (bCompact)
4154             {
4155                 /* Compact the home array in place */
4156                 copy_rvec(src[icg], src[home_pos++]);
4157             }
4158         }
4159         else
4160         {
4161             nrcg = i1 - i0;
4162             /* Copy to the communication buffer */
4163             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4164             pos_vec[m] += 1 + nrcg*nvec;
4165         }
4166         i0 = i1;
4167     }
4168     if (!bCompact)
4169     {
4170         home_pos = ncg;
4171     }
4172
4173     return home_pos;
4174 }
4175
4176 static int compact_ind(int ncg, int *move,
4177                        int *index_gl, int *cgindex,
4178                        int *gatindex,
4179                        gmx_ga2la_t ga2la, char *bLocalCG,
4180                        int *cginfo)
4181 {
4182     int cg, nat, a0, a1, a, a_gl;
4183     int home_pos;
4184
4185     home_pos = 0;
4186     nat      = 0;
4187     for (cg = 0; cg < ncg; cg++)
4188     {
4189         a0 = cgindex[cg];
4190         a1 = cgindex[cg+1];
4191         if (move[cg] == -1)
4192         {
4193             /* Compact the home arrays in place.
4194              * Anything that can be done here avoids access to global arrays.
4195              */
4196             cgindex[home_pos] = nat;
4197             for (a = a0; a < a1; a++)
4198             {
4199                 a_gl          = gatindex[a];
4200                 gatindex[nat] = a_gl;
4201                 /* The cell number stays 0, so we don't need to set it */
4202                 ga2la_change_la(ga2la, a_gl, nat);
4203                 nat++;
4204             }
4205             index_gl[home_pos] = index_gl[cg];
4206             cginfo[home_pos]   = cginfo[cg];
4207             /* The charge group remains local, so bLocalCG does not change */
4208             home_pos++;
4209         }
4210         else
4211         {
4212             /* Clear the global indices */
4213             for (a = a0; a < a1; a++)
4214             {
4215                 ga2la_del(ga2la, gatindex[a]);
4216             }
4217             if (bLocalCG)
4218             {
4219                 bLocalCG[index_gl[cg]] = FALSE;
4220             }
4221         }
4222     }
4223     cgindex[home_pos] = nat;
4224
4225     return home_pos;
4226 }
4227
4228 static void clear_and_mark_ind(int ncg, int *move,
4229                                int *index_gl, int *cgindex, int *gatindex,
4230                                gmx_ga2la_t ga2la, char *bLocalCG,
4231                                int *cell_index)
4232 {
4233     int cg, a0, a1, a;
4234
4235     for (cg = 0; cg < ncg; cg++)
4236     {
4237         if (move[cg] >= 0)
4238         {
4239             a0 = cgindex[cg];
4240             a1 = cgindex[cg+1];
4241             /* Clear the global indices */
4242             for (a = a0; a < a1; a++)
4243             {
4244                 ga2la_del(ga2la, gatindex[a]);
4245             }
4246             if (bLocalCG)
4247             {
4248                 bLocalCG[index_gl[cg]] = FALSE;
4249             }
4250             /* Signal that this cg has moved using the ns cell index.
4251              * Here we set it to -1. fill_grid will change it
4252              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4253              */
4254             cell_index[cg] = -1;
4255         }
4256     }
4257 }
4258
4259 static void print_cg_move(FILE *fplog,
4260                           gmx_domdec_t *dd,
4261                           gmx_large_int_t step, int cg, int dim, int dir,
4262                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4263                           rvec cm_old, rvec cm_new, real pos_d)
4264 {
4265     gmx_domdec_comm_t *comm;
4266     char               buf[22];
4267
4268     comm = dd->comm;
4269
4270     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4271     if (bHaveLimitdAndCMOld)
4272     {
4273         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4274                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4275     }
4276     else
4277     {
4278         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4279                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4280     }
4281     fprintf(fplog, "distance out of cell %f\n",
4282             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4283     if (bHaveLimitdAndCMOld)
4284     {
4285         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4286                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4287     }
4288     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4289             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4290     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4291             dim2char(dim),
4292             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4293     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4294             dim2char(dim),
4295             comm->cell_x0[dim], comm->cell_x1[dim]);
4296 }
4297
4298 static void cg_move_error(FILE *fplog,
4299                           gmx_domdec_t *dd,
4300                           gmx_large_int_t step, int cg, int dim, int dir,
4301                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4302                           rvec cm_old, rvec cm_new, real pos_d)
4303 {
4304     if (fplog)
4305     {
4306         print_cg_move(fplog, dd, step, cg, dim, dir,
4307                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4308     }
4309     print_cg_move(stderr, dd, step, cg, dim, dir,
4310                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4311     gmx_fatal(FARGS,
4312               "A charge group moved too far between two domain decomposition steps\n"
4313               "This usually means that your system is not well equilibrated");
4314 }
4315
4316 static void rotate_state_atom(t_state *state, int a)
4317 {
4318     int est;
4319
4320     for (est = 0; est < estNR; est++)
4321     {
4322         if (EST_DISTR(est) && (state->flags & (1<<est)))
4323         {
4324             switch (est)
4325             {
4326                 case estX:
4327                     /* Rotate the complete state; for a rectangular box only */
4328                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4329                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4330                     break;
4331                 case estV:
4332                     state->v[a][YY] = -state->v[a][YY];
4333                     state->v[a][ZZ] = -state->v[a][ZZ];
4334                     break;
4335                 case estSDX:
4336                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4337                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4338                     break;
4339                 case estCGP:
4340                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4341                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4342                     break;
4343                 case estDISRE_INITF:
4344                 case estDISRE_RM3TAV:
4345                 case estORIRE_INITF:
4346                 case estORIRE_DTAV:
4347                     /* These are distances, so not affected by rotation */
4348                     break;
4349                 default:
4350                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4351             }
4352         }
4353     }
4354 }
4355
4356 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4357 {
4358     if (natoms > comm->moved_nalloc)
4359     {
4360         /* Contents should be preserved here */
4361         comm->moved_nalloc = over_alloc_dd(natoms);
4362         srenew(comm->moved, comm->moved_nalloc);
4363     }
4364
4365     return comm->moved;
4366 }
4367
4368 static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
4369                          gmx_domdec_t *dd,
4370                          t_state *state,
4371                          ivec tric_dir, matrix tcm,
4372                          rvec cell_x0, rvec cell_x1,
4373                          rvec limitd, rvec limit0, rvec limit1,
4374                          const int *cgindex,
4375                          int cg_start, int cg_end,
4376                          rvec *cg_cm,
4377                          int *move)
4378 {
4379     int      npbcdim;
4380     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4381     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4382     int      flag;
4383     gmx_bool bScrew;
4384     ivec     dev;
4385     real     inv_ncg, pos_d;
4386     rvec     cm_new;
4387
4388     npbcdim = dd->npbcdim;
4389
4390     for (cg = cg_start; cg < cg_end; cg++)
4391     {
4392         k0   = cgindex[cg];
4393         k1   = cgindex[cg+1];
4394         nrcg = k1 - k0;
4395         if (nrcg == 1)
4396         {
4397             copy_rvec(state->x[k0], cm_new);
4398         }
4399         else
4400         {
4401             inv_ncg = 1.0/nrcg;
4402
4403             clear_rvec(cm_new);
4404             for (k = k0; (k < k1); k++)
4405             {
4406                 rvec_inc(cm_new, state->x[k]);
4407             }
4408             for (d = 0; (d < DIM); d++)
4409             {
4410                 cm_new[d] = inv_ncg*cm_new[d];
4411             }
4412         }
4413
4414         clear_ivec(dev);
4415         /* Do pbc and check DD cell boundary crossings */
4416         for (d = DIM-1; d >= 0; d--)
4417         {
4418             if (dd->nc[d] > 1)
4419             {
4420                 bScrew = (dd->bScrewPBC && d == XX);
4421                 /* Determine the location of this cg in lattice coordinates */
4422                 pos_d = cm_new[d];
4423                 if (tric_dir[d])
4424                 {
4425                     for (d2 = d+1; d2 < DIM; d2++)
4426                     {
4427                         pos_d += cm_new[d2]*tcm[d2][d];
4428                     }
4429                 }
4430                 /* Put the charge group in the triclinic unit-cell */
4431                 if (pos_d >= cell_x1[d])
4432                 {
4433                     if (pos_d >= limit1[d])
4434                     {
4435                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4436                                       cg_cm[cg], cm_new, pos_d);
4437                     }
4438                     dev[d] = 1;
4439                     if (dd->ci[d] == dd->nc[d] - 1)
4440                     {
4441                         rvec_dec(cm_new, state->box[d]);
4442                         if (bScrew)
4443                         {
4444                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4445                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4446                         }
4447                         for (k = k0; (k < k1); k++)
4448                         {
4449                             rvec_dec(state->x[k], state->box[d]);
4450                             if (bScrew)
4451                             {
4452                                 rotate_state_atom(state, k);
4453                             }
4454                         }
4455                     }
4456                 }
4457                 else if (pos_d < cell_x0[d])
4458                 {
4459                     if (pos_d < limit0[d])
4460                     {
4461                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4462                                       cg_cm[cg], cm_new, pos_d);
4463                     }
4464                     dev[d] = -1;
4465                     if (dd->ci[d] == 0)
4466                     {
4467                         rvec_inc(cm_new, state->box[d]);
4468                         if (bScrew)
4469                         {
4470                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4471                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4472                         }
4473                         for (k = k0; (k < k1); k++)
4474                         {
4475                             rvec_inc(state->x[k], state->box[d]);
4476                             if (bScrew)
4477                             {
4478                                 rotate_state_atom(state, k);
4479                             }
4480                         }
4481                     }
4482                 }
4483             }
4484             else if (d < npbcdim)
4485             {
4486                 /* Put the charge group in the rectangular unit-cell */
4487                 while (cm_new[d] >= state->box[d][d])
4488                 {
4489                     rvec_dec(cm_new, state->box[d]);
4490                     for (k = k0; (k < k1); k++)
4491                     {
4492                         rvec_dec(state->x[k], state->box[d]);
4493                     }
4494                 }
4495                 while (cm_new[d] < 0)
4496                 {
4497                     rvec_inc(cm_new, state->box[d]);
4498                     for (k = k0; (k < k1); k++)
4499                     {
4500                         rvec_inc(state->x[k], state->box[d]);
4501                     }
4502                 }
4503             }
4504         }
4505
4506         copy_rvec(cm_new, cg_cm[cg]);
4507
4508         /* Determine where this cg should go */
4509         flag = 0;
4510         mc   = -1;
4511         for (d = 0; d < dd->ndim; d++)
4512         {
4513             dim = dd->dim[d];
4514             if (dev[dim] == 1)
4515             {
4516                 flag |= DD_FLAG_FW(d);
4517                 if (mc == -1)
4518                 {
4519                     mc = d*2;
4520                 }
4521             }
4522             else if (dev[dim] == -1)
4523             {
4524                 flag |= DD_FLAG_BW(d);
4525                 if (mc == -1)
4526                 {
4527                     if (dd->nc[dim] > 2)
4528                     {
4529                         mc = d*2 + 1;
4530                     }
4531                     else
4532                     {
4533                         mc = d*2;
4534                     }
4535                 }
4536             }
4537         }
4538         /* Temporarily store the flag in move */
4539         move[cg] = mc + flag;
4540     }
4541 }
4542
4543 static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
4544                                gmx_domdec_t *dd, ivec tric_dir,
4545                                t_state *state, rvec **f,
4546                                t_forcerec *fr, t_mdatoms *md,
4547                                gmx_bool bCompact,
4548                                t_nrnb *nrnb,
4549                                int *ncg_stay_home,
4550                                int *ncg_moved)
4551 {
4552     int               *move;
4553     int                npbcdim;
4554     int                ncg[DIM*2], nat[DIM*2];
4555     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4556     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4557     int                sbuf[2], rbuf[2];
4558     int                home_pos_cg, home_pos_at, buf_pos;
4559     int                flag;
4560     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4561     gmx_bool           bScrew;
4562     ivec               dev;
4563     real               inv_ncg, pos_d;
4564     matrix             tcm;
4565     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4566     atom_id           *cgindex;
4567     cginfo_mb_t       *cginfo_mb;
4568     gmx_domdec_comm_t *comm;
4569     int               *moved;
4570     int                nthread, thread;
4571
4572     if (dd->bScrewPBC)
4573     {
4574         check_screw_box(state->box);
4575     }
4576
4577     comm  = dd->comm;
4578     if (fr->cutoff_scheme == ecutsGROUP)
4579     {
4580         cg_cm = fr->cg_cm;
4581     }
4582
4583     for (i = 0; i < estNR; i++)
4584     {
4585         if (EST_DISTR(i))
4586         {
4587             switch (i)
4588             {
4589                 case estX: /* Always present */ break;
4590                 case estV:   bV   = (state->flags & (1<<i)); break;
4591                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4592                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4593                 case estLD_RNG:
4594                 case estLD_RNGI:
4595                 case estDISRE_INITF:
4596                 case estDISRE_RM3TAV:
4597                 case estORIRE_INITF:
4598                 case estORIRE_DTAV:
4599                     /* No processing required */
4600                     break;
4601                 default:
4602                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4603             }
4604         }
4605     }
4606
4607     if (dd->ncg_tot > comm->nalloc_int)
4608     {
4609         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4610         srenew(comm->buf_int, comm->nalloc_int);
4611     }
4612     move = comm->buf_int;
4613
4614     /* Clear the count */
4615     for (c = 0; c < dd->ndim*2; c++)
4616     {
4617         ncg[c] = 0;
4618         nat[c] = 0;
4619     }
4620
4621     npbcdim = dd->npbcdim;
4622
4623     for (d = 0; (d < DIM); d++)
4624     {
4625         limitd[d] = dd->comm->cellsize_min[d];
4626         if (d >= npbcdim && dd->ci[d] == 0)
4627         {
4628             cell_x0[d] = -GMX_FLOAT_MAX;
4629         }
4630         else
4631         {
4632             cell_x0[d] = comm->cell_x0[d];
4633         }
4634         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4635         {
4636             cell_x1[d] = GMX_FLOAT_MAX;
4637         }
4638         else
4639         {
4640             cell_x1[d] = comm->cell_x1[d];
4641         }
4642         if (d < npbcdim)
4643         {
4644             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4645             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4646         }
4647         else
4648         {
4649             /* We check after communication if a charge group moved
4650              * more than one cell. Set the pre-comm check limit to float_max.
4651              */
4652             limit0[d] = -GMX_FLOAT_MAX;
4653             limit1[d] =  GMX_FLOAT_MAX;
4654         }
4655     }
4656
4657     make_tric_corr_matrix(npbcdim, state->box, tcm);
4658
4659     cgindex = dd->cgindex;
4660
4661     nthread = gmx_omp_nthreads_get(emntDomdec);
4662
4663     /* Compute the center of geometry for all home charge groups
4664      * and put them in the box and determine where they should go.
4665      */
4666 #pragma omp parallel for num_threads(nthread) schedule(static)
4667     for (thread = 0; thread < nthread; thread++)
4668     {
4669         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4670                      cell_x0, cell_x1, limitd, limit0, limit1,
4671                      cgindex,
4672                      ( thread   *dd->ncg_home)/nthread,
4673                      ((thread+1)*dd->ncg_home)/nthread,
4674                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4675                      move);
4676     }
4677
4678     for (cg = 0; cg < dd->ncg_home; cg++)
4679     {
4680         if (move[cg] >= 0)
4681         {
4682             mc       = move[cg];
4683             flag     = mc & ~DD_FLAG_NRCG;
4684             mc       = mc & DD_FLAG_NRCG;
4685             move[cg] = mc;
4686
4687             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4688             {
4689                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4690                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4691             }
4692             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4693             /* We store the cg size in the lower 16 bits
4694              * and the place where the charge group should go
4695              * in the next 6 bits. This saves some communication volume.
4696              */
4697             nrcg = cgindex[cg+1] - cgindex[cg];
4698             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4699             ncg[mc] += 1;
4700             nat[mc] += nrcg;
4701         }
4702     }
4703
4704     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4705     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4706
4707     *ncg_moved = 0;
4708     for (i = 0; i < dd->ndim*2; i++)
4709     {
4710         *ncg_moved += ncg[i];
4711     }
4712
4713     nvec = 1;
4714     if (bV)
4715     {
4716         nvec++;
4717     }
4718     if (bSDX)
4719     {
4720         nvec++;
4721     }
4722     if (bCGP)
4723     {
4724         nvec++;
4725     }
4726
4727     /* Make sure the communication buffers are large enough */
4728     for (mc = 0; mc < dd->ndim*2; mc++)
4729     {
4730         nvr = ncg[mc] + nat[mc]*nvec;
4731         if (nvr > comm->cgcm_state_nalloc[mc])
4732         {
4733             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4734             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4735         }
4736     }
4737
4738     switch (fr->cutoff_scheme)
4739     {
4740         case ecutsGROUP:
4741             /* Recalculating cg_cm might be cheaper than communicating,
4742              * but that could give rise to rounding issues.
4743              */
4744             home_pos_cg =
4745                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4746                                         nvec, cg_cm, comm, bCompact);
4747             break;
4748         case ecutsVERLET:
4749             /* Without charge groups we send the moved atom coordinates
4750              * over twice. This is so the code below can be used without
4751              * many conditionals for both for with and without charge groups.
4752              */
4753             home_pos_cg =
4754                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4755                                         nvec, state->x, comm, FALSE);
4756             if (bCompact)
4757             {
4758                 home_pos_cg -= *ncg_moved;
4759             }
4760             break;
4761         default:
4762             gmx_incons("unimplemented");
4763             home_pos_cg = 0;
4764     }
4765
4766     vec         = 0;
4767     home_pos_at =
4768         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4769                                 nvec, vec++, state->x, comm, bCompact);
4770     if (bV)
4771     {
4772         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4773                                 nvec, vec++, state->v, comm, bCompact);
4774     }
4775     if (bSDX)
4776     {
4777         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4778                                 nvec, vec++, state->sd_X, comm, bCompact);
4779     }
4780     if (bCGP)
4781     {
4782         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4783                                 nvec, vec++, state->cg_p, comm, bCompact);
4784     }
4785
4786     if (bCompact)
4787     {
4788         compact_ind(dd->ncg_home, move,
4789                     dd->index_gl, dd->cgindex, dd->gatindex,
4790                     dd->ga2la, comm->bLocalCG,
4791                     fr->cginfo);
4792     }
4793     else
4794     {
4795         if (fr->cutoff_scheme == ecutsVERLET)
4796         {
4797             moved = get_moved(comm, dd->ncg_home);
4798
4799             for (k = 0; k < dd->ncg_home; k++)
4800             {
4801                 moved[k] = 0;
4802             }
4803         }
4804         else
4805         {
4806             moved = fr->ns.grid->cell_index;
4807         }
4808
4809         clear_and_mark_ind(dd->ncg_home, move,
4810                            dd->index_gl, dd->cgindex, dd->gatindex,
4811                            dd->ga2la, comm->bLocalCG,
4812                            moved);
4813     }
4814
4815     cginfo_mb = fr->cginfo_mb;
4816
4817     *ncg_stay_home = home_pos_cg;
4818     for (d = 0; d < dd->ndim; d++)
4819     {
4820         dim      = dd->dim[d];
4821         ncg_recv = 0;
4822         nat_recv = 0;
4823         nvr      = 0;
4824         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4825         {
4826             cdd = d*2 + dir;
4827             /* Communicate the cg and atom counts */
4828             sbuf[0] = ncg[cdd];
4829             sbuf[1] = nat[cdd];
4830             if (debug)
4831             {
4832                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4833                         d, dir, sbuf[0], sbuf[1]);
4834             }
4835             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4836
4837             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4838             {
4839                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4840                 srenew(comm->buf_int, comm->nalloc_int);
4841             }
4842
4843             /* Communicate the charge group indices, sizes and flags */
4844             dd_sendrecv_int(dd, d, dir,
4845                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4846                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4847
4848             nvs = ncg[cdd] + nat[cdd]*nvec;
4849             i   = rbuf[0]  + rbuf[1] *nvec;
4850             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4851
4852             /* Communicate cgcm and state */
4853             dd_sendrecv_rvec(dd, d, dir,
4854                              comm->cgcm_state[cdd], nvs,
4855                              comm->vbuf.v+nvr, i);
4856             ncg_recv += rbuf[0];
4857             nat_recv += rbuf[1];
4858             nvr      += i;
4859         }
4860
4861         /* Process the received charge groups */
4862         buf_pos = 0;
4863         for (cg = 0; cg < ncg_recv; cg++)
4864         {
4865             flag = comm->buf_int[cg*DD_CGIBS+1];
4866
4867             if (dim >= npbcdim && dd->nc[dim] > 2)
4868             {
4869                 /* No pbc in this dim and more than one domain boundary.
4870                  * We do a separate check if a charge group didn't move too far.
4871                  */
4872                 if (((flag & DD_FLAG_FW(d)) &&
4873                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4874                     ((flag & DD_FLAG_BW(d)) &&
4875                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4876                 {
4877                     cg_move_error(fplog, dd, step, cg, dim,
4878                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4879                                   FALSE, 0,
4880                                   comm->vbuf.v[buf_pos],
4881                                   comm->vbuf.v[buf_pos],
4882                                   comm->vbuf.v[buf_pos][dim]);
4883                 }
4884             }
4885
4886             mc = -1;
4887             if (d < dd->ndim-1)
4888             {
4889                 /* Check which direction this cg should go */
4890                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4891                 {
4892                     if (dd->bGridJump)
4893                     {
4894                         /* The cell boundaries for dimension d2 are not equal
4895                          * for each cell row of the lower dimension(s),
4896                          * therefore we might need to redetermine where
4897                          * this cg should go.
4898                          */
4899                         dim2 = dd->dim[d2];
4900                         /* If this cg crosses the box boundary in dimension d2
4901                          * we can use the communicated flag, so we do not
4902                          * have to worry about pbc.
4903                          */
4904                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4905                                (flag & DD_FLAG_FW(d2))) ||
4906                               (dd->ci[dim2] == 0 &&
4907                                (flag & DD_FLAG_BW(d2)))))
4908                         {
4909                             /* Clear the two flags for this dimension */
4910                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4911                             /* Determine the location of this cg
4912                              * in lattice coordinates
4913                              */
4914                             pos_d = comm->vbuf.v[buf_pos][dim2];
4915                             if (tric_dir[dim2])
4916                             {
4917                                 for (d3 = dim2+1; d3 < DIM; d3++)
4918                                 {
4919                                     pos_d +=
4920                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4921                                 }
4922                             }
4923                             /* Check of we are not at the box edge.
4924                              * pbc is only handled in the first step above,
4925                              * but this check could move over pbc while
4926                              * the first step did not due to different rounding.
4927                              */
4928                             if (pos_d >= cell_x1[dim2] &&
4929                                 dd->ci[dim2] != dd->nc[dim2]-1)
4930                             {
4931                                 flag |= DD_FLAG_FW(d2);
4932                             }
4933                             else if (pos_d < cell_x0[dim2] &&
4934                                      dd->ci[dim2] != 0)
4935                             {
4936                                 flag |= DD_FLAG_BW(d2);
4937                             }
4938                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4939                         }
4940                     }
4941                     /* Set to which neighboring cell this cg should go */
4942                     if (flag & DD_FLAG_FW(d2))
4943                     {
4944                         mc = d2*2;
4945                     }
4946                     else if (flag & DD_FLAG_BW(d2))
4947                     {
4948                         if (dd->nc[dd->dim[d2]] > 2)
4949                         {
4950                             mc = d2*2+1;
4951                         }
4952                         else
4953                         {
4954                             mc = d2*2;
4955                         }
4956                     }
4957                 }
4958             }
4959
4960             nrcg = flag & DD_FLAG_NRCG;
4961             if (mc == -1)
4962             {
4963                 if (home_pos_cg+1 > dd->cg_nalloc)
4964                 {
4965                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4966                     srenew(dd->index_gl, dd->cg_nalloc);
4967                     srenew(dd->cgindex, dd->cg_nalloc+1);
4968                 }
4969                 /* Set the global charge group index and size */
4970                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4971                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4972                 /* Copy the state from the buffer */
4973                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4974                 if (fr->cutoff_scheme == ecutsGROUP)
4975                 {
4976                     cg_cm = fr->cg_cm;
4977                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4978                 }
4979                 buf_pos++;
4980
4981                 /* Set the cginfo */
4982                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4983                                                    dd->index_gl[home_pos_cg]);
4984                 if (comm->bLocalCG)
4985                 {
4986                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4987                 }
4988
4989                 if (home_pos_at+nrcg > state->nalloc)
4990                 {
4991                     dd_realloc_state(state, f, home_pos_at+nrcg);
4992                 }
4993                 for (i = 0; i < nrcg; i++)
4994                 {
4995                     copy_rvec(comm->vbuf.v[buf_pos++],
4996                               state->x[home_pos_at+i]);
4997                 }
4998                 if (bV)
4999                 {
5000                     for (i = 0; i < nrcg; i++)
5001                     {
5002                         copy_rvec(comm->vbuf.v[buf_pos++],
5003                                   state->v[home_pos_at+i]);
5004                     }
5005                 }
5006                 if (bSDX)
5007                 {
5008                     for (i = 0; i < nrcg; i++)
5009                     {
5010                         copy_rvec(comm->vbuf.v[buf_pos++],
5011                                   state->sd_X[home_pos_at+i]);
5012                     }
5013                 }
5014                 if (bCGP)
5015                 {
5016                     for (i = 0; i < nrcg; i++)
5017                     {
5018                         copy_rvec(comm->vbuf.v[buf_pos++],
5019                                   state->cg_p[home_pos_at+i]);
5020                     }
5021                 }
5022                 home_pos_cg += 1;
5023                 home_pos_at += nrcg;
5024             }
5025             else
5026             {
5027                 /* Reallocate the buffers if necessary  */
5028                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5029                 {
5030                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5031                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5032                 }
5033                 nvr = ncg[mc] + nat[mc]*nvec;
5034                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5035                 {
5036                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5037                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5038                 }
5039                 /* Copy from the receive to the send buffers */
5040                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5041                        comm->buf_int + cg*DD_CGIBS,
5042                        DD_CGIBS*sizeof(int));
5043                 memcpy(comm->cgcm_state[mc][nvr],
5044                        comm->vbuf.v[buf_pos],
5045                        (1+nrcg*nvec)*sizeof(rvec));
5046                 buf_pos += 1 + nrcg*nvec;
5047                 ncg[mc] += 1;
5048                 nat[mc] += nrcg;
5049             }
5050         }
5051     }
5052
5053     /* With sorting (!bCompact) the indices are now only partially up to date
5054      * and ncg_home and nat_home are not the real count, since there are
5055      * "holes" in the arrays for the charge groups that moved to neighbors.
5056      */
5057     if (fr->cutoff_scheme == ecutsVERLET)
5058     {
5059         moved = get_moved(comm, home_pos_cg);
5060
5061         for (i = dd->ncg_home; i < home_pos_cg; i++)
5062         {
5063             moved[i] = 0;
5064         }
5065     }
5066     dd->ncg_home = home_pos_cg;
5067     dd->nat_home = home_pos_at;
5068
5069     if (debug)
5070     {
5071         fprintf(debug,
5072                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5073                 *ncg_moved, dd->ncg_home-*ncg_moved);
5074
5075     }
5076 }
5077
5078 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5079 {
5080     dd->comm->cycl[ddCycl] += cycles;
5081     dd->comm->cycl_n[ddCycl]++;
5082     if (cycles > dd->comm->cycl_max[ddCycl])
5083     {
5084         dd->comm->cycl_max[ddCycl] = cycles;
5085     }
5086 }
5087
5088 static double force_flop_count(t_nrnb *nrnb)
5089 {
5090     int         i;
5091     double      sum;
5092     const char *name;
5093
5094     sum = 0;
5095     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5096     {
5097         /* To get closer to the real timings, we half the count
5098          * for the normal loops and again half it for water loops.
5099          */
5100         name = nrnb_str(i);
5101         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5102         {
5103             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5104         }
5105         else
5106         {
5107             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5108         }
5109     }
5110     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5111     {
5112         name = nrnb_str(i);
5113         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5114         {
5115             sum += nrnb->n[i]*cost_nrnb(i);
5116         }
5117     }
5118     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5119     {
5120         sum += nrnb->n[i]*cost_nrnb(i);
5121     }
5122
5123     return sum;
5124 }
5125
5126 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5127 {
5128     if (dd->comm->eFlop)
5129     {
5130         dd->comm->flop -= force_flop_count(nrnb);
5131     }
5132 }
5133 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5134 {
5135     if (dd->comm->eFlop)
5136     {
5137         dd->comm->flop += force_flop_count(nrnb);
5138         dd->comm->flop_n++;
5139     }
5140 }
5141
5142 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5143 {
5144     int i;
5145
5146     for (i = 0; i < ddCyclNr; i++)
5147     {
5148         dd->comm->cycl[i]     = 0;
5149         dd->comm->cycl_n[i]   = 0;
5150         dd->comm->cycl_max[i] = 0;
5151     }
5152     dd->comm->flop   = 0;
5153     dd->comm->flop_n = 0;
5154 }
5155
5156 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5157 {
5158     gmx_domdec_comm_t *comm;
5159     gmx_domdec_load_t *load;
5160     gmx_domdec_root_t *root = NULL;
5161     int                d, dim, cid, i, pos;
5162     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5163     gmx_bool           bSepPME;
5164
5165     if (debug)
5166     {
5167         fprintf(debug, "get_load_distribution start\n");
5168     }
5169
5170     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5171
5172     comm = dd->comm;
5173
5174     bSepPME = (dd->pme_nodeid >= 0);
5175
5176     for (d = dd->ndim-1; d >= 0; d--)
5177     {
5178         dim = dd->dim[d];
5179         /* Check if we participate in the communication in this dimension */
5180         if (d == dd->ndim-1 ||
5181             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5182         {
5183             load = &comm->load[d];
5184             if (dd->bGridJump)
5185             {
5186                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5187             }
5188             pos = 0;
5189             if (d == dd->ndim-1)
5190             {
5191                 sbuf[pos++] = dd_force_load(comm);
5192                 sbuf[pos++] = sbuf[0];
5193                 if (dd->bGridJump)
5194                 {
5195                     sbuf[pos++] = sbuf[0];
5196                     sbuf[pos++] = cell_frac;
5197                     if (d > 0)
5198                     {
5199                         sbuf[pos++] = comm->cell_f_max0[d];
5200                         sbuf[pos++] = comm->cell_f_min1[d];
5201                     }
5202                 }
5203                 if (bSepPME)
5204                 {
5205                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5206                     sbuf[pos++] = comm->cycl[ddCyclPME];
5207                 }
5208             }
5209             else
5210             {
5211                 sbuf[pos++] = comm->load[d+1].sum;
5212                 sbuf[pos++] = comm->load[d+1].max;
5213                 if (dd->bGridJump)
5214                 {
5215                     sbuf[pos++] = comm->load[d+1].sum_m;
5216                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5217                     sbuf[pos++] = comm->load[d+1].flags;
5218                     if (d > 0)
5219                     {
5220                         sbuf[pos++] = comm->cell_f_max0[d];
5221                         sbuf[pos++] = comm->cell_f_min1[d];
5222                     }
5223                 }
5224                 if (bSepPME)
5225                 {
5226                     sbuf[pos++] = comm->load[d+1].mdf;
5227                     sbuf[pos++] = comm->load[d+1].pme;
5228                 }
5229             }
5230             load->nload = pos;
5231             /* Communicate a row in DD direction d.
5232              * The communicators are setup such that the root always has rank 0.
5233              */
5234 #ifdef GMX_MPI
5235             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5236                        load->load, load->nload*sizeof(float), MPI_BYTE,
5237                        0, comm->mpi_comm_load[d]);
5238 #endif
5239             if (dd->ci[dim] == dd->master_ci[dim])
5240             {
5241                 /* We are the root, process this row */
5242                 if (comm->bDynLoadBal)
5243                 {
5244                     root = comm->root[d];
5245                 }
5246                 load->sum      = 0;
5247                 load->max      = 0;
5248                 load->sum_m    = 0;
5249                 load->cvol_min = 1;
5250                 load->flags    = 0;
5251                 load->mdf      = 0;
5252                 load->pme      = 0;
5253                 pos            = 0;
5254                 for (i = 0; i < dd->nc[dim]; i++)
5255                 {
5256                     load->sum += load->load[pos++];
5257                     load->max  = max(load->max, load->load[pos]);
5258                     pos++;
5259                     if (dd->bGridJump)
5260                     {
5261                         if (root->bLimited)
5262                         {
5263                             /* This direction could not be load balanced properly,
5264                              * therefore we need to use the maximum iso the average load.
5265                              */
5266                             load->sum_m = max(load->sum_m, load->load[pos]);
5267                         }
5268                         else
5269                         {
5270                             load->sum_m += load->load[pos];
5271                         }
5272                         pos++;
5273                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5274                         pos++;
5275                         if (d < dd->ndim-1)
5276                         {
5277                             load->flags = (int)(load->load[pos++] + 0.5);
5278                         }
5279                         if (d > 0)
5280                         {
5281                             root->cell_f_max0[i] = load->load[pos++];
5282                             root->cell_f_min1[i] = load->load[pos++];
5283                         }
5284                     }
5285                     if (bSepPME)
5286                     {
5287                         load->mdf = max(load->mdf, load->load[pos]);
5288                         pos++;
5289                         load->pme = max(load->pme, load->load[pos]);
5290                         pos++;
5291                     }
5292                 }
5293                 if (comm->bDynLoadBal && root->bLimited)
5294                 {
5295                     load->sum_m *= dd->nc[dim];
5296                     load->flags |= (1<<d);
5297                 }
5298             }
5299         }
5300     }
5301
5302     if (DDMASTER(dd))
5303     {
5304         comm->nload      += dd_load_count(comm);
5305         comm->load_step  += comm->cycl[ddCyclStep];
5306         comm->load_sum   += comm->load[0].sum;
5307         comm->load_max   += comm->load[0].max;
5308         if (comm->bDynLoadBal)
5309         {
5310             for (d = 0; d < dd->ndim; d++)
5311             {
5312                 if (comm->load[0].flags & (1<<d))
5313                 {
5314                     comm->load_lim[d]++;
5315                 }
5316             }
5317         }
5318         if (bSepPME)
5319         {
5320             comm->load_mdf += comm->load[0].mdf;
5321             comm->load_pme += comm->load[0].pme;
5322         }
5323     }
5324
5325     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5326
5327     if (debug)
5328     {
5329         fprintf(debug, "get_load_distribution finished\n");
5330     }
5331 }
5332
5333 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5334 {
5335     /* Return the relative performance loss on the total run time
5336      * due to the force calculation load imbalance.
5337      */
5338     if (dd->comm->nload > 0)
5339     {
5340         return
5341             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5342             (dd->comm->load_step*dd->nnodes);
5343     }
5344     else
5345     {
5346         return 0;
5347     }
5348 }
5349
5350 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5351 {
5352     char               buf[STRLEN];
5353     int                npp, npme, nnodes, d, limp;
5354     float              imbal, pme_f_ratio, lossf, lossp = 0;
5355     gmx_bool           bLim;
5356     gmx_domdec_comm_t *comm;
5357
5358     comm = dd->comm;
5359     if (DDMASTER(dd) && comm->nload > 0)
5360     {
5361         npp    = dd->nnodes;
5362         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5363         nnodes = npp + npme;
5364         imbal  = comm->load_max*npp/comm->load_sum - 1;
5365         lossf  = dd_force_imb_perf_loss(dd);
5366         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5367         fprintf(fplog, "%s", buf);
5368         fprintf(stderr, "\n");
5369         fprintf(stderr, "%s", buf);
5370         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5371         fprintf(fplog, "%s", buf);
5372         fprintf(stderr, "%s", buf);
5373         bLim = FALSE;
5374         if (comm->bDynLoadBal)
5375         {
5376             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5377             for (d = 0; d < dd->ndim; d++)
5378             {
5379                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5380                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5381                 if (limp >= 50)
5382                 {
5383                     bLim = TRUE;
5384                 }
5385             }
5386             sprintf(buf+strlen(buf), "\n");
5387             fprintf(fplog, "%s", buf);
5388             fprintf(stderr, "%s", buf);
5389         }
5390         if (npme > 0)
5391         {
5392             pme_f_ratio = comm->load_pme/comm->load_mdf;
5393             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5394             if (lossp <= 0)
5395             {
5396                 lossp *= (float)npme/(float)nnodes;
5397             }
5398             else
5399             {
5400                 lossp *= (float)npp/(float)nnodes;
5401             }
5402             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5403             fprintf(fplog, "%s", buf);
5404             fprintf(stderr, "%s", buf);
5405             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5406             fprintf(fplog, "%s", buf);
5407             fprintf(stderr, "%s", buf);
5408         }
5409         fprintf(fplog, "\n");
5410         fprintf(stderr, "\n");
5411
5412         if (lossf >= DD_PERF_LOSS)
5413         {
5414             sprintf(buf,
5415                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5416                     "      in the domain decomposition.\n", lossf*100);
5417             if (!comm->bDynLoadBal)
5418             {
5419                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5420             }
5421             else if (bLim)
5422             {
5423                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5424             }
5425             fprintf(fplog, "%s\n", buf);
5426             fprintf(stderr, "%s\n", buf);
5427         }
5428         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5429         {
5430             sprintf(buf,
5431                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5432                     "      had %s work to do than the PP nodes.\n"
5433                     "      You might want to %s the number of PME nodes\n"
5434                     "      or %s the cut-off and the grid spacing.\n",
5435                     fabs(lossp*100),
5436                     (lossp < 0) ? "less"     : "more",
5437                     (lossp < 0) ? "decrease" : "increase",
5438                     (lossp < 0) ? "decrease" : "increase");
5439             fprintf(fplog, "%s\n", buf);
5440             fprintf(stderr, "%s\n", buf);
5441         }
5442     }
5443 }
5444
5445 static float dd_vol_min(gmx_domdec_t *dd)
5446 {
5447     return dd->comm->load[0].cvol_min*dd->nnodes;
5448 }
5449
5450 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5451 {
5452     return dd->comm->load[0].flags;
5453 }
5454
5455 static float dd_f_imbal(gmx_domdec_t *dd)
5456 {
5457     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5458 }
5459
5460 float dd_pme_f_ratio(gmx_domdec_t *dd)
5461 {
5462     if (dd->comm->cycl_n[ddCyclPME] > 0)
5463     {
5464         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5465     }
5466     else
5467     {
5468         return -1.0;
5469     }
5470 }
5471
5472 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
5473 {
5474     int  flags, d;
5475     char buf[22];
5476
5477     flags = dd_load_flags(dd);
5478     if (flags)
5479     {
5480         fprintf(fplog,
5481                 "DD  load balancing is limited by minimum cell size in dimension");
5482         for (d = 0; d < dd->ndim; d++)
5483         {
5484             if (flags & (1<<d))
5485             {
5486                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5487             }
5488         }
5489         fprintf(fplog, "\n");
5490     }
5491     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5492     if (dd->comm->bDynLoadBal)
5493     {
5494         fprintf(fplog, "  vol min/aver %5.3f%c",
5495                 dd_vol_min(dd), flags ? '!' : ' ');
5496     }
5497     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5498     if (dd->comm->cycl_n[ddCyclPME])
5499     {
5500         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5501     }
5502     fprintf(fplog, "\n\n");
5503 }
5504
5505 static void dd_print_load_verbose(gmx_domdec_t *dd)
5506 {
5507     if (dd->comm->bDynLoadBal)
5508     {
5509         fprintf(stderr, "vol %4.2f%c ",
5510                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5511     }
5512     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5513     if (dd->comm->cycl_n[ddCyclPME])
5514     {
5515         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5516     }
5517 }
5518
5519 #ifdef GMX_MPI
5520 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5521 {
5522     MPI_Comm           c_row;
5523     int                dim, i, rank;
5524     ivec               loc_c;
5525     gmx_domdec_root_t *root;
5526     gmx_bool           bPartOfGroup = FALSE;
5527
5528     dim = dd->dim[dim_ind];
5529     copy_ivec(loc, loc_c);
5530     for (i = 0; i < dd->nc[dim]; i++)
5531     {
5532         loc_c[dim] = i;
5533         rank       = dd_index(dd->nc, loc_c);
5534         if (rank == dd->rank)
5535         {
5536             /* This process is part of the group */
5537             bPartOfGroup = TRUE;
5538         }
5539     }
5540     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5541                    &c_row);
5542     if (bPartOfGroup)
5543     {
5544         dd->comm->mpi_comm_load[dim_ind] = c_row;
5545         if (dd->comm->eDLB != edlbNO)
5546         {
5547             if (dd->ci[dim] == dd->master_ci[dim])
5548             {
5549                 /* This is the root process of this row */
5550                 snew(dd->comm->root[dim_ind], 1);
5551                 root = dd->comm->root[dim_ind];
5552                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5553                 snew(root->old_cell_f, dd->nc[dim]+1);
5554                 snew(root->bCellMin, dd->nc[dim]);
5555                 if (dim_ind > 0)
5556                 {
5557                     snew(root->cell_f_max0, dd->nc[dim]);
5558                     snew(root->cell_f_min1, dd->nc[dim]);
5559                     snew(root->bound_min, dd->nc[dim]);
5560                     snew(root->bound_max, dd->nc[dim]);
5561                 }
5562                 snew(root->buf_ncd, dd->nc[dim]);
5563             }
5564             else
5565             {
5566                 /* This is not a root process, we only need to receive cell_f */
5567                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5568             }
5569         }
5570         if (dd->ci[dim] == dd->master_ci[dim])
5571         {
5572             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5573         }
5574     }
5575 }
5576 #endif
5577
5578 static void make_load_communicators(gmx_domdec_t *dd)
5579 {
5580 #ifdef GMX_MPI
5581     int  dim0, dim1, i, j;
5582     ivec loc;
5583
5584     if (debug)
5585     {
5586         fprintf(debug, "Making load communicators\n");
5587     }
5588
5589     snew(dd->comm->load, dd->ndim);
5590     snew(dd->comm->mpi_comm_load, dd->ndim);
5591
5592     clear_ivec(loc);
5593     make_load_communicator(dd, 0, loc);
5594     if (dd->ndim > 1)
5595     {
5596         dim0 = dd->dim[0];
5597         for (i = 0; i < dd->nc[dim0]; i++)
5598         {
5599             loc[dim0] = i;
5600             make_load_communicator(dd, 1, loc);
5601         }
5602     }
5603     if (dd->ndim > 2)
5604     {
5605         dim0 = dd->dim[0];
5606         for (i = 0; i < dd->nc[dim0]; i++)
5607         {
5608             loc[dim0] = i;
5609             dim1      = dd->dim[1];
5610             for (j = 0; j < dd->nc[dim1]; j++)
5611             {
5612                 loc[dim1] = j;
5613                 make_load_communicator(dd, 2, loc);
5614             }
5615         }
5616     }
5617
5618     if (debug)
5619     {
5620         fprintf(debug, "Finished making load communicators\n");
5621     }
5622 #endif
5623 }
5624
5625 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5626 {
5627     gmx_bool                bZYX;
5628     int                     d, dim, i, j, m;
5629     ivec                    tmp, s;
5630     int                     nzone, nzonep;
5631     ivec                    dd_zp[DD_MAXIZONE];
5632     gmx_domdec_zones_t     *zones;
5633     gmx_domdec_ns_ranges_t *izone;
5634
5635     for (d = 0; d < dd->ndim; d++)
5636     {
5637         dim = dd->dim[d];
5638         copy_ivec(dd->ci, tmp);
5639         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5640         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5641         copy_ivec(dd->ci, tmp);
5642         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5643         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5644         if (debug)
5645         {
5646             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5647                     dd->rank, dim,
5648                     dd->neighbor[d][0],
5649                     dd->neighbor[d][1]);
5650         }
5651     }
5652
5653     if (fplog)
5654     {
5655         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5656                 dd->ndim,
5657                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5658                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5659     }
5660     switch (dd->ndim)
5661     {
5662         case 3:
5663             nzone  = dd_z3n;
5664             nzonep = dd_zp3n;
5665             for (i = 0; i < nzonep; i++)
5666             {
5667                 copy_ivec(dd_zp3[i], dd_zp[i]);
5668             }
5669             break;
5670         case 2:
5671             nzone  = dd_z2n;
5672             nzonep = dd_zp2n;
5673             for (i = 0; i < nzonep; i++)
5674             {
5675                 copy_ivec(dd_zp2[i], dd_zp[i]);
5676             }
5677             break;
5678         case 1:
5679             nzone  = dd_z1n;
5680             nzonep = dd_zp1n;
5681             for (i = 0; i < nzonep; i++)
5682             {
5683                 copy_ivec(dd_zp1[i], dd_zp[i]);
5684             }
5685             break;
5686         default:
5687             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5688             nzone  = 0;
5689             nzonep = 0;
5690     }
5691
5692     zones = &dd->comm->zones;
5693
5694     for (i = 0; i < nzone; i++)
5695     {
5696         m = 0;
5697         clear_ivec(zones->shift[i]);
5698         for (d = 0; d < dd->ndim; d++)
5699         {
5700             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5701         }
5702     }
5703
5704     zones->n = nzone;
5705     for (i = 0; i < nzone; i++)
5706     {
5707         for (d = 0; d < DIM; d++)
5708         {
5709             s[d] = dd->ci[d] - zones->shift[i][d];
5710             if (s[d] < 0)
5711             {
5712                 s[d] += dd->nc[d];
5713             }
5714             else if (s[d] >= dd->nc[d])
5715             {
5716                 s[d] -= dd->nc[d];
5717             }
5718         }
5719     }
5720     zones->nizone = nzonep;
5721     for (i = 0; i < zones->nizone; i++)
5722     {
5723         if (dd_zp[i][0] != i)
5724         {
5725             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5726         }
5727         izone     = &zones->izone[i];
5728         izone->j0 = dd_zp[i][1];
5729         izone->j1 = dd_zp[i][2];
5730         for (dim = 0; dim < DIM; dim++)
5731         {
5732             if (dd->nc[dim] == 1)
5733             {
5734                 /* All shifts should be allowed */
5735                 izone->shift0[dim] = -1;
5736                 izone->shift1[dim] = 1;
5737             }
5738             else
5739             {
5740                 /*
5741                    izone->shift0[d] = 0;
5742                    izone->shift1[d] = 0;
5743                    for(j=izone->j0; j<izone->j1; j++) {
5744                    if (dd->shift[j][d] > dd->shift[i][d])
5745                    izone->shift0[d] = -1;
5746                    if (dd->shift[j][d] < dd->shift[i][d])
5747                    izone->shift1[d] = 1;
5748                    }
5749                  */
5750
5751                 int shift_diff;
5752
5753                 /* Assume the shift are not more than 1 cell */
5754                 izone->shift0[dim] = 1;
5755                 izone->shift1[dim] = -1;
5756                 for (j = izone->j0; j < izone->j1; j++)
5757                 {
5758                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5759                     if (shift_diff < izone->shift0[dim])
5760                     {
5761                         izone->shift0[dim] = shift_diff;
5762                     }
5763                     if (shift_diff > izone->shift1[dim])
5764                     {
5765                         izone->shift1[dim] = shift_diff;
5766                     }
5767                 }
5768             }
5769         }
5770     }
5771
5772     if (dd->comm->eDLB != edlbNO)
5773     {
5774         snew(dd->comm->root, dd->ndim);
5775     }
5776
5777     if (dd->comm->bRecordLoad)
5778     {
5779         make_load_communicators(dd);
5780     }
5781 }
5782
5783 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
5784 {
5785     gmx_domdec_t      *dd;
5786     gmx_domdec_comm_t *comm;
5787     int                i, rank, *buf;
5788     ivec               periods;
5789 #ifdef GMX_MPI
5790     MPI_Comm           comm_cart;
5791 #endif
5792
5793     dd   = cr->dd;
5794     comm = dd->comm;
5795
5796 #ifdef GMX_MPI
5797     if (comm->bCartesianPP)
5798     {
5799         /* Set up cartesian communication for the particle-particle part */
5800         if (fplog)
5801         {
5802             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5803                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5804         }
5805
5806         for (i = 0; i < DIM; i++)
5807         {
5808             periods[i] = TRUE;
5809         }
5810         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5811                         &comm_cart);
5812         /* We overwrite the old communicator with the new cartesian one */
5813         cr->mpi_comm_mygroup = comm_cart;
5814     }
5815
5816     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5817     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5818
5819     if (comm->bCartesianPP_PME)
5820     {
5821         /* Since we want to use the original cartesian setup for sim,
5822          * and not the one after split, we need to make an index.
5823          */
5824         snew(comm->ddindex2ddnodeid, dd->nnodes);
5825         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5826         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5827         /* Get the rank of the DD master,
5828          * above we made sure that the master node is a PP node.
5829          */
5830         if (MASTER(cr))
5831         {
5832             rank = dd->rank;
5833         }
5834         else
5835         {
5836             rank = 0;
5837         }
5838         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5839     }
5840     else if (comm->bCartesianPP)
5841     {
5842         if (cr->npmenodes == 0)
5843         {
5844             /* The PP communicator is also
5845              * the communicator for this simulation
5846              */
5847             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5848         }
5849         cr->nodeid = dd->rank;
5850
5851         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5852
5853         /* We need to make an index to go from the coordinates
5854          * to the nodeid of this simulation.
5855          */
5856         snew(comm->ddindex2simnodeid, dd->nnodes);
5857         snew(buf, dd->nnodes);
5858         if (cr->duty & DUTY_PP)
5859         {
5860             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5861         }
5862         /* Communicate the ddindex to simulation nodeid index */
5863         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5864                       cr->mpi_comm_mysim);
5865         sfree(buf);
5866
5867         /* Determine the master coordinates and rank.
5868          * The DD master should be the same node as the master of this sim.
5869          */
5870         for (i = 0; i < dd->nnodes; i++)
5871         {
5872             if (comm->ddindex2simnodeid[i] == 0)
5873             {
5874                 ddindex2xyz(dd->nc, i, dd->master_ci);
5875                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5876             }
5877         }
5878         if (debug)
5879         {
5880             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5881         }
5882     }
5883     else
5884     {
5885         /* No Cartesian communicators */
5886         /* We use the rank in dd->comm->all as DD index */
5887         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5888         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5889         dd->masterrank = 0;
5890         clear_ivec(dd->master_ci);
5891     }
5892 #endif
5893
5894     if (fplog)
5895     {
5896         fprintf(fplog,
5897                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5898                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5899     }
5900     if (debug)
5901     {
5902         fprintf(debug,
5903                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5904                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5905     }
5906 }
5907
5908 static void receive_ddindex2simnodeid(t_commrec *cr)
5909 {
5910     gmx_domdec_t      *dd;
5911
5912     gmx_domdec_comm_t *comm;
5913     int               *buf;
5914
5915     dd   = cr->dd;
5916     comm = dd->comm;
5917
5918 #ifdef GMX_MPI
5919     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5920     {
5921         snew(comm->ddindex2simnodeid, dd->nnodes);
5922         snew(buf, dd->nnodes);
5923         if (cr->duty & DUTY_PP)
5924         {
5925             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5926         }
5927 #ifdef GMX_MPI
5928         /* Communicate the ddindex to simulation nodeid index */
5929         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5930                       cr->mpi_comm_mysim);
5931 #endif
5932         sfree(buf);
5933     }
5934 #endif
5935 }
5936
5937 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5938                                                      int ncg, int natoms)
5939 {
5940     gmx_domdec_master_t *ma;
5941     int                  i;
5942
5943     snew(ma, 1);
5944
5945     snew(ma->ncg, dd->nnodes);
5946     snew(ma->index, dd->nnodes+1);
5947     snew(ma->cg, ncg);
5948     snew(ma->nat, dd->nnodes);
5949     snew(ma->ibuf, dd->nnodes*2);
5950     snew(ma->cell_x, DIM);
5951     for (i = 0; i < DIM; i++)
5952     {
5953         snew(ma->cell_x[i], dd->nc[i]+1);
5954     }
5955
5956     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5957     {
5958         ma->vbuf = NULL;
5959     }
5960     else
5961     {
5962         snew(ma->vbuf, natoms);
5963     }
5964
5965     return ma;
5966 }
5967
5968 static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
5969                                int reorder)
5970 {
5971     gmx_domdec_t      *dd;
5972     gmx_domdec_comm_t *comm;
5973     int                i, rank;
5974     gmx_bool           bDiv[DIM];
5975     ivec               periods;
5976 #ifdef GMX_MPI
5977     MPI_Comm           comm_cart;
5978 #endif
5979
5980     dd   = cr->dd;
5981     comm = dd->comm;
5982
5983     if (comm->bCartesianPP)
5984     {
5985         for (i = 1; i < DIM; i++)
5986         {
5987             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5988         }
5989         if (bDiv[YY] || bDiv[ZZ])
5990         {
5991             comm->bCartesianPP_PME = TRUE;
5992             /* If we have 2D PME decomposition, which is always in x+y,
5993              * we stack the PME only nodes in z.
5994              * Otherwise we choose the direction that provides the thinnest slab
5995              * of PME only nodes as this will have the least effect
5996              * on the PP communication.
5997              * But for the PME communication the opposite might be better.
5998              */
5999             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6000                              !bDiv[YY] ||
6001                              dd->nc[YY] > dd->nc[ZZ]))
6002             {
6003                 comm->cartpmedim = ZZ;
6004             }
6005             else
6006             {
6007                 comm->cartpmedim = YY;
6008             }
6009             comm->ntot[comm->cartpmedim]
6010                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6011         }
6012         else if (fplog)
6013         {
6014             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6015             fprintf(fplog,
6016                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6017         }
6018     }
6019
6020 #ifdef GMX_MPI
6021     if (comm->bCartesianPP_PME)
6022     {
6023         if (fplog)
6024         {
6025             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6026         }
6027
6028         for (i = 0; i < DIM; i++)
6029         {
6030             periods[i] = TRUE;
6031         }
6032         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6033                         &comm_cart);
6034
6035         MPI_Comm_rank(comm_cart, &rank);
6036         if (MASTERNODE(cr) && rank != 0)
6037         {
6038             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6039         }
6040
6041         /* With this assigment we loose the link to the original communicator
6042          * which will usually be MPI_COMM_WORLD, unless have multisim.
6043          */
6044         cr->mpi_comm_mysim = comm_cart;
6045         cr->sim_nodeid     = rank;
6046
6047         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6048
6049         if (fplog)
6050         {
6051             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6052                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6053         }
6054
6055         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6056         {
6057             cr->duty = DUTY_PP;
6058         }
6059         if (cr->npmenodes == 0 ||
6060             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6061         {
6062             cr->duty = DUTY_PME;
6063         }
6064
6065         /* Split the sim communicator into PP and PME only nodes */
6066         MPI_Comm_split(cr->mpi_comm_mysim,
6067                        cr->duty,
6068                        dd_index(comm->ntot, dd->ci),
6069                        &cr->mpi_comm_mygroup);
6070     }
6071     else
6072     {
6073         switch (dd_node_order)
6074         {
6075             case ddnoPP_PME:
6076                 if (fplog)
6077                 {
6078                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6079                 }
6080                 break;
6081             case ddnoINTERLEAVE:
6082                 /* Interleave the PP-only and PME-only nodes,
6083                  * as on clusters with dual-core machines this will double
6084                  * the communication bandwidth of the PME processes
6085                  * and thus speed up the PP <-> PME and inter PME communication.
6086                  */
6087                 if (fplog)
6088                 {
6089                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6090                 }
6091                 comm->pmenodes = dd_pmenodes(cr);
6092                 break;
6093             case ddnoCARTESIAN:
6094                 break;
6095             default:
6096                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6097         }
6098
6099         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6100         {
6101             cr->duty = DUTY_PME;
6102         }
6103         else
6104         {
6105             cr->duty = DUTY_PP;
6106         }
6107
6108         /* Split the sim communicator into PP and PME only nodes */
6109         MPI_Comm_split(cr->mpi_comm_mysim,
6110                        cr->duty,
6111                        cr->nodeid,
6112                        &cr->mpi_comm_mygroup);
6113         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6114     }
6115 #endif
6116
6117     if (fplog)
6118     {
6119         fprintf(fplog, "This is a %s only node\n\n",
6120                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6121     }
6122 }
6123
6124 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6125 {
6126     gmx_domdec_t      *dd;
6127     gmx_domdec_comm_t *comm;
6128     int                CartReorder;
6129
6130     dd   = cr->dd;
6131     comm = dd->comm;
6132
6133     copy_ivec(dd->nc, comm->ntot);
6134
6135     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6136     comm->bCartesianPP_PME = FALSE;
6137
6138     /* Reorder the nodes by default. This might change the MPI ranks.
6139      * Real reordering is only supported on very few architectures,
6140      * Blue Gene is one of them.
6141      */
6142     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6143
6144     if (cr->npmenodes > 0)
6145     {
6146         /* Split the communicator into a PP and PME part */
6147         split_communicator(fplog, cr, dd_node_order, CartReorder);
6148         if (comm->bCartesianPP_PME)
6149         {
6150             /* We (possibly) reordered the nodes in split_communicator,
6151              * so it is no longer required in make_pp_communicator.
6152              */
6153             CartReorder = FALSE;
6154         }
6155     }
6156     else
6157     {
6158         /* All nodes do PP and PME */
6159 #ifdef GMX_MPI
6160         /* We do not require separate communicators */
6161         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6162 #endif
6163     }
6164
6165     if (cr->duty & DUTY_PP)
6166     {
6167         /* Copy or make a new PP communicator */
6168         make_pp_communicator(fplog, cr, CartReorder);
6169     }
6170     else
6171     {
6172         receive_ddindex2simnodeid(cr);
6173     }
6174
6175     if (!(cr->duty & DUTY_PME))
6176     {
6177         /* Set up the commnuication to our PME node */
6178         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6179         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6180         if (debug)
6181         {
6182             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6183                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6184         }
6185     }
6186     else
6187     {
6188         dd->pme_nodeid = -1;
6189     }
6190
6191     if (DDMASTER(dd))
6192     {
6193         dd->ma = init_gmx_domdec_master_t(dd,
6194                                           comm->cgs_gl.nr,
6195                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6196     }
6197 }
6198
6199 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6200 {
6201     real  *slb_frac, tot;
6202     int    i, n;
6203     double dbl;
6204
6205     slb_frac = NULL;
6206     if (nc > 1 && size_string != NULL)
6207     {
6208         if (fplog)
6209         {
6210             fprintf(fplog, "Using static load balancing for the %s direction\n",
6211                     dir);
6212         }
6213         snew(slb_frac, nc);
6214         tot = 0;
6215         for (i = 0; i < nc; i++)
6216         {
6217             dbl = 0;
6218             sscanf(size_string, "%lf%n", &dbl, &n);
6219             if (dbl == 0)
6220             {
6221                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6222             }
6223             slb_frac[i]  = dbl;
6224             size_string += n;
6225             tot         += slb_frac[i];
6226         }
6227         /* Normalize */
6228         if (fplog)
6229         {
6230             fprintf(fplog, "Relative cell sizes:");
6231         }
6232         for (i = 0; i < nc; i++)
6233         {
6234             slb_frac[i] /= tot;
6235             if (fplog)
6236             {
6237                 fprintf(fplog, " %5.3f", slb_frac[i]);
6238             }
6239         }
6240         if (fplog)
6241         {
6242             fprintf(fplog, "\n");
6243         }
6244     }
6245
6246     return slb_frac;
6247 }
6248
6249 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6250 {
6251     int                  n, nmol, ftype;
6252     gmx_mtop_ilistloop_t iloop;
6253     t_ilist             *il;
6254
6255     n     = 0;
6256     iloop = gmx_mtop_ilistloop_init(mtop);
6257     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6258     {
6259         for (ftype = 0; ftype < F_NRE; ftype++)
6260         {
6261             if ((interaction_function[ftype].flags & IF_BOND) &&
6262                 NRAL(ftype) >  2)
6263             {
6264                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6265             }
6266         }
6267     }
6268
6269     return n;
6270 }
6271
6272 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6273 {
6274     char *val;
6275     int   nst;
6276
6277     nst = def;
6278     val = getenv(env_var);
6279     if (val)
6280     {
6281         if (sscanf(val, "%d", &nst) <= 0)
6282         {
6283             nst = 1;
6284         }
6285         if (fplog)
6286         {
6287             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6288                     env_var, val, nst);
6289         }
6290     }
6291
6292     return nst;
6293 }
6294
6295 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6296 {
6297     if (MASTER(cr))
6298     {
6299         fprintf(stderr, "\n%s\n", warn_string);
6300     }
6301     if (fplog)
6302     {
6303         fprintf(fplog, "\n%s\n", warn_string);
6304     }
6305 }
6306
6307 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6308                                   t_inputrec *ir, FILE *fplog)
6309 {
6310     if (ir->ePBC == epbcSCREW &&
6311         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6312     {
6313         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6314     }
6315
6316     if (ir->ns_type == ensSIMPLE)
6317     {
6318         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6319     }
6320
6321     if (ir->nstlist == 0)
6322     {
6323         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6324     }
6325
6326     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6327     {
6328         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6329     }
6330 }
6331
6332 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6333 {
6334     int  di, d;
6335     real r;
6336
6337     r = ddbox->box_size[XX];
6338     for (di = 0; di < dd->ndim; di++)
6339     {
6340         d = dd->dim[di];
6341         /* Check using the initial average cell size */
6342         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6343     }
6344
6345     return r;
6346 }
6347
6348 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6349                              const char *dlb_opt, gmx_bool bRecordLoad,
6350                              unsigned long Flags, t_inputrec *ir)
6351 {
6352     gmx_domdec_t *dd;
6353     int           eDLB = -1;
6354     char          buf[STRLEN];
6355
6356     switch (dlb_opt[0])
6357     {
6358         case 'a': eDLB = edlbAUTO; break;
6359         case 'n': eDLB = edlbNO;   break;
6360         case 'y': eDLB = edlbYES;  break;
6361         default: gmx_incons("Unknown dlb_opt");
6362     }
6363
6364     if (Flags & MD_RERUN)
6365     {
6366         return edlbNO;
6367     }
6368
6369     if (!EI_DYNAMICS(ir->eI))
6370     {
6371         if (eDLB == edlbYES)
6372         {
6373             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6374             dd_warning(cr, fplog, buf);
6375         }
6376
6377         return edlbNO;
6378     }
6379
6380     if (!bRecordLoad)
6381     {
6382         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6383
6384         return edlbNO;
6385     }
6386
6387     if (Flags & MD_REPRODUCIBLE)
6388     {
6389         switch (eDLB)
6390         {
6391             case edlbNO:
6392                 break;
6393             case edlbAUTO:
6394                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6395                 eDLB = edlbNO;
6396                 break;
6397             case edlbYES:
6398                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6399                 break;
6400             default:
6401                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6402                 break;
6403         }
6404     }
6405
6406     return eDLB;
6407 }
6408
6409 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6410 {
6411     int dim;
6412
6413     dd->ndim = 0;
6414     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6415     {
6416         /* Decomposition order z,y,x */
6417         if (fplog)
6418         {
6419             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6420         }
6421         for (dim = DIM-1; dim >= 0; dim--)
6422         {
6423             if (dd->nc[dim] > 1)
6424             {
6425                 dd->dim[dd->ndim++] = dim;
6426             }
6427         }
6428     }
6429     else
6430     {
6431         /* Decomposition order x,y,z */
6432         for (dim = 0; dim < DIM; dim++)
6433         {
6434             if (dd->nc[dim] > 1)
6435             {
6436                 dd->dim[dd->ndim++] = dim;
6437             }
6438         }
6439     }
6440 }
6441
6442 static gmx_domdec_comm_t *init_dd_comm()
6443 {
6444     gmx_domdec_comm_t *comm;
6445     int                i;
6446
6447     snew(comm, 1);
6448     snew(comm->cggl_flag, DIM*2);
6449     snew(comm->cgcm_state, DIM*2);
6450     for (i = 0; i < DIM*2; i++)
6451     {
6452         comm->cggl_flag_nalloc[i]  = 0;
6453         comm->cgcm_state_nalloc[i] = 0;
6454     }
6455
6456     comm->nalloc_int = 0;
6457     comm->buf_int    = NULL;
6458
6459     vec_rvec_init(&comm->vbuf);
6460
6461     comm->n_load_have    = 0;
6462     comm->n_load_collect = 0;
6463
6464     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6465     {
6466         comm->sum_nat[i] = 0;
6467     }
6468     comm->ndecomp   = 0;
6469     comm->nload     = 0;
6470     comm->load_step = 0;
6471     comm->load_sum  = 0;
6472     comm->load_max  = 0;
6473     clear_ivec(comm->load_lim);
6474     comm->load_mdf  = 0;
6475     comm->load_pme  = 0;
6476
6477     return comm;
6478 }
6479
6480 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6481                                         unsigned long Flags,
6482                                         ivec nc,
6483                                         real comm_distance_min, real rconstr,
6484                                         const char *dlb_opt, real dlb_scale,
6485                                         const char *sizex, const char *sizey, const char *sizez,
6486                                         gmx_mtop_t *mtop, t_inputrec *ir,
6487                                         matrix box, rvec *x,
6488                                         gmx_ddbox_t *ddbox,
6489                                         int *npme_x, int *npme_y)
6490 {
6491     gmx_domdec_t      *dd;
6492     gmx_domdec_comm_t *comm;
6493     int                recload;
6494     int                d, i, j;
6495     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6496     gmx_bool           bC;
6497     char               buf[STRLEN];
6498
6499     if (fplog)
6500     {
6501         fprintf(fplog,
6502                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6503     }
6504
6505     snew(dd, 1);
6506
6507     dd->comm = init_dd_comm();
6508     comm     = dd->comm;
6509     snew(comm->cggl_flag, DIM*2);
6510     snew(comm->cgcm_state, DIM*2);
6511
6512     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6513     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6514
6515     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6516     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6517     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6518     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6519     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6520     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6521     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6522     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6523
6524     dd->pme_recv_f_alloc = 0;
6525     dd->pme_recv_f_buf   = NULL;
6526
6527     if (dd->bSendRecv2 && fplog)
6528     {
6529         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6530     }
6531     if (comm->eFlop)
6532     {
6533         if (fplog)
6534         {
6535             fprintf(fplog, "Will load balance based on FLOP count\n");
6536         }
6537         if (comm->eFlop > 1)
6538         {
6539             srand(1+cr->nodeid);
6540         }
6541         comm->bRecordLoad = TRUE;
6542     }
6543     else
6544     {
6545         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6546
6547     }
6548
6549     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6550
6551     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6552     if (fplog)
6553     {
6554         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6555     }
6556     dd->bGridJump              = comm->bDynLoadBal;
6557     comm->bPMELoadBalDLBLimits = FALSE;
6558
6559     if (comm->nstSortCG)
6560     {
6561         if (fplog)
6562         {
6563             if (comm->nstSortCG == 1)
6564             {
6565                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6566             }
6567             else
6568             {
6569                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6570                         comm->nstSortCG);
6571             }
6572         }
6573         snew(comm->sort, 1);
6574     }
6575     else
6576     {
6577         if (fplog)
6578         {
6579             fprintf(fplog, "Will not sort the charge groups\n");
6580         }
6581     }
6582
6583     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6584
6585     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6586     if (comm->bInterCGBondeds)
6587     {
6588         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6589     }
6590     else
6591     {
6592         comm->bInterCGMultiBody = FALSE;
6593     }
6594
6595     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6596     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6597
6598     if (ir->rlistlong == 0)
6599     {
6600         /* Set the cut-off to some very large value,
6601          * so we don't need if statements everywhere in the code.
6602          * We use sqrt, since the cut-off is squared in some places.
6603          */
6604         comm->cutoff   = GMX_CUTOFF_INF;
6605     }
6606     else
6607     {
6608         comm->cutoff   = ir->rlistlong;
6609     }
6610     comm->cutoff_mbody = 0;
6611
6612     comm->cellsize_limit = 0;
6613     comm->bBondComm      = FALSE;
6614
6615     if (comm->bInterCGBondeds)
6616     {
6617         if (comm_distance_min > 0)
6618         {
6619             comm->cutoff_mbody = comm_distance_min;
6620             if (Flags & MD_DDBONDCOMM)
6621             {
6622                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6623             }
6624             else
6625             {
6626                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6627             }
6628             r_bonded_limit = comm->cutoff_mbody;
6629         }
6630         else if (ir->bPeriodicMols)
6631         {
6632             /* Can not easily determine the required cut-off */
6633             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6634             comm->cutoff_mbody = comm->cutoff/2;
6635             r_bonded_limit     = comm->cutoff_mbody;
6636         }
6637         else
6638         {
6639             if (MASTER(cr))
6640             {
6641                 dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
6642                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6643             }
6644             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6645             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6646
6647             /* We use an initial margin of 10% for the minimum cell size,
6648              * except when we are just below the non-bonded cut-off.
6649              */
6650             if (Flags & MD_DDBONDCOMM)
6651             {
6652                 if (max(r_2b, r_mb) > comm->cutoff)
6653                 {
6654                     r_bonded        = max(r_2b, r_mb);
6655                     r_bonded_limit  = 1.1*r_bonded;
6656                     comm->bBondComm = TRUE;
6657                 }
6658                 else
6659                 {
6660                     r_bonded       = r_mb;
6661                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6662                 }
6663                 /* We determine cutoff_mbody later */
6664             }
6665             else
6666             {
6667                 /* No special bonded communication,
6668                  * simply increase the DD cut-off.
6669                  */
6670                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6671                 comm->cutoff_mbody = r_bonded_limit;
6672                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6673             }
6674         }
6675         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6676         if (fplog)
6677         {
6678             fprintf(fplog,
6679                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6680                     comm->cellsize_limit);
6681         }
6682     }
6683
6684     if (dd->bInterCGcons && rconstr <= 0)
6685     {
6686         /* There is a cell size limit due to the constraints (P-LINCS) */
6687         rconstr = constr_r_max(fplog, mtop, ir);
6688         if (fplog)
6689         {
6690             fprintf(fplog,
6691                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6692                     rconstr);
6693             if (rconstr > comm->cellsize_limit)
6694             {
6695                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6696             }
6697         }
6698     }
6699     else if (rconstr > 0 && fplog)
6700     {
6701         /* Here we do not check for dd->bInterCGcons,
6702          * because one can also set a cell size limit for virtual sites only
6703          * and at this point we don't know yet if there are intercg v-sites.
6704          */
6705         fprintf(fplog,
6706                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6707                 rconstr);
6708     }
6709     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6710
6711     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6712
6713     if (nc[XX] > 0)
6714     {
6715         copy_ivec(nc, dd->nc);
6716         set_dd_dim(fplog, dd);
6717         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6718
6719         if (cr->npmenodes == -1)
6720         {
6721             cr->npmenodes = 0;
6722         }
6723         acs = average_cellsize_min(dd, ddbox);
6724         if (acs < comm->cellsize_limit)
6725         {
6726             if (fplog)
6727             {
6728                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6729             }
6730             gmx_fatal_collective(FARGS, cr, NULL,
6731                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6732                                  acs, comm->cellsize_limit);
6733         }
6734     }
6735     else
6736     {
6737         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6738
6739         /* We need to choose the optimal DD grid and possibly PME nodes */
6740         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6741                                comm->eDLB != edlbNO, dlb_scale,
6742                                comm->cellsize_limit, comm->cutoff,
6743                                comm->bInterCGBondeds, comm->bInterCGMultiBody);
6744
6745         if (dd->nc[XX] == 0)
6746         {
6747             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6748             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6749                     !bC ? "-rdd" : "-rcon",
6750                     comm->eDLB != edlbNO ? " or -dds" : "",
6751                     bC ? " or your LINCS settings" : "");
6752
6753             gmx_fatal_collective(FARGS, cr, NULL,
6754                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6755                                  "%s\n"
6756                                  "Look in the log file for details on the domain decomposition",
6757                                  cr->nnodes-cr->npmenodes, limit, buf);
6758         }
6759         set_dd_dim(fplog, dd);
6760     }
6761
6762     if (fplog)
6763     {
6764         fprintf(fplog,
6765                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6766                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6767     }
6768
6769     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6770     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6771     {
6772         gmx_fatal_collective(FARGS, cr, NULL,
6773                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6774                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6775     }
6776     if (cr->npmenodes > dd->nnodes)
6777     {
6778         gmx_fatal_collective(FARGS, cr, NULL,
6779                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6780     }
6781     if (cr->npmenodes > 0)
6782     {
6783         comm->npmenodes = cr->npmenodes;
6784     }
6785     else
6786     {
6787         comm->npmenodes = dd->nnodes;
6788     }
6789
6790     if (EEL_PME(ir->coulombtype))
6791     {
6792         /* The following choices should match those
6793          * in comm_cost_est in domdec_setup.c.
6794          * Note that here the checks have to take into account
6795          * that the decomposition might occur in a different order than xyz
6796          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6797          * in which case they will not match those in comm_cost_est,
6798          * but since that is mainly for testing purposes that's fine.
6799          */
6800         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6801             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6802             getenv("GMX_PMEONEDD") == NULL)
6803         {
6804             comm->npmedecompdim = 2;
6805             comm->npmenodes_x   = dd->nc[XX];
6806             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6807         }
6808         else
6809         {
6810             /* In case nc is 1 in both x and y we could still choose to
6811              * decompose pme in y instead of x, but we use x for simplicity.
6812              */
6813             comm->npmedecompdim = 1;
6814             if (dd->dim[0] == YY)
6815             {
6816                 comm->npmenodes_x = 1;
6817                 comm->npmenodes_y = comm->npmenodes;
6818             }
6819             else
6820             {
6821                 comm->npmenodes_x = comm->npmenodes;
6822                 comm->npmenodes_y = 1;
6823             }
6824         }
6825         if (fplog)
6826         {
6827             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6828                     comm->npmenodes_x, comm->npmenodes_y, 1);
6829         }
6830     }
6831     else
6832     {
6833         comm->npmedecompdim = 0;
6834         comm->npmenodes_x   = 0;
6835         comm->npmenodes_y   = 0;
6836     }
6837
6838     /* Technically we don't need both of these,
6839      * but it simplifies code not having to recalculate it.
6840      */
6841     *npme_x = comm->npmenodes_x;
6842     *npme_y = comm->npmenodes_y;
6843
6844     snew(comm->slb_frac, DIM);
6845     if (comm->eDLB == edlbNO)
6846     {
6847         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6848         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6849         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6850     }
6851
6852     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6853     {
6854         if (comm->bBondComm || comm->eDLB != edlbNO)
6855         {
6856             /* Set the bonded communication distance to halfway
6857              * the minimum and the maximum,
6858              * since the extra communication cost is nearly zero.
6859              */
6860             acs                = average_cellsize_min(dd, ddbox);
6861             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6862             if (comm->eDLB != edlbNO)
6863             {
6864                 /* Check if this does not limit the scaling */
6865                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
6866             }
6867             if (!comm->bBondComm)
6868             {
6869                 /* Without bBondComm do not go beyond the n.b. cut-off */
6870                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
6871                 if (comm->cellsize_limit >= comm->cutoff)
6872                 {
6873                     /* We don't loose a lot of efficieny
6874                      * when increasing it to the n.b. cut-off.
6875                      * It can even be slightly faster, because we need
6876                      * less checks for the communication setup.
6877                      */
6878                     comm->cutoff_mbody = comm->cutoff;
6879                 }
6880             }
6881             /* Check if we did not end up below our original limit */
6882             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
6883
6884             if (comm->cutoff_mbody > comm->cellsize_limit)
6885             {
6886                 comm->cellsize_limit = comm->cutoff_mbody;
6887             }
6888         }
6889         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6890     }
6891
6892     if (debug)
6893     {
6894         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6895                 "cellsize limit %f\n",
6896                 comm->bBondComm, comm->cellsize_limit);
6897     }
6898
6899     if (MASTER(cr))
6900     {
6901         check_dd_restrictions(cr, dd, ir, fplog);
6902     }
6903
6904     comm->partition_step = INT_MIN;
6905     dd->ddp_count        = 0;
6906
6907     clear_dd_cycle_counts(dd);
6908
6909     return dd;
6910 }
6911
6912 static void set_dlb_limits(gmx_domdec_t *dd)
6913
6914 {
6915     int d;
6916
6917     for (d = 0; d < dd->ndim; d++)
6918     {
6919         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6920         dd->comm->cellsize_min[dd->dim[d]] =
6921             dd->comm->cellsize_min_dlb[dd->dim[d]];
6922     }
6923 }
6924
6925
6926 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
6927 {
6928     gmx_domdec_t      *dd;
6929     gmx_domdec_comm_t *comm;
6930     real               cellsize_min;
6931     int                d, nc, i;
6932     char               buf[STRLEN];
6933
6934     dd   = cr->dd;
6935     comm = dd->comm;
6936
6937     if (fplog)
6938     {
6939         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6940     }
6941
6942     cellsize_min = comm->cellsize_min[dd->dim[0]];
6943     for (d = 1; d < dd->ndim; d++)
6944     {
6945         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6946     }
6947
6948     if (cellsize_min < comm->cellsize_limit*1.05)
6949     {
6950         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6951
6952         /* Change DLB from "auto" to "no". */
6953         comm->eDLB = edlbNO;
6954
6955         return;
6956     }
6957
6958     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6959     comm->bDynLoadBal = TRUE;
6960     dd->bGridJump     = TRUE;
6961
6962     set_dlb_limits(dd);
6963
6964     /* We can set the required cell size info here,
6965      * so we do not need to communicate this.
6966      * The grid is completely uniform.
6967      */
6968     for (d = 0; d < dd->ndim; d++)
6969     {
6970         if (comm->root[d])
6971         {
6972             comm->load[d].sum_m = comm->load[d].sum;
6973
6974             nc = dd->nc[dd->dim[d]];
6975             for (i = 0; i < nc; i++)
6976             {
6977                 comm->root[d]->cell_f[i]    = i/(real)nc;
6978                 if (d > 0)
6979                 {
6980                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6981                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6982                 }
6983             }
6984             comm->root[d]->cell_f[nc] = 1.0;
6985         }
6986     }
6987 }
6988
6989 static char *init_bLocalCG(gmx_mtop_t *mtop)
6990 {
6991     int   ncg, cg;
6992     char *bLocalCG;
6993
6994     ncg = ncg_mtop(mtop);
6995     snew(bLocalCG, ncg);
6996     for (cg = 0; cg < ncg; cg++)
6997     {
6998         bLocalCG[cg] = FALSE;
6999     }
7000
7001     return bLocalCG;
7002 }
7003
7004 void dd_init_bondeds(FILE *fplog,
7005                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7006                      gmx_vsite_t *vsite, gmx_constr_t constr,
7007                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7008 {
7009     gmx_domdec_comm_t *comm;
7010     gmx_bool           bBondComm;
7011     int                d;
7012
7013     dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
7014
7015     comm = dd->comm;
7016
7017     if (comm->bBondComm)
7018     {
7019         /* Communicate atoms beyond the cut-off for bonded interactions */
7020         comm = dd->comm;
7021
7022         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7023
7024         comm->bLocalCG = init_bLocalCG(mtop);
7025     }
7026     else
7027     {
7028         /* Only communicate atoms based on cut-off */
7029         comm->cglink   = NULL;
7030         comm->bLocalCG = NULL;
7031     }
7032 }
7033
7034 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7035                               t_inputrec *ir,
7036                               gmx_bool bDynLoadBal, real dlb_scale,
7037                               gmx_ddbox_t *ddbox)
7038 {
7039     gmx_domdec_comm_t *comm;
7040     int                d;
7041     ivec               np;
7042     real               limit, shrink;
7043     char               buf[64];
7044
7045     if (fplog == NULL)
7046     {
7047         return;
7048     }
7049
7050     comm = dd->comm;
7051
7052     if (bDynLoadBal)
7053     {
7054         fprintf(fplog, "The maximum number of communication pulses is:");
7055         for (d = 0; d < dd->ndim; d++)
7056         {
7057             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7058         }
7059         fprintf(fplog, "\n");
7060         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7061         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7062         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7063         for (d = 0; d < DIM; d++)
7064         {
7065             if (dd->nc[d] > 1)
7066             {
7067                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7068                 {
7069                     shrink = 0;
7070                 }
7071                 else
7072                 {
7073                     shrink =
7074                         comm->cellsize_min_dlb[d]/
7075                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7076                 }
7077                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7078             }
7079         }
7080         fprintf(fplog, "\n");
7081     }
7082     else
7083     {
7084         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
7085         fprintf(fplog, "The initial number of communication pulses is:");
7086         for (d = 0; d < dd->ndim; d++)
7087         {
7088             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7089         }
7090         fprintf(fplog, "\n");
7091         fprintf(fplog, "The initial domain decomposition cell size is:");
7092         for (d = 0; d < DIM; d++)
7093         {
7094             if (dd->nc[d] > 1)
7095             {
7096                 fprintf(fplog, " %c %.2f nm",
7097                         dim2char(d), dd->comm->cellsize_min[d]);
7098             }
7099         }
7100         fprintf(fplog, "\n\n");
7101     }
7102
7103     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7104     {
7105         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7106         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7107                 "non-bonded interactions", "", comm->cutoff);
7108
7109         if (bDynLoadBal)
7110         {
7111             limit = dd->comm->cellsize_limit;
7112         }
7113         else
7114         {
7115             if (dynamic_dd_box(ddbox, ir))
7116             {
7117                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7118             }
7119             limit = dd->comm->cellsize_min[XX];
7120             for (d = 1; d < DIM; d++)
7121             {
7122                 limit = min(limit, dd->comm->cellsize_min[d]);
7123             }
7124         }
7125
7126         if (comm->bInterCGBondeds)
7127         {
7128             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7129                     "two-body bonded interactions", "(-rdd)",
7130                     max(comm->cutoff, comm->cutoff_mbody));
7131             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7132                     "multi-body bonded interactions", "(-rdd)",
7133                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7134         }
7135         if (dd->vsite_comm)
7136         {
7137             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7138                     "virtual site constructions", "(-rcon)", limit);
7139         }
7140         if (dd->constraint_comm)
7141         {
7142             sprintf(buf, "atoms separated by up to %d constraints",
7143                     1+ir->nProjOrder);
7144             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7145                     buf, "(-rcon)", limit);
7146         }
7147         fprintf(fplog, "\n");
7148     }
7149
7150     fflush(fplog);
7151 }
7152
7153 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7154                                 real               dlb_scale,
7155                                 const t_inputrec  *ir,
7156                                 const gmx_ddbox_t *ddbox)
7157 {
7158     gmx_domdec_comm_t *comm;
7159     int                d, dim, npulse, npulse_d_max, npulse_d;
7160     gmx_bool           bNoCutOff;
7161
7162     comm = dd->comm;
7163
7164     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7165
7166     /* Determine the maximum number of comm. pulses in one dimension */
7167
7168     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7169
7170     /* Determine the maximum required number of grid pulses */
7171     if (comm->cellsize_limit >= comm->cutoff)
7172     {
7173         /* Only a single pulse is required */
7174         npulse = 1;
7175     }
7176     else if (!bNoCutOff && comm->cellsize_limit > 0)
7177     {
7178         /* We round down slightly here to avoid overhead due to the latency
7179          * of extra communication calls when the cut-off
7180          * would be only slightly longer than the cell size.
7181          * Later cellsize_limit is redetermined,
7182          * so we can not miss interactions due to this rounding.
7183          */
7184         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7185     }
7186     else
7187     {
7188         /* There is no cell size limit */
7189         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7190     }
7191
7192     if (!bNoCutOff && npulse > 1)
7193     {
7194         /* See if we can do with less pulses, based on dlb_scale */
7195         npulse_d_max = 0;
7196         for (d = 0; d < dd->ndim; d++)
7197         {
7198             dim      = dd->dim[d];
7199             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7200                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7201             npulse_d_max = max(npulse_d_max, npulse_d);
7202         }
7203         npulse = min(npulse, npulse_d_max);
7204     }
7205
7206     /* This env var can override npulse */
7207     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7208     if (d > 0)
7209     {
7210         npulse = d;
7211     }
7212
7213     comm->maxpulse       = 1;
7214     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7215     for (d = 0; d < dd->ndim; d++)
7216     {
7217         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7218         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7219         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7220         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7221         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7222         {
7223             comm->bVacDLBNoLimit = FALSE;
7224         }
7225     }
7226
7227     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7228     if (!comm->bVacDLBNoLimit)
7229     {
7230         comm->cellsize_limit = max(comm->cellsize_limit,
7231                                    comm->cutoff/comm->maxpulse);
7232     }
7233     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7234     /* Set the minimum cell size for each DD dimension */
7235     for (d = 0; d < dd->ndim; d++)
7236     {
7237         if (comm->bVacDLBNoLimit ||
7238             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7239         {
7240             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7241         }
7242         else
7243         {
7244             comm->cellsize_min_dlb[dd->dim[d]] =
7245                 comm->cutoff/comm->cd[d].np_dlb;
7246         }
7247     }
7248     if (comm->cutoff_mbody <= 0)
7249     {
7250         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7251     }
7252     if (comm->bDynLoadBal)
7253     {
7254         set_dlb_limits(dd);
7255     }
7256 }
7257
7258 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7259 {
7260     /* If each molecule is a single charge group
7261      * or we use domain decomposition for each periodic dimension,
7262      * we do not need to take pbc into account for the bonded interactions.
7263      */
7264     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7265             !(dd->nc[XX] > 1 &&
7266               dd->nc[YY] > 1 &&
7267               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7268 }
7269
7270 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7271                        t_inputrec *ir, t_forcerec *fr,
7272                        gmx_ddbox_t *ddbox)
7273 {
7274     gmx_domdec_comm_t *comm;
7275     int                natoms_tot;
7276     real               vol_frac;
7277
7278     comm = dd->comm;
7279
7280     /* Initialize the thread data.
7281      * This can not be done in init_domain_decomposition,
7282      * as the numbers of threads is determined later.
7283      */
7284     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7285     if (comm->nth > 1)
7286     {
7287         snew(comm->dth, comm->nth);
7288     }
7289
7290     if (EEL_PME(ir->coulombtype))
7291     {
7292         init_ddpme(dd, &comm->ddpme[0], 0);
7293         if (comm->npmedecompdim >= 2)
7294         {
7295             init_ddpme(dd, &comm->ddpme[1], 1);
7296         }
7297     }
7298     else
7299     {
7300         comm->npmenodes = 0;
7301         if (dd->pme_nodeid >= 0)
7302         {
7303             gmx_fatal_collective(FARGS, NULL, dd,
7304                                  "Can not have separate PME nodes without PME electrostatics");
7305         }
7306     }
7307
7308     if (debug)
7309     {
7310         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7311     }
7312     if (comm->eDLB != edlbNO)
7313     {
7314         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7315     }
7316
7317     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7318     if (comm->eDLB == edlbAUTO)
7319     {
7320         if (fplog)
7321         {
7322             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7323         }
7324         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7325     }
7326
7327     if (ir->ePBC == epbcNONE)
7328     {
7329         vol_frac = 1 - 1/(double)dd->nnodes;
7330     }
7331     else
7332     {
7333         vol_frac =
7334             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7335     }
7336     if (debug)
7337     {
7338         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7339     }
7340     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7341
7342     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7343 }
7344
7345 static gmx_bool test_dd_cutoff(t_commrec *cr,
7346                                t_state *state, t_inputrec *ir,
7347                                real cutoff_req)
7348 {
7349     gmx_domdec_t *dd;
7350     gmx_ddbox_t   ddbox;
7351     int           d, dim, np;
7352     real          inv_cell_size;
7353     int           LocallyLimited;
7354
7355     dd = cr->dd;
7356
7357     set_ddbox(dd, FALSE, cr, ir, state->box,
7358               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7359
7360     LocallyLimited = 0;
7361
7362     for (d = 0; d < dd->ndim; d++)
7363     {
7364         dim = dd->dim[d];
7365
7366         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7367         if (dynamic_dd_box(&ddbox, ir))
7368         {
7369             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7370         }
7371
7372         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7373
7374         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7375             dd->comm->cd[d].np_dlb > 0)
7376         {
7377             if (np > dd->comm->cd[d].np_dlb)
7378             {
7379                 return FALSE;
7380             }
7381
7382             /* If a current local cell size is smaller than the requested
7383              * cut-off, we could still fix it, but this gets very complicated.
7384              * Without fixing here, we might actually need more checks.
7385              */
7386             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7387             {
7388                 LocallyLimited = 1;
7389             }
7390         }
7391     }
7392
7393     if (dd->comm->eDLB != edlbNO)
7394     {
7395         /* If DLB is not active yet, we don't need to check the grid jumps.
7396          * Actually we shouldn't, because then the grid jump data is not set.
7397          */
7398         if (dd->comm->bDynLoadBal &&
7399             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7400         {
7401             LocallyLimited = 1;
7402         }
7403
7404         gmx_sumi(1, &LocallyLimited, cr);
7405
7406         if (LocallyLimited > 0)
7407         {
7408             return FALSE;
7409         }
7410     }
7411
7412     return TRUE;
7413 }
7414
7415 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7416                           real cutoff_req)
7417 {
7418     gmx_bool bCutoffAllowed;
7419
7420     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7421
7422     if (bCutoffAllowed)
7423     {
7424         cr->dd->comm->cutoff = cutoff_req;
7425     }
7426
7427     return bCutoffAllowed;
7428 }
7429
7430 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7431 {
7432     gmx_domdec_comm_t *comm;
7433
7434     comm = cr->dd->comm;
7435
7436     /* Turn on the DLB limiting (might have been on already) */
7437     comm->bPMELoadBalDLBLimits = TRUE;
7438
7439     /* Change the cut-off limit */
7440     comm->PMELoadBal_max_cutoff = comm->cutoff;
7441 }
7442
7443 static void merge_cg_buffers(int ncell,
7444                              gmx_domdec_comm_dim_t *cd, int pulse,
7445                              int  *ncg_cell,
7446                              int  *index_gl, int  *recv_i,
7447                              rvec *cg_cm,    rvec *recv_vr,
7448                              int *cgindex,
7449                              cginfo_mb_t *cginfo_mb, int *cginfo)
7450 {
7451     gmx_domdec_ind_t *ind, *ind_p;
7452     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7453     int               shift, shift_at;
7454
7455     ind = &cd->ind[pulse];
7456
7457     /* First correct the already stored data */
7458     shift = ind->nrecv[ncell];
7459     for (cell = ncell-1; cell >= 0; cell--)
7460     {
7461         shift -= ind->nrecv[cell];
7462         if (shift > 0)
7463         {
7464             /* Move the cg's present from previous grid pulses */
7465             cg0                = ncg_cell[ncell+cell];
7466             cg1                = ncg_cell[ncell+cell+1];
7467             cgindex[cg1+shift] = cgindex[cg1];
7468             for (cg = cg1-1; cg >= cg0; cg--)
7469             {
7470                 index_gl[cg+shift] = index_gl[cg];
7471                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7472                 cgindex[cg+shift] = cgindex[cg];
7473                 cginfo[cg+shift]  = cginfo[cg];
7474             }
7475             /* Correct the already stored send indices for the shift */
7476             for (p = 1; p <= pulse; p++)
7477             {
7478                 ind_p = &cd->ind[p];
7479                 cg0   = 0;
7480                 for (c = 0; c < cell; c++)
7481                 {
7482                     cg0 += ind_p->nsend[c];
7483                 }
7484                 cg1 = cg0 + ind_p->nsend[cell];
7485                 for (cg = cg0; cg < cg1; cg++)
7486                 {
7487                     ind_p->index[cg] += shift;
7488                 }
7489             }
7490         }
7491     }
7492
7493     /* Merge in the communicated buffers */
7494     shift    = 0;
7495     shift_at = 0;
7496     cg0      = 0;
7497     for (cell = 0; cell < ncell; cell++)
7498     {
7499         cg1 = ncg_cell[ncell+cell+1] + shift;
7500         if (shift_at > 0)
7501         {
7502             /* Correct the old cg indices */
7503             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7504             {
7505                 cgindex[cg+1] += shift_at;
7506             }
7507         }
7508         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7509         {
7510             /* Copy this charge group from the buffer */
7511             index_gl[cg1] = recv_i[cg0];
7512             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7513             /* Add it to the cgindex */
7514             cg_gl          = index_gl[cg1];
7515             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7516             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7517             cgindex[cg1+1] = cgindex[cg1] + nat;
7518             cg0++;
7519             cg1++;
7520             shift_at += nat;
7521         }
7522         shift                 += ind->nrecv[cell];
7523         ncg_cell[ncell+cell+1] = cg1;
7524     }
7525 }
7526
7527 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7528                                int nzone, int cg0, const int *cgindex)
7529 {
7530     int cg, zone, p;
7531
7532     /* Store the atom block boundaries for easy copying of communication buffers
7533      */
7534     cg = cg0;
7535     for (zone = 0; zone < nzone; zone++)
7536     {
7537         for (p = 0; p < cd->np; p++)
7538         {
7539             cd->ind[p].cell2at0[zone] = cgindex[cg];
7540             cg += cd->ind[p].nrecv[zone];
7541             cd->ind[p].cell2at1[zone] = cgindex[cg];
7542         }
7543     }
7544 }
7545
7546 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7547 {
7548     int      i;
7549     gmx_bool bMiss;
7550
7551     bMiss = FALSE;
7552     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7553     {
7554         if (!bLocalCG[link->a[i]])
7555         {
7556             bMiss = TRUE;
7557         }
7558     }
7559
7560     return bMiss;
7561 }
7562
7563 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7564 typedef struct {
7565     real c[DIM][4]; /* the corners for the non-bonded communication */
7566     real cr0;       /* corner for rounding */
7567     real cr1[4];    /* corners for rounding */
7568     real bc[DIM];   /* corners for bounded communication */
7569     real bcr1;      /* corner for rounding for bonded communication */
7570 } dd_corners_t;
7571
7572 /* Determine the corners of the domain(s) we are communicating with */
7573 static void
7574 set_dd_corners(const gmx_domdec_t *dd,
7575                int dim0, int dim1, int dim2,
7576                gmx_bool bDistMB,
7577                dd_corners_t *c)
7578 {
7579     const gmx_domdec_comm_t  *comm;
7580     const gmx_domdec_zones_t *zones;
7581     int i, j;
7582
7583     comm = dd->comm;
7584
7585     zones = &comm->zones;
7586
7587     /* Keep the compiler happy */
7588     c->cr0  = 0;
7589     c->bcr1 = 0;
7590
7591     /* The first dimension is equal for all cells */
7592     c->c[0][0] = comm->cell_x0[dim0];
7593     if (bDistMB)
7594     {
7595         c->bc[0] = c->c[0][0];
7596     }
7597     if (dd->ndim >= 2)
7598     {
7599         dim1 = dd->dim[1];
7600         /* This cell row is only seen from the first row */
7601         c->c[1][0] = comm->cell_x0[dim1];
7602         /* All rows can see this row */
7603         c->c[1][1] = comm->cell_x0[dim1];
7604         if (dd->bGridJump)
7605         {
7606             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7607             if (bDistMB)
7608             {
7609                 /* For the multi-body distance we need the maximum */
7610                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7611             }
7612         }
7613         /* Set the upper-right corner for rounding */
7614         c->cr0 = comm->cell_x1[dim0];
7615
7616         if (dd->ndim >= 3)
7617         {
7618             dim2 = dd->dim[2];
7619             for (j = 0; j < 4; j++)
7620             {
7621                 c->c[2][j] = comm->cell_x0[dim2];
7622             }
7623             if (dd->bGridJump)
7624             {
7625                 /* Use the maximum of the i-cells that see a j-cell */
7626                 for (i = 0; i < zones->nizone; i++)
7627                 {
7628                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7629                     {
7630                         if (j >= 4)
7631                         {
7632                             c->c[2][j-4] =
7633                                 max(c->c[2][j-4],
7634                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7635                         }
7636                     }
7637                 }
7638                 if (bDistMB)
7639                 {
7640                     /* For the multi-body distance we need the maximum */
7641                     c->bc[2] = comm->cell_x0[dim2];
7642                     for (i = 0; i < 2; i++)
7643                     {
7644                         for (j = 0; j < 2; j++)
7645                         {
7646                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7647                         }
7648                     }
7649                 }
7650             }
7651
7652             /* Set the upper-right corner for rounding */
7653             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7654              * Only cell (0,0,0) can see cell 7 (1,1,1)
7655              */
7656             c->cr1[0] = comm->cell_x1[dim1];
7657             c->cr1[3] = comm->cell_x1[dim1];
7658             if (dd->bGridJump)
7659             {
7660                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7661                 if (bDistMB)
7662                 {
7663                     /* For the multi-body distance we need the maximum */
7664                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7665                 }
7666             }
7667         }
7668     }
7669 }
7670
7671 /* Determine which cg's we need to send in this pulse from this zone */
7672 static void
7673 get_zone_pulse_cgs(gmx_domdec_t *dd,
7674                    int zonei, int zone,
7675                    int cg0, int cg1,
7676                    const int *index_gl,
7677                    const int *cgindex,
7678                    int dim, int dim_ind,
7679                    int dim0, int dim1, int dim2,
7680                    real r_comm2, real r_bcomm2,
7681                    matrix box,
7682                    ivec tric_dist,
7683                    rvec *normal,
7684                    real skew_fac2_d, real skew_fac_01,
7685                    rvec *v_d, rvec *v_0, rvec *v_1,
7686                    const dd_corners_t *c,
7687                    rvec sf2_round,
7688                    gmx_bool bDistBonded,
7689                    gmx_bool bBondComm,
7690                    gmx_bool bDist2B,
7691                    gmx_bool bDistMB,
7692                    rvec *cg_cm,
7693                    int *cginfo,
7694                    gmx_domdec_ind_t *ind,
7695                    int **ibuf, int *ibuf_nalloc,
7696                    vec_rvec_t *vbuf,
7697                    int *nsend_ptr,
7698                    int *nat_ptr,
7699                    int *nsend_z_ptr)
7700 {
7701     gmx_domdec_comm_t *comm;
7702     gmx_bool           bScrew;
7703     gmx_bool           bDistMB_pulse;
7704     int                cg, i;
7705     real               r2, rb2, r, tric_sh;
7706     rvec               rn, rb;
7707     int                dimd;
7708     int                nsend_z, nsend, nat;
7709
7710     comm = dd->comm;
7711
7712     bScrew = (dd->bScrewPBC && dim == XX);
7713
7714     bDistMB_pulse = (bDistMB && bDistBonded);
7715
7716     nsend_z = 0;
7717     nsend   = *nsend_ptr;
7718     nat     = *nat_ptr;
7719
7720     for (cg = cg0; cg < cg1; cg++)
7721     {
7722         r2  = 0;
7723         rb2 = 0;
7724         if (tric_dist[dim_ind] == 0)
7725         {
7726             /* Rectangular direction, easy */
7727             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7728             if (r > 0)
7729             {
7730                 r2 += r*r;
7731             }
7732             if (bDistMB_pulse)
7733             {
7734                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7735                 if (r > 0)
7736                 {
7737                     rb2 += r*r;
7738                 }
7739             }
7740             /* Rounding gives at most a 16% reduction
7741              * in communicated atoms
7742              */
7743             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7744             {
7745                 r = cg_cm[cg][dim0] - c->cr0;
7746                 /* This is the first dimension, so always r >= 0 */
7747                 r2 += r*r;
7748                 if (bDistMB_pulse)
7749                 {
7750                     rb2 += r*r;
7751                 }
7752             }
7753             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7754             {
7755                 r = cg_cm[cg][dim1] - c->cr1[zone];
7756                 if (r > 0)
7757                 {
7758                     r2 += r*r;
7759                 }
7760                 if (bDistMB_pulse)
7761                 {
7762                     r = cg_cm[cg][dim1] - c->bcr1;
7763                     if (r > 0)
7764                     {
7765                         rb2 += r*r;
7766                     }
7767                 }
7768             }
7769         }
7770         else
7771         {
7772             /* Triclinic direction, more complicated */
7773             clear_rvec(rn);
7774             clear_rvec(rb);
7775             /* Rounding, conservative as the skew_fac multiplication
7776              * will slightly underestimate the distance.
7777              */
7778             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7779             {
7780                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7781                 for (i = dim0+1; i < DIM; i++)
7782                 {
7783                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7784                 }
7785                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7786                 if (bDistMB_pulse)
7787                 {
7788                     rb[dim0] = rn[dim0];
7789                     rb2      = r2;
7790                 }
7791                 /* Take care that the cell planes along dim0 might not
7792                  * be orthogonal to those along dim1 and dim2.
7793                  */
7794                 for (i = 1; i <= dim_ind; i++)
7795                 {
7796                     dimd = dd->dim[i];
7797                     if (normal[dim0][dimd] > 0)
7798                     {
7799                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7800                         if (bDistMB_pulse)
7801                         {
7802                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7803                         }
7804                     }
7805                 }
7806             }
7807             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7808             {
7809                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7810                 tric_sh   = 0;
7811                 for (i = dim1+1; i < DIM; i++)
7812                 {
7813                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7814                 }
7815                 rn[dim1] += tric_sh;
7816                 if (rn[dim1] > 0)
7817                 {
7818                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7819                     /* Take care of coupling of the distances
7820                      * to the planes along dim0 and dim1 through dim2.
7821                      */
7822                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7823                     /* Take care that the cell planes along dim1
7824                      * might not be orthogonal to that along dim2.
7825                      */
7826                     if (normal[dim1][dim2] > 0)
7827                     {
7828                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7829                     }
7830                 }
7831                 if (bDistMB_pulse)
7832                 {
7833                     rb[dim1] +=
7834                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7835                     if (rb[dim1] > 0)
7836                     {
7837                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7838                         /* Take care of coupling of the distances
7839                          * to the planes along dim0 and dim1 through dim2.
7840                          */
7841                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7842                         /* Take care that the cell planes along dim1
7843                          * might not be orthogonal to that along dim2.
7844                          */
7845                         if (normal[dim1][dim2] > 0)
7846                         {
7847                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7848                         }
7849                     }
7850                 }
7851             }
7852             /* The distance along the communication direction */
7853             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7854             tric_sh  = 0;
7855             for (i = dim+1; i < DIM; i++)
7856             {
7857                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7858             }
7859             rn[dim] += tric_sh;
7860             if (rn[dim] > 0)
7861             {
7862                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7863                 /* Take care of coupling of the distances
7864                  * to the planes along dim0 and dim1 through dim2.
7865                  */
7866                 if (dim_ind == 1 && zonei == 1)
7867                 {
7868                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7869                 }
7870             }
7871             if (bDistMB_pulse)
7872             {
7873                 clear_rvec(rb);
7874                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7875                 if (rb[dim] > 0)
7876                 {
7877                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7878                     /* Take care of coupling of the distances
7879                      * to the planes along dim0 and dim1 through dim2.
7880                      */
7881                     if (dim_ind == 1 && zonei == 1)
7882                     {
7883                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7884                     }
7885                 }
7886             }
7887         }
7888
7889         if (r2 < r_comm2 ||
7890             (bDistBonded &&
7891              ((bDistMB && rb2 < r_bcomm2) ||
7892               (bDist2B && r2  < r_bcomm2)) &&
7893              (!bBondComm ||
7894               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7895                missing_link(comm->cglink, index_gl[cg],
7896                             comm->bLocalCG)))))
7897         {
7898             /* Make an index to the local charge groups */
7899             if (nsend+1 > ind->nalloc)
7900             {
7901                 ind->nalloc = over_alloc_large(nsend+1);
7902                 srenew(ind->index, ind->nalloc);
7903             }
7904             if (nsend+1 > *ibuf_nalloc)
7905             {
7906                 *ibuf_nalloc = over_alloc_large(nsend+1);
7907                 srenew(*ibuf, *ibuf_nalloc);
7908             }
7909             ind->index[nsend] = cg;
7910             (*ibuf)[nsend]    = index_gl[cg];
7911             nsend_z++;
7912             vec_rvec_check_alloc(vbuf, nsend+1);
7913
7914             if (dd->ci[dim] == 0)
7915             {
7916                 /* Correct cg_cm for pbc */
7917                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7918                 if (bScrew)
7919                 {
7920                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7921                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7922                 }
7923             }
7924             else
7925             {
7926                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7927             }
7928             nsend++;
7929             nat += cgindex[cg+1] - cgindex[cg];
7930         }
7931     }
7932
7933     *nsend_ptr   = nsend;
7934     *nat_ptr     = nat;
7935     *nsend_z_ptr = nsend_z;
7936 }
7937
7938 static void setup_dd_communication(gmx_domdec_t *dd,
7939                                    matrix box, gmx_ddbox_t *ddbox,
7940                                    t_forcerec *fr, t_state *state, rvec **f)
7941 {
7942     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7943     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7944     int                    c, i, j, cg, cg_gl, nrcg;
7945     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7946     gmx_domdec_comm_t     *comm;
7947     gmx_domdec_zones_t    *zones;
7948     gmx_domdec_comm_dim_t *cd;
7949     gmx_domdec_ind_t      *ind;
7950     cginfo_mb_t           *cginfo_mb;
7951     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7952     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
7953     dd_corners_t           corners;
7954     ivec                   tric_dist;
7955     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7956     real                   skew_fac2_d, skew_fac_01;
7957     rvec                   sf2_round;
7958     int                    nsend, nat;
7959     int                    th;
7960
7961     if (debug)
7962     {
7963         fprintf(debug, "Setting up DD communication\n");
7964     }
7965
7966     comm  = dd->comm;
7967
7968     switch (fr->cutoff_scheme)
7969     {
7970         case ecutsGROUP:
7971             cg_cm = fr->cg_cm;
7972             break;
7973         case ecutsVERLET:
7974             cg_cm = state->x;
7975             break;
7976         default:
7977             gmx_incons("unimplemented");
7978             cg_cm = NULL;
7979     }
7980
7981     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7982     {
7983         dim = dd->dim[dim_ind];
7984
7985         /* Check if we need to use triclinic distances */
7986         tric_dist[dim_ind] = 0;
7987         for (i = 0; i <= dim_ind; i++)
7988         {
7989             if (ddbox->tric_dir[dd->dim[i]])
7990             {
7991                 tric_dist[dim_ind] = 1;
7992             }
7993         }
7994     }
7995
7996     bBondComm = comm->bBondComm;
7997
7998     /* Do we need to determine extra distances for multi-body bondeds? */
7999     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8000
8001     /* Do we need to determine extra distances for only two-body bondeds? */
8002     bDist2B = (bBondComm && !bDistMB);
8003
8004     r_comm2  = sqr(comm->cutoff);
8005     r_bcomm2 = sqr(comm->cutoff_mbody);
8006
8007     if (debug)
8008     {
8009         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8010     }
8011
8012     zones = &comm->zones;
8013
8014     dim0 = dd->dim[0];
8015     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8016     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8017
8018     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8019
8020     /* Triclinic stuff */
8021     normal      = ddbox->normal;
8022     skew_fac_01 = 0;
8023     if (dd->ndim >= 2)
8024     {
8025         v_0 = ddbox->v[dim0];
8026         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8027         {
8028             /* Determine the coupling coefficient for the distances
8029              * to the cell planes along dim0 and dim1 through dim2.
8030              * This is required for correct rounding.
8031              */
8032             skew_fac_01 =
8033                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8034             if (debug)
8035             {
8036                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8037             }
8038         }
8039     }
8040     if (dd->ndim >= 3)
8041     {
8042         v_1 = ddbox->v[dim1];
8043     }
8044
8045     zone_cg_range = zones->cg_range;
8046     index_gl      = dd->index_gl;
8047     cgindex       = dd->cgindex;
8048     cginfo_mb     = fr->cginfo_mb;
8049
8050     zone_cg_range[0]   = 0;
8051     zone_cg_range[1]   = dd->ncg_home;
8052     comm->zone_ncg1[0] = dd->ncg_home;
8053     pos_cg             = dd->ncg_home;
8054
8055     nat_tot = dd->nat_home;
8056     nzone   = 1;
8057     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8058     {
8059         dim = dd->dim[dim_ind];
8060         cd  = &comm->cd[dim_ind];
8061
8062         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8063         {
8064             /* No pbc in this dimension, the first node should not comm. */
8065             nzone_send = 0;
8066         }
8067         else
8068         {
8069             nzone_send = nzone;
8070         }
8071
8072         v_d         = ddbox->v[dim];
8073         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8074
8075         cd->bInPlace = TRUE;
8076         for (p = 0; p < cd->np; p++)
8077         {
8078             /* Only atoms communicated in the first pulse are used
8079              * for multi-body bonded interactions or for bBondComm.
8080              */
8081             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8082
8083             ind   = &cd->ind[p];
8084             nsend = 0;
8085             nat   = 0;
8086             for (zone = 0; zone < nzone_send; zone++)
8087             {
8088                 if (tric_dist[dim_ind] && dim_ind > 0)
8089                 {
8090                     /* Determine slightly more optimized skew_fac's
8091                      * for rounding.
8092                      * This reduces the number of communicated atoms
8093                      * by about 10% for 3D DD of rhombic dodecahedra.
8094                      */
8095                     for (dimd = 0; dimd < dim; dimd++)
8096                     {
8097                         sf2_round[dimd] = 1;
8098                         if (ddbox->tric_dir[dimd])
8099                         {
8100                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8101                             {
8102                                 /* If we are shifted in dimension i
8103                                  * and the cell plane is tilted forward
8104                                  * in dimension i, skip this coupling.
8105                                  */
8106                                 if (!(zones->shift[nzone+zone][i] &&
8107                                       ddbox->v[dimd][i][dimd] >= 0))
8108                                 {
8109                                     sf2_round[dimd] +=
8110                                         sqr(ddbox->v[dimd][i][dimd]);
8111                                 }
8112                             }
8113                             sf2_round[dimd] = 1/sf2_round[dimd];
8114                         }
8115                     }
8116                 }
8117
8118                 zonei = zone_perm[dim_ind][zone];
8119                 if (p == 0)
8120                 {
8121                     /* Here we permutate the zones to obtain a convenient order
8122                      * for neighbor searching
8123                      */
8124                     cg0 = zone_cg_range[zonei];
8125                     cg1 = zone_cg_range[zonei+1];
8126                 }
8127                 else
8128                 {
8129                     /* Look only at the cg's received in the previous grid pulse
8130                      */
8131                     cg1 = zone_cg_range[nzone+zone+1];
8132                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8133                 }
8134
8135 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8136                 for (th = 0; th < comm->nth; th++)
8137                 {
8138                     gmx_domdec_ind_t *ind_p;
8139                     int             **ibuf_p, *ibuf_nalloc_p;
8140                     vec_rvec_t       *vbuf_p;
8141                     int              *nsend_p, *nat_p;
8142                     int              *nsend_zone_p;
8143                     int               cg0_th, cg1_th;
8144
8145                     if (th == 0)
8146                     {
8147                         /* Thread 0 writes in the comm buffers */
8148                         ind_p         = ind;
8149                         ibuf_p        = &comm->buf_int;
8150                         ibuf_nalloc_p = &comm->nalloc_int;
8151                         vbuf_p        = &comm->vbuf;
8152                         nsend_p       = &nsend;
8153                         nat_p         = &nat;
8154                         nsend_zone_p  = &ind->nsend[zone];
8155                     }
8156                     else
8157                     {
8158                         /* Other threads write into temp buffers */
8159                         ind_p         = &comm->dth[th].ind;
8160                         ibuf_p        = &comm->dth[th].ibuf;
8161                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8162                         vbuf_p        = &comm->dth[th].vbuf;
8163                         nsend_p       = &comm->dth[th].nsend;
8164                         nat_p         = &comm->dth[th].nat;
8165                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8166
8167                         comm->dth[th].nsend      = 0;
8168                         comm->dth[th].nat        = 0;
8169                         comm->dth[th].nsend_zone = 0;
8170                     }
8171
8172                     if (comm->nth == 1)
8173                     {
8174                         cg0_th = cg0;
8175                         cg1_th = cg1;
8176                     }
8177                     else
8178                     {
8179                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8180                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8181                     }
8182
8183                     /* Get the cg's for this pulse in this zone */
8184                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8185                                        index_gl, cgindex,
8186                                        dim, dim_ind, dim0, dim1, dim2,
8187                                        r_comm2, r_bcomm2,
8188                                        box, tric_dist,
8189                                        normal, skew_fac2_d, skew_fac_01,
8190                                        v_d, v_0, v_1, &corners, sf2_round,
8191                                        bDistBonded, bBondComm,
8192                                        bDist2B, bDistMB,
8193                                        cg_cm, fr->cginfo,
8194                                        ind_p,
8195                                        ibuf_p, ibuf_nalloc_p,
8196                                        vbuf_p,
8197                                        nsend_p, nat_p,
8198                                        nsend_zone_p);
8199                 }
8200
8201                 /* Append data of threads>=1 to the communication buffers */
8202                 for (th = 1; th < comm->nth; th++)
8203                 {
8204                     dd_comm_setup_work_t *dth;
8205                     int                   i, ns1;
8206
8207                     dth = &comm->dth[th];
8208
8209                     ns1 = nsend + dth->nsend_zone;
8210                     if (ns1 > ind->nalloc)
8211                     {
8212                         ind->nalloc = over_alloc_dd(ns1);
8213                         srenew(ind->index, ind->nalloc);
8214                     }
8215                     if (ns1 > comm->nalloc_int)
8216                     {
8217                         comm->nalloc_int = over_alloc_dd(ns1);
8218                         srenew(comm->buf_int, comm->nalloc_int);
8219                     }
8220                     if (ns1 > comm->vbuf.nalloc)
8221                     {
8222                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8223                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8224                     }
8225
8226                     for (i = 0; i < dth->nsend_zone; i++)
8227                     {
8228                         ind->index[nsend]    = dth->ind.index[i];
8229                         comm->buf_int[nsend] = dth->ibuf[i];
8230                         copy_rvec(dth->vbuf.v[i],
8231                                   comm->vbuf.v[nsend]);
8232                         nsend++;
8233                     }
8234                     nat              += dth->nat;
8235                     ind->nsend[zone] += dth->nsend_zone;
8236                 }
8237             }
8238             /* Clear the counts in case we do not have pbc */
8239             for (zone = nzone_send; zone < nzone; zone++)
8240             {
8241                 ind->nsend[zone] = 0;
8242             }
8243             ind->nsend[nzone]   = nsend;
8244             ind->nsend[nzone+1] = nat;
8245             /* Communicate the number of cg's and atoms to receive */
8246             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8247                             ind->nsend, nzone+2,
8248                             ind->nrecv, nzone+2);
8249
8250             /* The rvec buffer is also required for atom buffers of size nsend
8251              * in dd_move_x and dd_move_f.
8252              */
8253             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8254
8255             if (p > 0)
8256             {
8257                 /* We can receive in place if only the last zone is not empty */
8258                 for (zone = 0; zone < nzone-1; zone++)
8259                 {
8260                     if (ind->nrecv[zone] > 0)
8261                     {
8262                         cd->bInPlace = FALSE;
8263                     }
8264                 }
8265                 if (!cd->bInPlace)
8266                 {
8267                     /* The int buffer is only required here for the cg indices */
8268                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8269                     {
8270                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8271                         srenew(comm->buf_int2, comm->nalloc_int2);
8272                     }
8273                     /* The rvec buffer is also required for atom buffers
8274                      * of size nrecv in dd_move_x and dd_move_f.
8275                      */
8276                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8277                     vec_rvec_check_alloc(&comm->vbuf2, i);
8278                 }
8279             }
8280
8281             /* Make space for the global cg indices */
8282             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8283                 || dd->cg_nalloc == 0)
8284             {
8285                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8286                 srenew(index_gl, dd->cg_nalloc);
8287                 srenew(cgindex, dd->cg_nalloc+1);
8288             }
8289             /* Communicate the global cg indices */
8290             if (cd->bInPlace)
8291             {
8292                 recv_i = index_gl + pos_cg;
8293             }
8294             else
8295             {
8296                 recv_i = comm->buf_int2;
8297             }
8298             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8299                             comm->buf_int, nsend,
8300                             recv_i,        ind->nrecv[nzone]);
8301
8302             /* Make space for cg_cm */
8303             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8304             if (fr->cutoff_scheme == ecutsGROUP)
8305             {
8306                 cg_cm = fr->cg_cm;
8307             }
8308             else
8309             {
8310                 cg_cm = state->x;
8311             }
8312             /* Communicate cg_cm */
8313             if (cd->bInPlace)
8314             {
8315                 recv_vr = cg_cm + pos_cg;
8316             }
8317             else
8318             {
8319                 recv_vr = comm->vbuf2.v;
8320             }
8321             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8322                              comm->vbuf.v, nsend,
8323                              recv_vr,      ind->nrecv[nzone]);
8324
8325             /* Make the charge group index */
8326             if (cd->bInPlace)
8327             {
8328                 zone = (p == 0 ? 0 : nzone - 1);
8329                 while (zone < nzone)
8330                 {
8331                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8332                     {
8333                         cg_gl              = index_gl[pos_cg];
8334                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8335                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8336                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8337                         if (bBondComm)
8338                         {
8339                             /* Update the charge group presence,
8340                              * so we can use it in the next pass of the loop.
8341                              */
8342                             comm->bLocalCG[cg_gl] = TRUE;
8343                         }
8344                         pos_cg++;
8345                     }
8346                     if (p == 0)
8347                     {
8348                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8349                     }
8350                     zone++;
8351                     zone_cg_range[nzone+zone] = pos_cg;
8352                 }
8353             }
8354             else
8355             {
8356                 /* This part of the code is never executed with bBondComm. */
8357                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8358                                  index_gl, recv_i, cg_cm, recv_vr,
8359                                  cgindex, fr->cginfo_mb, fr->cginfo);
8360                 pos_cg += ind->nrecv[nzone];
8361             }
8362             nat_tot += ind->nrecv[nzone+1];
8363         }
8364         if (!cd->bInPlace)
8365         {
8366             /* Store the atom block for easy copying of communication buffers */
8367             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8368         }
8369         nzone += nzone;
8370     }
8371     dd->index_gl = index_gl;
8372     dd->cgindex  = cgindex;
8373
8374     dd->ncg_tot          = zone_cg_range[zones->n];
8375     dd->nat_tot          = nat_tot;
8376     comm->nat[ddnatHOME] = dd->nat_home;
8377     for (i = ddnatZONE; i < ddnatNR; i++)
8378     {
8379         comm->nat[i] = dd->nat_tot;
8380     }
8381
8382     if (!bBondComm)
8383     {
8384         /* We don't need to update cginfo, since that was alrady done above.
8385          * So we pass NULL for the forcerec.
8386          */
8387         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8388                       NULL, comm->bLocalCG);
8389     }
8390
8391     if (debug)
8392     {
8393         fprintf(debug, "Finished setting up DD communication, zones:");
8394         for (c = 0; c < zones->n; c++)
8395         {
8396             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8397         }
8398         fprintf(debug, "\n");
8399     }
8400 }
8401
8402 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8403 {
8404     int c;
8405
8406     for (c = 0; c < zones->nizone; c++)
8407     {
8408         zones->izone[c].cg1  = zones->cg_range[c+1];
8409         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8410         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8411     }
8412 }
8413
8414 static void set_zones_size(gmx_domdec_t *dd,
8415                            matrix box, const gmx_ddbox_t *ddbox,
8416                            int zone_start, int zone_end)
8417 {
8418     gmx_domdec_comm_t  *comm;
8419     gmx_domdec_zones_t *zones;
8420     gmx_bool            bDistMB;
8421     int                 z, zi, zj0, zj1, d, dim;
8422     real                rcs, rcmbs;
8423     int                 i, j;
8424     real                size_j, add_tric;
8425     real                vol;
8426
8427     comm = dd->comm;
8428
8429     zones = &comm->zones;
8430
8431     /* Do we need to determine extra distances for multi-body bondeds? */
8432     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8433
8434     for (z = zone_start; z < zone_end; z++)
8435     {
8436         /* Copy cell limits to zone limits.
8437          * Valid for non-DD dims and non-shifted dims.
8438          */
8439         copy_rvec(comm->cell_x0, zones->size[z].x0);
8440         copy_rvec(comm->cell_x1, zones->size[z].x1);
8441     }
8442
8443     for (d = 0; d < dd->ndim; d++)
8444     {
8445         dim = dd->dim[d];
8446
8447         for (z = 0; z < zones->n; z++)
8448         {
8449             /* With a staggered grid we have different sizes
8450              * for non-shifted dimensions.
8451              */
8452             if (dd->bGridJump && zones->shift[z][dim] == 0)
8453             {
8454                 if (d == 1)
8455                 {
8456                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8457                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8458                 }
8459                 else if (d == 2)
8460                 {
8461                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8462                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8463                 }
8464             }
8465         }
8466
8467         rcs   = comm->cutoff;
8468         rcmbs = comm->cutoff_mbody;
8469         if (ddbox->tric_dir[dim])
8470         {
8471             rcs   /= ddbox->skew_fac[dim];
8472             rcmbs /= ddbox->skew_fac[dim];
8473         }
8474
8475         /* Set the lower limit for the shifted zone dimensions */
8476         for (z = zone_start; z < zone_end; z++)
8477         {
8478             if (zones->shift[z][dim] > 0)
8479             {
8480                 dim = dd->dim[d];
8481                 if (!dd->bGridJump || d == 0)
8482                 {
8483                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8484                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8485                 }
8486                 else
8487                 {
8488                     /* Here we take the lower limit of the zone from
8489                      * the lowest domain of the zone below.
8490                      */
8491                     if (z < 4)
8492                     {
8493                         zones->size[z].x0[dim] =
8494                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8495                     }
8496                     else
8497                     {
8498                         if (d == 1)
8499                         {
8500                             zones->size[z].x0[dim] =
8501                                 zones->size[zone_perm[2][z-4]].x0[dim];
8502                         }
8503                         else
8504                         {
8505                             zones->size[z].x0[dim] =
8506                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8507                         }
8508                     }
8509                     /* A temporary limit, is updated below */
8510                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8511
8512                     if (bDistMB)
8513                     {
8514                         for (zi = 0; zi < zones->nizone; zi++)
8515                         {
8516                             if (zones->shift[zi][dim] == 0)
8517                             {
8518                                 /* This takes the whole zone into account.
8519                                  * With multiple pulses this will lead
8520                                  * to a larger zone then strictly necessary.
8521                                  */
8522                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8523                                                              zones->size[zi].x1[dim]+rcmbs);
8524                             }
8525                         }
8526                     }
8527                 }
8528             }
8529         }
8530
8531         /* Loop over the i-zones to set the upper limit of each
8532          * j-zone they see.
8533          */
8534         for (zi = 0; zi < zones->nizone; zi++)
8535         {
8536             if (zones->shift[zi][dim] == 0)
8537             {
8538                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8539                 {
8540                     if (zones->shift[z][dim] > 0)
8541                     {
8542                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8543                                                      zones->size[zi].x1[dim]+rcs);
8544                     }
8545                 }
8546             }
8547         }
8548     }
8549
8550     for (z = zone_start; z < zone_end; z++)
8551     {
8552         /* Initialization only required to keep the compiler happy */
8553         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8554         int  nc, c;
8555
8556         /* To determine the bounding box for a zone we need to find
8557          * the extreme corners of 4, 2 or 1 corners.
8558          */
8559         nc = 1 << (ddbox->npbcdim - 1);
8560
8561         for (c = 0; c < nc; c++)
8562         {
8563             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8564             corner[XX] = 0;
8565             if ((c & 1) == 0)
8566             {
8567                 corner[YY] = zones->size[z].x0[YY];
8568             }
8569             else
8570             {
8571                 corner[YY] = zones->size[z].x1[YY];
8572             }
8573             if ((c & 2) == 0)
8574             {
8575                 corner[ZZ] = zones->size[z].x0[ZZ];
8576             }
8577             else
8578             {
8579                 corner[ZZ] = zones->size[z].x1[ZZ];
8580             }
8581             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8582             {
8583                 /* With 1D domain decomposition the cg's are not in
8584                  * the triclinic box, but triclinic x-y and rectangular y-z.
8585                  * Shift y back, so it will later end up at 0.
8586                  */
8587                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8588             }
8589             /* Apply the triclinic couplings */
8590             for (i = YY; i < ddbox->npbcdim; i++)
8591             {
8592                 for (j = XX; j < i; j++)
8593                 {
8594                     corner[j] += corner[i]*box[i][j]/box[i][i];
8595                 }
8596             }
8597             if (c == 0)
8598             {
8599                 copy_rvec(corner, corner_min);
8600                 copy_rvec(corner, corner_max);
8601             }
8602             else
8603             {
8604                 for (i = 0; i < DIM; i++)
8605                 {
8606                     corner_min[i] = min(corner_min[i], corner[i]);
8607                     corner_max[i] = max(corner_max[i], corner[i]);
8608                 }
8609             }
8610         }
8611         /* Copy the extreme cornes without offset along x */
8612         for (i = 0; i < DIM; i++)
8613         {
8614             zones->size[z].bb_x0[i] = corner_min[i];
8615             zones->size[z].bb_x1[i] = corner_max[i];
8616         }
8617         /* Add the offset along x */
8618         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8619         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8620     }
8621
8622     if (zone_start == 0)
8623     {
8624         vol = 1;
8625         for (dim = 0; dim < DIM; dim++)
8626         {
8627             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8628         }
8629         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8630     }
8631
8632     if (debug)
8633     {
8634         for (z = zone_start; z < zone_end; z++)
8635         {
8636             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8637                     z,
8638                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8639                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8640                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8641             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8642                     z,
8643                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8644                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8645                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8646         }
8647     }
8648 }
8649
8650 static int comp_cgsort(const void *a, const void *b)
8651 {
8652     int           comp;
8653
8654     gmx_cgsort_t *cga, *cgb;
8655     cga = (gmx_cgsort_t *)a;
8656     cgb = (gmx_cgsort_t *)b;
8657
8658     comp = cga->nsc - cgb->nsc;
8659     if (comp == 0)
8660     {
8661         comp = cga->ind_gl - cgb->ind_gl;
8662     }
8663
8664     return comp;
8665 }
8666
8667 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8668                          int *a, int *buf)
8669 {
8670     int i;
8671
8672     /* Order the data */
8673     for (i = 0; i < n; i++)
8674     {
8675         buf[i] = a[sort[i].ind];
8676     }
8677
8678     /* Copy back to the original array */
8679     for (i = 0; i < n; i++)
8680     {
8681         a[i] = buf[i];
8682     }
8683 }
8684
8685 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8686                          rvec *v, rvec *buf)
8687 {
8688     int i;
8689
8690     /* Order the data */
8691     for (i = 0; i < n; i++)
8692     {
8693         copy_rvec(v[sort[i].ind], buf[i]);
8694     }
8695
8696     /* Copy back to the original array */
8697     for (i = 0; i < n; i++)
8698     {
8699         copy_rvec(buf[i], v[i]);
8700     }
8701 }
8702
8703 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8704                            rvec *v, rvec *buf)
8705 {
8706     int a, atot, cg, cg0, cg1, i;
8707
8708     if (cgindex == NULL)
8709     {
8710         /* Avoid the useless loop of the atoms within a cg */
8711         order_vec_cg(ncg, sort, v, buf);
8712
8713         return;
8714     }
8715
8716     /* Order the data */
8717     a = 0;
8718     for (cg = 0; cg < ncg; cg++)
8719     {
8720         cg0 = cgindex[sort[cg].ind];
8721         cg1 = cgindex[sort[cg].ind+1];
8722         for (i = cg0; i < cg1; i++)
8723         {
8724             copy_rvec(v[i], buf[a]);
8725             a++;
8726         }
8727     }
8728     atot = a;
8729
8730     /* Copy back to the original array */
8731     for (a = 0; a < atot; a++)
8732     {
8733         copy_rvec(buf[a], v[a]);
8734     }
8735 }
8736
8737 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8738                          int nsort_new, gmx_cgsort_t *sort_new,
8739                          gmx_cgsort_t *sort1)
8740 {
8741     int i1, i2, i_new;
8742
8743     /* The new indices are not very ordered, so we qsort them */
8744     qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8745
8746     /* sort2 is already ordered, so now we can merge the two arrays */
8747     i1    = 0;
8748     i2    = 0;
8749     i_new = 0;
8750     while (i2 < nsort2 || i_new < nsort_new)
8751     {
8752         if (i2 == nsort2)
8753         {
8754             sort1[i1++] = sort_new[i_new++];
8755         }
8756         else if (i_new == nsort_new)
8757         {
8758             sort1[i1++] = sort2[i2++];
8759         }
8760         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8761                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8762                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8763         {
8764             sort1[i1++] = sort2[i2++];
8765         }
8766         else
8767         {
8768             sort1[i1++] = sort_new[i_new++];
8769         }
8770     }
8771 }
8772
8773 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8774 {
8775     gmx_domdec_sort_t *sort;
8776     gmx_cgsort_t      *cgsort, *sort_i;
8777     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8778     int                sort_last, sort_skip;
8779
8780     sort = dd->comm->sort;
8781
8782     a = fr->ns.grid->cell_index;
8783
8784     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8785
8786     if (ncg_home_old >= 0)
8787     {
8788         /* The charge groups that remained in the same ns grid cell
8789          * are completely ordered. So we can sort efficiently by sorting
8790          * the charge groups that did move into the stationary list.
8791          */
8792         ncg_new   = 0;
8793         nsort2    = 0;
8794         nsort_new = 0;
8795         for (i = 0; i < dd->ncg_home; i++)
8796         {
8797             /* Check if this cg did not move to another node */
8798             if (a[i] < moved)
8799             {
8800                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8801                 {
8802                     /* This cg is new on this node or moved ns grid cell */
8803                     if (nsort_new >= sort->sort_new_nalloc)
8804                     {
8805                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8806                         srenew(sort->sort_new, sort->sort_new_nalloc);
8807                     }
8808                     sort_i = &(sort->sort_new[nsort_new++]);
8809                 }
8810                 else
8811                 {
8812                     /* This cg did not move */
8813                     sort_i = &(sort->sort2[nsort2++]);
8814                 }
8815                 /* Sort on the ns grid cell indices
8816                  * and the global topology index.
8817                  * index_gl is irrelevant with cell ns,
8818                  * but we set it here anyhow to avoid a conditional.
8819                  */
8820                 sort_i->nsc    = a[i];
8821                 sort_i->ind_gl = dd->index_gl[i];
8822                 sort_i->ind    = i;
8823                 ncg_new++;
8824             }
8825         }
8826         if (debug)
8827         {
8828             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8829                     nsort2, nsort_new);
8830         }
8831         /* Sort efficiently */
8832         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8833                      sort->sort);
8834     }
8835     else
8836     {
8837         cgsort  = sort->sort;
8838         ncg_new = 0;
8839         for (i = 0; i < dd->ncg_home; i++)
8840         {
8841             /* Sort on the ns grid cell indices
8842              * and the global topology index
8843              */
8844             cgsort[i].nsc    = a[i];
8845             cgsort[i].ind_gl = dd->index_gl[i];
8846             cgsort[i].ind    = i;
8847             if (cgsort[i].nsc < moved)
8848             {
8849                 ncg_new++;
8850             }
8851         }
8852         if (debug)
8853         {
8854             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8855         }
8856         /* Determine the order of the charge groups using qsort */
8857         qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8858     }
8859
8860     return ncg_new;
8861 }
8862
8863 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8864 {
8865     gmx_cgsort_t *sort;
8866     int           ncg_new, i, *a, na;
8867
8868     sort = dd->comm->sort->sort;
8869
8870     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8871
8872     ncg_new = 0;
8873     for (i = 0; i < na; i++)
8874     {
8875         if (a[i] >= 0)
8876         {
8877             sort[ncg_new].ind = a[i];
8878             ncg_new++;
8879         }
8880     }
8881
8882     return ncg_new;
8883 }
8884
8885 static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
8886                           rvec *cgcm, t_forcerec *fr, t_state *state,
8887                           int ncg_home_old)
8888 {
8889     gmx_domdec_sort_t *sort;
8890     gmx_cgsort_t      *cgsort, *sort_i;
8891     int               *cgindex;
8892     int                ncg_new, i, *ibuf, cgsize;
8893     rvec              *vbuf;
8894
8895     sort = dd->comm->sort;
8896
8897     if (dd->ncg_home > sort->sort_nalloc)
8898     {
8899         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8900         srenew(sort->sort, sort->sort_nalloc);
8901         srenew(sort->sort2, sort->sort_nalloc);
8902     }
8903     cgsort = sort->sort;
8904
8905     switch (fr->cutoff_scheme)
8906     {
8907         case ecutsGROUP:
8908             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8909             break;
8910         case ecutsVERLET:
8911             ncg_new = dd_sort_order_nbnxn(dd, fr);
8912             break;
8913         default:
8914             gmx_incons("unimplemented");
8915             ncg_new = 0;
8916     }
8917
8918     /* We alloc with the old size, since cgindex is still old */
8919     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8920     vbuf = dd->comm->vbuf.v;
8921
8922     if (dd->comm->bCGs)
8923     {
8924         cgindex = dd->cgindex;
8925     }
8926     else
8927     {
8928         cgindex = NULL;
8929     }
8930
8931     /* Remove the charge groups which are no longer at home here */
8932     dd->ncg_home = ncg_new;
8933     if (debug)
8934     {
8935         fprintf(debug, "Set the new home charge group count to %d\n",
8936                 dd->ncg_home);
8937     }
8938
8939     /* Reorder the state */
8940     for (i = 0; i < estNR; i++)
8941     {
8942         if (EST_DISTR(i) && (state->flags & (1<<i)))
8943         {
8944             switch (i)
8945             {
8946                 case estX:
8947                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8948                     break;
8949                 case estV:
8950                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8951                     break;
8952                 case estSDX:
8953                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8954                     break;
8955                 case estCGP:
8956                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8957                     break;
8958                 case estLD_RNG:
8959                 case estLD_RNGI:
8960                 case estDISRE_INITF:
8961                 case estDISRE_RM3TAV:
8962                 case estORIRE_INITF:
8963                 case estORIRE_DTAV:
8964                     /* No ordering required */
8965                     break;
8966                 default:
8967                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8968                     break;
8969             }
8970         }
8971     }
8972     if (fr->cutoff_scheme == ecutsGROUP)
8973     {
8974         /* Reorder cgcm */
8975         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8976     }
8977
8978     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8979     {
8980         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8981         srenew(sort->ibuf, sort->ibuf_nalloc);
8982     }
8983     ibuf = sort->ibuf;
8984     /* Reorder the global cg index */
8985     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
8986     /* Reorder the cginfo */
8987     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
8988     /* Rebuild the local cg index */
8989     if (dd->comm->bCGs)
8990     {
8991         ibuf[0] = 0;
8992         for (i = 0; i < dd->ncg_home; i++)
8993         {
8994             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8995             ibuf[i+1] = ibuf[i] + cgsize;
8996         }
8997         for (i = 0; i < dd->ncg_home+1; i++)
8998         {
8999             dd->cgindex[i] = ibuf[i];
9000         }
9001     }
9002     else
9003     {
9004         for (i = 0; i < dd->ncg_home+1; i++)
9005         {
9006             dd->cgindex[i] = i;
9007         }
9008     }
9009     /* Set the home atom number */
9010     dd->nat_home = dd->cgindex[dd->ncg_home];
9011
9012     if (fr->cutoff_scheme == ecutsVERLET)
9013     {
9014         /* The atoms are now exactly in grid order, update the grid order */
9015         nbnxn_set_atomorder(fr->nbv->nbs);
9016     }
9017     else
9018     {
9019         /* Copy the sorted ns cell indices back to the ns grid struct */
9020         for (i = 0; i < dd->ncg_home; i++)
9021         {
9022             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9023         }
9024         fr->ns.grid->nr = dd->ncg_home;
9025     }
9026 }
9027
9028 static void add_dd_statistics(gmx_domdec_t *dd)
9029 {
9030     gmx_domdec_comm_t *comm;
9031     int                ddnat;
9032
9033     comm = dd->comm;
9034
9035     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9036     {
9037         comm->sum_nat[ddnat-ddnatZONE] +=
9038             comm->nat[ddnat] - comm->nat[ddnat-1];
9039     }
9040     comm->ndecomp++;
9041 }
9042
9043 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9044 {
9045     gmx_domdec_comm_t *comm;
9046     int                ddnat;
9047
9048     comm = dd->comm;
9049
9050     /* Reset all the statistics and counters for total run counting */
9051     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9052     {
9053         comm->sum_nat[ddnat-ddnatZONE] = 0;
9054     }
9055     comm->ndecomp   = 0;
9056     comm->nload     = 0;
9057     comm->load_step = 0;
9058     comm->load_sum  = 0;
9059     comm->load_max  = 0;
9060     clear_ivec(comm->load_lim);
9061     comm->load_mdf = 0;
9062     comm->load_pme = 0;
9063 }
9064
9065 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9066 {
9067     gmx_domdec_comm_t *comm;
9068     int                ddnat;
9069     double             av;
9070
9071     comm = cr->dd->comm;
9072
9073     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9074
9075     if (fplog == NULL)
9076     {
9077         return;
9078     }
9079
9080     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9081
9082     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9083     {
9084         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9085         switch (ddnat)
9086         {
9087             case ddnatZONE:
9088                 fprintf(fplog,
9089                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9090                         2, av);
9091                 break;
9092             case ddnatVSITE:
9093                 if (cr->dd->vsite_comm)
9094                 {
9095                     fprintf(fplog,
9096                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9097                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9098                             av);
9099                 }
9100                 break;
9101             case ddnatCON:
9102                 if (cr->dd->constraint_comm)
9103                 {
9104                     fprintf(fplog,
9105                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9106                             1 + ir->nLincsIter, av);
9107                 }
9108                 break;
9109             default:
9110                 gmx_incons(" Unknown type for DD statistics");
9111         }
9112     }
9113     fprintf(fplog, "\n");
9114
9115     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9116     {
9117         print_dd_load_av(fplog, cr->dd);
9118     }
9119 }
9120
9121 void dd_partition_system(FILE                *fplog,
9122                          gmx_large_int_t      step,
9123                          t_commrec           *cr,
9124                          gmx_bool             bMasterState,
9125                          int                  nstglobalcomm,
9126                          t_state             *state_global,
9127                          gmx_mtop_t          *top_global,
9128                          t_inputrec          *ir,
9129                          t_state             *state_local,
9130                          rvec               **f,
9131                          t_mdatoms           *mdatoms,
9132                          gmx_localtop_t      *top_local,
9133                          t_forcerec          *fr,
9134                          gmx_vsite_t         *vsite,
9135                          gmx_shellfc_t        shellfc,
9136                          gmx_constr_t         constr,
9137                          t_nrnb              *nrnb,
9138                          gmx_wallcycle_t      wcycle,
9139                          gmx_bool             bVerbose)
9140 {
9141     gmx_domdec_t      *dd;
9142     gmx_domdec_comm_t *comm;
9143     gmx_ddbox_t        ddbox = {0};
9144     t_block           *cgs_gl;
9145     gmx_large_int_t    step_pcoupl;
9146     rvec               cell_ns_x0, cell_ns_x1;
9147     int                i, j, n, cg0 = 0, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9148     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9149     gmx_bool           bRedist, bSortCG, bResortAll;
9150     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9151     real               grid_density;
9152     char               sbuf[22];
9153
9154     dd   = cr->dd;
9155     comm = dd->comm;
9156
9157     bBoxChanged = (bMasterState || DEFORM(*ir));
9158     if (ir->epc != epcNO)
9159     {
9160         /* With nstpcouple > 1 pressure coupling happens.
9161          * one step after calculating the pressure.
9162          * Box scaling happens at the end of the MD step,
9163          * after the DD partitioning.
9164          * We therefore have to do DLB in the first partitioning
9165          * after an MD step where P-coupling occured.
9166          * We need to determine the last step in which p-coupling occurred.
9167          * MRS -- need to validate this for vv?
9168          */
9169         n = ir->nstpcouple;
9170         if (n == 1)
9171         {
9172             step_pcoupl = step - 1;
9173         }
9174         else
9175         {
9176             step_pcoupl = ((step - 1)/n)*n + 1;
9177         }
9178         if (step_pcoupl >= comm->partition_step)
9179         {
9180             bBoxChanged = TRUE;
9181         }
9182     }
9183
9184     bNStGlobalComm = (step % nstglobalcomm == 0);
9185
9186     if (!comm->bDynLoadBal)
9187     {
9188         bDoDLB = FALSE;
9189     }
9190     else
9191     {
9192         /* Should we do dynamic load balacing this step?
9193          * Since it requires (possibly expensive) global communication,
9194          * we might want to do DLB less frequently.
9195          */
9196         if (bBoxChanged || ir->epc != epcNO)
9197         {
9198             bDoDLB = bBoxChanged;
9199         }
9200         else
9201         {
9202             bDoDLB = bNStGlobalComm;
9203         }
9204     }
9205
9206     /* Check if we have recorded loads on the nodes */
9207     if (comm->bRecordLoad && dd_load_count(comm))
9208     {
9209         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9210         {
9211             /* Check if we should use DLB at the second partitioning
9212              * and every 100 partitionings,
9213              * so the extra communication cost is negligible.
9214              */
9215             n         = max(100, nstglobalcomm);
9216             bCheckDLB = (comm->n_load_collect == 0 ||
9217                          comm->n_load_have % n == n-1);
9218         }
9219         else
9220         {
9221             bCheckDLB = FALSE;
9222         }
9223
9224         /* Print load every nstlog, first and last step to the log file */
9225         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9226                     comm->n_load_collect == 0 ||
9227                     (ir->nsteps >= 0 &&
9228                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9229
9230         /* Avoid extra communication due to verbose screen output
9231          * when nstglobalcomm is set.
9232          */
9233         if (bDoDLB || bLogLoad || bCheckDLB ||
9234             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9235         {
9236             get_load_distribution(dd, wcycle);
9237             if (DDMASTER(dd))
9238             {
9239                 if (bLogLoad)
9240                 {
9241                     dd_print_load(fplog, dd, step-1);
9242                 }
9243                 if (bVerbose)
9244                 {
9245                     dd_print_load_verbose(dd);
9246                 }
9247             }
9248             comm->n_load_collect++;
9249
9250             if (bCheckDLB)
9251             {
9252                 /* Since the timings are node dependent, the master decides */
9253                 if (DDMASTER(dd))
9254                 {
9255                     bTurnOnDLB =
9256                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9257                     if (debug)
9258                     {
9259                         fprintf(debug, "step %s, imb loss %f\n",
9260                                 gmx_step_str(step, sbuf),
9261                                 dd_force_imb_perf_loss(dd));
9262                     }
9263                 }
9264                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9265                 if (bTurnOnDLB)
9266                 {
9267                     turn_on_dlb(fplog, cr, step);
9268                     bDoDLB = TRUE;
9269                 }
9270             }
9271         }
9272         comm->n_load_have++;
9273     }
9274
9275     cgs_gl = &comm->cgs_gl;
9276
9277     bRedist = FALSE;
9278     if (bMasterState)
9279     {
9280         /* Clear the old state */
9281         clear_dd_indices(dd, 0, 0);
9282
9283         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9284                   TRUE, cgs_gl, state_global->x, &ddbox);
9285
9286         get_cg_distribution(fplog, step, dd, cgs_gl,
9287                             state_global->box, &ddbox, state_global->x);
9288
9289         dd_distribute_state(dd, cgs_gl,
9290                             state_global, state_local, f);
9291
9292         dd_make_local_cgs(dd, &top_local->cgs);
9293
9294         /* Ensure that we have space for the new distribution */
9295         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9296
9297         if (fr->cutoff_scheme == ecutsGROUP)
9298         {
9299             calc_cgcm(fplog, 0, dd->ncg_home,
9300                       &top_local->cgs, state_local->x, fr->cg_cm);
9301         }
9302
9303         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9304
9305         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9306
9307         cg0 = 0;
9308     }
9309     else if (state_local->ddp_count != dd->ddp_count)
9310     {
9311         if (state_local->ddp_count > dd->ddp_count)
9312         {
9313             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9314         }
9315
9316         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9317         {
9318             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9319         }
9320
9321         /* Clear the old state */
9322         clear_dd_indices(dd, 0, 0);
9323
9324         /* Build the new indices */
9325         rebuild_cgindex(dd, cgs_gl->index, state_local);
9326         make_dd_indices(dd, cgs_gl->index, 0);
9327
9328         if (fr->cutoff_scheme == ecutsGROUP)
9329         {
9330             /* Redetermine the cg COMs */
9331             calc_cgcm(fplog, 0, dd->ncg_home,
9332                       &top_local->cgs, state_local->x, fr->cg_cm);
9333         }
9334
9335         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9336
9337         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9338
9339         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9340                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9341
9342         bRedist = comm->bDynLoadBal;
9343     }
9344     else
9345     {
9346         /* We have the full state, only redistribute the cgs */
9347
9348         /* Clear the non-home indices */
9349         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9350
9351         /* Avoid global communication for dim's without pbc and -gcom */
9352         if (!bNStGlobalComm)
9353         {
9354             copy_rvec(comm->box0, ddbox.box0    );
9355             copy_rvec(comm->box_size, ddbox.box_size);
9356         }
9357         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9358                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9359
9360         bBoxChanged = TRUE;
9361         bRedist     = TRUE;
9362     }
9363     /* For dim's without pbc and -gcom */
9364     copy_rvec(ddbox.box0, comm->box0    );
9365     copy_rvec(ddbox.box_size, comm->box_size);
9366
9367     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9368                       step, wcycle);
9369
9370     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9371     {
9372         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9373     }
9374
9375     /* Check if we should sort the charge groups */
9376     if (comm->nstSortCG > 0)
9377     {
9378         bSortCG = (bMasterState ||
9379                    (bRedist && (step % comm->nstSortCG == 0)));
9380     }
9381     else
9382     {
9383         bSortCG = FALSE;
9384     }
9385
9386     ncg_home_old = dd->ncg_home;
9387
9388     ncg_moved = 0;
9389     if (bRedist)
9390     {
9391         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9392
9393         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9394                            state_local, f, fr, mdatoms,
9395                            !bSortCG, nrnb, &cg0, &ncg_moved);
9396
9397         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9398     }
9399
9400     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9401                           dd, &ddbox,
9402                           &comm->cell_x0, &comm->cell_x1,
9403                           dd->ncg_home, fr->cg_cm,
9404                           cell_ns_x0, cell_ns_x1, &grid_density);
9405
9406     if (bBoxChanged)
9407     {
9408         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9409     }
9410
9411     switch (fr->cutoff_scheme)
9412     {
9413         case ecutsGROUP:
9414             copy_ivec(fr->ns.grid->n, ncells_old);
9415             grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
9416                        state_local->box, cell_ns_x0, cell_ns_x1,
9417                        fr->rlistlong, grid_density);
9418             break;
9419         case ecutsVERLET:
9420             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9421             break;
9422         default:
9423             gmx_incons("unimplemented");
9424     }
9425     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9426     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9427
9428     if (bSortCG)
9429     {
9430         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9431
9432         /* Sort the state on charge group position.
9433          * This enables exact restarts from this step.
9434          * It also improves performance by about 15% with larger numbers
9435          * of atoms per node.
9436          */
9437
9438         /* Fill the ns grid with the home cell,
9439          * so we can sort with the indices.
9440          */
9441         set_zones_ncg_home(dd);
9442
9443         switch (fr->cutoff_scheme)
9444         {
9445             case ecutsVERLET:
9446                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9447
9448                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9449                                   0,
9450                                   comm->zones.size[0].bb_x0,
9451                                   comm->zones.size[0].bb_x1,
9452                                   0, dd->ncg_home,
9453                                   comm->zones.dens_zone0,
9454                                   fr->cginfo,
9455                                   state_local->x,
9456                                   ncg_moved, bRedist ? comm->moved : NULL,
9457                                   fr->nbv->grp[eintLocal].kernel_type,
9458                                   fr->nbv->grp[eintLocal].nbat);
9459
9460                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9461                 break;
9462             case ecutsGROUP:
9463                 fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
9464                           0, dd->ncg_home, fr->cg_cm);
9465
9466                 copy_ivec(fr->ns.grid->n, ncells_new);
9467                 break;
9468             default:
9469                 gmx_incons("unimplemented");
9470         }
9471
9472         bResortAll = bMasterState;
9473
9474         /* Check if we can user the old order and ns grid cell indices
9475          * of the charge groups to sort the charge groups efficiently.
9476          */
9477         if (ncells_new[XX] != ncells_old[XX] ||
9478             ncells_new[YY] != ncells_old[YY] ||
9479             ncells_new[ZZ] != ncells_old[ZZ])
9480         {
9481             bResortAll = TRUE;
9482         }
9483
9484         if (debug)
9485         {
9486             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9487                     gmx_step_str(step, sbuf), dd->ncg_home);
9488         }
9489         dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
9490                       bResortAll ? -1 : ncg_home_old);
9491         /* Rebuild all the indices */
9492         cg0 = 0;
9493         ga2la_clear(dd->ga2la);
9494
9495         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9496     }
9497
9498     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9499
9500     /* Setup up the communication and communicate the coordinates */
9501     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9502
9503     /* Set the indices */
9504     make_dd_indices(dd, cgs_gl->index, cg0);
9505
9506     /* Set the charge group boundaries for neighbor searching */
9507     set_cg_boundaries(&comm->zones);
9508
9509     if (fr->cutoff_scheme == ecutsVERLET)
9510     {
9511         set_zones_size(dd, state_local->box, &ddbox,
9512                        bSortCG ? 1 : 0, comm->zones.n);
9513     }
9514
9515     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9516
9517     /*
9518        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9519                  -1,state_local->x,state_local->box);
9520      */
9521
9522     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9523
9524     /* Extract a local topology from the global topology */
9525     for (i = 0; i < dd->ndim; i++)
9526     {
9527         np[dd->dim[i]] = comm->cd[i].np;
9528     }
9529     dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
9530                       comm->cellsize_min, np,
9531                       fr,
9532                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9533                       vsite, top_global, top_local);
9534
9535     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9536
9537     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9538
9539     /* Set up the special atom communication */
9540     n = comm->nat[ddnatZONE];
9541     for (i = ddnatZONE+1; i < ddnatNR; i++)
9542     {
9543         switch (i)
9544         {
9545             case ddnatVSITE:
9546                 if (vsite && vsite->n_intercg_vsite)
9547                 {
9548                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9549                 }
9550                 break;
9551             case ddnatCON:
9552                 if (dd->bInterCGcons || dd->bInterCGsettles)
9553                 {
9554                     /* Only for inter-cg constraints we need special code */
9555                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9556                                                   constr, ir->nProjOrder,
9557                                                   top_local->idef.il);
9558                 }
9559                 break;
9560             default:
9561                 gmx_incons("Unknown special atom type setup");
9562         }
9563         comm->nat[i] = n;
9564     }
9565
9566     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9567
9568     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9569
9570     /* Make space for the extra coordinates for virtual site
9571      * or constraint communication.
9572      */
9573     state_local->natoms = comm->nat[ddnatNR-1];
9574     if (state_local->natoms > state_local->nalloc)
9575     {
9576         dd_realloc_state(state_local, f, state_local->natoms);
9577     }
9578
9579     if (fr->bF_NoVirSum)
9580     {
9581         if (vsite && vsite->n_intercg_vsite)
9582         {
9583             nat_f_novirsum = comm->nat[ddnatVSITE];
9584         }
9585         else
9586         {
9587             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9588             {
9589                 nat_f_novirsum = dd->nat_tot;
9590             }
9591             else
9592             {
9593                 nat_f_novirsum = dd->nat_home;
9594             }
9595         }
9596     }
9597     else
9598     {
9599         nat_f_novirsum = 0;
9600     }
9601
9602     /* Set the number of atoms required for the force calculation.
9603      * Forces need to be constrained when using a twin-range setup
9604      * or with energy minimization. For simple simulations we could
9605      * avoid some allocation, zeroing and copying, but this is
9606      * probably not worth the complications ande checking.
9607      */
9608     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9609                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9610
9611     /* We make the all mdatoms up to nat_tot_con.
9612      * We could save some work by only setting invmass
9613      * between nat_tot and nat_tot_con.
9614      */
9615     /* This call also sets the new number of home particles to dd->nat_home */
9616     atoms2md(top_global, ir,
9617              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9618
9619     /* Now we have the charges we can sort the FE interactions */
9620     dd_sort_local_top(dd, mdatoms, top_local);
9621
9622     if (vsite != NULL)
9623     {
9624         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9625         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
9626     }
9627
9628     if (shellfc)
9629     {
9630         /* Make the local shell stuff, currently no communication is done */
9631         make_local_shells(cr, mdatoms, shellfc);
9632     }
9633
9634     if (ir->implicit_solvent)
9635     {
9636         make_local_gb(cr, fr->born, ir->gb_algorithm);
9637     }
9638
9639     init_bonded_thread_force_reduction(fr, &top_local->idef);
9640
9641     if (!(cr->duty & DUTY_PME))
9642     {
9643         /* Send the charges to our PME only node */
9644         gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
9645                        mdatoms->chargeA, mdatoms->chargeB,
9646                        dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9647     }
9648
9649     if (constr)
9650     {
9651         set_constraints(constr, top_local, ir, mdatoms, cr);
9652     }
9653
9654     if (ir->ePull != epullNO)
9655     {
9656         /* Update the local pull groups */
9657         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9658     }
9659
9660     if (ir->bRot)
9661     {
9662         /* Update the local rotation groups */
9663         dd_make_local_rotation_groups(dd, ir->rot);
9664     }
9665
9666
9667     add_dd_statistics(dd);
9668
9669     /* Make sure we only count the cycles for this DD partitioning */
9670     clear_dd_cycle_counts(dd);
9671
9672     /* Because the order of the atoms might have changed since
9673      * the last vsite construction, we need to communicate the constructing
9674      * atom coordinates again (for spreading the forces this MD step).
9675      */
9676     dd_move_x_vsites(dd, state_local->box, state_local->x);
9677
9678     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9679
9680     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9681     {
9682         dd_move_x(dd, state_local->box, state_local->x);
9683         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9684                      -1, state_local->x, state_local->box);
9685     }
9686
9687     /* Store the partitioning step */
9688     comm->partition_step = step;
9689
9690     /* Increase the DD partitioning counter */
9691     dd->ddp_count++;
9692     /* The state currently matches this DD partitioning count, store it */
9693     state_local->ddp_count = dd->ddp_count;
9694     if (bMasterState)
9695     {
9696         /* The DD master node knows the complete cg distribution,
9697          * store the count so we can possibly skip the cg info communication.
9698          */
9699         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9700     }
9701
9702     if (comm->DD_debug > 0)
9703     {
9704         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9705         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9706                                 "after partitioning");
9707     }
9708 }