src/gromacs/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "gmx_fatal.h"
  31 #include "gmx_fatal_collective.h"
  32 #include "vec.h"
  33 #include "domdec.h"
  34 #include "domdec_network.h"
  35 #include "nrnb.h"
  36 #include "pbc.h"
  37 #include "chargegroup.h"
  38 #include "constr.h"
  39 #include "mdatoms.h"
  40 #include "names.h"
  41 #include "pdbio.h"
  42 #include "futil.h"
  43 #include "force.h"
  44 #include "pme.h"
  45 #include "pull.h"
  46 #include "pull_rotation.h"
  47 #include "gmx_wallcycle.h"
  48 #include "mdrun.h"
  49 #include "nsgrid.h"
  50 #include "shellfc.h"
  51 #include "mtop_util.h"
  52 #include "gmxfio.h"
  53 #include "gmx_ga2la.h"
  54 #include "gmx_sort.h"
  55 #include "macros.h"
  56 #include "nbnxn_search.h"
  57 #include "bondf.h"
  58 #include "gmx_omp_nthreads.h"
  59
  60 #include "gromacs/utility/gmxmpi.h"
  61
  62 #define DDRANK(dd, rank)    (rank)
  63 #define DDMASTERRANK(dd)   (dd->masterrank)
  64
  65 typedef struct gmx_domdec_master
  66 {
  67     /* The cell boundaries */
  68     real **cell_x;
  69     /* The global charge group division */
  70     int   *ncg;    /* Number of home charge groups for each node */
  71     int   *index;  /* Index of nnodes+1 into cg */
  72     int   *cg;     /* Global charge group index */
  73     int   *nat;    /* Number of home atoms for each node. */
  74     int   *ibuf;   /* Buffer for communication */
  75     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  76 } gmx_domdec_master_t;
  77
  78 typedef struct
  79 {
  80     /* The numbers of charge groups to send and receive for each cell
  81      * that requires communication, the last entry contains the total
  82      * number of atoms that needs to be communicated.
  83      */
  84     int  nsend[DD_MAXIZONE+2];
  85     int  nrecv[DD_MAXIZONE+2];
  86     /* The charge groups to send */
  87     int *index;
  88     int  nalloc;
  89     /* The atom range for non-in-place communication */
  90     int  cell2at0[DD_MAXIZONE];
  91     int  cell2at1[DD_MAXIZONE];
  92 } gmx_domdec_ind_t;
  93
  94 typedef struct
  95 {
  96     int               np;       /* Number of grid pulses in this dimension */
  97     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
  98     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
  99     int               np_nalloc;
 100     gmx_bool          bInPlace; /* Can we communicate in place?            */
 101 } gmx_domdec_comm_dim_t;
 102
 103 typedef struct
 104 {
 105     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 106     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 107     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 108     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 109     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 110     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 111     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 112     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 113     real     *buf_ncd;     /* Temp. var.                                     */
 114 } gmx_domdec_root_t;
 115
 116 #define DD_NLOAD_MAX 9
 117
 118 /* Here floats are accurate enough, since these variables
 119  * only influence the load balancing, not the actual MD results.
 120  */
 121 typedef struct
 122 {
 123     int    nload;
 124     float *load;
 125     float  sum;
 126     float  max;
 127     float  sum_m;
 128     float  cvol_min;
 129     float  mdf;
 130     float  pme;
 131     int    flags;
 132 } gmx_domdec_load_t;
 133
 134 typedef struct
 135 {
 136     int  nsc;
 137     int  ind_gl;
 138     int  ind;
 139 } gmx_cgsort_t;
 140
 141 typedef struct
 142 {
 143     gmx_cgsort_t *sort;
 144     gmx_cgsort_t *sort2;
 145     int           sort_nalloc;
 146     gmx_cgsort_t *sort_new;
 147     int           sort_new_nalloc;
 148     int          *ibuf;
 149     int           ibuf_nalloc;
 150 } gmx_domdec_sort_t;
 151
 152 typedef struct
 153 {
 154     rvec *v;
 155     int   nalloc;
 156 } vec_rvec_t;
 157
 158 /* This enum determines the order of the coordinates.
 159  * ddnatHOME and ddnatZONE should be first and second,
 160  * the others can be ordered as wanted.
 161  */
 162 enum {
 163     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 164 };
 165
 166 enum {
 167     edlbAUTO, edlbNO, edlbYES, edlbNR
 168 };
 169 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 170
 171 typedef struct
 172 {
 173     int      dim;       /* The dimension                                          */
 174     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 175     int      nslab;     /* The number of PME slabs in this dimension              */
 176     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 177     int     *pp_min;    /* The minimum pp node location, size nslab               */
 178     int     *pp_max;    /* The maximum pp node location,size nslab                */
 179     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 180 } gmx_ddpme_t;
 181
 182 typedef struct
 183 {
 184     real min0;    /* The minimum bottom of this zone                        */
 185     real max1;    /* The maximum top of this zone                           */
 186     real min1;    /* The minimum top of this zone                           */
 187     real mch0;    /* The maximum bottom communicaton height for this zone   */
 188     real mch1;    /* The maximum top communicaton height for this zone      */
 189     real p1_0;    /* The bottom value of the first cell in this zone        */
 190     real p1_1;    /* The top value of the first cell in this zone           */
 191 } gmx_ddzone_t;
 192
 193 typedef struct
 194 {
 195     gmx_domdec_ind_t ind;
 196     int             *ibuf;
 197     int              ibuf_nalloc;
 198     vec_rvec_t       vbuf;
 199     int              nsend;
 200     int              nat;
 201     int              nsend_zone;
 202 } dd_comm_setup_work_t;
 203
 204 typedef struct gmx_domdec_comm
 205 {
 206     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 207      * unless stated otherwise.
 208      */
 209
 210     /* The number of decomposition dimensions for PME, 0: no PME */
 211     int         npmedecompdim;
 212     /* The number of nodes doing PME (PP/PME or only PME) */
 213     int         npmenodes;
 214     int         npmenodes_x;
 215     int         npmenodes_y;
 216     /* The communication setup including the PME only nodes */
 217     gmx_bool    bCartesianPP_PME;
 218     ivec        ntot;
 219     int         cartpmedim;
 220     int        *pmenodes;          /* size npmenodes                         */
 221     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 222                                     * but with bCartesianPP_PME              */
 223     gmx_ddpme_t ddpme[2];
 224
 225     /* The DD particle-particle nodes only */
 226     gmx_bool bCartesianPP;
 227     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 228
 229     /* The global charge groups */
 230     t_block cgs_gl;
 231
 232     /* Should we sort the cgs */
 233     int                nstSortCG;
 234     gmx_domdec_sort_t *sort;
 235
 236     /* Are there charge groups? */
 237     gmx_bool bCGs;
 238
 239     /* Are there bonded and multi-body interactions between charge groups? */
 240     gmx_bool bInterCGBondeds;
 241     gmx_bool bInterCGMultiBody;
 242
 243     /* Data for the optional bonded interaction atom communication range */
 244     gmx_bool  bBondComm;
 245     t_blocka *cglink;
 246     char     *bLocalCG;
 247
 248     /* The DLB option */
 249     int      eDLB;
 250     /* Are we actually using DLB? */
 251     gmx_bool bDynLoadBal;
 252
 253     /* Cell sizes for static load balancing, first index cartesian */
 254     real **slb_frac;
 255
 256     /* The width of the communicated boundaries */
 257     real     cutoff_mbody;
 258     real     cutoff;
 259     /* The minimum cell size (including triclinic correction) */
 260     rvec     cellsize_min;
 261     /* For dlb, for use with edlbAUTO */
 262     rvec     cellsize_min_dlb;
 263     /* The lower limit for the DD cell size with DLB */
 264     real     cellsize_limit;
 265     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 266     gmx_bool bVacDLBNoLimit;
 267
 268     /* With PME load balancing we set limits on DLB */
 269     gmx_bool bPMELoadBalDLBLimits;
 270     /* DLB needs to take into account that we want to allow this maximum
 271      * cut-off (for PME load balancing), this could limit cell boundaries.
 272      */
 273     real PMELoadBal_max_cutoff;
 274
 275     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 276     ivec tric_dir;
 277     /* box0 and box_size are required with dim's without pbc and -gcom */
 278     rvec box0;
 279     rvec box_size;
 280
 281     /* The cell boundaries */
 282     rvec cell_x0;
 283     rvec cell_x1;
 284
 285     /* The old location of the cell boundaries, to check cg displacements */
 286     rvec old_cell_x0;
 287     rvec old_cell_x1;
 288
 289     /* The communication setup and charge group boundaries for the zones */
 290     gmx_domdec_zones_t zones;
 291
 292     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 293      * cell boundaries of neighboring cells for dynamic load balancing.
 294      */
 295     gmx_ddzone_t zone_d1[2];
 296     gmx_ddzone_t zone_d2[2][2];
 297
 298     /* The coordinate/force communication setup and indices */
 299     gmx_domdec_comm_dim_t cd[DIM];
 300     /* The maximum number of cells to communicate with in one dimension */
 301     int                   maxpulse;
 302
 303     /* Which cg distribution is stored on the master node */
 304     int master_cg_ddp_count;
 305
 306     /* The number of cg's received from the direct neighbors */
 307     int  zone_ncg1[DD_MAXZONE];
 308
 309     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 310     int  nat[ddnatNR];
 311
 312     /* Array for signalling if atoms have moved to another domain */
 313     int  *moved;
 314     int   moved_nalloc;
 315
 316     /* Communication buffer for general use */
 317     int  *buf_int;
 318     int   nalloc_int;
 319
 320     /* Communication buffer for general use */
 321     vec_rvec_t vbuf;
 322
 323     /* Temporary storage for thread parallel communication setup */
 324     int                   nth;
 325     dd_comm_setup_work_t *dth;
 326
 327     /* Communication buffers only used with multiple grid pulses */
 328     int       *buf_int2;
 329     int        nalloc_int2;
 330     vec_rvec_t vbuf2;
 331
 332     /* Communication buffers for local redistribution */
 333     int  **cggl_flag;
 334     int    cggl_flag_nalloc[DIM*2];
 335     rvec **cgcm_state;
 336     int    cgcm_state_nalloc[DIM*2];
 337
 338     /* Cell sizes for dynamic load balancing */
 339     gmx_domdec_root_t **root;
 340     real               *cell_f_row;
 341     real                cell_f0[DIM];
 342     real                cell_f1[DIM];
 343     real                cell_f_max0[DIM];
 344     real                cell_f_min1[DIM];
 345
 346     /* Stuff for load communication */
 347     gmx_bool           bRecordLoad;
 348     gmx_domdec_load_t *load;
 349 #ifdef GMX_MPI
 350     MPI_Comm          *mpi_comm_load;
 351 #endif
 352
 353     /* Maximum DLB scaling per load balancing step in percent */
 354     int dlb_scale_lim;
 355
 356     /* Cycle counters */
 357     float  cycl[ddCyclNr];
 358     int    cycl_n[ddCyclNr];
 359     float  cycl_max[ddCyclNr];
 360     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 361     int    eFlop;
 362     double flop;
 363     int    flop_n;
 364     /* Have often have did we have load measurements */
 365     int    n_load_have;
 366     /* Have often have we collected the load measurements */
 367     int    n_load_collect;
 368
 369     /* Statistics */
 370     double sum_nat[ddnatNR-ddnatZONE];
 371     int    ndecomp;
 372     int    nload;
 373     double load_step;
 374     double load_sum;
 375     double load_max;
 376     ivec   load_lim;
 377     double load_mdf;
 378     double load_pme;
 379
 380     /* The last partition step */
 381     gmx_large_int_t partition_step;
 382
 383     /* Debugging */
 384     int  nstDDDump;
 385     int  nstDDDumpGrid;
 386     int  DD_debug;
 387 } gmx_domdec_comm_t;
 388
 389 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 390 #define DD_CGIBS 2
 391
 392 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 393 #define DD_FLAG_NRCG  65535
 394 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 395 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 396
 397 /* Zone permutation required to obtain consecutive charge groups
 398  * for neighbor searching.
 399  */
 400 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 401
 402 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 403  * components see only j zones with that component 0.
 404  */
 405
 406 /* The DD zone order */
 407 static const ivec dd_zo[DD_MAXZONE] =
 408 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 409
 410 /* The 3D setup */
 411 #define dd_z3n  8
 412 #define dd_zp3n 4
 413 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 414
 415 /* The 2D setup */
 416 #define dd_z2n  4
 417 #define dd_zp2n 2
 418 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 419
 420 /* The 1D setup */
 421 #define dd_z1n  2
 422 #define dd_zp1n 1
 423 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 424
 425 /* Factors used to avoid problems due to rounding issues */
 426 #define DD_CELL_MARGIN       1.0001
 427 #define DD_CELL_MARGIN2      1.00005
 428 /* Factor to account for pressure scaling during nstlist steps */
 429 #define DD_PRES_SCALE_MARGIN 1.02
 430
 431 /* Allowed performance loss before we DLB or warn */
 432 #define DD_PERF_LOSS 0.05
 433
 434 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 435
 436 /* Use separate MPI send and receive commands
 437  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 438  * This saves memory (and some copying for small nnodes).
 439  * For high parallelization scatter and gather calls are used.
 440  */
 441 #define GMX_DD_NNODES_SENDRECV 4
 442
 443
 444 /*
 445    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 446
 447    static void index2xyz(ivec nc,int ind,ivec xyz)
 448    {
 449    xyz[XX] = ind % nc[XX];
 450    xyz[YY] = (ind / nc[XX]) % nc[YY];
 451    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 452    }
 453  */
 454
 455 /* This order is required to minimize the coordinate communication in PME
 456  * which uses decomposition in the x direction.
 457  */
 458 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 459
 460 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 461 {
 462     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 463     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 464     xyz[ZZ] = ind % nc[ZZ];
 465 }
 466
 467 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 468 {
 469     int ddindex;
 470     int ddnodeid = -1;
 471
 472     ddindex = dd_index(dd->nc, c);
 473     if (dd->comm->bCartesianPP_PME)
 474     {
 475         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 476     }
 477     else if (dd->comm->bCartesianPP)
 478     {
 479 #ifdef GMX_MPI
 480         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 481 #endif
 482     }
 483     else
 484     {
 485         ddnodeid = ddindex;
 486     }
 487
 488     return ddnodeid;
 489 }
 490
 491 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 492 {
 493     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 494 }
 495
 496 int ddglatnr(gmx_domdec_t *dd, int i)
 497 {
 498     int atnr;
 499
 500     if (dd == NULL)
 501     {
 502         atnr = i + 1;
 503     }
 504     else
 505     {
 506         if (i >= dd->comm->nat[ddnatNR-1])
 507         {
 508             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 509         }
 510         atnr = dd->gatindex[i] + 1;
 511     }
 512
 513     return atnr;
 514 }
 515
 516 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 517 {
 518     return &dd->comm->cgs_gl;
 519 }
 520
 521 static void vec_rvec_init(vec_rvec_t *v)
 522 {
 523     v->nalloc = 0;
 524     v->v      = NULL;
 525 }
 526
 527 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 528 {
 529     if (n > v->nalloc)
 530     {
 531         v->nalloc = over_alloc_dd(n);
 532         srenew(v->v, v->nalloc);
 533     }
 534 }
 535
 536 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 537 {
 538     int i;
 539
 540     if (state->ddp_count != dd->ddp_count)
 541     {
 542         gmx_incons("The state does not the domain decomposition state");
 543     }
 544
 545     state->ncg_gl = dd->ncg_home;
 546     if (state->ncg_gl > state->cg_gl_nalloc)
 547     {
 548         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 549         srenew(state->cg_gl, state->cg_gl_nalloc);
 550     }
 551     for (i = 0; i < state->ncg_gl; i++)
 552     {
 553         state->cg_gl[i] = dd->index_gl[i];
 554     }
 555
 556     state->ddp_count_cg_gl = dd->ddp_count;
 557 }
 558
 559 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 560 {
 561     return &dd->comm->zones;
 562 }
 563
 564 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 565                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 566 {
 567     gmx_domdec_zones_t *zones;
 568     int                 izone, d, dim;
 569
 570     zones = &dd->comm->zones;
 571
 572     izone = 0;
 573     while (icg >= zones->izone[izone].cg1)
 574     {
 575         izone++;
 576     }
 577
 578     if (izone == 0)
 579     {
 580         *jcg0 = icg;
 581     }
 582     else if (izone < zones->nizone)
 583     {
 584         *jcg0 = zones->izone[izone].jcg0;
 585     }
 586     else
 587     {
 588         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 589                   icg, izone, zones->nizone);
 590     }
 591
 592     *jcg1 = zones->izone[izone].jcg1;
 593
 594     for (d = 0; d < dd->ndim; d++)
 595     {
 596         dim         = dd->dim[d];
 597         shift0[dim] = zones->izone[izone].shift0[dim];
 598         shift1[dim] = zones->izone[izone].shift1[dim];
 599         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 600         {
 601             /* A conservative approach, this can be optimized */
 602             shift0[dim] -= 1;
 603             shift1[dim] += 1;
 604         }
 605     }
 606 }
 607
 608 int dd_natoms_vsite(gmx_domdec_t *dd)
 609 {
 610     return dd->comm->nat[ddnatVSITE];
 611 }
 612
 613 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 614 {
 615     *at_start = dd->comm->nat[ddnatCON-1];
 616     *at_end   = dd->comm->nat[ddnatCON];
 617 }
 618
 619 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 620 {
 621     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 622     int                   *index, *cgindex;
 623     gmx_domdec_comm_t     *comm;
 624     gmx_domdec_comm_dim_t *cd;
 625     gmx_domdec_ind_t      *ind;
 626     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 627     gmx_bool               bPBC, bScrew;
 628
 629     comm = dd->comm;
 630
 631     cgindex = dd->cgindex;
 632
 633     buf = comm->vbuf.v;
 634
 635     nzone   = 1;
 636     nat_tot = dd->nat_home;
 637     for (d = 0; d < dd->ndim; d++)
 638     {
 639         bPBC   = (dd->ci[dd->dim[d]] == 0);
 640         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 641         if (bPBC)
 642         {
 643             copy_rvec(box[dd->dim[d]], shift);
 644         }
 645         cd = &comm->cd[d];
 646         for (p = 0; p < cd->np; p++)
 647         {
 648             ind   = &cd->ind[p];
 649             index = ind->index;
 650             n     = 0;
 651             if (!bPBC)
 652             {
 653                 for (i = 0; i < ind->nsend[nzone]; i++)
 654                 {
 655                     at0 = cgindex[index[i]];
 656                     at1 = cgindex[index[i]+1];
 657                     for (j = at0; j < at1; j++)
 658                     {
 659                         copy_rvec(x[j], buf[n]);
 660                         n++;
 661                     }
 662                 }
 663             }
 664             else if (!bScrew)
 665             {
 666                 for (i = 0; i < ind->nsend[nzone]; i++)
 667                 {
 668                     at0 = cgindex[index[i]];
 669                     at1 = cgindex[index[i]+1];
 670                     for (j = at0; j < at1; j++)
 671                     {
 672                         /* We need to shift the coordinates */
 673                         rvec_add(x[j], shift, buf[n]);
 674                         n++;
 675                     }
 676                 }
 677             }
 678             else
 679             {
 680                 for (i = 0; i < ind->nsend[nzone]; i++)
 681                 {
 682                     at0 = cgindex[index[i]];
 683                     at1 = cgindex[index[i]+1];
 684                     for (j = at0; j < at1; j++)
 685                     {
 686                         /* Shift x */
 687                         buf[n][XX] = x[j][XX] + shift[XX];
 688                         /* Rotate y and z.
 689                          * This operation requires a special shift force
 690                          * treatment, which is performed in calc_vir.
 691                          */
 692                         buf[n][YY] = box[YY][YY] - x[j][YY];
 693                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 694                         n++;
 695                     }
 696                 }
 697             }
 698
 699             if (cd->bInPlace)
 700             {
 701                 rbuf = x + nat_tot;
 702             }
 703             else
 704             {
 705                 rbuf = comm->vbuf2.v;
 706             }
 707             /* Send and receive the coordinates */
 708             dd_sendrecv_rvec(dd, d, dddirBackward,
 709                              buf,  ind->nsend[nzone+1],
 710                              rbuf, ind->nrecv[nzone+1]);
 711             if (!cd->bInPlace)
 712             {
 713                 j = 0;
 714                 for (zone = 0; zone < nzone; zone++)
 715                 {
 716                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 717                     {
 718                         copy_rvec(rbuf[j], x[i]);
 719                         j++;
 720                     }
 721                 }
 722             }
 723             nat_tot += ind->nrecv[nzone+1];
 724         }
 725         nzone += nzone;
 726     }
 727 }
 728
 729 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 730 {
 731     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 732     int                   *index, *cgindex;
 733     gmx_domdec_comm_t     *comm;
 734     gmx_domdec_comm_dim_t *cd;
 735     gmx_domdec_ind_t      *ind;
 736     rvec                  *buf, *sbuf;
 737     ivec                   vis;
 738     int                    is;
 739     gmx_bool               bPBC, bScrew;
 740
 741     comm = dd->comm;
 742
 743     cgindex = dd->cgindex;
 744
 745     buf = comm->vbuf.v;
 746
 747     n       = 0;
 748     nzone   = comm->zones.n/2;
 749     nat_tot = dd->nat_tot;
 750     for (d = dd->ndim-1; d >= 0; d--)
 751     {
 752         bPBC   = (dd->ci[dd->dim[d]] == 0);
 753         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 754         if (fshift == NULL && !bScrew)
 755         {
 756             bPBC = FALSE;
 757         }
 758         /* Determine which shift vector we need */
 759         clear_ivec(vis);
 760         vis[dd->dim[d]] = 1;
 761         is              = IVEC2IS(vis);
 762
 763         cd = &comm->cd[d];
 764         for (p = cd->np-1; p >= 0; p--)
 765         {
 766             ind      = &cd->ind[p];
 767             nat_tot -= ind->nrecv[nzone+1];
 768             if (cd->bInPlace)
 769             {
 770                 sbuf = f + nat_tot;
 771             }
 772             else
 773             {
 774                 sbuf = comm->vbuf2.v;
 775                 j    = 0;
 776                 for (zone = 0; zone < nzone; zone++)
 777                 {
 778                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 779                     {
 780                         copy_rvec(f[i], sbuf[j]);
 781                         j++;
 782                     }
 783                 }
 784             }
 785             /* Communicate the forces */
 786             dd_sendrecv_rvec(dd, d, dddirForward,
 787                              sbuf, ind->nrecv[nzone+1],
 788                              buf,  ind->nsend[nzone+1]);
 789             index = ind->index;
 790             /* Add the received forces */
 791             n = 0;
 792             if (!bPBC)
 793             {
 794                 for (i = 0; i < ind->nsend[nzone]; i++)
 795                 {
 796                     at0 = cgindex[index[i]];
 797                     at1 = cgindex[index[i]+1];
 798                     for (j = at0; j < at1; j++)
 799                     {
 800                         rvec_inc(f[j], buf[n]);
 801                         n++;
 802                     }
 803                 }
 804             }
 805             else if (!bScrew)
 806             {
 807                 for (i = 0; i < ind->nsend[nzone]; i++)
 808                 {
 809                     at0 = cgindex[index[i]];
 810                     at1 = cgindex[index[i]+1];
 811                     for (j = at0; j < at1; j++)
 812                     {
 813                         rvec_inc(f[j], buf[n]);
 814                         /* Add this force to the shift force */
 815                         rvec_inc(fshift[is], buf[n]);
 816                         n++;
 817                     }
 818                 }
 819             }
 820             else
 821             {
 822                 for (i = 0; i < ind->nsend[nzone]; i++)
 823                 {
 824                     at0 = cgindex[index[i]];
 825                     at1 = cgindex[index[i]+1];
 826                     for (j = at0; j < at1; j++)
 827                     {
 828                         /* Rotate the force */
 829                         f[j][XX] += buf[n][XX];
 830                         f[j][YY] -= buf[n][YY];
 831                         f[j][ZZ] -= buf[n][ZZ];
 832                         if (fshift)
 833                         {
 834                             /* Add this force to the shift force */
 835                             rvec_inc(fshift[is], buf[n]);
 836                         }
 837                         n++;
 838                     }
 839                 }
 840             }
 841         }
 842         nzone /= 2;
 843     }
 844 }
 845
 846 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 847 {
 848     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 849     int                   *index, *cgindex;
 850     gmx_domdec_comm_t     *comm;
 851     gmx_domdec_comm_dim_t *cd;
 852     gmx_domdec_ind_t      *ind;
 853     real                  *buf, *rbuf;
 854
 855     comm = dd->comm;
 856
 857     cgindex = dd->cgindex;
 858
 859     buf = &comm->vbuf.v[0][0];
 860
 861     nzone   = 1;
 862     nat_tot = dd->nat_home;
 863     for (d = 0; d < dd->ndim; d++)
 864     {
 865         cd = &comm->cd[d];
 866         for (p = 0; p < cd->np; p++)
 867         {
 868             ind   = &cd->ind[p];
 869             index = ind->index;
 870             n     = 0;
 871             for (i = 0; i < ind->nsend[nzone]; i++)
 872             {
 873                 at0 = cgindex[index[i]];
 874                 at1 = cgindex[index[i]+1];
 875                 for (j = at0; j < at1; j++)
 876                 {
 877                     buf[n] = v[j];
 878                     n++;
 879                 }
 880             }
 881
 882             if (cd->bInPlace)
 883             {
 884                 rbuf = v + nat_tot;
 885             }
 886             else
 887             {
 888                 rbuf = &comm->vbuf2.v[0][0];
 889             }
 890             /* Send and receive the coordinates */
 891             dd_sendrecv_real(dd, d, dddirBackward,
 892                              buf,  ind->nsend[nzone+1],
 893                              rbuf, ind->nrecv[nzone+1]);
 894             if (!cd->bInPlace)
 895             {
 896                 j = 0;
 897                 for (zone = 0; zone < nzone; zone++)
 898                 {
 899                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 900                     {
 901                         v[i] = rbuf[j];
 902                         j++;
 903                     }
 904                 }
 905             }
 906             nat_tot += ind->nrecv[nzone+1];
 907         }
 908         nzone += nzone;
 909     }
 910 }
 911
 912 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 913 {
 914     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 915     int                   *index, *cgindex;
 916     gmx_domdec_comm_t     *comm;
 917     gmx_domdec_comm_dim_t *cd;
 918     gmx_domdec_ind_t      *ind;
 919     real                  *buf, *sbuf;
 920
 921     comm = dd->comm;
 922
 923     cgindex = dd->cgindex;
 924
 925     buf = &comm->vbuf.v[0][0];
 926
 927     n       = 0;
 928     nzone   = comm->zones.n/2;
 929     nat_tot = dd->nat_tot;
 930     for (d = dd->ndim-1; d >= 0; d--)
 931     {
 932         cd = &comm->cd[d];
 933         for (p = cd->np-1; p >= 0; p--)
 934         {
 935             ind      = &cd->ind[p];
 936             nat_tot -= ind->nrecv[nzone+1];
 937             if (cd->bInPlace)
 938             {
 939                 sbuf = v + nat_tot;
 940             }
 941             else
 942             {
 943                 sbuf = &comm->vbuf2.v[0][0];
 944                 j    = 0;
 945                 for (zone = 0; zone < nzone; zone++)
 946                 {
 947                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 948                     {
 949                         sbuf[j] = v[i];
 950                         j++;
 951                     }
 952                 }
 953             }
 954             /* Communicate the forces */
 955             dd_sendrecv_real(dd, d, dddirForward,
 956                              sbuf, ind->nrecv[nzone+1],
 957                              buf,  ind->nsend[nzone+1]);
 958             index = ind->index;
 959             /* Add the received forces */
 960             n = 0;
 961             for (i = 0; i < ind->nsend[nzone]; i++)
 962             {
 963                 at0 = cgindex[index[i]];
 964                 at1 = cgindex[index[i]+1];
 965                 for (j = at0; j < at1; j++)
 966                 {
 967                     v[j] += buf[n];
 968                     n++;
 969                 }
 970             }
 971         }
 972         nzone /= 2;
 973     }
 974 }
 975
 976 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 977 {
 978     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 979             d, i, j,
 980             zone->min0, zone->max1,
 981             zone->mch0, zone->mch0,
 982             zone->p1_0, zone->p1_1);
 983 }
 984
 985
 986 #define DDZONECOMM_MAXZONE  5
 987 #define DDZONECOMM_BUFSIZE  3
 988
 989 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 990                                int ddimind, int direction,
 991                                gmx_ddzone_t *buf_s, int n_s,
 992                                gmx_ddzone_t *buf_r, int n_r)
 993 {
 994 #define ZBS  DDZONECOMM_BUFSIZE
 995     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
 996     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
 997     int  i;
 998
 999     for (i = 0; i < n_s; i++)
1000     {
1001         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1002         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1003         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1004         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1005         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1006         vbuf_s[i*ZBS+1][2] = 0;
1007         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1008         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1009         vbuf_s[i*ZBS+2][2] = 0;
1010     }
1011
1012     dd_sendrecv_rvec(dd, ddimind, direction,
1013                      vbuf_s, n_s*ZBS,
1014                      vbuf_r, n_r*ZBS);
1015
1016     for (i = 0; i < n_r; i++)
1017     {
1018         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1019         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1020         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1021         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1022         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1023         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1024         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1025     }
1026
1027 #undef ZBS
1028 }
1029
1030 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1031                           rvec cell_ns_x0, rvec cell_ns_x1)
1032 {
1033     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1034     gmx_ddzone_t      *zp;
1035     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1036     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1037     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1038     rvec               extr_s[2], extr_r[2];
1039     rvec               dh;
1040     real               dist_d, c = 0, det;
1041     gmx_domdec_comm_t *comm;
1042     gmx_bool           bPBC, bUse;
1043
1044     comm = dd->comm;
1045
1046     for (d = 1; d < dd->ndim; d++)
1047     {
1048         dim      = dd->dim[d];
1049         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1050         zp->min0 = cell_ns_x0[dim];
1051         zp->max1 = cell_ns_x1[dim];
1052         zp->min1 = cell_ns_x1[dim];
1053         zp->mch0 = cell_ns_x0[dim];
1054         zp->mch1 = cell_ns_x1[dim];
1055         zp->p1_0 = cell_ns_x0[dim];
1056         zp->p1_1 = cell_ns_x1[dim];
1057     }
1058
1059     for (d = dd->ndim-2; d >= 0; d--)
1060     {
1061         dim  = dd->dim[d];
1062         bPBC = (dim < ddbox->npbcdim);
1063
1064         /* Use an rvec to store two reals */
1065         extr_s[d][0] = comm->cell_f0[d+1];
1066         extr_s[d][1] = comm->cell_f1[d+1];
1067         extr_s[d][2] = comm->cell_f1[d+1];
1068
1069         pos = 0;
1070         /* Store the extremes in the backward sending buffer,
1071          * so the get updated separately from the forward communication.
1072          */
1073         for (d1 = d; d1 < dd->ndim-1; d1++)
1074         {
1075             /* We invert the order to be able to use the same loop for buf_e */
1076             buf_s[pos].min0 = extr_s[d1][1];
1077             buf_s[pos].max1 = extr_s[d1][0];
1078             buf_s[pos].min1 = extr_s[d1][2];
1079             buf_s[pos].mch0 = 0;
1080             buf_s[pos].mch1 = 0;
1081             /* Store the cell corner of the dimension we communicate along */
1082             buf_s[pos].p1_0 = comm->cell_x0[dim];
1083             buf_s[pos].p1_1 = 0;
1084             pos++;
1085         }
1086
1087         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1088         pos++;
1089
1090         if (dd->ndim == 3 && d == 0)
1091         {
1092             buf_s[pos] = comm->zone_d2[0][1];
1093             pos++;
1094             buf_s[pos] = comm->zone_d1[0];
1095             pos++;
1096         }
1097
1098         /* We only need to communicate the extremes
1099          * in the forward direction
1100          */
1101         npulse = comm->cd[d].np;
1102         if (bPBC)
1103         {
1104             /* Take the minimum to avoid double communication */
1105             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1106         }
1107         else
1108         {
1109             /* Without PBC we should really not communicate over
1110              * the boundaries, but implementing that complicates
1111              * the communication setup and therefore we simply
1112              * do all communication, but ignore some data.
1113              */
1114             npulse_min = npulse;
1115         }
1116         for (p = 0; p < npulse_min; p++)
1117         {
1118             /* Communicate the extremes forward */
1119             bUse = (bPBC || dd->ci[dim] > 0);
1120
1121             dd_sendrecv_rvec(dd, d, dddirForward,
1122                              extr_s+d, dd->ndim-d-1,
1123                              extr_r+d, dd->ndim-d-1);
1124
1125             if (bUse)
1126             {
1127                 for (d1 = d; d1 < dd->ndim-1; d1++)
1128                 {
1129                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1130                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1131                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1132                 }
1133             }
1134         }
1135
1136         buf_size = pos;
1137         for (p = 0; p < npulse; p++)
1138         {
1139             /* Communicate all the zone information backward */
1140             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1141
1142             dd_sendrecv_ddzone(dd, d, dddirBackward,
1143                                buf_s, buf_size,
1144                                buf_r, buf_size);
1145
1146             clear_rvec(dh);
1147             if (p > 0)
1148             {
1149                 for (d1 = d+1; d1 < dd->ndim; d1++)
1150                 {
1151                     /* Determine the decrease of maximum required
1152                      * communication height along d1 due to the distance along d,
1153                      * this avoids a lot of useless atom communication.
1154                      */
1155                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1156
1157                     if (ddbox->tric_dir[dim])
1158                     {
1159                         /* c is the off-diagonal coupling between the cell planes
1160                          * along directions d and d1.
1161                          */
1162                         c = ddbox->v[dim][dd->dim[d1]][dim];
1163                     }
1164                     else
1165                     {
1166                         c = 0;
1167                     }
1168                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1169                     if (det > 0)
1170                     {
1171                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1172                     }
1173                     else
1174                     {
1175                         /* A negative value signals out of range */
1176                         dh[d1] = -1;
1177                     }
1178                 }
1179             }
1180
1181             /* Accumulate the extremes over all pulses */
1182             for (i = 0; i < buf_size; i++)
1183             {
1184                 if (p == 0)
1185                 {
1186                     buf_e[i] = buf_r[i];
1187                 }
1188                 else
1189                 {
1190                     if (bUse)
1191                     {
1192                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1193                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1194                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1195                     }
1196
1197                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1198                     {
1199                         d1 = 1;
1200                     }
1201                     else
1202                     {
1203                         d1 = d + 1;
1204                     }
1205                     if (bUse && dh[d1] >= 0)
1206                     {
1207                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1208                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1209                     }
1210                 }
1211                 /* Copy the received buffer to the send buffer,
1212                  * to pass the data through with the next pulse.
1213                  */
1214                 buf_s[i] = buf_r[i];
1215             }
1216             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1217                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1218             {
1219                 /* Store the extremes */
1220                 pos = 0;
1221
1222                 for (d1 = d; d1 < dd->ndim-1; d1++)
1223                 {
1224                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1225                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1226                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1227                     pos++;
1228                 }
1229
1230                 if (d == 1 || (d == 0 && dd->ndim == 3))
1231                 {
1232                     for (i = d; i < 2; i++)
1233                     {
1234                         comm->zone_d2[1-d][i] = buf_e[pos];
1235                         pos++;
1236                     }
1237                 }
1238                 if (d == 0)
1239                 {
1240                     comm->zone_d1[1] = buf_e[pos];
1241                     pos++;
1242                 }
1243             }
1244         }
1245     }
1246
1247     if (dd->ndim >= 2)
1248     {
1249         dim = dd->dim[1];
1250         for (i = 0; i < 2; i++)
1251         {
1252             if (debug)
1253             {
1254                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1255             }
1256             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1257             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1258         }
1259     }
1260     if (dd->ndim >= 3)
1261     {
1262         dim = dd->dim[2];
1263         for (i = 0; i < 2; i++)
1264         {
1265             for (j = 0; j < 2; j++)
1266             {
1267                 if (debug)
1268                 {
1269                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1270                 }
1271                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1272                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1273             }
1274         }
1275     }
1276     for (d = 1; d < dd->ndim; d++)
1277     {
1278         comm->cell_f_max0[d] = extr_s[d-1][0];
1279         comm->cell_f_min1[d] = extr_s[d-1][1];
1280         if (debug)
1281         {
1282             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1283                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1284         }
1285     }
1286 }
1287
1288 static void dd_collect_cg(gmx_domdec_t *dd,
1289                           t_state      *state_local)
1290 {
1291     gmx_domdec_master_t *ma = NULL;
1292     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1293     t_block             *cgs_gl;
1294
1295     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1296     {
1297         /* The master has the correct distribution */
1298         return;
1299     }
1300
1301     if (state_local->ddp_count == dd->ddp_count)
1302     {
1303         ncg_home = dd->ncg_home;
1304         cg       = dd->index_gl;
1305         nat_home = dd->nat_home;
1306     }
1307     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1308     {
1309         cgs_gl = &dd->comm->cgs_gl;
1310
1311         ncg_home = state_local->ncg_gl;
1312         cg       = state_local->cg_gl;
1313         nat_home = 0;
1314         for (i = 0; i < ncg_home; i++)
1315         {
1316             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1317         }
1318     }
1319     else
1320     {
1321         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1322     }
1323
1324     buf2[0] = dd->ncg_home;
1325     buf2[1] = dd->nat_home;
1326     if (DDMASTER(dd))
1327     {
1328         ma   = dd->ma;
1329         ibuf = ma->ibuf;
1330     }
1331     else
1332     {
1333         ibuf = NULL;
1334     }
1335     /* Collect the charge group and atom counts on the master */
1336     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1337
1338     if (DDMASTER(dd))
1339     {
1340         ma->index[0] = 0;
1341         for (i = 0; i < dd->nnodes; i++)
1342         {
1343             ma->ncg[i]     = ma->ibuf[2*i];
1344             ma->nat[i]     = ma->ibuf[2*i+1];
1345             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1346
1347         }
1348         /* Make byte counts and indices */
1349         for (i = 0; i < dd->nnodes; i++)
1350         {
1351             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1352             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1353         }
1354         if (debug)
1355         {
1356             fprintf(debug, "Initial charge group distribution: ");
1357             for (i = 0; i < dd->nnodes; i++)
1358             {
1359                 fprintf(debug, " %d", ma->ncg[i]);
1360             }
1361             fprintf(debug, "\n");
1362         }
1363     }
1364
1365     /* Collect the charge group indices on the master */
1366     dd_gatherv(dd,
1367                dd->ncg_home*sizeof(int), dd->index_gl,
1368                DDMASTER(dd) ? ma->ibuf : NULL,
1369                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1370                DDMASTER(dd) ? ma->cg : NULL);
1371
1372     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1373 }
1374
1375 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1376                                     rvec *lv, rvec *v)
1377 {
1378     gmx_domdec_master_t *ma;
1379     int                  n, i, c, a, nalloc = 0;
1380     rvec                *buf = NULL;
1381     t_block             *cgs_gl;
1382
1383     ma = dd->ma;
1384
1385     if (!DDMASTER(dd))
1386     {
1387 #ifdef GMX_MPI
1388         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1389                  dd->rank, dd->mpi_comm_all);
1390 #endif
1391     }
1392     else
1393     {
1394         /* Copy the master coordinates to the global array */
1395         cgs_gl = &dd->comm->cgs_gl;
1396
1397         n = DDMASTERRANK(dd);
1398         a = 0;
1399         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1400         {
1401             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1402             {
1403                 copy_rvec(lv[a++], v[c]);
1404             }
1405         }
1406
1407         for (n = 0; n < dd->nnodes; n++)
1408         {
1409             if (n != dd->rank)
1410             {
1411                 if (ma->nat[n] > nalloc)
1412                 {
1413                     nalloc = over_alloc_dd(ma->nat[n]);
1414                     srenew(buf, nalloc);
1415                 }
1416 #ifdef GMX_MPI
1417                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1418                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1419 #endif
1420                 a = 0;
1421                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1422                 {
1423                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1424                     {
1425                         copy_rvec(buf[a++], v[c]);
1426                     }
1427                 }
1428             }
1429         }
1430         sfree(buf);
1431     }
1432 }
1433
1434 static void get_commbuffer_counts(gmx_domdec_t *dd,
1435                                   int **counts, int **disps)
1436 {
1437     gmx_domdec_master_t *ma;
1438     int                  n;
1439
1440     ma = dd->ma;
1441
1442     /* Make the rvec count and displacment arrays */
1443     *counts  = ma->ibuf;
1444     *disps   = ma->ibuf + dd->nnodes;
1445     for (n = 0; n < dd->nnodes; n++)
1446     {
1447         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1448         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1449     }
1450 }
1451
1452 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1453                                    rvec *lv, rvec *v)
1454 {
1455     gmx_domdec_master_t *ma;
1456     int                 *rcounts = NULL, *disps = NULL;
1457     int                  n, i, c, a;
1458     rvec                *buf = NULL;
1459     t_block             *cgs_gl;
1460
1461     ma = dd->ma;
1462
1463     if (DDMASTER(dd))
1464     {
1465         get_commbuffer_counts(dd, &rcounts, &disps);
1466
1467         buf = ma->vbuf;
1468     }
1469
1470     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1471
1472     if (DDMASTER(dd))
1473     {
1474         cgs_gl = &dd->comm->cgs_gl;
1475
1476         a = 0;
1477         for (n = 0; n < dd->nnodes; n++)
1478         {
1479             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1480             {
1481                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1482                 {
1483                     copy_rvec(buf[a++], v[c]);
1484                 }
1485             }
1486         }
1487     }
1488 }
1489
1490 void dd_collect_vec(gmx_domdec_t *dd,
1491                     t_state *state_local, rvec *lv, rvec *v)
1492 {
1493     gmx_domdec_master_t *ma;
1494     int                  n, i, c, a, nalloc = 0;
1495     rvec                *buf = NULL;
1496
1497     dd_collect_cg(dd, state_local);
1498
1499     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1500     {
1501         dd_collect_vec_sendrecv(dd, lv, v);
1502     }
1503     else
1504     {
1505         dd_collect_vec_gatherv(dd, lv, v);
1506     }
1507 }
1508
1509
1510 void dd_collect_state(gmx_domdec_t *dd,
1511                       t_state *state_local, t_state *state)
1512 {
1513     int est, i, j, nh;
1514
1515     nh = state->nhchainlength;
1516
1517     if (DDMASTER(dd))
1518     {
1519         for (i = 0; i < efptNR; i++)
1520         {
1521             state->lambda[i] = state_local->lambda[i];
1522         }
1523         state->fep_state = state_local->fep_state;
1524         state->veta      = state_local->veta;
1525         state->vol0      = state_local->vol0;
1526         copy_mat(state_local->box, state->box);
1527         copy_mat(state_local->boxv, state->boxv);
1528         copy_mat(state_local->svir_prev, state->svir_prev);
1529         copy_mat(state_local->fvir_prev, state->fvir_prev);
1530         copy_mat(state_local->pres_prev, state->pres_prev);
1531
1532
1533         for (i = 0; i < state_local->ngtc; i++)
1534         {
1535             for (j = 0; j < nh; j++)
1536             {
1537                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1538                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1539             }
1540             state->therm_integral[i] = state_local->therm_integral[i];
1541         }
1542         for (i = 0; i < state_local->nnhpres; i++)
1543         {
1544             for (j = 0; j < nh; j++)
1545             {
1546                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1547                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1548             }
1549         }
1550     }
1551     for (est = 0; est < estNR; est++)
1552     {
1553         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1554         {
1555             switch (est)
1556             {
1557                 case estX:
1558                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1559                     break;
1560                 case estV:
1561                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1562                     break;
1563                 case estSDX:
1564                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1565                     break;
1566                 case estCGP:
1567                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1568                     break;
1569                 case estLD_RNG:
1570                     if (state->nrngi == 1)
1571                     {
1572                         if (DDMASTER(dd))
1573                         {
1574                             for (i = 0; i < state_local->nrng; i++)
1575                             {
1576                                 state->ld_rng[i] = state_local->ld_rng[i];
1577                             }
1578                         }
1579                     }
1580                     else
1581                     {
1582                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1583                                   state_local->ld_rng, state->ld_rng);
1584                     }
1585                     break;
1586                 case estLD_RNGI:
1587                     if (state->nrngi == 1)
1588                     {
1589                         if (DDMASTER(dd))
1590                         {
1591                             state->ld_rngi[0] = state_local->ld_rngi[0];
1592                         }
1593                     }
1594                     else
1595                     {
1596                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1597                                   state_local->ld_rngi, state->ld_rngi);
1598                     }
1599                     break;
1600                 case estDISRE_INITF:
1601                 case estDISRE_RM3TAV:
1602                 case estORIRE_INITF:
1603                 case estORIRE_DTAV:
1604                     break;
1605                 default:
1606                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1607             }
1608         }
1609     }
1610 }
1611
1612 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1613 {
1614     int est;
1615
1616     if (debug)
1617     {
1618         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1619     }
1620
1621     state->nalloc = over_alloc_dd(nalloc);
1622
1623     for (est = 0; est < estNR; est++)
1624     {
1625         if (EST_DISTR(est) && (state->flags & (1<<est)))
1626         {
1627             switch (est)
1628             {
1629                 case estX:
1630                     srenew(state->x, state->nalloc);
1631                     break;
1632                 case estV:
1633                     srenew(state->v, state->nalloc);
1634                     break;
1635                 case estSDX:
1636                     srenew(state->sd_X, state->nalloc);
1637                     break;
1638                 case estCGP:
1639                     srenew(state->cg_p, state->nalloc);
1640                     break;
1641                 case estLD_RNG:
1642                 case estLD_RNGI:
1643                 case estDISRE_INITF:
1644                 case estDISRE_RM3TAV:
1645                 case estORIRE_INITF:
1646                 case estORIRE_DTAV:
1647                     /* No reallocation required */
1648                     break;
1649                 default:
1650                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1651             }
1652         }
1653     }
1654
1655     if (f != NULL)
1656     {
1657         srenew(*f, state->nalloc);
1658     }
1659 }
1660
1661 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1662                                int nalloc)
1663 {
1664     if (nalloc > fr->cg_nalloc)
1665     {
1666         if (debug)
1667         {
1668             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1669         }
1670         fr->cg_nalloc = over_alloc_dd(nalloc);
1671         srenew(fr->cginfo, fr->cg_nalloc);
1672         if (fr->cutoff_scheme == ecutsGROUP)
1673         {
1674             srenew(fr->cg_cm, fr->cg_nalloc);
1675         }
1676     }
1677     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1678     {
1679         /* We don't use charge groups, we use x in state to set up
1680          * the atom communication.
1681          */
1682         dd_realloc_state(state, f, nalloc);
1683     }
1684 }
1685
1686 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1687                                        rvec *v, rvec *lv)
1688 {
1689     gmx_domdec_master_t *ma;
1690     int                  n, i, c, a, nalloc = 0;
1691     rvec                *buf = NULL;
1692
1693     if (DDMASTER(dd))
1694     {
1695         ma  = dd->ma;
1696
1697         for (n = 0; n < dd->nnodes; n++)
1698         {
1699             if (n != dd->rank)
1700             {
1701                 if (ma->nat[n] > nalloc)
1702                 {
1703                     nalloc = over_alloc_dd(ma->nat[n]);
1704                     srenew(buf, nalloc);
1705                 }
1706                 /* Use lv as a temporary buffer */
1707                 a = 0;
1708                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1709                 {
1710                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1711                     {
1712                         copy_rvec(v[c], buf[a++]);
1713                     }
1714                 }
1715                 if (a != ma->nat[n])
1716                 {
1717                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1718                               a, ma->nat[n]);
1719                 }
1720
1721 #ifdef GMX_MPI
1722                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1723                          DDRANK(dd, n), n, dd->mpi_comm_all);
1724 #endif
1725             }
1726         }
1727         sfree(buf);
1728         n = DDMASTERRANK(dd);
1729         a = 0;
1730         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1731         {
1732             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1733             {
1734                 copy_rvec(v[c], lv[a++]);
1735             }
1736         }
1737     }
1738     else
1739     {
1740 #ifdef GMX_MPI
1741         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1742                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1743 #endif
1744     }
1745 }
1746
1747 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1748                                        rvec *v, rvec *lv)
1749 {
1750     gmx_domdec_master_t *ma;
1751     int                 *scounts = NULL, *disps = NULL;
1752     int                  n, i, c, a, nalloc = 0;
1753     rvec                *buf = NULL;
1754
1755     if (DDMASTER(dd))
1756     {
1757         ma  = dd->ma;
1758
1759         get_commbuffer_counts(dd, &scounts, &disps);
1760
1761         buf = ma->vbuf;
1762         a   = 0;
1763         for (n = 0; n < dd->nnodes; n++)
1764         {
1765             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1766             {
1767                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1768                 {
1769                     copy_rvec(v[c], buf[a++]);
1770                 }
1771             }
1772         }
1773     }
1774
1775     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1776 }
1777
1778 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1779 {
1780     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1781     {
1782         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1783     }
1784     else
1785     {
1786         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1787     }
1788 }
1789
1790 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1791                                 t_state *state, t_state *state_local,
1792                                 rvec **f)
1793 {
1794     int  i, j, nh;
1795
1796     nh = state->nhchainlength;
1797
1798     if (DDMASTER(dd))
1799     {
1800         for (i = 0; i < efptNR; i++)
1801         {
1802             state_local->lambda[i] = state->lambda[i];
1803         }
1804         state_local->fep_state = state->fep_state;
1805         state_local->veta      = state->veta;
1806         state_local->vol0      = state->vol0;
1807         copy_mat(state->box, state_local->box);
1808         copy_mat(state->box_rel, state_local->box_rel);
1809         copy_mat(state->boxv, state_local->boxv);
1810         copy_mat(state->svir_prev, state_local->svir_prev);
1811         copy_mat(state->fvir_prev, state_local->fvir_prev);
1812         for (i = 0; i < state_local->ngtc; i++)
1813         {
1814             for (j = 0; j < nh; j++)
1815             {
1816                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1817                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1818             }
1819             state_local->therm_integral[i] = state->therm_integral[i];
1820         }
1821         for (i = 0; i < state_local->nnhpres; i++)
1822         {
1823             for (j = 0; j < nh; j++)
1824             {
1825                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1826                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1827             }
1828         }
1829     }
1830     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1831     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1832     dd_bcast(dd, sizeof(real), &state_local->veta);
1833     dd_bcast(dd, sizeof(real), &state_local->vol0);
1834     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1835     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1836     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1837     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1838     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1839     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1840     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1841     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1842     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1843     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1844
1845     if (dd->nat_home > state_local->nalloc)
1846     {
1847         dd_realloc_state(state_local, f, dd->nat_home);
1848     }
1849     for (i = 0; i < estNR; i++)
1850     {
1851         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1852         {
1853             switch (i)
1854             {
1855                 case estX:
1856                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1857                     break;
1858                 case estV:
1859                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1860                     break;
1861                 case estSDX:
1862                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1863                     break;
1864                 case estCGP:
1865                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1866                     break;
1867                 case estLD_RNG:
1868                     if (state->nrngi == 1)
1869                     {
1870                         dd_bcastc(dd,
1871                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1872                                   state->ld_rng, state_local->ld_rng);
1873                     }
1874                     else
1875                     {
1876                         dd_scatter(dd,
1877                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1878                                    state->ld_rng, state_local->ld_rng);
1879                     }
1880                     break;
1881                 case estLD_RNGI:
1882                     if (state->nrngi == 1)
1883                     {
1884                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1885                                   state->ld_rngi, state_local->ld_rngi);
1886                     }
1887                     else
1888                     {
1889                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1890                                    state->ld_rngi, state_local->ld_rngi);
1891                     }
1892                     break;
1893                 case estDISRE_INITF:
1894                 case estDISRE_RM3TAV:
1895                 case estORIRE_INITF:
1896                 case estORIRE_DTAV:
1897                     /* Not implemented yet */
1898                     break;
1899                 default:
1900                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1901             }
1902         }
1903     }
1904 }
1905
1906 static char dim2char(int dim)
1907 {
1908     char c = '?';
1909
1910     switch (dim)
1911     {
1912         case XX: c = 'X'; break;
1913         case YY: c = 'Y'; break;
1914         case ZZ: c = 'Z'; break;
1915         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1916     }
1917
1918     return c;
1919 }
1920
1921 static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
1922                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1923 {
1924     rvec   grid_s[2], *grid_r = NULL, cx, r;
1925     char   fname[STRLEN], format[STRLEN], buf[22];
1926     FILE  *out;
1927     int    a, i, d, z, y, x;
1928     matrix tric;
1929     real   vol;
1930
1931     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1932     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1933
1934     if (DDMASTER(dd))
1935     {
1936         snew(grid_r, 2*dd->nnodes);
1937     }
1938
1939     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1940
1941     if (DDMASTER(dd))
1942     {
1943         for (d = 0; d < DIM; d++)
1944         {
1945             for (i = 0; i < DIM; i++)
1946             {
1947                 if (d == i)
1948                 {
1949                     tric[d][i] = 1;
1950                 }
1951                 else
1952                 {
1953                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1954                     {
1955                         tric[d][i] = box[i][d]/box[i][i];
1956                     }
1957                     else
1958                     {
1959                         tric[d][i] = 0;
1960                     }
1961                 }
1962             }
1963         }
1964         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1965         sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
1966         out = gmx_fio_fopen(fname, "w");
1967         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1968         a = 1;
1969         for (i = 0; i < dd->nnodes; i++)
1970         {
1971             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1972             for (d = 0; d < DIM; d++)
1973             {
1974                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1975             }
1976             for (z = 0; z < 2; z++)
1977             {
1978                 for (y = 0; y < 2; y++)
1979                 {
1980                     for (x = 0; x < 2; x++)
1981                     {
1982                         cx[XX] = grid_r[i*2+x][XX];
1983                         cx[YY] = grid_r[i*2+y][YY];
1984                         cx[ZZ] = grid_r[i*2+z][ZZ];
1985                         mvmul(tric, cx, r);
1986                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
1987                                 ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
1988                     }
1989                 }
1990             }
1991             for (d = 0; d < DIM; d++)
1992             {
1993                 for (x = 0; x < 4; x++)
1994                 {
1995                     switch (d)
1996                     {
1997                         case 0: y = 1 + i*8 + 2*x; break;
1998                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
1999                         case 2: y = 1 + i*8 + x; break;
2000                     }
2001                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2002                 }
2003             }
2004         }
2005         gmx_fio_fclose(out);
2006         sfree(grid_r);
2007     }
2008 }
2009
2010 void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
2011                   gmx_mtop_t *mtop, t_commrec *cr,
2012                   int natoms, rvec x[], matrix box)
2013 {
2014     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2015     FILE         *out;
2016     int           i, ii, resnr, c;
2017     char         *atomname, *resname;
2018     real          b;
2019     gmx_domdec_t *dd;
2020
2021     dd = cr->dd;
2022     if (natoms == -1)
2023     {
2024         natoms = dd->comm->nat[ddnatVSITE];
2025     }
2026
2027     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2028
2029     sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
2030     sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
2031
2032     out = gmx_fio_fopen(fname, "w");
2033
2034     fprintf(out, "TITLE     %s\n", title);
2035     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2036     for (i = 0; i < natoms; i++)
2037     {
2038         ii = dd->gatindex[i];
2039         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2040         if (i < dd->comm->nat[ddnatZONE])
2041         {
2042             c = 0;
2043             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2044             {
2045                 c++;
2046             }
2047             b = c;
2048         }
2049         else if (i < dd->comm->nat[ddnatVSITE])
2050         {
2051             b = dd->comm->zones.n;
2052         }
2053         else
2054         {
2055             b = dd->comm->zones.n + 1;
2056         }
2057         fprintf(out, strlen(atomname) < 4 ? format : format4,
2058                 "ATOM", (ii+1)%100000,
2059                 atomname, resname, ' ', resnr%10000, ' ',
2060                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2061     }
2062     fprintf(out, "TER\n");
2063
2064     gmx_fio_fclose(out);
2065 }
2066
2067 real dd_cutoff_mbody(gmx_domdec_t *dd)
2068 {
2069     gmx_domdec_comm_t *comm;
2070     int                di;
2071     real               r;
2072
2073     comm = dd->comm;
2074
2075     r = -1;
2076     if (comm->bInterCGBondeds)
2077     {
2078         if (comm->cutoff_mbody > 0)
2079         {
2080             r = comm->cutoff_mbody;
2081         }
2082         else
2083         {
2084             /* cutoff_mbody=0 means we do not have DLB */
2085             r = comm->cellsize_min[dd->dim[0]];
2086             for (di = 1; di < dd->ndim; di++)
2087             {
2088                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2089             }
2090             if (comm->bBondComm)
2091             {
2092                 r = max(r, comm->cutoff_mbody);
2093             }
2094             else
2095             {
2096                 r = min(r, comm->cutoff);
2097             }
2098         }
2099     }
2100
2101     return r;
2102 }
2103
2104 real dd_cutoff_twobody(gmx_domdec_t *dd)
2105 {
2106     real r_mb;
2107
2108     r_mb = dd_cutoff_mbody(dd);
2109
2110     return max(dd->comm->cutoff, r_mb);
2111 }
2112
2113
2114 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2115 {
2116     int nc, ntot;
2117
2118     nc   = dd->nc[dd->comm->cartpmedim];
2119     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2120     copy_ivec(coord, coord_pme);
2121     coord_pme[dd->comm->cartpmedim] =
2122         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2123 }
2124
2125 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2126 {
2127     /* Here we assign a PME node to communicate with this DD node
2128      * by assuming that the major index of both is x.
2129      * We add cr->npmenodes/2 to obtain an even distribution.
2130      */
2131     return (ddindex*npme + npme/2)/ndd;
2132 }
2133
2134 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2135 {
2136     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2137 }
2138
2139 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2140 {
2141     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2142 }
2143
2144 static int *dd_pmenodes(t_commrec *cr)
2145 {
2146     int *pmenodes;
2147     int  n, i, p0, p1;
2148
2149     snew(pmenodes, cr->npmenodes);
2150     n = 0;
2151     for (i = 0; i < cr->dd->nnodes; i++)
2152     {
2153         p0 = cr_ddindex2pmeindex(cr, i);
2154         p1 = cr_ddindex2pmeindex(cr, i+1);
2155         if (i+1 == cr->dd->nnodes || p1 > p0)
2156         {
2157             if (debug)
2158             {
2159                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2160             }
2161             pmenodes[n] = i + 1 + n;
2162             n++;
2163         }
2164     }
2165
2166     return pmenodes;
2167 }
2168
2169 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2170 {
2171     gmx_domdec_t *dd;
2172     ivec          coords, coords_pme, nc;
2173     int           slab;
2174
2175     dd = cr->dd;
2176     /*
2177        if (dd->comm->bCartesian) {
2178        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2179        dd_coords2pmecoords(dd,coords,coords_pme);
2180        copy_ivec(dd->ntot,nc);
2181        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2182        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2183
2184        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2185        } else {
2186        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2187        }
2188      */
2189     coords[XX] = x;
2190     coords[YY] = y;
2191     coords[ZZ] = z;
2192     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2193
2194     return slab;
2195 }
2196
2197 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2198 {
2199     gmx_domdec_comm_t *comm;
2200     ivec               coords;
2201     int                ddindex, nodeid = -1;
2202
2203     comm = cr->dd->comm;
2204
2205     coords[XX] = x;
2206     coords[YY] = y;
2207     coords[ZZ] = z;
2208     if (comm->bCartesianPP_PME)
2209     {
2210 #ifdef GMX_MPI
2211         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2212 #endif
2213     }
2214     else
2215     {
2216         ddindex = dd_index(cr->dd->nc, coords);
2217         if (comm->bCartesianPP)
2218         {
2219             nodeid = comm->ddindex2simnodeid[ddindex];
2220         }
2221         else
2222         {
2223             if (comm->pmenodes)
2224             {
2225                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2226             }
2227             else
2228             {
2229                 nodeid = ddindex;
2230             }
2231         }
2232     }
2233
2234     return nodeid;
2235 }
2236
2237 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2238 {
2239     gmx_domdec_t      *dd;
2240     gmx_domdec_comm_t *comm;
2241     ivec               coord, coord_pme;
2242     int                i;
2243     int                pmenode = -1;
2244
2245     dd   = cr->dd;
2246     comm = dd->comm;
2247
2248     /* This assumes a uniform x domain decomposition grid cell size */
2249     if (comm->bCartesianPP_PME)
2250     {
2251 #ifdef GMX_MPI
2252         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2253         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2254         {
2255             /* This is a PP node */
2256             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2257             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2258         }
2259 #endif
2260     }
2261     else if (comm->bCartesianPP)
2262     {
2263         if (sim_nodeid < dd->nnodes)
2264         {
2265             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2266         }
2267     }
2268     else
2269     {
2270         /* This assumes DD cells with identical x coordinates
2271          * are numbered sequentially.
2272          */
2273         if (dd->comm->pmenodes == NULL)
2274         {
2275             if (sim_nodeid < dd->nnodes)
2276             {
2277                 /* The DD index equals the nodeid */
2278                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2279             }
2280         }
2281         else
2282         {
2283             i = 0;
2284             while (sim_nodeid > dd->comm->pmenodes[i])
2285             {
2286                 i++;
2287             }
2288             if (sim_nodeid < dd->comm->pmenodes[i])
2289             {
2290                 pmenode = dd->comm->pmenodes[i];
2291             }
2292         }
2293     }
2294
2295     return pmenode;
2296 }
2297
2298 void get_pme_nnodes(const gmx_domdec_t *dd,
2299                     int *npmenodes_x, int *npmenodes_y)
2300 {
2301     if (dd != NULL)
2302     {
2303         *npmenodes_x = dd->comm->npmenodes_x;
2304         *npmenodes_y = dd->comm->npmenodes_y;
2305     }
2306     else
2307     {
2308         *npmenodes_x = 1;
2309         *npmenodes_y = 1;
2310     }
2311 }
2312
2313 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2314 {
2315     gmx_bool bPMEOnlyNode;
2316
2317     if (DOMAINDECOMP(cr))
2318     {
2319         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2320     }
2321     else
2322     {
2323         bPMEOnlyNode = FALSE;
2324     }
2325
2326     return bPMEOnlyNode;
2327 }
2328
2329 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2330                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2331 {
2332     gmx_domdec_t *dd;
2333     int           x, y, z;
2334     ivec          coord, coord_pme;
2335
2336     dd = cr->dd;
2337
2338     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2339
2340     *nmy_ddnodes = 0;
2341     for (x = 0; x < dd->nc[XX]; x++)
2342     {
2343         for (y = 0; y < dd->nc[YY]; y++)
2344         {
2345             for (z = 0; z < dd->nc[ZZ]; z++)
2346             {
2347                 if (dd->comm->bCartesianPP_PME)
2348                 {
2349                     coord[XX] = x;
2350                     coord[YY] = y;
2351                     coord[ZZ] = z;
2352                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2353                     if (dd->ci[XX] == coord_pme[XX] &&
2354                         dd->ci[YY] == coord_pme[YY] &&
2355                         dd->ci[ZZ] == coord_pme[ZZ])
2356                     {
2357                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2358                     }
2359                 }
2360                 else
2361                 {
2362                     /* The slab corresponds to the nodeid in the PME group */
2363                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2364                     {
2365                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2366                     }
2367                 }
2368             }
2369         }
2370     }
2371
2372     /* The last PP-only node is the peer node */
2373     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2374
2375     if (debug)
2376     {
2377         fprintf(debug, "Receive coordinates from PP nodes:");
2378         for (x = 0; x < *nmy_ddnodes; x++)
2379         {
2380             fprintf(debug, " %d", (*my_ddnodes)[x]);
2381         }
2382         fprintf(debug, "\n");
2383     }
2384 }
2385
2386 static gmx_bool receive_vir_ener(t_commrec *cr)
2387 {
2388     gmx_domdec_comm_t *comm;
2389     int                pmenode, coords[DIM], rank;
2390     gmx_bool           bReceive;
2391
2392     bReceive = TRUE;
2393     if (cr->npmenodes < cr->dd->nnodes)
2394     {
2395         comm = cr->dd->comm;
2396         if (comm->bCartesianPP_PME)
2397         {
2398             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2399 #ifdef GMX_MPI
2400             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2401             coords[comm->cartpmedim]++;
2402             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2403             {
2404                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2405                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2406                 {
2407                     /* This is not the last PP node for pmenode */
2408                     bReceive = FALSE;
2409                 }
2410             }
2411 #endif
2412         }
2413         else
2414         {
2415             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2416             if (cr->sim_nodeid+1 < cr->nnodes &&
2417                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2418             {
2419                 /* This is not the last PP node for pmenode */
2420                 bReceive = FALSE;
2421             }
2422         }
2423     }
2424
2425     return bReceive;
2426 }
2427
2428 static void set_zones_ncg_home(gmx_domdec_t *dd)
2429 {
2430     gmx_domdec_zones_t *zones;
2431     int                 i;
2432
2433     zones = &dd->comm->zones;
2434
2435     zones->cg_range[0] = 0;
2436     for (i = 1; i < zones->n+1; i++)
2437     {
2438         zones->cg_range[i] = dd->ncg_home;
2439     }
2440     /* zone_ncg1[0] should always be equal to ncg_home */
2441     dd->comm->zone_ncg1[0] = dd->ncg_home;
2442 }
2443
2444 static void rebuild_cgindex(gmx_domdec_t *dd,
2445                             const int *gcgs_index, t_state *state)
2446 {
2447     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2448
2449     ind        = state->cg_gl;
2450     dd_cg_gl   = dd->index_gl;
2451     cgindex    = dd->cgindex;
2452     nat        = 0;
2453     cgindex[0] = nat;
2454     for (i = 0; i < state->ncg_gl; i++)
2455     {
2456         cgindex[i]  = nat;
2457         cg_gl       = ind[i];
2458         dd_cg_gl[i] = cg_gl;
2459         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2460     }
2461     cgindex[i] = nat;
2462
2463     dd->ncg_home = state->ncg_gl;
2464     dd->nat_home = nat;
2465
2466     set_zones_ncg_home(dd);
2467 }
2468
2469 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2470 {
2471     while (cg >= cginfo_mb->cg_end)
2472     {
2473         cginfo_mb++;
2474     }
2475
2476     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2477 }
2478
2479 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2480                           t_forcerec *fr, char *bLocalCG)
2481 {
2482     cginfo_mb_t *cginfo_mb;
2483     int         *cginfo;
2484     int          cg;
2485
2486     if (fr != NULL)
2487     {
2488         cginfo_mb = fr->cginfo_mb;
2489         cginfo    = fr->cginfo;
2490
2491         for (cg = cg0; cg < cg1; cg++)
2492         {
2493             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2494         }
2495     }
2496
2497     if (bLocalCG != NULL)
2498     {
2499         for (cg = cg0; cg < cg1; cg++)
2500         {
2501             bLocalCG[index_gl[cg]] = TRUE;
2502         }
2503     }
2504 }
2505
2506 static void make_dd_indices(gmx_domdec_t *dd,
2507                             const int *gcgs_index, int cg_start)
2508 {
2509     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2510     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2511     gmx_ga2la_t *ga2la;
2512     char        *bLocalCG;
2513     gmx_bool     bCGs;
2514
2515     bLocalCG = dd->comm->bLocalCG;
2516
2517     if (dd->nat_tot > dd->gatindex_nalloc)
2518     {
2519         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2520         srenew(dd->gatindex, dd->gatindex_nalloc);
2521     }
2522
2523     nzone      = dd->comm->zones.n;
2524     zone2cg    = dd->comm->zones.cg_range;
2525     zone_ncg1  = dd->comm->zone_ncg1;
2526     index_gl   = dd->index_gl;
2527     gatindex   = dd->gatindex;
2528     bCGs       = dd->comm->bCGs;
2529
2530     if (zone2cg[1] != dd->ncg_home)
2531     {
2532         gmx_incons("dd->ncg_zone is not up to date");
2533     }
2534
2535     /* Make the local to global and global to local atom index */
2536     a = dd->cgindex[cg_start];
2537     for (zone = 0; zone < nzone; zone++)
2538     {
2539         if (zone == 0)
2540         {
2541             cg0 = cg_start;
2542         }
2543         else
2544         {
2545             cg0 = zone2cg[zone];
2546         }
2547         cg1    = zone2cg[zone+1];
2548         cg1_p1 = cg0 + zone_ncg1[zone];
2549
2550         for (cg = cg0; cg < cg1; cg++)
2551         {
2552             zone1 = zone;
2553             if (cg >= cg1_p1)
2554             {
2555                 /* Signal that this cg is from more than one pulse away */
2556                 zone1 += nzone;
2557             }
2558             cg_gl = index_gl[cg];
2559             if (bCGs)
2560             {
2561                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2562                 {
2563                     gatindex[a] = a_gl;
2564                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2565                     a++;
2566                 }
2567             }
2568             else
2569             {
2570                 gatindex[a] = cg_gl;
2571                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2572                 a++;
2573             }
2574         }
2575     }
2576 }
2577
2578 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2579                           const char *where)
2580 {
2581     int ncg, i, ngl, nerr;
2582
2583     nerr = 0;
2584     if (bLocalCG == NULL)
2585     {
2586         return nerr;
2587     }
2588     for (i = 0; i < dd->ncg_tot; i++)
2589     {
2590         if (!bLocalCG[dd->index_gl[i]])
2591         {
2592             fprintf(stderr,
2593                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2594             nerr++;
2595         }
2596     }
2597     ngl = 0;
2598     for (i = 0; i < ncg_sys; i++)
2599     {
2600         if (bLocalCG[i])
2601         {
2602             ngl++;
2603         }
2604     }
2605     if (ngl != dd->ncg_tot)
2606     {
2607         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2608         nerr++;
2609     }
2610
2611     return nerr;
2612 }
2613
2614 static void check_index_consistency(gmx_domdec_t *dd,
2615                                     int natoms_sys, int ncg_sys,
2616                                     const char *where)
2617 {
2618     int   nerr, ngl, i, a, cell;
2619     int  *have;
2620
2621     nerr = 0;
2622
2623     if (dd->comm->DD_debug > 1)
2624     {
2625         snew(have, natoms_sys);
2626         for (a = 0; a < dd->nat_tot; a++)
2627         {
2628             if (have[dd->gatindex[a]] > 0)
2629             {
2630                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2631             }
2632             else
2633             {
2634                 have[dd->gatindex[a]] = a + 1;
2635             }
2636         }
2637         sfree(have);
2638     }
2639
2640     snew(have, dd->nat_tot);
2641
2642     ngl  = 0;
2643     for (i = 0; i < natoms_sys; i++)
2644     {
2645         if (ga2la_get(dd->ga2la, i, &a, &cell))
2646         {
2647             if (a >= dd->nat_tot)
2648             {
2649                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2650                 nerr++;
2651             }
2652             else
2653             {
2654                 have[a] = 1;
2655                 if (dd->gatindex[a] != i)
2656                 {
2657                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2658                     nerr++;
2659                 }
2660             }
2661             ngl++;
2662         }
2663     }
2664     if (ngl != dd->nat_tot)
2665     {
2666         fprintf(stderr,
2667                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2668                 dd->rank, where, ngl, dd->nat_tot);
2669     }
2670     for (a = 0; a < dd->nat_tot; a++)
2671     {
2672         if (have[a] == 0)
2673         {
2674             fprintf(stderr,
2675                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2676                     dd->rank, where, a+1, dd->gatindex[a]+1);
2677         }
2678     }
2679     sfree(have);
2680
2681     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2682
2683     if (nerr > 0)
2684     {
2685         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2686                   dd->rank, where, nerr);
2687     }
2688 }
2689
2690 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2691 {
2692     int   i;
2693     char *bLocalCG;
2694
2695     if (a_start == 0)
2696     {
2697         /* Clear the whole list without searching */
2698         ga2la_clear(dd->ga2la);
2699     }
2700     else
2701     {
2702         for (i = a_start; i < dd->nat_tot; i++)
2703         {
2704             ga2la_del(dd->ga2la, dd->gatindex[i]);
2705         }
2706     }
2707
2708     bLocalCG = dd->comm->bLocalCG;
2709     if (bLocalCG)
2710     {
2711         for (i = cg_start; i < dd->ncg_tot; i++)
2712         {
2713             bLocalCG[dd->index_gl[i]] = FALSE;
2714         }
2715     }
2716
2717     dd_clear_local_vsite_indices(dd);
2718
2719     if (dd->constraints)
2720     {
2721         dd_clear_local_constraint_indices(dd);
2722     }
2723 }
2724
2725 /* This function should be used for moving the domain boudaries during DLB,
2726  * for obtaining the minimum cell size. It checks the initially set limit
2727  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2728  * and, possibly, a longer cut-off limit set for PME load balancing.
2729  */
2730 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2731 {
2732     real cellsize_min;
2733
2734     cellsize_min = comm->cellsize_min[dim];
2735
2736     if (!comm->bVacDLBNoLimit)
2737     {
2738         /* The cut-off might have changed, e.g. by PME load balacning,
2739          * from the value used to set comm->cellsize_min, so check it.
2740          */
2741         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2742
2743         if (comm->bPMELoadBalDLBLimits)
2744         {
2745             /* Check for the cut-off limit set by the PME load balancing */
2746             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2747         }
2748     }
2749
2750     return cellsize_min;
2751 }
2752
2753 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2754                             int dim_ind)
2755 {
2756     real grid_jump_limit;
2757
2758     /* The distance between the boundaries of cells at distance
2759      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2760      * and by the fact that cells should not be shifted by more than
2761      * half their size, such that cg's only shift by one cell
2762      * at redecomposition.
2763      */
2764     grid_jump_limit = comm->cellsize_limit;
2765     if (!comm->bVacDLBNoLimit)
2766     {
2767         if (comm->bPMELoadBalDLBLimits)
2768         {
2769             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2770         }
2771         grid_jump_limit = max(grid_jump_limit,
2772                               cutoff/comm->cd[dim_ind].np);
2773     }
2774
2775     return grid_jump_limit;
2776 }
2777
2778 static gmx_bool check_grid_jump(gmx_large_int_t step,
2779                                 gmx_domdec_t   *dd,
2780                                 real            cutoff,
2781                                 gmx_ddbox_t    *ddbox,
2782                                 gmx_bool        bFatal)
2783 {
2784     gmx_domdec_comm_t *comm;
2785     int                d, dim;
2786     real               limit, bfac;
2787     gmx_bool           bInvalid;
2788
2789     bInvalid = FALSE;
2790
2791     comm = dd->comm;
2792
2793     for (d = 1; d < dd->ndim; d++)
2794     {
2795         dim   = dd->dim[d];
2796         limit = grid_jump_limit(comm, cutoff, d);
2797         bfac  = ddbox->box_size[dim];
2798         if (ddbox->tric_dir[dim])
2799         {
2800             bfac *= ddbox->skew_fac[dim];
2801         }
2802         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2803                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2804         {
2805             bInvalid = TRUE;
2806
2807             if (bFatal)
2808             {
2809                 char buf[22];
2810
2811                 /* This error should never be triggered under normal
2812                  * circumstances, but you never know ...
2813                  */
2814                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2815                           gmx_step_str(step, buf),
2816                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2817             }
2818         }
2819     }
2820
2821     return bInvalid;
2822 }
2823
2824 static int dd_load_count(gmx_domdec_comm_t *comm)
2825 {
2826     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2827 }
2828
2829 static float dd_force_load(gmx_domdec_comm_t *comm)
2830 {
2831     float load;
2832
2833     if (comm->eFlop)
2834     {
2835         load = comm->flop;
2836         if (comm->eFlop > 1)
2837         {
2838             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2839         }
2840     }
2841     else
2842     {
2843         load = comm->cycl[ddCyclF];
2844         if (comm->cycl_n[ddCyclF] > 1)
2845         {
2846             /* Subtract the maximum of the last n cycle counts
2847              * to get rid of possible high counts due to other soures,
2848              * for instance system activity, that would otherwise
2849              * affect the dynamic load balancing.
2850              */
2851             load -= comm->cycl_max[ddCyclF];
2852         }
2853     }
2854
2855     return load;
2856 }
2857
2858 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2859 {
2860     gmx_domdec_comm_t *comm;
2861     int                i;
2862
2863     comm = dd->comm;
2864
2865     snew(*dim_f, dd->nc[dim]+1);
2866     (*dim_f)[0] = 0;
2867     for (i = 1; i < dd->nc[dim]; i++)
2868     {
2869         if (comm->slb_frac[dim])
2870         {
2871             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2872         }
2873         else
2874         {
2875             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2876         }
2877     }
2878     (*dim_f)[dd->nc[dim]] = 1;
2879 }
2880
2881 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2882 {
2883     int  pmeindex, slab, nso, i;
2884     ivec xyz;
2885
2886     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2887     {
2888         ddpme->dim = YY;
2889     }
2890     else
2891     {
2892         ddpme->dim = dimind;
2893     }
2894     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2895
2896     ddpme->nslab = (ddpme->dim == 0 ?
2897                     dd->comm->npmenodes_x :
2898                     dd->comm->npmenodes_y);
2899
2900     if (ddpme->nslab <= 1)
2901     {
2902         return;
2903     }
2904
2905     nso = dd->comm->npmenodes/ddpme->nslab;
2906     /* Determine for each PME slab the PP location range for dimension dim */
2907     snew(ddpme->pp_min, ddpme->nslab);
2908     snew(ddpme->pp_max, ddpme->nslab);
2909     for (slab = 0; slab < ddpme->nslab; slab++)
2910     {
2911         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2912         ddpme->pp_max[slab] = 0;
2913     }
2914     for (i = 0; i < dd->nnodes; i++)
2915     {
2916         ddindex2xyz(dd->nc, i, xyz);
2917         /* For y only use our y/z slab.
2918          * This assumes that the PME x grid size matches the DD grid size.
2919          */
2920         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2921         {
2922             pmeindex = ddindex2pmeindex(dd, i);
2923             if (dimind == 0)
2924             {
2925                 slab = pmeindex/nso;
2926             }
2927             else
2928             {
2929                 slab = pmeindex % ddpme->nslab;
2930             }
2931             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2932             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2933         }
2934     }
2935
2936     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2937 }
2938
2939 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2940 {
2941     if (dd->comm->ddpme[0].dim == XX)
2942     {
2943         return dd->comm->ddpme[0].maxshift;
2944     }
2945     else
2946     {
2947         return 0;
2948     }
2949 }
2950
2951 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2952 {
2953     if (dd->comm->ddpme[0].dim == YY)
2954     {
2955         return dd->comm->ddpme[0].maxshift;
2956     }
2957     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2958     {
2959         return dd->comm->ddpme[1].maxshift;
2960     }
2961     else
2962     {
2963         return 0;
2964     }
2965 }
2966
2967 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2968                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2969 {
2970     gmx_domdec_comm_t *comm;
2971     int                nc, ns, s;
2972     int               *xmin, *xmax;
2973     real               range, pme_boundary;
2974     int                sh;
2975
2976     comm = dd->comm;
2977     nc   = dd->nc[ddpme->dim];
2978     ns   = ddpme->nslab;
2979
2980     if (!ddpme->dim_match)
2981     {
2982         /* PP decomposition is not along dim: the worst situation */
2983         sh = ns/2;
2984     }
2985     else if (ns <= 3 || (bUniform && ns == nc))
2986     {
2987         /* The optimal situation */
2988         sh = 1;
2989     }
2990     else
2991     {
2992         /* We need to check for all pme nodes which nodes they
2993          * could possibly need to communicate with.
2994          */
2995         xmin = ddpme->pp_min;
2996         xmax = ddpme->pp_max;
2997         /* Allow for atoms to be maximally 2/3 times the cut-off
2998          * out of their DD cell. This is a reasonable balance between
2999          * between performance and support for most charge-group/cut-off
3000          * combinations.
3001          */
3002         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3003         /* Avoid extra communication when we are exactly at a boundary */
3004         range *= 0.999;
3005
3006         sh = 1;
3007         for (s = 0; s < ns; s++)
3008         {
3009             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3010             pme_boundary = (real)s/ns;
3011             while (sh+1 < ns &&
3012                    ((s-(sh+1) >= 0 &&
3013                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3014                     (s-(sh+1) <  0 &&
3015                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3016             {
3017                 sh++;
3018             }
3019             pme_boundary = (real)(s+1)/ns;
3020             while (sh+1 < ns &&
3021                    ((s+(sh+1) <  ns &&
3022                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3023                     (s+(sh+1) >= ns &&
3024                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3025             {
3026                 sh++;
3027             }
3028         }
3029     }
3030
3031     ddpme->maxshift = sh;
3032
3033     if (debug)
3034     {
3035         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3036                 ddpme->dim, ddpme->maxshift);
3037     }
3038 }
3039
3040 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3041 {
3042     int d, dim;
3043
3044     for (d = 0; d < dd->ndim; d++)
3045     {
3046         dim = dd->dim[d];
3047         if (dim < ddbox->nboundeddim &&
3048             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3049             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3050         {
3051             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3052                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3053                       dd->nc[dim], dd->comm->cellsize_limit);
3054         }
3055     }
3056 }
3057
3058 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3059                                   gmx_bool bMaster, ivec npulse)
3060 {
3061     gmx_domdec_comm_t *comm;
3062     int                d, j;
3063     rvec               cellsize_min;
3064     real              *cell_x, cell_dx, cellsize;
3065
3066     comm = dd->comm;
3067
3068     for (d = 0; d < DIM; d++)
3069     {
3070         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3071         npulse[d]       = 1;
3072         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3073         {
3074             /* Uniform grid */
3075             cell_dx = ddbox->box_size[d]/dd->nc[d];
3076             if (bMaster)
3077             {
3078                 for (j = 0; j < dd->nc[d]+1; j++)
3079                 {
3080                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3081                 }
3082             }
3083             else
3084             {
3085                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3086                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3087             }
3088             cellsize = cell_dx*ddbox->skew_fac[d];
3089             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3090             {
3091                 npulse[d]++;
3092             }
3093             cellsize_min[d] = cellsize;
3094         }
3095         else
3096         {
3097             /* Statically load balanced grid */
3098             /* Also when we are not doing a master distribution we determine
3099              * all cell borders in a loop to obtain identical values
3100              * to the master distribution case and to determine npulse.
3101              */
3102             if (bMaster)
3103             {
3104                 cell_x = dd->ma->cell_x[d];
3105             }
3106             else
3107             {
3108                 snew(cell_x, dd->nc[d]+1);
3109             }
3110             cell_x[0] = ddbox->box0[d];
3111             for (j = 0; j < dd->nc[d]; j++)
3112             {
3113                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3114                 cell_x[j+1] = cell_x[j] + cell_dx;
3115                 cellsize    = cell_dx*ddbox->skew_fac[d];
3116                 while (cellsize*npulse[d] < comm->cutoff &&
3117                        npulse[d] < dd->nc[d]-1)
3118                 {
3119                     npulse[d]++;
3120                 }
3121                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3122             }
3123             if (!bMaster)
3124             {
3125                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3126                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3127                 sfree(cell_x);
3128             }
3129         }
3130         /* The following limitation is to avoid that a cell would receive
3131          * some of its own home charge groups back over the periodic boundary.
3132          * Double charge groups cause trouble with the global indices.
3133          */
3134         if (d < ddbox->npbcdim &&
3135             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3136         {
3137             gmx_fatal_collective(FARGS, NULL, dd,
3138                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3139                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3140                                  comm->cutoff,
3141                                  dd->nc[d], dd->nc[d],
3142                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3143         }
3144     }
3145
3146     if (!comm->bDynLoadBal)
3147     {
3148         copy_rvec(cellsize_min, comm->cellsize_min);
3149     }
3150
3151     for (d = 0; d < comm->npmedecompdim; d++)
3152     {
3153         set_pme_maxshift(dd, &comm->ddpme[d],
3154                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3155                          comm->ddpme[d].slb_dim_f);
3156     }
3157 }
3158
3159
3160 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3161                                                   int d, int dim, gmx_domdec_root_t *root,
3162                                                   gmx_ddbox_t *ddbox,
3163                                                   gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
3164 {
3165     gmx_domdec_comm_t *comm;
3166     int                ncd, i, j, nmin, nmin_old;
3167     gmx_bool           bLimLo, bLimHi;
3168     real              *cell_size;
3169     real               fac, halfway, cellsize_limit_f_i, region_size;
3170     gmx_bool           bPBC, bLastHi = FALSE;
3171     int                nrange[] = {range[0], range[1]};
3172
3173     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3174
3175     comm = dd->comm;
3176
3177     ncd = dd->nc[dim];
3178
3179     bPBC = (dim < ddbox->npbcdim);
3180
3181     cell_size = root->buf_ncd;
3182
3183     if (debug)
3184     {
3185         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3186     }
3187
3188     /* First we need to check if the scaling does not make cells
3189      * smaller than the smallest allowed size.
3190      * We need to do this iteratively, since if a cell is too small,
3191      * it needs to be enlarged, which makes all the other cells smaller,
3192      * which could in turn make another cell smaller than allowed.
3193      */
3194     for (i = range[0]; i < range[1]; i++)
3195     {
3196         root->bCellMin[i] = FALSE;
3197     }
3198     nmin = 0;
3199     do
3200     {
3201         nmin_old = nmin;
3202         /* We need the total for normalization */
3203         fac = 0;
3204         for (i = range[0]; i < range[1]; i++)
3205         {
3206             if (root->bCellMin[i] == FALSE)
3207             {
3208                 fac += cell_size[i];
3209             }
3210         }
3211         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3212         /* Determine the cell boundaries */
3213         for (i = range[0]; i < range[1]; i++)
3214         {
3215             if (root->bCellMin[i] == FALSE)
3216             {
3217                 cell_size[i] *= fac;
3218                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3219                 {
3220                     cellsize_limit_f_i = 0;
3221                 }
3222                 else
3223                 {
3224                     cellsize_limit_f_i = cellsize_limit_f;
3225                 }
3226                 if (cell_size[i] < cellsize_limit_f_i)
3227                 {
3228                     root->bCellMin[i] = TRUE;
3229                     cell_size[i]      = cellsize_limit_f_i;
3230                     nmin++;
3231                 }
3232             }
3233             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3234         }
3235     }
3236     while (nmin > nmin_old);
3237
3238     i            = range[1]-1;
3239     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3240     /* For this check we should not use DD_CELL_MARGIN,
3241      * but a slightly smaller factor,
3242      * since rounding could get use below the limit.
3243      */
3244     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3245     {
3246         char buf[22];
3247         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3248                   gmx_step_str(step, buf),
3249                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3250                   ncd, comm->cellsize_min[dim]);
3251     }
3252
3253     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3254
3255     if (!bUniform)
3256     {
3257         /* Check if the boundary did not displace more than halfway
3258          * each of the cells it bounds, as this could cause problems,
3259          * especially when the differences between cell sizes are large.
3260          * If changes are applied, they will not make cells smaller
3261          * than the cut-off, as we check all the boundaries which
3262          * might be affected by a change and if the old state was ok,
3263          * the cells will at most be shrunk back to their old size.
3264          */
3265         for (i = range[0]+1; i < range[1]; i++)
3266         {
3267             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3268             if (root->cell_f[i] < halfway)
3269             {
3270                 root->cell_f[i] = halfway;
3271                 /* Check if the change also causes shifts of the next boundaries */
3272                 for (j = i+1; j < range[1]; j++)
3273                 {
3274                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3275                     {
3276                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3277                     }
3278                 }
3279             }
3280             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3281             if (root->cell_f[i] > halfway)
3282             {
3283                 root->cell_f[i] = halfway;
3284                 /* Check if the change also causes shifts of the next boundaries */
3285                 for (j = i-1; j >= range[0]+1; j--)
3286                 {
3287                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3288                     {
3289                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3290                     }
3291                 }
3292             }
3293         }
3294     }
3295
3296     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3297     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3298      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3299      * for a and b nrange is used */
3300     if (d > 0)
3301     {
3302         /* Take care of the staggering of the cell boundaries */
3303         if (bUniform)
3304         {
3305             for (i = range[0]; i < range[1]; i++)
3306             {
3307                 root->cell_f_max0[i] = root->cell_f[i];
3308                 root->cell_f_min1[i] = root->cell_f[i+1];
3309             }
3310         }
3311         else
3312         {
3313             for (i = range[0]+1; i < range[1]; i++)
3314             {
3315                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3316                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3317                 if (bLimLo && bLimHi)
3318                 {
3319                     /* Both limits violated, try the best we can */
3320                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3321                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3322                     nrange[0]       = range[0];
3323                     nrange[1]       = i;
3324                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3325
3326                     nrange[0] = i;
3327                     nrange[1] = range[1];
3328                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3329
3330                     return;
3331                 }
3332                 else if (bLimLo)
3333                 {
3334                     /* root->cell_f[i] = root->bound_min[i]; */
3335                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3336                     bLastHi   = FALSE;
3337                 }
3338                 else if (bLimHi && !bLastHi)
3339                 {
3340                     bLastHi = TRUE;
3341                     if (nrange[1] < range[1])   /* found a LimLo before */
3342                     {
3343                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3344                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3345                         nrange[0] = nrange[1];
3346                     }
3347                     root->cell_f[i] = root->bound_max[i];
3348                     nrange[1]       = i;
3349                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3350                     nrange[0] = i;
3351                     nrange[1] = range[1];
3352                 }
3353             }
3354             if (nrange[1] < range[1])   /* found last a LimLo */
3355             {
3356                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3357                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3358                 nrange[0] = nrange[1];
3359                 nrange[1] = range[1];
3360                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3361             }
3362             else if (nrange[0] > range[0]) /* found at least one LimHi */
3363             {
3364                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3365             }
3366         }
3367     }
3368 }
3369
3370
3371 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3372                                        int d, int dim, gmx_domdec_root_t *root,
3373                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3374                                        gmx_bool bUniform, gmx_large_int_t step)
3375 {
3376     gmx_domdec_comm_t *comm;
3377     int                ncd, d1, i, j, pos;
3378     real              *cell_size;
3379     real               load_aver, load_i, imbalance, change, change_max, sc;
3380     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3381     real               change_limit;
3382     real               relax = 0.5;
3383     gmx_bool           bPBC;
3384     int                range[] = { 0, 0 };
3385
3386     comm = dd->comm;
3387
3388     /* Convert the maximum change from the input percentage to a fraction */
3389     change_limit = comm->dlb_scale_lim*0.01;
3390
3391     ncd = dd->nc[dim];
3392
3393     bPBC = (dim < ddbox->npbcdim);
3394
3395     cell_size = root->buf_ncd;
3396
3397     /* Store the original boundaries */
3398     for (i = 0; i < ncd+1; i++)
3399     {
3400         root->old_cell_f[i] = root->cell_f[i];
3401     }
3402     if (bUniform)
3403     {
3404         for (i = 0; i < ncd; i++)
3405         {
3406             cell_size[i] = 1.0/ncd;
3407         }
3408     }
3409     else if (dd_load_count(comm))
3410     {
3411         load_aver  = comm->load[d].sum_m/ncd;
3412         change_max = 0;
3413         for (i = 0; i < ncd; i++)
3414         {
3415             /* Determine the relative imbalance of cell i */
3416             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3417             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3418             /* Determine the change of the cell size using underrelaxation */
3419             change     = -relax*imbalance;
3420             change_max = max(change_max, max(change, -change));
3421         }
3422         /* Limit the amount of scaling.
3423          * We need to use the same rescaling for all cells in one row,
3424          * otherwise the load balancing might not converge.
3425          */
3426         sc = relax;
3427         if (change_max > change_limit)
3428         {
3429             sc *= change_limit/change_max;
3430         }
3431         for (i = 0; i < ncd; i++)
3432         {
3433             /* Determine the relative imbalance of cell i */
3434             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3435             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3436             /* Determine the change of the cell size using underrelaxation */
3437             change       = -sc*imbalance;
3438             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3439         }
3440     }
3441
3442     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3443     cellsize_limit_f *= DD_CELL_MARGIN;
3444     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3445     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3446     if (ddbox->tric_dir[dim])
3447     {
3448         cellsize_limit_f /= ddbox->skew_fac[dim];
3449         dist_min_f       /= ddbox->skew_fac[dim];
3450     }
3451     if (bDynamicBox && d > 0)
3452     {
3453         dist_min_f *= DD_PRES_SCALE_MARGIN;
3454     }
3455     if (d > 0 && !bUniform)
3456     {
3457         /* Make sure that the grid is not shifted too much */
3458         for (i = 1; i < ncd; i++)
3459         {
3460             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3461             {
3462                 gmx_incons("Inconsistent DD boundary staggering limits!");
3463             }
3464             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3465             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3466             if (space > 0)
3467             {
3468                 root->bound_min[i] += 0.5*space;
3469             }
3470             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3471             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3472             if (space < 0)
3473             {
3474                 root->bound_max[i] += 0.5*space;
3475             }
3476             if (debug)
3477             {
3478                 fprintf(debug,
3479                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3480                         d, i,
3481                         root->cell_f_max0[i-1] + dist_min_f,
3482                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3483                         root->cell_f_min1[i] - dist_min_f);
3484             }
3485         }
3486     }
3487     range[1]          = ncd;
3488     root->cell_f[0]   = 0;
3489     root->cell_f[ncd] = 1;
3490     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3491
3492
3493     /* After the checks above, the cells should obey the cut-off
3494      * restrictions, but it does not hurt to check.
3495      */
3496     for (i = 0; i < ncd; i++)
3497     {
3498         if (debug)
3499         {
3500             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3501                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3502         }
3503
3504         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3505             root->cell_f[i+1] - root->cell_f[i] <
3506             cellsize_limit_f/DD_CELL_MARGIN)
3507         {
3508             char buf[22];
3509             fprintf(stderr,
3510                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3511                     gmx_step_str(step, buf), dim2char(dim), i,
3512                     (root->cell_f[i+1] - root->cell_f[i])
3513                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3514         }
3515     }
3516
3517     pos = ncd + 1;
3518     /* Store the cell boundaries of the lower dimensions at the end */
3519     for (d1 = 0; d1 < d; d1++)
3520     {
3521         root->cell_f[pos++] = comm->cell_f0[d1];
3522         root->cell_f[pos++] = comm->cell_f1[d1];
3523     }
3524
3525     if (d < comm->npmedecompdim)
3526     {
3527         /* The master determines the maximum shift for
3528          * the coordinate communication between separate PME nodes.
3529          */
3530         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3531     }
3532     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3533     if (d >= 1)
3534     {
3535         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3536     }
3537 }
3538
3539 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3540                                              gmx_ddbox_t *ddbox, int dimind)
3541 {
3542     gmx_domdec_comm_t *comm;
3543     int                dim;
3544
3545     comm = dd->comm;
3546
3547     /* Set the cell dimensions */
3548     dim                = dd->dim[dimind];
3549     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3550     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3551     if (dim >= ddbox->nboundeddim)
3552     {
3553         comm->cell_x0[dim] += ddbox->box0[dim];
3554         comm->cell_x1[dim] += ddbox->box0[dim];
3555     }
3556 }
3557
3558 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3559                                          int d, int dim, real *cell_f_row,
3560                                          gmx_ddbox_t *ddbox)
3561 {
3562     gmx_domdec_comm_t *comm;
3563     int                d1, dim1, pos;
3564
3565     comm = dd->comm;
3566
3567 #ifdef GMX_MPI
3568     /* Each node would only need to know two fractions,
3569      * but it is probably cheaper to broadcast the whole array.
3570      */
3571     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3572               0, comm->mpi_comm_load[d]);
3573 #endif
3574     /* Copy the fractions for this dimension from the buffer */
3575     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3576     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3577     /* The whole array was communicated, so set the buffer position */
3578     pos = dd->nc[dim] + 1;
3579     for (d1 = 0; d1 <= d; d1++)
3580     {
3581         if (d1 < d)
3582         {
3583             /* Copy the cell fractions of the lower dimensions */
3584             comm->cell_f0[d1] = cell_f_row[pos++];
3585             comm->cell_f1[d1] = cell_f_row[pos++];
3586         }
3587         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3588     }
3589     /* Convert the communicated shift from float to int */
3590     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3591     if (d >= 1)
3592     {
3593         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3594     }
3595 }
3596
3597 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3598                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3599                                          gmx_bool bUniform, gmx_large_int_t step)
3600 {
3601     gmx_domdec_comm_t *comm;
3602     int                d, dim, d1;
3603     gmx_bool           bRowMember, bRowRoot;
3604     real              *cell_f_row;
3605
3606     comm = dd->comm;
3607
3608     for (d = 0; d < dd->ndim; d++)
3609     {
3610         dim        = dd->dim[d];
3611         bRowMember = TRUE;
3612         bRowRoot   = TRUE;
3613         for (d1 = d; d1 < dd->ndim; d1++)
3614         {
3615             if (dd->ci[dd->dim[d1]] > 0)
3616             {
3617                 if (d1 > d)
3618                 {
3619                     bRowMember = FALSE;
3620                 }
3621                 bRowRoot = FALSE;
3622             }
3623         }
3624         if (bRowMember)
3625         {
3626             if (bRowRoot)
3627             {
3628                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3629                                            ddbox, bDynamicBox, bUniform, step);
3630                 cell_f_row = comm->root[d]->cell_f;
3631             }
3632             else
3633             {
3634                 cell_f_row = comm->cell_f_row;
3635             }
3636             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3637         }
3638     }
3639 }
3640
3641 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3642 {
3643     int d;
3644
3645     /* This function assumes the box is static and should therefore
3646      * not be called when the box has changed since the last
3647      * call to dd_partition_system.
3648      */
3649     for (d = 0; d < dd->ndim; d++)
3650     {
3651         relative_to_absolute_cell_bounds(dd, ddbox, d);
3652     }
3653 }
3654
3655
3656
3657 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3658                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3659                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3660                                   gmx_wallcycle_t wcycle)
3661 {
3662     gmx_domdec_comm_t *comm;
3663     int                dim;
3664
3665     comm = dd->comm;
3666
3667     if (bDoDLB)
3668     {
3669         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3670         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3671         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3672     }
3673     else if (bDynamicBox)
3674     {
3675         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3676     }
3677
3678     /* Set the dimensions for which no DD is used */
3679     for (dim = 0; dim < DIM; dim++)
3680     {
3681         if (dd->nc[dim] == 1)
3682         {
3683             comm->cell_x0[dim] = 0;
3684             comm->cell_x1[dim] = ddbox->box_size[dim];
3685             if (dim >= ddbox->nboundeddim)
3686             {
3687                 comm->cell_x0[dim] += ddbox->box0[dim];
3688                 comm->cell_x1[dim] += ddbox->box0[dim];
3689             }
3690         }
3691     }
3692 }
3693
3694 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3695 {
3696     int                    d, np, i;
3697     gmx_domdec_comm_dim_t *cd;
3698
3699     for (d = 0; d < dd->ndim; d++)
3700     {
3701         cd = &dd->comm->cd[d];
3702         np = npulse[dd->dim[d]];
3703         if (np > cd->np_nalloc)
3704         {
3705             if (debug)
3706             {
3707                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3708                         dim2char(dd->dim[d]), np);
3709             }
3710             if (DDMASTER(dd) && cd->np_nalloc > 0)
3711             {
3712                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3713             }
3714             srenew(cd->ind, np);
3715             for (i = cd->np_nalloc; i < np; i++)
3716             {
3717                 cd->ind[i].index  = NULL;
3718                 cd->ind[i].nalloc = 0;
3719             }
3720             cd->np_nalloc = np;
3721         }
3722         cd->np = np;
3723     }
3724 }
3725
3726
3727 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3728                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3729                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3730                               gmx_wallcycle_t wcycle)
3731 {
3732     gmx_domdec_comm_t *comm;
3733     int                d;
3734     ivec               npulse;
3735
3736     comm = dd->comm;
3737
3738     /* Copy the old cell boundaries for the cg displacement check */
3739     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3740     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3741
3742     if (comm->bDynLoadBal)
3743     {
3744         if (DDMASTER(dd))
3745         {
3746             check_box_size(dd, ddbox);
3747         }
3748         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3749     }
3750     else
3751     {
3752         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
3753         realloc_comm_ind(dd, npulse);
3754     }
3755
3756     if (debug)
3757     {
3758         for (d = 0; d < DIM; d++)
3759         {
3760             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3761                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3762         }
3763     }
3764 }
3765
3766 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3767                                   gmx_ddbox_t *ddbox,
3768                                   rvec cell_ns_x0, rvec cell_ns_x1,
3769                                   gmx_large_int_t step)
3770 {
3771     gmx_domdec_comm_t *comm;
3772     int                dim_ind, dim;
3773
3774     comm = dd->comm;
3775
3776     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3777     {
3778         dim = dd->dim[dim_ind];
3779
3780         /* Without PBC we don't have restrictions on the outer cells */
3781         if (!(dim >= ddbox->npbcdim &&
3782               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3783             comm->bDynLoadBal &&
3784             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3785             comm->cellsize_min[dim])
3786         {
3787             char buf[22];
3788             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3789                       gmx_step_str(step, buf), dim2char(dim),
3790                       comm->cell_x1[dim] - comm->cell_x0[dim],
3791                       ddbox->skew_fac[dim],
3792                       dd->comm->cellsize_min[dim],
3793                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3794         }
3795     }
3796
3797     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3798     {
3799         /* Communicate the boundaries and update cell_ns_x0/1 */
3800         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3801         if (dd->bGridJump && dd->ndim > 1)
3802         {
3803             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3804         }
3805     }
3806 }
3807
3808 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3809 {
3810     if (YY < npbcdim)
3811     {
3812         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3813     }
3814     else
3815     {
3816         tcm[YY][XX] = 0;
3817     }
3818     if (ZZ < npbcdim)
3819     {
3820         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3821         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3822     }
3823     else
3824     {
3825         tcm[ZZ][XX] = 0;
3826         tcm[ZZ][YY] = 0;
3827     }
3828 }
3829
3830 static void check_screw_box(matrix box)
3831 {
3832     /* Mathematical limitation */
3833     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3834     {
3835         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3836     }
3837
3838     /* Limitation due to the asymmetry of the eighth shell method */
3839     if (box[ZZ][YY] != 0)
3840     {
3841         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3842     }
3843 }
3844
3845 static void distribute_cg(FILE *fplog, gmx_large_int_t step,
3846                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3847                           gmx_domdec_t *dd)
3848 {
3849     gmx_domdec_master_t *ma;
3850     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3851     int                  i, icg, j, k, k0, k1, d, npbcdim;
3852     matrix               tcm;
3853     rvec                 box_size, cg_cm;
3854     ivec                 ind;
3855     real                 nrcg, inv_ncg, pos_d;
3856     atom_id             *cgindex;
3857     gmx_bool             bUnbounded, bScrew;
3858
3859     ma = dd->ma;
3860
3861     if (tmp_ind == NULL)
3862     {
3863         snew(tmp_nalloc, dd->nnodes);
3864         snew(tmp_ind, dd->nnodes);
3865         for (i = 0; i < dd->nnodes; i++)
3866         {
3867             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3868             snew(tmp_ind[i], tmp_nalloc[i]);
3869         }
3870     }
3871
3872     /* Clear the count */
3873     for (i = 0; i < dd->nnodes; i++)
3874     {
3875         ma->ncg[i] = 0;
3876         ma->nat[i] = 0;
3877     }
3878
3879     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3880
3881     cgindex = cgs->index;
3882
3883     /* Compute the center of geometry for all charge groups */
3884     for (icg = 0; icg < cgs->nr; icg++)
3885     {
3886         k0      = cgindex[icg];
3887         k1      = cgindex[icg+1];
3888         nrcg    = k1 - k0;
3889         if (nrcg == 1)
3890         {
3891             copy_rvec(pos[k0], cg_cm);
3892         }
3893         else
3894         {
3895             inv_ncg = 1.0/nrcg;
3896
3897             clear_rvec(cg_cm);
3898             for (k = k0; (k < k1); k++)
3899             {
3900                 rvec_inc(cg_cm, pos[k]);
3901             }
3902             for (d = 0; (d < DIM); d++)
3903             {
3904                 cg_cm[d] *= inv_ncg;
3905             }
3906         }
3907         /* Put the charge group in the box and determine the cell index */
3908         for (d = DIM-1; d >= 0; d--)
3909         {
3910             pos_d = cg_cm[d];
3911             if (d < dd->npbcdim)
3912             {
3913                 bScrew = (dd->bScrewPBC && d == XX);
3914                 if (tric_dir[d] && dd->nc[d] > 1)
3915                 {
3916                     /* Use triclinic coordintates for this dimension */
3917                     for (j = d+1; j < DIM; j++)
3918                     {
3919                         pos_d += cg_cm[j]*tcm[j][d];
3920                     }
3921                 }
3922                 while (pos_d >= box[d][d])
3923                 {
3924                     pos_d -= box[d][d];
3925                     rvec_dec(cg_cm, box[d]);
3926                     if (bScrew)
3927                     {
3928                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3929                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3930                     }
3931                     for (k = k0; (k < k1); k++)
3932                     {
3933                         rvec_dec(pos[k], box[d]);
3934                         if (bScrew)
3935                         {
3936                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3937                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3938                         }
3939                     }
3940                 }
3941                 while (pos_d < 0)
3942                 {
3943                     pos_d += box[d][d];
3944                     rvec_inc(cg_cm, box[d]);
3945                     if (bScrew)
3946                     {
3947                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3948                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3949                     }
3950                     for (k = k0; (k < k1); k++)
3951                     {
3952                         rvec_inc(pos[k], box[d]);
3953                         if (bScrew)
3954                         {
3955                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3956                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3957                         }
3958                     }
3959                 }
3960             }
3961             /* This could be done more efficiently */
3962             ind[d] = 0;
3963             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3964             {
3965                 ind[d]++;
3966             }
3967         }
3968         i = dd_index(dd->nc, ind);
3969         if (ma->ncg[i] == tmp_nalloc[i])
3970         {
3971             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3972             srenew(tmp_ind[i], tmp_nalloc[i]);
3973         }
3974         tmp_ind[i][ma->ncg[i]] = icg;
3975         ma->ncg[i]++;
3976         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3977     }
3978
3979     k1 = 0;
3980     for (i = 0; i < dd->nnodes; i++)
3981     {
3982         ma->index[i] = k1;
3983         for (k = 0; k < ma->ncg[i]; k++)
3984         {
3985             ma->cg[k1++] = tmp_ind[i][k];
3986         }
3987     }
3988     ma->index[dd->nnodes] = k1;
3989
3990     for (i = 0; i < dd->nnodes; i++)
3991     {
3992         sfree(tmp_ind[i]);
3993     }
3994     sfree(tmp_ind);
3995     sfree(tmp_nalloc);
3996
3997     if (fplog)
3998     {
3999         char buf[22];
4000         fprintf(fplog, "Charge group distribution at step %s:",
4001                 gmx_step_str(step, buf));
4002         for (i = 0; i < dd->nnodes; i++)
4003         {
4004             fprintf(fplog, " %d", ma->ncg[i]);
4005         }
4006         fprintf(fplog, "\n");
4007     }
4008 }
4009
4010 static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
4011                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4012                                 rvec pos[])
4013 {
4014     gmx_domdec_master_t *ma = NULL;
4015     ivec                 npulse;
4016     int                  i, cg_gl;
4017     int                 *ibuf, buf2[2] = { 0, 0 };
4018     gmx_bool             bMaster = DDMASTER(dd);
4019     if (bMaster)
4020     {
4021         ma = dd->ma;
4022
4023         if (dd->bScrewPBC)
4024         {
4025             check_screw_box(box);
4026         }
4027
4028         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
4029
4030         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4031         for (i = 0; i < dd->nnodes; i++)
4032         {
4033             ma->ibuf[2*i]   = ma->ncg[i];
4034             ma->ibuf[2*i+1] = ma->nat[i];
4035         }
4036         ibuf = ma->ibuf;
4037     }
4038     else
4039     {
4040         ibuf = NULL;
4041     }
4042     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4043
4044     dd->ncg_home = buf2[0];
4045     dd->nat_home = buf2[1];
4046     dd->ncg_tot  = dd->ncg_home;
4047     dd->nat_tot  = dd->nat_home;
4048     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4049     {
4050         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4051         srenew(dd->index_gl, dd->cg_nalloc);
4052         srenew(dd->cgindex, dd->cg_nalloc+1);
4053     }
4054     if (bMaster)
4055     {
4056         for (i = 0; i < dd->nnodes; i++)
4057         {
4058             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4059             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4060         }
4061     }
4062
4063     dd_scatterv(dd,
4064                 DDMASTER(dd) ? ma->ibuf : NULL,
4065                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4066                 DDMASTER(dd) ? ma->cg : NULL,
4067                 dd->ncg_home*sizeof(int), dd->index_gl);
4068
4069     /* Determine the home charge group sizes */
4070     dd->cgindex[0] = 0;
4071     for (i = 0; i < dd->ncg_home; i++)
4072     {
4073         cg_gl            = dd->index_gl[i];
4074         dd->cgindex[i+1] =
4075             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4076     }
4077
4078     if (debug)
4079     {
4080         fprintf(debug, "Home charge groups:\n");
4081         for (i = 0; i < dd->ncg_home; i++)
4082         {
4083             fprintf(debug, " %d", dd->index_gl[i]);
4084             if (i % 10 == 9)
4085             {
4086                 fprintf(debug, "\n");
4087             }
4088         }
4089         fprintf(debug, "\n");
4090     }
4091 }
4092
4093 static int compact_and_copy_vec_at(int ncg, int *move,
4094                                    int *cgindex,
4095                                    int nvec, int vec,
4096                                    rvec *src, gmx_domdec_comm_t *comm,
4097                                    gmx_bool bCompact)
4098 {
4099     int m, icg, i, i0, i1, nrcg;
4100     int home_pos;
4101     int pos_vec[DIM*2];
4102
4103     home_pos = 0;
4104
4105     for (m = 0; m < DIM*2; m++)
4106     {
4107         pos_vec[m] = 0;
4108     }
4109
4110     i0 = 0;
4111     for (icg = 0; icg < ncg; icg++)
4112     {
4113         i1 = cgindex[icg+1];
4114         m  = move[icg];
4115         if (m == -1)
4116         {
4117             if (bCompact)
4118             {
4119                 /* Compact the home array in place */
4120                 for (i = i0; i < i1; i++)
4121                 {
4122                     copy_rvec(src[i], src[home_pos++]);
4123                 }
4124             }
4125         }
4126         else
4127         {
4128             /* Copy to the communication buffer */
4129             nrcg        = i1 - i0;
4130             pos_vec[m] += 1 + vec*nrcg;
4131             for (i = i0; i < i1; i++)
4132             {
4133                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4134             }
4135             pos_vec[m] += (nvec - vec - 1)*nrcg;
4136         }
4137         if (!bCompact)
4138         {
4139             home_pos += i1 - i0;
4140         }
4141         i0 = i1;
4142     }
4143
4144     return home_pos;
4145 }
4146
4147 static int compact_and_copy_vec_cg(int ncg, int *move,
4148                                    int *cgindex,
4149                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4150                                    gmx_bool bCompact)
4151 {
4152     int m, icg, i0, i1, nrcg;
4153     int home_pos;
4154     int pos_vec[DIM*2];
4155
4156     home_pos = 0;
4157
4158     for (m = 0; m < DIM*2; m++)
4159     {
4160         pos_vec[m] = 0;
4161     }
4162
4163     i0 = 0;
4164     for (icg = 0; icg < ncg; icg++)
4165     {
4166         i1 = cgindex[icg+1];
4167         m  = move[icg];
4168         if (m == -1)
4169         {
4170             if (bCompact)
4171             {
4172                 /* Compact the home array in place */
4173                 copy_rvec(src[icg], src[home_pos++]);
4174             }
4175         }
4176         else
4177         {
4178             nrcg = i1 - i0;
4179             /* Copy to the communication buffer */
4180             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4181             pos_vec[m] += 1 + nrcg*nvec;
4182         }
4183         i0 = i1;
4184     }
4185     if (!bCompact)
4186     {
4187         home_pos = ncg;
4188     }
4189
4190     return home_pos;
4191 }
4192
4193 static int compact_ind(int ncg, int *move,
4194                        int *index_gl, int *cgindex,
4195                        int *gatindex,
4196                        gmx_ga2la_t ga2la, char *bLocalCG,
4197                        int *cginfo)
4198 {
4199     int cg, nat, a0, a1, a, a_gl;
4200     int home_pos;
4201
4202     home_pos = 0;
4203     nat      = 0;
4204     for (cg = 0; cg < ncg; cg++)
4205     {
4206         a0 = cgindex[cg];
4207         a1 = cgindex[cg+1];
4208         if (move[cg] == -1)
4209         {
4210             /* Compact the home arrays in place.
4211              * Anything that can be done here avoids access to global arrays.
4212              */
4213             cgindex[home_pos] = nat;
4214             for (a = a0; a < a1; a++)
4215             {
4216                 a_gl          = gatindex[a];
4217                 gatindex[nat] = a_gl;
4218                 /* The cell number stays 0, so we don't need to set it */
4219                 ga2la_change_la(ga2la, a_gl, nat);
4220                 nat++;
4221             }
4222             index_gl[home_pos] = index_gl[cg];
4223             cginfo[home_pos]   = cginfo[cg];
4224             /* The charge group remains local, so bLocalCG does not change */
4225             home_pos++;
4226         }
4227         else
4228         {
4229             /* Clear the global indices */
4230             for (a = a0; a < a1; a++)
4231             {
4232                 ga2la_del(ga2la, gatindex[a]);
4233             }
4234             if (bLocalCG)
4235             {
4236                 bLocalCG[index_gl[cg]] = FALSE;
4237             }
4238         }
4239     }
4240     cgindex[home_pos] = nat;
4241
4242     return home_pos;
4243 }
4244
4245 static void clear_and_mark_ind(int ncg, int *move,
4246                                int *index_gl, int *cgindex, int *gatindex,
4247                                gmx_ga2la_t ga2la, char *bLocalCG,
4248                                int *cell_index)
4249 {
4250     int cg, a0, a1, a;
4251
4252     for (cg = 0; cg < ncg; cg++)
4253     {
4254         if (move[cg] >= 0)
4255         {
4256             a0 = cgindex[cg];
4257             a1 = cgindex[cg+1];
4258             /* Clear the global indices */
4259             for (a = a0; a < a1; a++)
4260             {
4261                 ga2la_del(ga2la, gatindex[a]);
4262             }
4263             if (bLocalCG)
4264             {
4265                 bLocalCG[index_gl[cg]] = FALSE;
4266             }
4267             /* Signal that this cg has moved using the ns cell index.
4268              * Here we set it to -1. fill_grid will change it
4269              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4270              */
4271             cell_index[cg] = -1;
4272         }
4273     }
4274 }
4275
4276 static void print_cg_move(FILE *fplog,
4277                           gmx_domdec_t *dd,
4278                           gmx_large_int_t step, int cg, int dim, int dir,
4279                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4280                           rvec cm_old, rvec cm_new, real pos_d)
4281 {
4282     gmx_domdec_comm_t *comm;
4283     char               buf[22];
4284
4285     comm = dd->comm;
4286
4287     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4288     if (bHaveLimitdAndCMOld)
4289     {
4290         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4291                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4292     }
4293     else
4294     {
4295         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4296                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4297     }
4298     fprintf(fplog, "distance out of cell %f\n",
4299             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4300     if (bHaveLimitdAndCMOld)
4301     {
4302         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4303                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4304     }
4305     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4306             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4307     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4308             dim2char(dim),
4309             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4310     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4311             dim2char(dim),
4312             comm->cell_x0[dim], comm->cell_x1[dim]);
4313 }
4314
4315 static void cg_move_error(FILE *fplog,
4316                           gmx_domdec_t *dd,
4317                           gmx_large_int_t step, int cg, int dim, int dir,
4318                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4319                           rvec cm_old, rvec cm_new, real pos_d)
4320 {
4321     if (fplog)
4322     {
4323         print_cg_move(fplog, dd, step, cg, dim, dir,
4324                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4325     }
4326     print_cg_move(stderr, dd, step, cg, dim, dir,
4327                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4328     gmx_fatal(FARGS,
4329               "A charge group moved too far between two domain decomposition steps\n"
4330               "This usually means that your system is not well equilibrated");
4331 }
4332
4333 static void rotate_state_atom(t_state *state, int a)
4334 {
4335     int est;
4336
4337     for (est = 0; est < estNR; est++)
4338     {
4339         if (EST_DISTR(est) && (state->flags & (1<<est)))
4340         {
4341             switch (est)
4342             {
4343                 case estX:
4344                     /* Rotate the complete state; for a rectangular box only */
4345                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4346                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4347                     break;
4348                 case estV:
4349                     state->v[a][YY] = -state->v[a][YY];
4350                     state->v[a][ZZ] = -state->v[a][ZZ];
4351                     break;
4352                 case estSDX:
4353                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4354                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4355                     break;
4356                 case estCGP:
4357                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4358                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4359                     break;
4360                 case estDISRE_INITF:
4361                 case estDISRE_RM3TAV:
4362                 case estORIRE_INITF:
4363                 case estORIRE_DTAV:
4364                     /* These are distances, so not affected by rotation */
4365                     break;
4366                 default:
4367                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4368             }
4369         }
4370     }
4371 }
4372
4373 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4374 {
4375     if (natoms > comm->moved_nalloc)
4376     {
4377         /* Contents should be preserved here */
4378         comm->moved_nalloc = over_alloc_dd(natoms);
4379         srenew(comm->moved, comm->moved_nalloc);
4380     }
4381
4382     return comm->moved;
4383 }
4384
4385 static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
4386                          gmx_domdec_t *dd,
4387                          t_state *state,
4388                          ivec tric_dir, matrix tcm,
4389                          rvec cell_x0, rvec cell_x1,
4390                          rvec limitd, rvec limit0, rvec limit1,
4391                          const int *cgindex,
4392                          int cg_start, int cg_end,
4393                          rvec *cg_cm,
4394                          int *move)
4395 {
4396     int      npbcdim;
4397     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4398     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4399     int      flag;
4400     gmx_bool bScrew;
4401     ivec     dev;
4402     real     inv_ncg, pos_d;
4403     rvec     cm_new;
4404
4405     npbcdim = dd->npbcdim;
4406
4407     for (cg = cg_start; cg < cg_end; cg++)
4408     {
4409         k0   = cgindex[cg];
4410         k1   = cgindex[cg+1];
4411         nrcg = k1 - k0;
4412         if (nrcg == 1)
4413         {
4414             copy_rvec(state->x[k0], cm_new);
4415         }
4416         else
4417         {
4418             inv_ncg = 1.0/nrcg;
4419
4420             clear_rvec(cm_new);
4421             for (k = k0; (k < k1); k++)
4422             {
4423                 rvec_inc(cm_new, state->x[k]);
4424             }
4425             for (d = 0; (d < DIM); d++)
4426             {
4427                 cm_new[d] = inv_ncg*cm_new[d];
4428             }
4429         }
4430
4431         clear_ivec(dev);
4432         /* Do pbc and check DD cell boundary crossings */
4433         for (d = DIM-1; d >= 0; d--)
4434         {
4435             if (dd->nc[d] > 1)
4436             {
4437                 bScrew = (dd->bScrewPBC && d == XX);
4438                 /* Determine the location of this cg in lattice coordinates */
4439                 pos_d = cm_new[d];
4440                 if (tric_dir[d])
4441                 {
4442                     for (d2 = d+1; d2 < DIM; d2++)
4443                     {
4444                         pos_d += cm_new[d2]*tcm[d2][d];
4445                     }
4446                 }
4447                 /* Put the charge group in the triclinic unit-cell */
4448                 if (pos_d >= cell_x1[d])
4449                 {
4450                     if (pos_d >= limit1[d])
4451                     {
4452                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4453                                       cg_cm[cg], cm_new, pos_d);
4454                     }
4455                     dev[d] = 1;
4456                     if (dd->ci[d] == dd->nc[d] - 1)
4457                     {
4458                         rvec_dec(cm_new, state->box[d]);
4459                         if (bScrew)
4460                         {
4461                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4462                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4463                         }
4464                         for (k = k0; (k < k1); k++)
4465                         {
4466                             rvec_dec(state->x[k], state->box[d]);
4467                             if (bScrew)
4468                             {
4469                                 rotate_state_atom(state, k);
4470                             }
4471                         }
4472                     }
4473                 }
4474                 else if (pos_d < cell_x0[d])
4475                 {
4476                     if (pos_d < limit0[d])
4477                     {
4478                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4479                                       cg_cm[cg], cm_new, pos_d);
4480                     }
4481                     dev[d] = -1;
4482                     if (dd->ci[d] == 0)
4483                     {
4484                         rvec_inc(cm_new, state->box[d]);
4485                         if (bScrew)
4486                         {
4487                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4488                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4489                         }
4490                         for (k = k0; (k < k1); k++)
4491                         {
4492                             rvec_inc(state->x[k], state->box[d]);
4493                             if (bScrew)
4494                             {
4495                                 rotate_state_atom(state, k);
4496                             }
4497                         }
4498                     }
4499                 }
4500             }
4501             else if (d < npbcdim)
4502             {
4503                 /* Put the charge group in the rectangular unit-cell */
4504                 while (cm_new[d] >= state->box[d][d])
4505                 {
4506                     rvec_dec(cm_new, state->box[d]);
4507                     for (k = k0; (k < k1); k++)
4508                     {
4509                         rvec_dec(state->x[k], state->box[d]);
4510                     }
4511                 }
4512                 while (cm_new[d] < 0)
4513                 {
4514                     rvec_inc(cm_new, state->box[d]);
4515                     for (k = k0; (k < k1); k++)
4516                     {
4517                         rvec_inc(state->x[k], state->box[d]);
4518                     }
4519                 }
4520             }
4521         }
4522
4523         copy_rvec(cm_new, cg_cm[cg]);
4524
4525         /* Determine where this cg should go */
4526         flag = 0;
4527         mc   = -1;
4528         for (d = 0; d < dd->ndim; d++)
4529         {
4530             dim = dd->dim[d];
4531             if (dev[dim] == 1)
4532             {
4533                 flag |= DD_FLAG_FW(d);
4534                 if (mc == -1)
4535                 {
4536                     mc = d*2;
4537                 }
4538             }
4539             else if (dev[dim] == -1)
4540             {
4541                 flag |= DD_FLAG_BW(d);
4542                 if (mc == -1)
4543                 {
4544                     if (dd->nc[dim] > 2)
4545                     {
4546                         mc = d*2 + 1;
4547                     }
4548                     else
4549                     {
4550                         mc = d*2;
4551                     }
4552                 }
4553             }
4554         }
4555         /* Temporarily store the flag in move */
4556         move[cg] = mc + flag;
4557     }
4558 }
4559
4560 static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
4561                                gmx_domdec_t *dd, ivec tric_dir,
4562                                t_state *state, rvec **f,
4563                                t_forcerec *fr,
4564                                gmx_bool bCompact,
4565                                t_nrnb *nrnb,
4566                                int *ncg_stay_home,
4567                                int *ncg_moved)
4568 {
4569     int               *move;
4570     int                npbcdim;
4571     int                ncg[DIM*2], nat[DIM*2];
4572     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4573     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4574     int                sbuf[2], rbuf[2];
4575     int                home_pos_cg, home_pos_at, buf_pos;
4576     int                flag;
4577     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4578     gmx_bool           bScrew;
4579     ivec               dev;
4580     real               inv_ncg, pos_d;
4581     matrix             tcm;
4582     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4583     atom_id           *cgindex;
4584     cginfo_mb_t       *cginfo_mb;
4585     gmx_domdec_comm_t *comm;
4586     int               *moved;
4587     int                nthread, thread;
4588
4589     if (dd->bScrewPBC)
4590     {
4591         check_screw_box(state->box);
4592     }
4593
4594     comm  = dd->comm;
4595     if (fr->cutoff_scheme == ecutsGROUP)
4596     {
4597         cg_cm = fr->cg_cm;
4598     }
4599
4600     for (i = 0; i < estNR; i++)
4601     {
4602         if (EST_DISTR(i))
4603         {
4604             switch (i)
4605             {
4606                 case estX: /* Always present */ break;
4607                 case estV:   bV   = (state->flags & (1<<i)); break;
4608                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4609                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4610                 case estLD_RNG:
4611                 case estLD_RNGI:
4612                 case estDISRE_INITF:
4613                 case estDISRE_RM3TAV:
4614                 case estORIRE_INITF:
4615                 case estORIRE_DTAV:
4616                     /* No processing required */
4617                     break;
4618                 default:
4619                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4620             }
4621         }
4622     }
4623
4624     if (dd->ncg_tot > comm->nalloc_int)
4625     {
4626         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4627         srenew(comm->buf_int, comm->nalloc_int);
4628     }
4629     move = comm->buf_int;
4630
4631     /* Clear the count */
4632     for (c = 0; c < dd->ndim*2; c++)
4633     {
4634         ncg[c] = 0;
4635         nat[c] = 0;
4636     }
4637
4638     npbcdim = dd->npbcdim;
4639
4640     for (d = 0; (d < DIM); d++)
4641     {
4642         limitd[d] = dd->comm->cellsize_min[d];
4643         if (d >= npbcdim && dd->ci[d] == 0)
4644         {
4645             cell_x0[d] = -GMX_FLOAT_MAX;
4646         }
4647         else
4648         {
4649             cell_x0[d] = comm->cell_x0[d];
4650         }
4651         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4652         {
4653             cell_x1[d] = GMX_FLOAT_MAX;
4654         }
4655         else
4656         {
4657             cell_x1[d] = comm->cell_x1[d];
4658         }
4659         if (d < npbcdim)
4660         {
4661             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4662             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4663         }
4664         else
4665         {
4666             /* We check after communication if a charge group moved
4667              * more than one cell. Set the pre-comm check limit to float_max.
4668              */
4669             limit0[d] = -GMX_FLOAT_MAX;
4670             limit1[d] =  GMX_FLOAT_MAX;
4671         }
4672     }
4673
4674     make_tric_corr_matrix(npbcdim, state->box, tcm);
4675
4676     cgindex = dd->cgindex;
4677
4678     nthread = gmx_omp_nthreads_get(emntDomdec);
4679
4680     /* Compute the center of geometry for all home charge groups
4681      * and put them in the box and determine where they should go.
4682      */
4683 #pragma omp parallel for num_threads(nthread) schedule(static)
4684     for (thread = 0; thread < nthread; thread++)
4685     {
4686         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4687                      cell_x0, cell_x1, limitd, limit0, limit1,
4688                      cgindex,
4689                      ( thread   *dd->ncg_home)/nthread,
4690                      ((thread+1)*dd->ncg_home)/nthread,
4691                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4692                      move);
4693     }
4694
4695     for (cg = 0; cg < dd->ncg_home; cg++)
4696     {
4697         if (move[cg] >= 0)
4698         {
4699             mc       = move[cg];
4700             flag     = mc & ~DD_FLAG_NRCG;
4701             mc       = mc & DD_FLAG_NRCG;
4702             move[cg] = mc;
4703
4704             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4705             {
4706                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4707                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4708             }
4709             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4710             /* We store the cg size in the lower 16 bits
4711              * and the place where the charge group should go
4712              * in the next 6 bits. This saves some communication volume.
4713              */
4714             nrcg = cgindex[cg+1] - cgindex[cg];
4715             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4716             ncg[mc] += 1;
4717             nat[mc] += nrcg;
4718         }
4719     }
4720
4721     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4722     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4723
4724     *ncg_moved = 0;
4725     for (i = 0; i < dd->ndim*2; i++)
4726     {
4727         *ncg_moved += ncg[i];
4728     }
4729
4730     nvec = 1;
4731     if (bV)
4732     {
4733         nvec++;
4734     }
4735     if (bSDX)
4736     {
4737         nvec++;
4738     }
4739     if (bCGP)
4740     {
4741         nvec++;
4742     }
4743
4744     /* Make sure the communication buffers are large enough */
4745     for (mc = 0; mc < dd->ndim*2; mc++)
4746     {
4747         nvr = ncg[mc] + nat[mc]*nvec;
4748         if (nvr > comm->cgcm_state_nalloc[mc])
4749         {
4750             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4751             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4752         }
4753     }
4754
4755     switch (fr->cutoff_scheme)
4756     {
4757         case ecutsGROUP:
4758             /* Recalculating cg_cm might be cheaper than communicating,
4759              * but that could give rise to rounding issues.
4760              */
4761             home_pos_cg =
4762                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4763                                         nvec, cg_cm, comm, bCompact);
4764             break;
4765         case ecutsVERLET:
4766             /* Without charge groups we send the moved atom coordinates
4767              * over twice. This is so the code below can be used without
4768              * many conditionals for both for with and without charge groups.
4769              */
4770             home_pos_cg =
4771                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4772                                         nvec, state->x, comm, FALSE);
4773             if (bCompact)
4774             {
4775                 home_pos_cg -= *ncg_moved;
4776             }
4777             break;
4778         default:
4779             gmx_incons("unimplemented");
4780             home_pos_cg = 0;
4781     }
4782
4783     vec         = 0;
4784     home_pos_at =
4785         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4786                                 nvec, vec++, state->x, comm, bCompact);
4787     if (bV)
4788     {
4789         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4790                                 nvec, vec++, state->v, comm, bCompact);
4791     }
4792     if (bSDX)
4793     {
4794         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4795                                 nvec, vec++, state->sd_X, comm, bCompact);
4796     }
4797     if (bCGP)
4798     {
4799         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4800                                 nvec, vec++, state->cg_p, comm, bCompact);
4801     }
4802
4803     if (bCompact)
4804     {
4805         compact_ind(dd->ncg_home, move,
4806                     dd->index_gl, dd->cgindex, dd->gatindex,
4807                     dd->ga2la, comm->bLocalCG,
4808                     fr->cginfo);
4809     }
4810     else
4811     {
4812         if (fr->cutoff_scheme == ecutsVERLET)
4813         {
4814             moved = get_moved(comm, dd->ncg_home);
4815
4816             for (k = 0; k < dd->ncg_home; k++)
4817             {
4818                 moved[k] = 0;
4819             }
4820         }
4821         else
4822         {
4823             moved = fr->ns.grid->cell_index;
4824         }
4825
4826         clear_and_mark_ind(dd->ncg_home, move,
4827                            dd->index_gl, dd->cgindex, dd->gatindex,
4828                            dd->ga2la, comm->bLocalCG,
4829                            moved);
4830     }
4831
4832     cginfo_mb = fr->cginfo_mb;
4833
4834     *ncg_stay_home = home_pos_cg;
4835     for (d = 0; d < dd->ndim; d++)
4836     {
4837         dim      = dd->dim[d];
4838         ncg_recv = 0;
4839         nat_recv = 0;
4840         nvr      = 0;
4841         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4842         {
4843             cdd = d*2 + dir;
4844             /* Communicate the cg and atom counts */
4845             sbuf[0] = ncg[cdd];
4846             sbuf[1] = nat[cdd];
4847             if (debug)
4848             {
4849                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4850                         d, dir, sbuf[0], sbuf[1]);
4851             }
4852             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4853
4854             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4855             {
4856                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4857                 srenew(comm->buf_int, comm->nalloc_int);
4858             }
4859
4860             /* Communicate the charge group indices, sizes and flags */
4861             dd_sendrecv_int(dd, d, dir,
4862                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4863                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4864
4865             nvs = ncg[cdd] + nat[cdd]*nvec;
4866             i   = rbuf[0]  + rbuf[1] *nvec;
4867             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4868
4869             /* Communicate cgcm and state */
4870             dd_sendrecv_rvec(dd, d, dir,
4871                              comm->cgcm_state[cdd], nvs,
4872                              comm->vbuf.v+nvr, i);
4873             ncg_recv += rbuf[0];
4874             nat_recv += rbuf[1];
4875             nvr      += i;
4876         }
4877
4878         /* Process the received charge groups */
4879         buf_pos = 0;
4880         for (cg = 0; cg < ncg_recv; cg++)
4881         {
4882             flag = comm->buf_int[cg*DD_CGIBS+1];
4883
4884             if (dim >= npbcdim && dd->nc[dim] > 2)
4885             {
4886                 /* No pbc in this dim and more than one domain boundary.
4887                  * We do a separate check if a charge group didn't move too far.
4888                  */
4889                 if (((flag & DD_FLAG_FW(d)) &&
4890                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4891                     ((flag & DD_FLAG_BW(d)) &&
4892                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4893                 {
4894                     cg_move_error(fplog, dd, step, cg, dim,
4895                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4896                                   FALSE, 0,
4897                                   comm->vbuf.v[buf_pos],
4898                                   comm->vbuf.v[buf_pos],
4899                                   comm->vbuf.v[buf_pos][dim]);
4900                 }
4901             }
4902
4903             mc = -1;
4904             if (d < dd->ndim-1)
4905             {
4906                 /* Check which direction this cg should go */
4907                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4908                 {
4909                     if (dd->bGridJump)
4910                     {
4911                         /* The cell boundaries for dimension d2 are not equal
4912                          * for each cell row of the lower dimension(s),
4913                          * therefore we might need to redetermine where
4914                          * this cg should go.
4915                          */
4916                         dim2 = dd->dim[d2];
4917                         /* If this cg crosses the box boundary in dimension d2
4918                          * we can use the communicated flag, so we do not
4919                          * have to worry about pbc.
4920                          */
4921                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4922                                (flag & DD_FLAG_FW(d2))) ||
4923                               (dd->ci[dim2] == 0 &&
4924                                (flag & DD_FLAG_BW(d2)))))
4925                         {
4926                             /* Clear the two flags for this dimension */
4927                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4928                             /* Determine the location of this cg
4929                              * in lattice coordinates
4930                              */
4931                             pos_d = comm->vbuf.v[buf_pos][dim2];
4932                             if (tric_dir[dim2])
4933                             {
4934                                 for (d3 = dim2+1; d3 < DIM; d3++)
4935                                 {
4936                                     pos_d +=
4937                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4938                                 }
4939                             }
4940                             /* Check of we are not at the box edge.
4941                              * pbc is only handled in the first step above,
4942                              * but this check could move over pbc while
4943                              * the first step did not due to different rounding.
4944                              */
4945                             if (pos_d >= cell_x1[dim2] &&
4946                                 dd->ci[dim2] != dd->nc[dim2]-1)
4947                             {
4948                                 flag |= DD_FLAG_FW(d2);
4949                             }
4950                             else if (pos_d < cell_x0[dim2] &&
4951                                      dd->ci[dim2] != 0)
4952                             {
4953                                 flag |= DD_FLAG_BW(d2);
4954                             }
4955                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4956                         }
4957                     }
4958                     /* Set to which neighboring cell this cg should go */
4959                     if (flag & DD_FLAG_FW(d2))
4960                     {
4961                         mc = d2*2;
4962                     }
4963                     else if (flag & DD_FLAG_BW(d2))
4964                     {
4965                         if (dd->nc[dd->dim[d2]] > 2)
4966                         {
4967                             mc = d2*2+1;
4968                         }
4969                         else
4970                         {
4971                             mc = d2*2;
4972                         }
4973                     }
4974                 }
4975             }
4976
4977             nrcg = flag & DD_FLAG_NRCG;
4978             if (mc == -1)
4979             {
4980                 if (home_pos_cg+1 > dd->cg_nalloc)
4981                 {
4982                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4983                     srenew(dd->index_gl, dd->cg_nalloc);
4984                     srenew(dd->cgindex, dd->cg_nalloc+1);
4985                 }
4986                 /* Set the global charge group index and size */
4987                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4988                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4989                 /* Copy the state from the buffer */
4990                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4991                 if (fr->cutoff_scheme == ecutsGROUP)
4992                 {
4993                     cg_cm = fr->cg_cm;
4994                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4995                 }
4996                 buf_pos++;
4997
4998                 /* Set the cginfo */
4999                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5000                                                    dd->index_gl[home_pos_cg]);
5001                 if (comm->bLocalCG)
5002                 {
5003                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5004                 }
5005
5006                 if (home_pos_at+nrcg > state->nalloc)
5007                 {
5008                     dd_realloc_state(state, f, home_pos_at+nrcg);
5009                 }
5010                 for (i = 0; i < nrcg; i++)
5011                 {
5012                     copy_rvec(comm->vbuf.v[buf_pos++],
5013                               state->x[home_pos_at+i]);
5014                 }
5015                 if (bV)
5016                 {
5017                     for (i = 0; i < nrcg; i++)
5018                     {
5019                         copy_rvec(comm->vbuf.v[buf_pos++],
5020                                   state->v[home_pos_at+i]);
5021                     }
5022                 }
5023                 if (bSDX)
5024                 {
5025                     for (i = 0; i < nrcg; i++)
5026                     {
5027                         copy_rvec(comm->vbuf.v[buf_pos++],
5028                                   state->sd_X[home_pos_at+i]);
5029                     }
5030                 }
5031                 if (bCGP)
5032                 {
5033                     for (i = 0; i < nrcg; i++)
5034                     {
5035                         copy_rvec(comm->vbuf.v[buf_pos++],
5036                                   state->cg_p[home_pos_at+i]);
5037                     }
5038                 }
5039                 home_pos_cg += 1;
5040                 home_pos_at += nrcg;
5041             }
5042             else
5043             {
5044                 /* Reallocate the buffers if necessary  */
5045                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5046                 {
5047                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5048                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5049                 }
5050                 nvr = ncg[mc] + nat[mc]*nvec;
5051                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5052                 {
5053                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5054                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5055                 }
5056                 /* Copy from the receive to the send buffers */
5057                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5058                        comm->buf_int + cg*DD_CGIBS,
5059                        DD_CGIBS*sizeof(int));
5060                 memcpy(comm->cgcm_state[mc][nvr],
5061                        comm->vbuf.v[buf_pos],
5062                        (1+nrcg*nvec)*sizeof(rvec));
5063                 buf_pos += 1 + nrcg*nvec;
5064                 ncg[mc] += 1;
5065                 nat[mc] += nrcg;
5066             }
5067         }
5068     }
5069
5070     /* With sorting (!bCompact) the indices are now only partially up to date
5071      * and ncg_home and nat_home are not the real count, since there are
5072      * "holes" in the arrays for the charge groups that moved to neighbors.
5073      */
5074     if (fr->cutoff_scheme == ecutsVERLET)
5075     {
5076         moved = get_moved(comm, home_pos_cg);
5077
5078         for (i = dd->ncg_home; i < home_pos_cg; i++)
5079         {
5080             moved[i] = 0;
5081         }
5082     }
5083     dd->ncg_home = home_pos_cg;
5084     dd->nat_home = home_pos_at;
5085
5086     if (debug)
5087     {
5088         fprintf(debug,
5089                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5090                 *ncg_moved, dd->ncg_home-*ncg_moved);
5091
5092     }
5093 }
5094
5095 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5096 {
5097     dd->comm->cycl[ddCycl] += cycles;
5098     dd->comm->cycl_n[ddCycl]++;
5099     if (cycles > dd->comm->cycl_max[ddCycl])
5100     {
5101         dd->comm->cycl_max[ddCycl] = cycles;
5102     }
5103 }
5104
5105 static double force_flop_count(t_nrnb *nrnb)
5106 {
5107     int         i;
5108     double      sum;
5109     const char *name;
5110
5111     sum = 0;
5112     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5113     {
5114         /* To get closer to the real timings, we half the count
5115          * for the normal loops and again half it for water loops.
5116          */
5117         name = nrnb_str(i);
5118         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5119         {
5120             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5121         }
5122         else
5123         {
5124             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5125         }
5126     }
5127     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5128     {
5129         name = nrnb_str(i);
5130         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5131         {
5132             sum += nrnb->n[i]*cost_nrnb(i);
5133         }
5134     }
5135     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5136     {
5137         sum += nrnb->n[i]*cost_nrnb(i);
5138     }
5139
5140     return sum;
5141 }
5142
5143 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5144 {
5145     if (dd->comm->eFlop)
5146     {
5147         dd->comm->flop -= force_flop_count(nrnb);
5148     }
5149 }
5150 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5151 {
5152     if (dd->comm->eFlop)
5153     {
5154         dd->comm->flop += force_flop_count(nrnb);
5155         dd->comm->flop_n++;
5156     }
5157 }
5158
5159 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5160 {
5161     int i;
5162
5163     for (i = 0; i < ddCyclNr; i++)
5164     {
5165         dd->comm->cycl[i]     = 0;
5166         dd->comm->cycl_n[i]   = 0;
5167         dd->comm->cycl_max[i] = 0;
5168     }
5169     dd->comm->flop   = 0;
5170     dd->comm->flop_n = 0;
5171 }
5172
5173 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5174 {
5175     gmx_domdec_comm_t *comm;
5176     gmx_domdec_load_t *load;
5177     gmx_domdec_root_t *root = NULL;
5178     int                d, dim, cid, i, pos;
5179     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5180     gmx_bool           bSepPME;
5181
5182     if (debug)
5183     {
5184         fprintf(debug, "get_load_distribution start\n");
5185     }
5186
5187     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5188
5189     comm = dd->comm;
5190
5191     bSepPME = (dd->pme_nodeid >= 0);
5192
5193     for (d = dd->ndim-1; d >= 0; d--)
5194     {
5195         dim = dd->dim[d];
5196         /* Check if we participate in the communication in this dimension */
5197         if (d == dd->ndim-1 ||
5198             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5199         {
5200             load = &comm->load[d];
5201             if (dd->bGridJump)
5202             {
5203                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5204             }
5205             pos = 0;
5206             if (d == dd->ndim-1)
5207             {
5208                 sbuf[pos++] = dd_force_load(comm);
5209                 sbuf[pos++] = sbuf[0];
5210                 if (dd->bGridJump)
5211                 {
5212                     sbuf[pos++] = sbuf[0];
5213                     sbuf[pos++] = cell_frac;
5214                     if (d > 0)
5215                     {
5216                         sbuf[pos++] = comm->cell_f_max0[d];
5217                         sbuf[pos++] = comm->cell_f_min1[d];
5218                     }
5219                 }
5220                 if (bSepPME)
5221                 {
5222                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5223                     sbuf[pos++] = comm->cycl[ddCyclPME];
5224                 }
5225             }
5226             else
5227             {
5228                 sbuf[pos++] = comm->load[d+1].sum;
5229                 sbuf[pos++] = comm->load[d+1].max;
5230                 if (dd->bGridJump)
5231                 {
5232                     sbuf[pos++] = comm->load[d+1].sum_m;
5233                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5234                     sbuf[pos++] = comm->load[d+1].flags;
5235                     if (d > 0)
5236                     {
5237                         sbuf[pos++] = comm->cell_f_max0[d];
5238                         sbuf[pos++] = comm->cell_f_min1[d];
5239                     }
5240                 }
5241                 if (bSepPME)
5242                 {
5243                     sbuf[pos++] = comm->load[d+1].mdf;
5244                     sbuf[pos++] = comm->load[d+1].pme;
5245                 }
5246             }
5247             load->nload = pos;
5248             /* Communicate a row in DD direction d.
5249              * The communicators are setup such that the root always has rank 0.
5250              */
5251 #ifdef GMX_MPI
5252             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5253                        load->load, load->nload*sizeof(float), MPI_BYTE,
5254                        0, comm->mpi_comm_load[d]);
5255 #endif
5256             if (dd->ci[dim] == dd->master_ci[dim])
5257             {
5258                 /* We are the root, process this row */
5259                 if (comm->bDynLoadBal)
5260                 {
5261                     root = comm->root[d];
5262                 }
5263                 load->sum      = 0;
5264                 load->max      = 0;
5265                 load->sum_m    = 0;
5266                 load->cvol_min = 1;
5267                 load->flags    = 0;
5268                 load->mdf      = 0;
5269                 load->pme      = 0;
5270                 pos            = 0;
5271                 for (i = 0; i < dd->nc[dim]; i++)
5272                 {
5273                     load->sum += load->load[pos++];
5274                     load->max  = max(load->max, load->load[pos]);
5275                     pos++;
5276                     if (dd->bGridJump)
5277                     {
5278                         if (root->bLimited)
5279                         {
5280                             /* This direction could not be load balanced properly,
5281                              * therefore we need to use the maximum iso the average load.
5282                              */
5283                             load->sum_m = max(load->sum_m, load->load[pos]);
5284                         }
5285                         else
5286                         {
5287                             load->sum_m += load->load[pos];
5288                         }
5289                         pos++;
5290                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5291                         pos++;
5292                         if (d < dd->ndim-1)
5293                         {
5294                             load->flags = (int)(load->load[pos++] + 0.5);
5295                         }
5296                         if (d > 0)
5297                         {
5298                             root->cell_f_max0[i] = load->load[pos++];
5299                             root->cell_f_min1[i] = load->load[pos++];
5300                         }
5301                     }
5302                     if (bSepPME)
5303                     {
5304                         load->mdf = max(load->mdf, load->load[pos]);
5305                         pos++;
5306                         load->pme = max(load->pme, load->load[pos]);
5307                         pos++;
5308                     }
5309                 }
5310                 if (comm->bDynLoadBal && root->bLimited)
5311                 {
5312                     load->sum_m *= dd->nc[dim];
5313                     load->flags |= (1<<d);
5314                 }
5315             }
5316         }
5317     }
5318
5319     if (DDMASTER(dd))
5320     {
5321         comm->nload      += dd_load_count(comm);
5322         comm->load_step  += comm->cycl[ddCyclStep];
5323         comm->load_sum   += comm->load[0].sum;
5324         comm->load_max   += comm->load[0].max;
5325         if (comm->bDynLoadBal)
5326         {
5327             for (d = 0; d < dd->ndim; d++)
5328             {
5329                 if (comm->load[0].flags & (1<<d))
5330                 {
5331                     comm->load_lim[d]++;
5332                 }
5333             }
5334         }
5335         if (bSepPME)
5336         {
5337             comm->load_mdf += comm->load[0].mdf;
5338             comm->load_pme += comm->load[0].pme;
5339         }
5340     }
5341
5342     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5343
5344     if (debug)
5345     {
5346         fprintf(debug, "get_load_distribution finished\n");
5347     }
5348 }
5349
5350 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5351 {
5352     /* Return the relative performance loss on the total run time
5353      * due to the force calculation load imbalance.
5354      */
5355     if (dd->comm->nload > 0)
5356     {
5357         return
5358             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5359             (dd->comm->load_step*dd->nnodes);
5360     }
5361     else
5362     {
5363         return 0;
5364     }
5365 }
5366
5367 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5368 {
5369     char               buf[STRLEN];
5370     int                npp, npme, nnodes, d, limp;
5371     float              imbal, pme_f_ratio, lossf, lossp = 0;
5372     gmx_bool           bLim;
5373     gmx_domdec_comm_t *comm;
5374
5375     comm = dd->comm;
5376     if (DDMASTER(dd) && comm->nload > 0)
5377     {
5378         npp    = dd->nnodes;
5379         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5380         nnodes = npp + npme;
5381         imbal  = comm->load_max*npp/comm->load_sum - 1;
5382         lossf  = dd_force_imb_perf_loss(dd);
5383         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5384         fprintf(fplog, "%s", buf);
5385         fprintf(stderr, "\n");
5386         fprintf(stderr, "%s", buf);
5387         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5388         fprintf(fplog, "%s", buf);
5389         fprintf(stderr, "%s", buf);
5390         bLim = FALSE;
5391         if (comm->bDynLoadBal)
5392         {
5393             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5394             for (d = 0; d < dd->ndim; d++)
5395             {
5396                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5397                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5398                 if (limp >= 50)
5399                 {
5400                     bLim = TRUE;
5401                 }
5402             }
5403             sprintf(buf+strlen(buf), "\n");
5404             fprintf(fplog, "%s", buf);
5405             fprintf(stderr, "%s", buf);
5406         }
5407         if (npme > 0)
5408         {
5409             pme_f_ratio = comm->load_pme/comm->load_mdf;
5410             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5411             if (lossp <= 0)
5412             {
5413                 lossp *= (float)npme/(float)nnodes;
5414             }
5415             else
5416             {
5417                 lossp *= (float)npp/(float)nnodes;
5418             }
5419             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5420             fprintf(fplog, "%s", buf);
5421             fprintf(stderr, "%s", buf);
5422             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5423             fprintf(fplog, "%s", buf);
5424             fprintf(stderr, "%s", buf);
5425         }
5426         fprintf(fplog, "\n");
5427         fprintf(stderr, "\n");
5428
5429         if (lossf >= DD_PERF_LOSS)
5430         {
5431             sprintf(buf,
5432                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5433                     "      in the domain decomposition.\n", lossf*100);
5434             if (!comm->bDynLoadBal)
5435             {
5436                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5437             }
5438             else if (bLim)
5439             {
5440                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5441             }
5442             fprintf(fplog, "%s\n", buf);
5443             fprintf(stderr, "%s\n", buf);
5444         }
5445         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5446         {
5447             sprintf(buf,
5448                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5449                     "      had %s work to do than the PP nodes.\n"
5450                     "      You might want to %s the number of PME nodes\n"
5451                     "      or %s the cut-off and the grid spacing.\n",
5452                     fabs(lossp*100),
5453                     (lossp < 0) ? "less"     : "more",
5454                     (lossp < 0) ? "decrease" : "increase",
5455                     (lossp < 0) ? "decrease" : "increase");
5456             fprintf(fplog, "%s\n", buf);
5457             fprintf(stderr, "%s\n", buf);
5458         }
5459     }
5460 }
5461
5462 static float dd_vol_min(gmx_domdec_t *dd)
5463 {
5464     return dd->comm->load[0].cvol_min*dd->nnodes;
5465 }
5466
5467 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5468 {
5469     return dd->comm->load[0].flags;
5470 }
5471
5472 static float dd_f_imbal(gmx_domdec_t *dd)
5473 {
5474     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5475 }
5476
5477 float dd_pme_f_ratio(gmx_domdec_t *dd)
5478 {
5479     if (dd->comm->cycl_n[ddCyclPME] > 0)
5480     {
5481         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5482     }
5483     else
5484     {
5485         return -1.0;
5486     }
5487 }
5488
5489 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
5490 {
5491     int  flags, d;
5492     char buf[22];
5493
5494     flags = dd_load_flags(dd);
5495     if (flags)
5496     {
5497         fprintf(fplog,
5498                 "DD  load balancing is limited by minimum cell size in dimension");
5499         for (d = 0; d < dd->ndim; d++)
5500         {
5501             if (flags & (1<<d))
5502             {
5503                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5504             }
5505         }
5506         fprintf(fplog, "\n");
5507     }
5508     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5509     if (dd->comm->bDynLoadBal)
5510     {
5511         fprintf(fplog, "  vol min/aver %5.3f%c",
5512                 dd_vol_min(dd), flags ? '!' : ' ');
5513     }
5514     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5515     if (dd->comm->cycl_n[ddCyclPME])
5516     {
5517         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5518     }
5519     fprintf(fplog, "\n\n");
5520 }
5521
5522 static void dd_print_load_verbose(gmx_domdec_t *dd)
5523 {
5524     if (dd->comm->bDynLoadBal)
5525     {
5526         fprintf(stderr, "vol %4.2f%c ",
5527                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5528     }
5529     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5530     if (dd->comm->cycl_n[ddCyclPME])
5531     {
5532         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5533     }
5534 }
5535
5536 #ifdef GMX_MPI
5537 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5538 {
5539     MPI_Comm           c_row;
5540     int                dim, i, rank;
5541     ivec               loc_c;
5542     gmx_domdec_root_t *root;
5543     gmx_bool           bPartOfGroup = FALSE;
5544
5545     dim = dd->dim[dim_ind];
5546     copy_ivec(loc, loc_c);
5547     for (i = 0; i < dd->nc[dim]; i++)
5548     {
5549         loc_c[dim] = i;
5550         rank       = dd_index(dd->nc, loc_c);
5551         if (rank == dd->rank)
5552         {
5553             /* This process is part of the group */
5554             bPartOfGroup = TRUE;
5555         }
5556     }
5557     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5558                    &c_row);
5559     if (bPartOfGroup)
5560     {
5561         dd->comm->mpi_comm_load[dim_ind] = c_row;
5562         if (dd->comm->eDLB != edlbNO)
5563         {
5564             if (dd->ci[dim] == dd->master_ci[dim])
5565             {
5566                 /* This is the root process of this row */
5567                 snew(dd->comm->root[dim_ind], 1);
5568                 root = dd->comm->root[dim_ind];
5569                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5570                 snew(root->old_cell_f, dd->nc[dim]+1);
5571                 snew(root->bCellMin, dd->nc[dim]);
5572                 if (dim_ind > 0)
5573                 {
5574                     snew(root->cell_f_max0, dd->nc[dim]);
5575                     snew(root->cell_f_min1, dd->nc[dim]);
5576                     snew(root->bound_min, dd->nc[dim]);
5577                     snew(root->bound_max, dd->nc[dim]);
5578                 }
5579                 snew(root->buf_ncd, dd->nc[dim]);
5580             }
5581             else
5582             {
5583                 /* This is not a root process, we only need to receive cell_f */
5584                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5585             }
5586         }
5587         if (dd->ci[dim] == dd->master_ci[dim])
5588         {
5589             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5590         }
5591     }
5592 }
5593 #endif
5594
5595 static void make_load_communicators(gmx_domdec_t *dd)
5596 {
5597 #ifdef GMX_MPI
5598     int  dim0, dim1, i, j;
5599     ivec loc;
5600
5601     if (debug)
5602     {
5603         fprintf(debug, "Making load communicators\n");
5604     }
5605
5606     snew(dd->comm->load, dd->ndim);
5607     snew(dd->comm->mpi_comm_load, dd->ndim);
5608
5609     clear_ivec(loc);
5610     make_load_communicator(dd, 0, loc);
5611     if (dd->ndim > 1)
5612     {
5613         dim0 = dd->dim[0];
5614         for (i = 0; i < dd->nc[dim0]; i++)
5615         {
5616             loc[dim0] = i;
5617             make_load_communicator(dd, 1, loc);
5618         }
5619     }
5620     if (dd->ndim > 2)
5621     {
5622         dim0 = dd->dim[0];
5623         for (i = 0; i < dd->nc[dim0]; i++)
5624         {
5625             loc[dim0] = i;
5626             dim1      = dd->dim[1];
5627             for (j = 0; j < dd->nc[dim1]; j++)
5628             {
5629                 loc[dim1] = j;
5630                 make_load_communicator(dd, 2, loc);
5631             }
5632         }
5633     }
5634
5635     if (debug)
5636     {
5637         fprintf(debug, "Finished making load communicators\n");
5638     }
5639 #endif
5640 }
5641
5642 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5643 {
5644     gmx_bool                bZYX;
5645     int                     d, dim, i, j, m;
5646     ivec                    tmp, s;
5647     int                     nzone, nzonep;
5648     ivec                    dd_zp[DD_MAXIZONE];
5649     gmx_domdec_zones_t     *zones;
5650     gmx_domdec_ns_ranges_t *izone;
5651
5652     for (d = 0; d < dd->ndim; d++)
5653     {
5654         dim = dd->dim[d];
5655         copy_ivec(dd->ci, tmp);
5656         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5657         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5658         copy_ivec(dd->ci, tmp);
5659         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5660         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5661         if (debug)
5662         {
5663             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5664                     dd->rank, dim,
5665                     dd->neighbor[d][0],
5666                     dd->neighbor[d][1]);
5667         }
5668     }
5669
5670     if (fplog)
5671     {
5672         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5673                 dd->ndim,
5674                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5675                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5676     }
5677     switch (dd->ndim)
5678     {
5679         case 3:
5680             nzone  = dd_z3n;
5681             nzonep = dd_zp3n;
5682             for (i = 0; i < nzonep; i++)
5683             {
5684                 copy_ivec(dd_zp3[i], dd_zp[i]);
5685             }
5686             break;
5687         case 2:
5688             nzone  = dd_z2n;
5689             nzonep = dd_zp2n;
5690             for (i = 0; i < nzonep; i++)
5691             {
5692                 copy_ivec(dd_zp2[i], dd_zp[i]);
5693             }
5694             break;
5695         case 1:
5696             nzone  = dd_z1n;
5697             nzonep = dd_zp1n;
5698             for (i = 0; i < nzonep; i++)
5699             {
5700                 copy_ivec(dd_zp1[i], dd_zp[i]);
5701             }
5702             break;
5703         default:
5704             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5705             nzone  = 0;
5706             nzonep = 0;
5707     }
5708
5709     zones = &dd->comm->zones;
5710
5711     for (i = 0; i < nzone; i++)
5712     {
5713         m = 0;
5714         clear_ivec(zones->shift[i]);
5715         for (d = 0; d < dd->ndim; d++)
5716         {
5717             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5718         }
5719     }
5720
5721     zones->n = nzone;
5722     for (i = 0; i < nzone; i++)
5723     {
5724         for (d = 0; d < DIM; d++)
5725         {
5726             s[d] = dd->ci[d] - zones->shift[i][d];
5727             if (s[d] < 0)
5728             {
5729                 s[d] += dd->nc[d];
5730             }
5731             else if (s[d] >= dd->nc[d])
5732             {
5733                 s[d] -= dd->nc[d];
5734             }
5735         }
5736     }
5737     zones->nizone = nzonep;
5738     for (i = 0; i < zones->nizone; i++)
5739     {
5740         if (dd_zp[i][0] != i)
5741         {
5742             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5743         }
5744         izone     = &zones->izone[i];
5745         izone->j0 = dd_zp[i][1];
5746         izone->j1 = dd_zp[i][2];
5747         for (dim = 0; dim < DIM; dim++)
5748         {
5749             if (dd->nc[dim] == 1)
5750             {
5751                 /* All shifts should be allowed */
5752                 izone->shift0[dim] = -1;
5753                 izone->shift1[dim] = 1;
5754             }
5755             else
5756             {
5757                 /*
5758                    izone->shift0[d] = 0;
5759                    izone->shift1[d] = 0;
5760                    for(j=izone->j0; j<izone->j1; j++) {
5761                    if (dd->shift[j][d] > dd->shift[i][d])
5762                    izone->shift0[d] = -1;
5763                    if (dd->shift[j][d] < dd->shift[i][d])
5764                    izone->shift1[d] = 1;
5765                    }
5766                  */
5767
5768                 int shift_diff;
5769
5770                 /* Assume the shift are not more than 1 cell */
5771                 izone->shift0[dim] = 1;
5772                 izone->shift1[dim] = -1;
5773                 for (j = izone->j0; j < izone->j1; j++)
5774                 {
5775                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5776                     if (shift_diff < izone->shift0[dim])
5777                     {
5778                         izone->shift0[dim] = shift_diff;
5779                     }
5780                     if (shift_diff > izone->shift1[dim])
5781                     {
5782                         izone->shift1[dim] = shift_diff;
5783                     }
5784                 }
5785             }
5786         }
5787     }
5788
5789     if (dd->comm->eDLB != edlbNO)
5790     {
5791         snew(dd->comm->root, dd->ndim);
5792     }
5793
5794     if (dd->comm->bRecordLoad)
5795     {
5796         make_load_communicators(dd);
5797     }
5798 }
5799
5800 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
5801 {
5802     gmx_domdec_t      *dd;
5803     gmx_domdec_comm_t *comm;
5804     int                i, rank, *buf;
5805     ivec               periods;
5806 #ifdef GMX_MPI
5807     MPI_Comm           comm_cart;
5808 #endif
5809
5810     dd   = cr->dd;
5811     comm = dd->comm;
5812
5813 #ifdef GMX_MPI
5814     if (comm->bCartesianPP)
5815     {
5816         /* Set up cartesian communication for the particle-particle part */
5817         if (fplog)
5818         {
5819             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5820                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5821         }
5822
5823         for (i = 0; i < DIM; i++)
5824         {
5825             periods[i] = TRUE;
5826         }
5827         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5828                         &comm_cart);
5829         /* We overwrite the old communicator with the new cartesian one */
5830         cr->mpi_comm_mygroup = comm_cart;
5831     }
5832
5833     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5834     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5835
5836     if (comm->bCartesianPP_PME)
5837     {
5838         /* Since we want to use the original cartesian setup for sim,
5839          * and not the one after split, we need to make an index.
5840          */
5841         snew(comm->ddindex2ddnodeid, dd->nnodes);
5842         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5843         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5844         /* Get the rank of the DD master,
5845          * above we made sure that the master node is a PP node.
5846          */
5847         if (MASTER(cr))
5848         {
5849             rank = dd->rank;
5850         }
5851         else
5852         {
5853             rank = 0;
5854         }
5855         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5856     }
5857     else if (comm->bCartesianPP)
5858     {
5859         if (cr->npmenodes == 0)
5860         {
5861             /* The PP communicator is also
5862              * the communicator for this simulation
5863              */
5864             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5865         }
5866         cr->nodeid = dd->rank;
5867
5868         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5869
5870         /* We need to make an index to go from the coordinates
5871          * to the nodeid of this simulation.
5872          */
5873         snew(comm->ddindex2simnodeid, dd->nnodes);
5874         snew(buf, dd->nnodes);
5875         if (cr->duty & DUTY_PP)
5876         {
5877             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5878         }
5879         /* Communicate the ddindex to simulation nodeid index */
5880         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5881                       cr->mpi_comm_mysim);
5882         sfree(buf);
5883
5884         /* Determine the master coordinates and rank.
5885          * The DD master should be the same node as the master of this sim.
5886          */
5887         for (i = 0; i < dd->nnodes; i++)
5888         {
5889             if (comm->ddindex2simnodeid[i] == 0)
5890             {
5891                 ddindex2xyz(dd->nc, i, dd->master_ci);
5892                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5893             }
5894         }
5895         if (debug)
5896         {
5897             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5898         }
5899     }
5900     else
5901     {
5902         /* No Cartesian communicators */
5903         /* We use the rank in dd->comm->all as DD index */
5904         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5905         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5906         dd->masterrank = 0;
5907         clear_ivec(dd->master_ci);
5908     }
5909 #endif
5910
5911     if (fplog)
5912     {
5913         fprintf(fplog,
5914                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5915                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5916     }
5917     if (debug)
5918     {
5919         fprintf(debug,
5920                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5921                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5922     }
5923 }
5924
5925 static void receive_ddindex2simnodeid(t_commrec *cr)
5926 {
5927     gmx_domdec_t      *dd;
5928
5929     gmx_domdec_comm_t *comm;
5930     int               *buf;
5931
5932     dd   = cr->dd;
5933     comm = dd->comm;
5934
5935 #ifdef GMX_MPI
5936     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5937     {
5938         snew(comm->ddindex2simnodeid, dd->nnodes);
5939         snew(buf, dd->nnodes);
5940         if (cr->duty & DUTY_PP)
5941         {
5942             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5943         }
5944 #ifdef GMX_MPI
5945         /* Communicate the ddindex to simulation nodeid index */
5946         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5947                       cr->mpi_comm_mysim);
5948 #endif
5949         sfree(buf);
5950     }
5951 #endif
5952 }
5953
5954 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5955                                                      int ncg, int natoms)
5956 {
5957     gmx_domdec_master_t *ma;
5958     int                  i;
5959
5960     snew(ma, 1);
5961
5962     snew(ma->ncg, dd->nnodes);
5963     snew(ma->index, dd->nnodes+1);
5964     snew(ma->cg, ncg);
5965     snew(ma->nat, dd->nnodes);
5966     snew(ma->ibuf, dd->nnodes*2);
5967     snew(ma->cell_x, DIM);
5968     for (i = 0; i < DIM; i++)
5969     {
5970         snew(ma->cell_x[i], dd->nc[i]+1);
5971     }
5972
5973     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5974     {
5975         ma->vbuf = NULL;
5976     }
5977     else
5978     {
5979         snew(ma->vbuf, natoms);
5980     }
5981
5982     return ma;
5983 }
5984
5985 static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
5986                                int reorder)
5987 {
5988     gmx_domdec_t      *dd;
5989     gmx_domdec_comm_t *comm;
5990     int                i, rank;
5991     gmx_bool           bDiv[DIM];
5992     ivec               periods;
5993 #ifdef GMX_MPI
5994     MPI_Comm           comm_cart;
5995 #endif
5996
5997     dd   = cr->dd;
5998     comm = dd->comm;
5999
6000     if (comm->bCartesianPP)
6001     {
6002         for (i = 1; i < DIM; i++)
6003         {
6004             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6005         }
6006         if (bDiv[YY] || bDiv[ZZ])
6007         {
6008             comm->bCartesianPP_PME = TRUE;
6009             /* If we have 2D PME decomposition, which is always in x+y,
6010              * we stack the PME only nodes in z.
6011              * Otherwise we choose the direction that provides the thinnest slab
6012              * of PME only nodes as this will have the least effect
6013              * on the PP communication.
6014              * But for the PME communication the opposite might be better.
6015              */
6016             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6017                              !bDiv[YY] ||
6018                              dd->nc[YY] > dd->nc[ZZ]))
6019             {
6020                 comm->cartpmedim = ZZ;
6021             }
6022             else
6023             {
6024                 comm->cartpmedim = YY;
6025             }
6026             comm->ntot[comm->cartpmedim]
6027                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6028         }
6029         else if (fplog)
6030         {
6031             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6032             fprintf(fplog,
6033                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6034         }
6035     }
6036
6037 #ifdef GMX_MPI
6038     if (comm->bCartesianPP_PME)
6039     {
6040         if (fplog)
6041         {
6042             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6043         }
6044
6045         for (i = 0; i < DIM; i++)
6046         {
6047             periods[i] = TRUE;
6048         }
6049         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6050                         &comm_cart);
6051
6052         MPI_Comm_rank(comm_cart, &rank);
6053         if (MASTERNODE(cr) && rank != 0)
6054         {
6055             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6056         }
6057
6058         /* With this assigment we loose the link to the original communicator
6059          * which will usually be MPI_COMM_WORLD, unless have multisim.
6060          */
6061         cr->mpi_comm_mysim = comm_cart;
6062         cr->sim_nodeid     = rank;
6063
6064         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6065
6066         if (fplog)
6067         {
6068             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6069                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6070         }
6071
6072         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6073         {
6074             cr->duty = DUTY_PP;
6075         }
6076         if (cr->npmenodes == 0 ||
6077             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6078         {
6079             cr->duty = DUTY_PME;
6080         }
6081
6082         /* Split the sim communicator into PP and PME only nodes */
6083         MPI_Comm_split(cr->mpi_comm_mysim,
6084                        cr->duty,
6085                        dd_index(comm->ntot, dd->ci),
6086                        &cr->mpi_comm_mygroup);
6087     }
6088     else
6089     {
6090         switch (dd_node_order)
6091         {
6092             case ddnoPP_PME:
6093                 if (fplog)
6094                 {
6095                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6096                 }
6097                 break;
6098             case ddnoINTERLEAVE:
6099                 /* Interleave the PP-only and PME-only nodes,
6100                  * as on clusters with dual-core machines this will double
6101                  * the communication bandwidth of the PME processes
6102                  * and thus speed up the PP <-> PME and inter PME communication.
6103                  */
6104                 if (fplog)
6105                 {
6106                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6107                 }
6108                 comm->pmenodes = dd_pmenodes(cr);
6109                 break;
6110             case ddnoCARTESIAN:
6111                 break;
6112             default:
6113                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6114         }
6115
6116         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6117         {
6118             cr->duty = DUTY_PME;
6119         }
6120         else
6121         {
6122             cr->duty = DUTY_PP;
6123         }
6124
6125         /* Split the sim communicator into PP and PME only nodes */
6126         MPI_Comm_split(cr->mpi_comm_mysim,
6127                        cr->duty,
6128                        cr->nodeid,
6129                        &cr->mpi_comm_mygroup);
6130         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6131     }
6132 #endif
6133
6134     if (fplog)
6135     {
6136         fprintf(fplog, "This is a %s only node\n\n",
6137                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6138     }
6139 }
6140
6141 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6142 {
6143     gmx_domdec_t      *dd;
6144     gmx_domdec_comm_t *comm;
6145     int                CartReorder;
6146
6147     dd   = cr->dd;
6148     comm = dd->comm;
6149
6150     copy_ivec(dd->nc, comm->ntot);
6151
6152     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6153     comm->bCartesianPP_PME = FALSE;
6154
6155     /* Reorder the nodes by default. This might change the MPI ranks.
6156      * Real reordering is only supported on very few architectures,
6157      * Blue Gene is one of them.
6158      */
6159     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6160
6161     if (cr->npmenodes > 0)
6162     {
6163         /* Split the communicator into a PP and PME part */
6164         split_communicator(fplog, cr, dd_node_order, CartReorder);
6165         if (comm->bCartesianPP_PME)
6166         {
6167             /* We (possibly) reordered the nodes in split_communicator,
6168              * so it is no longer required in make_pp_communicator.
6169              */
6170             CartReorder = FALSE;
6171         }
6172     }
6173     else
6174     {
6175         /* All nodes do PP and PME */
6176 #ifdef GMX_MPI
6177         /* We do not require separate communicators */
6178         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6179 #endif
6180     }
6181
6182     if (cr->duty & DUTY_PP)
6183     {
6184         /* Copy or make a new PP communicator */
6185         make_pp_communicator(fplog, cr, CartReorder);
6186     }
6187     else
6188     {
6189         receive_ddindex2simnodeid(cr);
6190     }
6191
6192     if (!(cr->duty & DUTY_PME))
6193     {
6194         /* Set up the commnuication to our PME node */
6195         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6196         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6197         if (debug)
6198         {
6199             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6200                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6201         }
6202     }
6203     else
6204     {
6205         dd->pme_nodeid = -1;
6206     }
6207
6208     if (DDMASTER(dd))
6209     {
6210         dd->ma = init_gmx_domdec_master_t(dd,
6211                                           comm->cgs_gl.nr,
6212                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6213     }
6214 }
6215
6216 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6217 {
6218     real  *slb_frac, tot;
6219     int    i, n;
6220     double dbl;
6221
6222     slb_frac = NULL;
6223     if (nc > 1 && size_string != NULL)
6224     {
6225         if (fplog)
6226         {
6227             fprintf(fplog, "Using static load balancing for the %s direction\n",
6228                     dir);
6229         }
6230         snew(slb_frac, nc);
6231         tot = 0;
6232         for (i = 0; i < nc; i++)
6233         {
6234             dbl = 0;
6235             sscanf(size_string, "%lf%n", &dbl, &n);
6236             if (dbl == 0)
6237             {
6238                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6239             }
6240             slb_frac[i]  = dbl;
6241             size_string += n;
6242             tot         += slb_frac[i];
6243         }
6244         /* Normalize */
6245         if (fplog)
6246         {
6247             fprintf(fplog, "Relative cell sizes:");
6248         }
6249         for (i = 0; i < nc; i++)
6250         {
6251             slb_frac[i] /= tot;
6252             if (fplog)
6253             {
6254                 fprintf(fplog, " %5.3f", slb_frac[i]);
6255             }
6256         }
6257         if (fplog)
6258         {
6259             fprintf(fplog, "\n");
6260         }
6261     }
6262
6263     return slb_frac;
6264 }
6265
6266 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6267 {
6268     int                  n, nmol, ftype;
6269     gmx_mtop_ilistloop_t iloop;
6270     t_ilist             *il;
6271
6272     n     = 0;
6273     iloop = gmx_mtop_ilistloop_init(mtop);
6274     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6275     {
6276         for (ftype = 0; ftype < F_NRE; ftype++)
6277         {
6278             if ((interaction_function[ftype].flags & IF_BOND) &&
6279                 NRAL(ftype) >  2)
6280             {
6281                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6282             }
6283         }
6284     }
6285
6286     return n;
6287 }
6288
6289 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6290 {
6291     char *val;
6292     int   nst;
6293
6294     nst = def;
6295     val = getenv(env_var);
6296     if (val)
6297     {
6298         if (sscanf(val, "%d", &nst) <= 0)
6299         {
6300             nst = 1;
6301         }
6302         if (fplog)
6303         {
6304             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6305                     env_var, val, nst);
6306         }
6307     }
6308
6309     return nst;
6310 }
6311
6312 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6313 {
6314     if (MASTER(cr))
6315     {
6316         fprintf(stderr, "\n%s\n", warn_string);
6317     }
6318     if (fplog)
6319     {
6320         fprintf(fplog, "\n%s\n", warn_string);
6321     }
6322 }
6323
6324 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6325                                   t_inputrec *ir, FILE *fplog)
6326 {
6327     if (ir->ePBC == epbcSCREW &&
6328         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6329     {
6330         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6331     }
6332
6333     if (ir->ns_type == ensSIMPLE)
6334     {
6335         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6336     }
6337
6338     if (ir->nstlist == 0)
6339     {
6340         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6341     }
6342
6343     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6344     {
6345         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6346     }
6347 }
6348
6349 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6350 {
6351     int  di, d;
6352     real r;
6353
6354     r = ddbox->box_size[XX];
6355     for (di = 0; di < dd->ndim; di++)
6356     {
6357         d = dd->dim[di];
6358         /* Check using the initial average cell size */
6359         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6360     }
6361
6362     return r;
6363 }
6364
6365 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6366                              const char *dlb_opt, gmx_bool bRecordLoad,
6367                              unsigned long Flags, t_inputrec *ir)
6368 {
6369     gmx_domdec_t *dd;
6370     int           eDLB = -1;
6371     char          buf[STRLEN];
6372
6373     switch (dlb_opt[0])
6374     {
6375         case 'a': eDLB = edlbAUTO; break;
6376         case 'n': eDLB = edlbNO;   break;
6377         case 'y': eDLB = edlbYES;  break;
6378         default: gmx_incons("Unknown dlb_opt");
6379     }
6380
6381     if (Flags & MD_RERUN)
6382     {
6383         return edlbNO;
6384     }
6385
6386     if (!EI_DYNAMICS(ir->eI))
6387     {
6388         if (eDLB == edlbYES)
6389         {
6390             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6391             dd_warning(cr, fplog, buf);
6392         }
6393
6394         return edlbNO;
6395     }
6396
6397     if (!bRecordLoad)
6398     {
6399         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6400
6401         return edlbNO;
6402     }
6403
6404     if (Flags & MD_REPRODUCIBLE)
6405     {
6406         switch (eDLB)
6407         {
6408             case edlbNO:
6409                 break;
6410             case edlbAUTO:
6411                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6412                 eDLB = edlbNO;
6413                 break;
6414             case edlbYES:
6415                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6416                 break;
6417             default:
6418                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6419                 break;
6420         }
6421     }
6422
6423     return eDLB;
6424 }
6425
6426 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6427 {
6428     int dim;
6429
6430     dd->ndim = 0;
6431     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6432     {
6433         /* Decomposition order z,y,x */
6434         if (fplog)
6435         {
6436             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6437         }
6438         for (dim = DIM-1; dim >= 0; dim--)
6439         {
6440             if (dd->nc[dim] > 1)
6441             {
6442                 dd->dim[dd->ndim++] = dim;
6443             }
6444         }
6445     }
6446     else
6447     {
6448         /* Decomposition order x,y,z */
6449         for (dim = 0; dim < DIM; dim++)
6450         {
6451             if (dd->nc[dim] > 1)
6452             {
6453                 dd->dim[dd->ndim++] = dim;
6454             }
6455         }
6456     }
6457 }
6458
6459 static gmx_domdec_comm_t *init_dd_comm()
6460 {
6461     gmx_domdec_comm_t *comm;
6462     int                i;
6463
6464     snew(comm, 1);
6465     snew(comm->cggl_flag, DIM*2);
6466     snew(comm->cgcm_state, DIM*2);
6467     for (i = 0; i < DIM*2; i++)
6468     {
6469         comm->cggl_flag_nalloc[i]  = 0;
6470         comm->cgcm_state_nalloc[i] = 0;
6471     }
6472
6473     comm->nalloc_int = 0;
6474     comm->buf_int    = NULL;
6475
6476     vec_rvec_init(&comm->vbuf);
6477
6478     comm->n_load_have    = 0;
6479     comm->n_load_collect = 0;
6480
6481     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6482     {
6483         comm->sum_nat[i] = 0;
6484     }
6485     comm->ndecomp   = 0;
6486     comm->nload     = 0;
6487     comm->load_step = 0;
6488     comm->load_sum  = 0;
6489     comm->load_max  = 0;
6490     clear_ivec(comm->load_lim);
6491     comm->load_mdf  = 0;
6492     comm->load_pme  = 0;
6493
6494     return comm;
6495 }
6496
6497 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6498                                         unsigned long Flags,
6499                                         ivec nc,
6500                                         real comm_distance_min, real rconstr,
6501                                         const char *dlb_opt, real dlb_scale,
6502                                         const char *sizex, const char *sizey, const char *sizez,
6503                                         gmx_mtop_t *mtop, t_inputrec *ir,
6504                                         matrix box, rvec *x,
6505                                         gmx_ddbox_t *ddbox,
6506                                         int *npme_x, int *npme_y)
6507 {
6508     gmx_domdec_t      *dd;
6509     gmx_domdec_comm_t *comm;
6510     int                recload;
6511     int                d, i, j;
6512     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6513     gmx_bool           bC;
6514     char               buf[STRLEN];
6515
6516     if (fplog)
6517     {
6518         fprintf(fplog,
6519                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6520     }
6521
6522     snew(dd, 1);
6523
6524     dd->comm = init_dd_comm();
6525     comm     = dd->comm;
6526     snew(comm->cggl_flag, DIM*2);
6527     snew(comm->cgcm_state, DIM*2);
6528
6529     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6530     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6531
6532     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6533     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6534     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6535     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6536     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6537     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6538     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6539     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6540
6541     dd->pme_recv_f_alloc = 0;
6542     dd->pme_recv_f_buf   = NULL;
6543
6544     if (dd->bSendRecv2 && fplog)
6545     {
6546         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6547     }
6548     if (comm->eFlop)
6549     {
6550         if (fplog)
6551         {
6552             fprintf(fplog, "Will load balance based on FLOP count\n");
6553         }
6554         if (comm->eFlop > 1)
6555         {
6556             srand(1+cr->nodeid);
6557         }
6558         comm->bRecordLoad = TRUE;
6559     }
6560     else
6561     {
6562         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6563
6564     }
6565
6566     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6567
6568     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6569     if (fplog)
6570     {
6571         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6572     }
6573     dd->bGridJump              = comm->bDynLoadBal;
6574     comm->bPMELoadBalDLBLimits = FALSE;
6575
6576     if (comm->nstSortCG)
6577     {
6578         if (fplog)
6579         {
6580             if (comm->nstSortCG == 1)
6581             {
6582                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6583             }
6584             else
6585             {
6586                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6587                         comm->nstSortCG);
6588             }
6589         }
6590         snew(comm->sort, 1);
6591     }
6592     else
6593     {
6594         if (fplog)
6595         {
6596             fprintf(fplog, "Will not sort the charge groups\n");
6597         }
6598     }
6599
6600     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6601
6602     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6603     if (comm->bInterCGBondeds)
6604     {
6605         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6606     }
6607     else
6608     {
6609         comm->bInterCGMultiBody = FALSE;
6610     }
6611
6612     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6613     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6614
6615     if (ir->rlistlong == 0)
6616     {
6617         /* Set the cut-off to some very large value,
6618          * so we don't need if statements everywhere in the code.
6619          * We use sqrt, since the cut-off is squared in some places.
6620          */
6621         comm->cutoff   = GMX_CUTOFF_INF;
6622     }
6623     else
6624     {
6625         comm->cutoff   = ir->rlistlong;
6626     }
6627     comm->cutoff_mbody = 0;
6628
6629     comm->cellsize_limit = 0;
6630     comm->bBondComm      = FALSE;
6631
6632     if (comm->bInterCGBondeds)
6633     {
6634         if (comm_distance_min > 0)
6635         {
6636             comm->cutoff_mbody = comm_distance_min;
6637             if (Flags & MD_DDBONDCOMM)
6638             {
6639                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6640             }
6641             else
6642             {
6643                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6644             }
6645             r_bonded_limit = comm->cutoff_mbody;
6646         }
6647         else if (ir->bPeriodicMols)
6648         {
6649             /* Can not easily determine the required cut-off */
6650             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6651             comm->cutoff_mbody = comm->cutoff/2;
6652             r_bonded_limit     = comm->cutoff_mbody;
6653         }
6654         else
6655         {
6656             if (MASTER(cr))
6657             {
6658                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6659                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6660             }
6661             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6662             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6663
6664             /* We use an initial margin of 10% for the minimum cell size,
6665              * except when we are just below the non-bonded cut-off.
6666              */
6667             if (Flags & MD_DDBONDCOMM)
6668             {
6669                 if (max(r_2b, r_mb) > comm->cutoff)
6670                 {
6671                     r_bonded        = max(r_2b, r_mb);
6672                     r_bonded_limit  = 1.1*r_bonded;
6673                     comm->bBondComm = TRUE;
6674                 }
6675                 else
6676                 {
6677                     r_bonded       = r_mb;
6678                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6679                 }
6680                 /* We determine cutoff_mbody later */
6681             }
6682             else
6683             {
6684                 /* No special bonded communication,
6685                  * simply increase the DD cut-off.
6686                  */
6687                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6688                 comm->cutoff_mbody = r_bonded_limit;
6689                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6690             }
6691         }
6692         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6693         if (fplog)
6694         {
6695             fprintf(fplog,
6696                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6697                     comm->cellsize_limit);
6698         }
6699     }
6700
6701     if (dd->bInterCGcons && rconstr <= 0)
6702     {
6703         /* There is a cell size limit due to the constraints (P-LINCS) */
6704         rconstr = constr_r_max(fplog, mtop, ir);
6705         if (fplog)
6706         {
6707             fprintf(fplog,
6708                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6709                     rconstr);
6710             if (rconstr > comm->cellsize_limit)
6711             {
6712                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6713             }
6714         }
6715     }
6716     else if (rconstr > 0 && fplog)
6717     {
6718         /* Here we do not check for dd->bInterCGcons,
6719          * because one can also set a cell size limit for virtual sites only
6720          * and at this point we don't know yet if there are intercg v-sites.
6721          */
6722         fprintf(fplog,
6723                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6724                 rconstr);
6725     }
6726     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6727
6728     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6729
6730     if (nc[XX] > 0)
6731     {
6732         copy_ivec(nc, dd->nc);
6733         set_dd_dim(fplog, dd);
6734         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6735
6736         if (cr->npmenodes == -1)
6737         {
6738             cr->npmenodes = 0;
6739         }
6740         acs = average_cellsize_min(dd, ddbox);
6741         if (acs < comm->cellsize_limit)
6742         {
6743             if (fplog)
6744             {
6745                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6746             }
6747             gmx_fatal_collective(FARGS, cr, NULL,
6748                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6749                                  acs, comm->cellsize_limit);
6750         }
6751     }
6752     else
6753     {
6754         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6755
6756         /* We need to choose the optimal DD grid and possibly PME nodes */
6757         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6758                                comm->eDLB != edlbNO, dlb_scale,
6759                                comm->cellsize_limit, comm->cutoff,
6760                                comm->bInterCGBondeds);
6761
6762         if (dd->nc[XX] == 0)
6763         {
6764             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6765             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6766                     !bC ? "-rdd" : "-rcon",
6767                     comm->eDLB != edlbNO ? " or -dds" : "",
6768                     bC ? " or your LINCS settings" : "");
6769
6770             gmx_fatal_collective(FARGS, cr, NULL,
6771                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6772                                  "%s\n"
6773                                  "Look in the log file for details on the domain decomposition",
6774                                  cr->nnodes-cr->npmenodes, limit, buf);
6775         }
6776         set_dd_dim(fplog, dd);
6777     }
6778
6779     if (fplog)
6780     {
6781         fprintf(fplog,
6782                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6783                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6784     }
6785
6786     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6787     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6788     {
6789         gmx_fatal_collective(FARGS, cr, NULL,
6790                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6791                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6792     }
6793     if (cr->npmenodes > dd->nnodes)
6794     {
6795         gmx_fatal_collective(FARGS, cr, NULL,
6796                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6797     }
6798     if (cr->npmenodes > 0)
6799     {
6800         comm->npmenodes = cr->npmenodes;
6801     }
6802     else
6803     {
6804         comm->npmenodes = dd->nnodes;
6805     }
6806
6807     if (EEL_PME(ir->coulombtype))
6808     {
6809         /* The following choices should match those
6810          * in comm_cost_est in domdec_setup.c.
6811          * Note that here the checks have to take into account
6812          * that the decomposition might occur in a different order than xyz
6813          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6814          * in which case they will not match those in comm_cost_est,
6815          * but since that is mainly for testing purposes that's fine.
6816          */
6817         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6818             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6819             getenv("GMX_PMEONEDD") == NULL)
6820         {
6821             comm->npmedecompdim = 2;
6822             comm->npmenodes_x   = dd->nc[XX];
6823             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6824         }
6825         else
6826         {
6827             /* In case nc is 1 in both x and y we could still choose to
6828              * decompose pme in y instead of x, but we use x for simplicity.
6829              */
6830             comm->npmedecompdim = 1;
6831             if (dd->dim[0] == YY)
6832             {
6833                 comm->npmenodes_x = 1;
6834                 comm->npmenodes_y = comm->npmenodes;
6835             }
6836             else
6837             {
6838                 comm->npmenodes_x = comm->npmenodes;
6839                 comm->npmenodes_y = 1;
6840             }
6841         }
6842         if (fplog)
6843         {
6844             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6845                     comm->npmenodes_x, comm->npmenodes_y, 1);
6846         }
6847     }
6848     else
6849     {
6850         comm->npmedecompdim = 0;
6851         comm->npmenodes_x   = 0;
6852         comm->npmenodes_y   = 0;
6853     }
6854
6855     /* Technically we don't need both of these,
6856      * but it simplifies code not having to recalculate it.
6857      */
6858     *npme_x = comm->npmenodes_x;
6859     *npme_y = comm->npmenodes_y;
6860
6861     snew(comm->slb_frac, DIM);
6862     if (comm->eDLB == edlbNO)
6863     {
6864         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6865         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6866         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6867     }
6868
6869     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6870     {
6871         if (comm->bBondComm || comm->eDLB != edlbNO)
6872         {
6873             /* Set the bonded communication distance to halfway
6874              * the minimum and the maximum,
6875              * since the extra communication cost is nearly zero.
6876              */
6877             acs                = average_cellsize_min(dd, ddbox);
6878             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6879             if (comm->eDLB != edlbNO)
6880             {
6881                 /* Check if this does not limit the scaling */
6882                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
6883             }
6884             if (!comm->bBondComm)
6885             {
6886                 /* Without bBondComm do not go beyond the n.b. cut-off */
6887                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
6888                 if (comm->cellsize_limit >= comm->cutoff)
6889                 {
6890                     /* We don't loose a lot of efficieny
6891                      * when increasing it to the n.b. cut-off.
6892                      * It can even be slightly faster, because we need
6893                      * less checks for the communication setup.
6894                      */
6895                     comm->cutoff_mbody = comm->cutoff;
6896                 }
6897             }
6898             /* Check if we did not end up below our original limit */
6899             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
6900
6901             if (comm->cutoff_mbody > comm->cellsize_limit)
6902             {
6903                 comm->cellsize_limit = comm->cutoff_mbody;
6904             }
6905         }
6906         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6907     }
6908
6909     if (debug)
6910     {
6911         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6912                 "cellsize limit %f\n",
6913                 comm->bBondComm, comm->cellsize_limit);
6914     }
6915
6916     if (MASTER(cr))
6917     {
6918         check_dd_restrictions(cr, dd, ir, fplog);
6919     }
6920
6921     comm->partition_step = INT_MIN;
6922     dd->ddp_count        = 0;
6923
6924     clear_dd_cycle_counts(dd);
6925
6926     return dd;
6927 }
6928
6929 static void set_dlb_limits(gmx_domdec_t *dd)
6930
6931 {
6932     int d;
6933
6934     for (d = 0; d < dd->ndim; d++)
6935     {
6936         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6937         dd->comm->cellsize_min[dd->dim[d]] =
6938             dd->comm->cellsize_min_dlb[dd->dim[d]];
6939     }
6940 }
6941
6942
6943 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
6944 {
6945     gmx_domdec_t      *dd;
6946     gmx_domdec_comm_t *comm;
6947     real               cellsize_min;
6948     int                d, nc, i;
6949     char               buf[STRLEN];
6950
6951     dd   = cr->dd;
6952     comm = dd->comm;
6953
6954     if (fplog)
6955     {
6956         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6957     }
6958
6959     cellsize_min = comm->cellsize_min[dd->dim[0]];
6960     for (d = 1; d < dd->ndim; d++)
6961     {
6962         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6963     }
6964
6965     if (cellsize_min < comm->cellsize_limit*1.05)
6966     {
6967         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6968
6969         /* Change DLB from "auto" to "no". */
6970         comm->eDLB = edlbNO;
6971
6972         return;
6973     }
6974
6975     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6976     comm->bDynLoadBal = TRUE;
6977     dd->bGridJump     = TRUE;
6978
6979     set_dlb_limits(dd);
6980
6981     /* We can set the required cell size info here,
6982      * so we do not need to communicate this.
6983      * The grid is completely uniform.
6984      */
6985     for (d = 0; d < dd->ndim; d++)
6986     {
6987         if (comm->root[d])
6988         {
6989             comm->load[d].sum_m = comm->load[d].sum;
6990
6991             nc = dd->nc[dd->dim[d]];
6992             for (i = 0; i < nc; i++)
6993             {
6994                 comm->root[d]->cell_f[i]    = i/(real)nc;
6995                 if (d > 0)
6996                 {
6997                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6998                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6999                 }
7000             }
7001             comm->root[d]->cell_f[nc] = 1.0;
7002         }
7003     }
7004 }
7005
7006 static char *init_bLocalCG(gmx_mtop_t *mtop)
7007 {
7008     int   ncg, cg;
7009     char *bLocalCG;
7010
7011     ncg = ncg_mtop(mtop);
7012     snew(bLocalCG, ncg);
7013     for (cg = 0; cg < ncg; cg++)
7014     {
7015         bLocalCG[cg] = FALSE;
7016     }
7017
7018     return bLocalCG;
7019 }
7020
7021 void dd_init_bondeds(FILE *fplog,
7022                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7023                      gmx_vsite_t *vsite,
7024                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7025 {
7026     gmx_domdec_comm_t *comm;
7027     gmx_bool           bBondComm;
7028     int                d;
7029
7030     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
7031
7032     comm = dd->comm;
7033
7034     if (comm->bBondComm)
7035     {
7036         /* Communicate atoms beyond the cut-off for bonded interactions */
7037         comm = dd->comm;
7038
7039         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7040
7041         comm->bLocalCG = init_bLocalCG(mtop);
7042     }
7043     else
7044     {
7045         /* Only communicate atoms based on cut-off */
7046         comm->cglink   = NULL;
7047         comm->bLocalCG = NULL;
7048     }
7049 }
7050
7051 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7052                               t_inputrec *ir,
7053                               gmx_bool bDynLoadBal, real dlb_scale,
7054                               gmx_ddbox_t *ddbox)
7055 {
7056     gmx_domdec_comm_t *comm;
7057     int                d;
7058     ivec               np;
7059     real               limit, shrink;
7060     char               buf[64];
7061
7062     if (fplog == NULL)
7063     {
7064         return;
7065     }
7066
7067     comm = dd->comm;
7068
7069     if (bDynLoadBal)
7070     {
7071         fprintf(fplog, "The maximum number of communication pulses is:");
7072         for (d = 0; d < dd->ndim; d++)
7073         {
7074             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7075         }
7076         fprintf(fplog, "\n");
7077         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7078         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7079         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7080         for (d = 0; d < DIM; d++)
7081         {
7082             if (dd->nc[d] > 1)
7083             {
7084                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7085                 {
7086                     shrink = 0;
7087                 }
7088                 else
7089                 {
7090                     shrink =
7091                         comm->cellsize_min_dlb[d]/
7092                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7093                 }
7094                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7095             }
7096         }
7097         fprintf(fplog, "\n");
7098     }
7099     else
7100     {
7101         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
7102         fprintf(fplog, "The initial number of communication pulses is:");
7103         for (d = 0; d < dd->ndim; d++)
7104         {
7105             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7106         }
7107         fprintf(fplog, "\n");
7108         fprintf(fplog, "The initial domain decomposition cell size is:");
7109         for (d = 0; d < DIM; d++)
7110         {
7111             if (dd->nc[d] > 1)
7112             {
7113                 fprintf(fplog, " %c %.2f nm",
7114                         dim2char(d), dd->comm->cellsize_min[d]);
7115             }
7116         }
7117         fprintf(fplog, "\n\n");
7118     }
7119
7120     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7121     {
7122         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7123         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7124                 "non-bonded interactions", "", comm->cutoff);
7125
7126         if (bDynLoadBal)
7127         {
7128             limit = dd->comm->cellsize_limit;
7129         }
7130         else
7131         {
7132             if (dynamic_dd_box(ddbox, ir))
7133             {
7134                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7135             }
7136             limit = dd->comm->cellsize_min[XX];
7137             for (d = 1; d < DIM; d++)
7138             {
7139                 limit = min(limit, dd->comm->cellsize_min[d]);
7140             }
7141         }
7142
7143         if (comm->bInterCGBondeds)
7144         {
7145             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7146                     "two-body bonded interactions", "(-rdd)",
7147                     max(comm->cutoff, comm->cutoff_mbody));
7148             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7149                     "multi-body bonded interactions", "(-rdd)",
7150                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7151         }
7152         if (dd->vsite_comm)
7153         {
7154             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7155                     "virtual site constructions", "(-rcon)", limit);
7156         }
7157         if (dd->constraint_comm)
7158         {
7159             sprintf(buf, "atoms separated by up to %d constraints",
7160                     1+ir->nProjOrder);
7161             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7162                     buf, "(-rcon)", limit);
7163         }
7164         fprintf(fplog, "\n");
7165     }
7166
7167     fflush(fplog);
7168 }
7169
7170 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7171                                 real               dlb_scale,
7172                                 const t_inputrec  *ir,
7173                                 const gmx_ddbox_t *ddbox)
7174 {
7175     gmx_domdec_comm_t *comm;
7176     int                d, dim, npulse, npulse_d_max, npulse_d;
7177     gmx_bool           bNoCutOff;
7178
7179     comm = dd->comm;
7180
7181     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7182
7183     /* Determine the maximum number of comm. pulses in one dimension */
7184
7185     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7186
7187     /* Determine the maximum required number of grid pulses */
7188     if (comm->cellsize_limit >= comm->cutoff)
7189     {
7190         /* Only a single pulse is required */
7191         npulse = 1;
7192     }
7193     else if (!bNoCutOff && comm->cellsize_limit > 0)
7194     {
7195         /* We round down slightly here to avoid overhead due to the latency
7196          * of extra communication calls when the cut-off
7197          * would be only slightly longer than the cell size.
7198          * Later cellsize_limit is redetermined,
7199          * so we can not miss interactions due to this rounding.
7200          */
7201         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7202     }
7203     else
7204     {
7205         /* There is no cell size limit */
7206         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7207     }
7208
7209     if (!bNoCutOff && npulse > 1)
7210     {
7211         /* See if we can do with less pulses, based on dlb_scale */
7212         npulse_d_max = 0;
7213         for (d = 0; d < dd->ndim; d++)
7214         {
7215             dim      = dd->dim[d];
7216             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7217                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7218             npulse_d_max = max(npulse_d_max, npulse_d);
7219         }
7220         npulse = min(npulse, npulse_d_max);
7221     }
7222
7223     /* This env var can override npulse */
7224     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7225     if (d > 0)
7226     {
7227         npulse = d;
7228     }
7229
7230     comm->maxpulse       = 1;
7231     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7232     for (d = 0; d < dd->ndim; d++)
7233     {
7234         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7235         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7236         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7237         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7238         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7239         {
7240             comm->bVacDLBNoLimit = FALSE;
7241         }
7242     }
7243
7244     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7245     if (!comm->bVacDLBNoLimit)
7246     {
7247         comm->cellsize_limit = max(comm->cellsize_limit,
7248                                    comm->cutoff/comm->maxpulse);
7249     }
7250     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7251     /* Set the minimum cell size for each DD dimension */
7252     for (d = 0; d < dd->ndim; d++)
7253     {
7254         if (comm->bVacDLBNoLimit ||
7255             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7256         {
7257             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7258         }
7259         else
7260         {
7261             comm->cellsize_min_dlb[dd->dim[d]] =
7262                 comm->cutoff/comm->cd[d].np_dlb;
7263         }
7264     }
7265     if (comm->cutoff_mbody <= 0)
7266     {
7267         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7268     }
7269     if (comm->bDynLoadBal)
7270     {
7271         set_dlb_limits(dd);
7272     }
7273 }
7274
7275 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7276 {
7277     /* If each molecule is a single charge group
7278      * or we use domain decomposition for each periodic dimension,
7279      * we do not need to take pbc into account for the bonded interactions.
7280      */
7281     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7282             !(dd->nc[XX] > 1 &&
7283               dd->nc[YY] > 1 &&
7284               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7285 }
7286
7287 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7288                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7289 {
7290     gmx_domdec_comm_t *comm;
7291     int                natoms_tot;
7292     real               vol_frac;
7293
7294     comm = dd->comm;
7295
7296     /* Initialize the thread data.
7297      * This can not be done in init_domain_decomposition,
7298      * as the numbers of threads is determined later.
7299      */
7300     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7301     if (comm->nth > 1)
7302     {
7303         snew(comm->dth, comm->nth);
7304     }
7305
7306     if (EEL_PME(ir->coulombtype))
7307     {
7308         init_ddpme(dd, &comm->ddpme[0], 0);
7309         if (comm->npmedecompdim >= 2)
7310         {
7311             init_ddpme(dd, &comm->ddpme[1], 1);
7312         }
7313     }
7314     else
7315     {
7316         comm->npmenodes = 0;
7317         if (dd->pme_nodeid >= 0)
7318         {
7319             gmx_fatal_collective(FARGS, NULL, dd,
7320                                  "Can not have separate PME nodes without PME electrostatics");
7321         }
7322     }
7323
7324     if (debug)
7325     {
7326         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7327     }
7328     if (comm->eDLB != edlbNO)
7329     {
7330         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7331     }
7332
7333     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7334     if (comm->eDLB == edlbAUTO)
7335     {
7336         if (fplog)
7337         {
7338             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7339         }
7340         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7341     }
7342
7343     if (ir->ePBC == epbcNONE)
7344     {
7345         vol_frac = 1 - 1/(double)dd->nnodes;
7346     }
7347     else
7348     {
7349         vol_frac =
7350             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7351     }
7352     if (debug)
7353     {
7354         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7355     }
7356     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7357
7358     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7359 }
7360
7361 static gmx_bool test_dd_cutoff(t_commrec *cr,
7362                                t_state *state, t_inputrec *ir,
7363                                real cutoff_req)
7364 {
7365     gmx_domdec_t *dd;
7366     gmx_ddbox_t   ddbox;
7367     int           d, dim, np;
7368     real          inv_cell_size;
7369     int           LocallyLimited;
7370
7371     dd = cr->dd;
7372
7373     set_ddbox(dd, FALSE, cr, ir, state->box,
7374               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7375
7376     LocallyLimited = 0;
7377
7378     for (d = 0; d < dd->ndim; d++)
7379     {
7380         dim = dd->dim[d];
7381
7382         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7383         if (dynamic_dd_box(&ddbox, ir))
7384         {
7385             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7386         }
7387
7388         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7389
7390         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7391             dd->comm->cd[d].np_dlb > 0)
7392         {
7393             if (np > dd->comm->cd[d].np_dlb)
7394             {
7395                 return FALSE;
7396             }
7397
7398             /* If a current local cell size is smaller than the requested
7399              * cut-off, we could still fix it, but this gets very complicated.
7400              * Without fixing here, we might actually need more checks.
7401              */
7402             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7403             {
7404                 LocallyLimited = 1;
7405             }
7406         }
7407     }
7408
7409     if (dd->comm->eDLB != edlbNO)
7410     {
7411         /* If DLB is not active yet, we don't need to check the grid jumps.
7412          * Actually we shouldn't, because then the grid jump data is not set.
7413          */
7414         if (dd->comm->bDynLoadBal &&
7415             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7416         {
7417             LocallyLimited = 1;
7418         }
7419
7420         gmx_sumi(1, &LocallyLimited, cr);
7421
7422         if (LocallyLimited > 0)
7423         {
7424             return FALSE;
7425         }
7426     }
7427
7428     return TRUE;
7429 }
7430
7431 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7432                           real cutoff_req)
7433 {
7434     gmx_bool bCutoffAllowed;
7435
7436     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7437
7438     if (bCutoffAllowed)
7439     {
7440         cr->dd->comm->cutoff = cutoff_req;
7441     }
7442
7443     return bCutoffAllowed;
7444 }
7445
7446 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7447 {
7448     gmx_domdec_comm_t *comm;
7449
7450     comm = cr->dd->comm;
7451
7452     /* Turn on the DLB limiting (might have been on already) */
7453     comm->bPMELoadBalDLBLimits = TRUE;
7454
7455     /* Change the cut-off limit */
7456     comm->PMELoadBal_max_cutoff = comm->cutoff;
7457 }
7458
7459 static void merge_cg_buffers(int ncell,
7460                              gmx_domdec_comm_dim_t *cd, int pulse,
7461                              int  *ncg_cell,
7462                              int  *index_gl, int  *recv_i,
7463                              rvec *cg_cm,    rvec *recv_vr,
7464                              int *cgindex,
7465                              cginfo_mb_t *cginfo_mb, int *cginfo)
7466 {
7467     gmx_domdec_ind_t *ind, *ind_p;
7468     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7469     int               shift, shift_at;
7470
7471     ind = &cd->ind[pulse];
7472
7473     /* First correct the already stored data */
7474     shift = ind->nrecv[ncell];
7475     for (cell = ncell-1; cell >= 0; cell--)
7476     {
7477         shift -= ind->nrecv[cell];
7478         if (shift > 0)
7479         {
7480             /* Move the cg's present from previous grid pulses */
7481             cg0                = ncg_cell[ncell+cell];
7482             cg1                = ncg_cell[ncell+cell+1];
7483             cgindex[cg1+shift] = cgindex[cg1];
7484             for (cg = cg1-1; cg >= cg0; cg--)
7485             {
7486                 index_gl[cg+shift] = index_gl[cg];
7487                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7488                 cgindex[cg+shift] = cgindex[cg];
7489                 cginfo[cg+shift]  = cginfo[cg];
7490             }
7491             /* Correct the already stored send indices for the shift */
7492             for (p = 1; p <= pulse; p++)
7493             {
7494                 ind_p = &cd->ind[p];
7495                 cg0   = 0;
7496                 for (c = 0; c < cell; c++)
7497                 {
7498                     cg0 += ind_p->nsend[c];
7499                 }
7500                 cg1 = cg0 + ind_p->nsend[cell];
7501                 for (cg = cg0; cg < cg1; cg++)
7502                 {
7503                     ind_p->index[cg] += shift;
7504                 }
7505             }
7506         }
7507     }
7508
7509     /* Merge in the communicated buffers */
7510     shift    = 0;
7511     shift_at = 0;
7512     cg0      = 0;
7513     for (cell = 0; cell < ncell; cell++)
7514     {
7515         cg1 = ncg_cell[ncell+cell+1] + shift;
7516         if (shift_at > 0)
7517         {
7518             /* Correct the old cg indices */
7519             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7520             {
7521                 cgindex[cg+1] += shift_at;
7522             }
7523         }
7524         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7525         {
7526             /* Copy this charge group from the buffer */
7527             index_gl[cg1] = recv_i[cg0];
7528             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7529             /* Add it to the cgindex */
7530             cg_gl          = index_gl[cg1];
7531             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7532             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7533             cgindex[cg1+1] = cgindex[cg1] + nat;
7534             cg0++;
7535             cg1++;
7536             shift_at += nat;
7537         }
7538         shift                 += ind->nrecv[cell];
7539         ncg_cell[ncell+cell+1] = cg1;
7540     }
7541 }
7542
7543 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7544                                int nzone, int cg0, const int *cgindex)
7545 {
7546     int cg, zone, p;
7547
7548     /* Store the atom block boundaries for easy copying of communication buffers
7549      */
7550     cg = cg0;
7551     for (zone = 0; zone < nzone; zone++)
7552     {
7553         for (p = 0; p < cd->np; p++)
7554         {
7555             cd->ind[p].cell2at0[zone] = cgindex[cg];
7556             cg += cd->ind[p].nrecv[zone];
7557             cd->ind[p].cell2at1[zone] = cgindex[cg];
7558         }
7559     }
7560 }
7561
7562 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7563 {
7564     int      i;
7565     gmx_bool bMiss;
7566
7567     bMiss = FALSE;
7568     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7569     {
7570         if (!bLocalCG[link->a[i]])
7571         {
7572             bMiss = TRUE;
7573         }
7574     }
7575
7576     return bMiss;
7577 }
7578
7579 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7580 typedef struct {
7581     real c[DIM][4]; /* the corners for the non-bonded communication */
7582     real cr0;       /* corner for rounding */
7583     real cr1[4];    /* corners for rounding */
7584     real bc[DIM];   /* corners for bounded communication */
7585     real bcr1;      /* corner for rounding for bonded communication */
7586 } dd_corners_t;
7587
7588 /* Determine the corners of the domain(s) we are communicating with */
7589 static void
7590 set_dd_corners(const gmx_domdec_t *dd,
7591                int dim0, int dim1, int dim2,
7592                gmx_bool bDistMB,
7593                dd_corners_t *c)
7594 {
7595     const gmx_domdec_comm_t  *comm;
7596     const gmx_domdec_zones_t *zones;
7597     int i, j;
7598
7599     comm = dd->comm;
7600
7601     zones = &comm->zones;
7602
7603     /* Keep the compiler happy */
7604     c->cr0  = 0;
7605     c->bcr1 = 0;
7606
7607     /* The first dimension is equal for all cells */
7608     c->c[0][0] = comm->cell_x0[dim0];
7609     if (bDistMB)
7610     {
7611         c->bc[0] = c->c[0][0];
7612     }
7613     if (dd->ndim >= 2)
7614     {
7615         dim1 = dd->dim[1];
7616         /* This cell row is only seen from the first row */
7617         c->c[1][0] = comm->cell_x0[dim1];
7618         /* All rows can see this row */
7619         c->c[1][1] = comm->cell_x0[dim1];
7620         if (dd->bGridJump)
7621         {
7622             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7623             if (bDistMB)
7624             {
7625                 /* For the multi-body distance we need the maximum */
7626                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7627             }
7628         }
7629         /* Set the upper-right corner for rounding */
7630         c->cr0 = comm->cell_x1[dim0];
7631
7632         if (dd->ndim >= 3)
7633         {
7634             dim2 = dd->dim[2];
7635             for (j = 0; j < 4; j++)
7636             {
7637                 c->c[2][j] = comm->cell_x0[dim2];
7638             }
7639             if (dd->bGridJump)
7640             {
7641                 /* Use the maximum of the i-cells that see a j-cell */
7642                 for (i = 0; i < zones->nizone; i++)
7643                 {
7644                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7645                     {
7646                         if (j >= 4)
7647                         {
7648                             c->c[2][j-4] =
7649                                 max(c->c[2][j-4],
7650                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7651                         }
7652                     }
7653                 }
7654                 if (bDistMB)
7655                 {
7656                     /* For the multi-body distance we need the maximum */
7657                     c->bc[2] = comm->cell_x0[dim2];
7658                     for (i = 0; i < 2; i++)
7659                     {
7660                         for (j = 0; j < 2; j++)
7661                         {
7662                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7663                         }
7664                     }
7665                 }
7666             }
7667
7668             /* Set the upper-right corner for rounding */
7669             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7670              * Only cell (0,0,0) can see cell 7 (1,1,1)
7671              */
7672             c->cr1[0] = comm->cell_x1[dim1];
7673             c->cr1[3] = comm->cell_x1[dim1];
7674             if (dd->bGridJump)
7675             {
7676                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7677                 if (bDistMB)
7678                 {
7679                     /* For the multi-body distance we need the maximum */
7680                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7681                 }
7682             }
7683         }
7684     }
7685 }
7686
7687 /* Determine which cg's we need to send in this pulse from this zone */
7688 static void
7689 get_zone_pulse_cgs(gmx_domdec_t *dd,
7690                    int zonei, int zone,
7691                    int cg0, int cg1,
7692                    const int *index_gl,
7693                    const int *cgindex,
7694                    int dim, int dim_ind,
7695                    int dim0, int dim1, int dim2,
7696                    real r_comm2, real r_bcomm2,
7697                    matrix box,
7698                    ivec tric_dist,
7699                    rvec *normal,
7700                    real skew_fac2_d, real skew_fac_01,
7701                    rvec *v_d, rvec *v_0, rvec *v_1,
7702                    const dd_corners_t *c,
7703                    rvec sf2_round,
7704                    gmx_bool bDistBonded,
7705                    gmx_bool bBondComm,
7706                    gmx_bool bDist2B,
7707                    gmx_bool bDistMB,
7708                    rvec *cg_cm,
7709                    int *cginfo,
7710                    gmx_domdec_ind_t *ind,
7711                    int **ibuf, int *ibuf_nalloc,
7712                    vec_rvec_t *vbuf,
7713                    int *nsend_ptr,
7714                    int *nat_ptr,
7715                    int *nsend_z_ptr)
7716 {
7717     gmx_domdec_comm_t *comm;
7718     gmx_bool           bScrew;
7719     gmx_bool           bDistMB_pulse;
7720     int                cg, i;
7721     real               r2, rb2, r, tric_sh;
7722     rvec               rn, rb;
7723     int                dimd;
7724     int                nsend_z, nsend, nat;
7725
7726     comm = dd->comm;
7727
7728     bScrew = (dd->bScrewPBC && dim == XX);
7729
7730     bDistMB_pulse = (bDistMB && bDistBonded);
7731
7732     nsend_z = 0;
7733     nsend   = *nsend_ptr;
7734     nat     = *nat_ptr;
7735
7736     for (cg = cg0; cg < cg1; cg++)
7737     {
7738         r2  = 0;
7739         rb2 = 0;
7740         if (tric_dist[dim_ind] == 0)
7741         {
7742             /* Rectangular direction, easy */
7743             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7744             if (r > 0)
7745             {
7746                 r2 += r*r;
7747             }
7748             if (bDistMB_pulse)
7749             {
7750                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7751                 if (r > 0)
7752                 {
7753                     rb2 += r*r;
7754                 }
7755             }
7756             /* Rounding gives at most a 16% reduction
7757              * in communicated atoms
7758              */
7759             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7760             {
7761                 r = cg_cm[cg][dim0] - c->cr0;
7762                 /* This is the first dimension, so always r >= 0 */
7763                 r2 += r*r;
7764                 if (bDistMB_pulse)
7765                 {
7766                     rb2 += r*r;
7767                 }
7768             }
7769             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7770             {
7771                 r = cg_cm[cg][dim1] - c->cr1[zone];
7772                 if (r > 0)
7773                 {
7774                     r2 += r*r;
7775                 }
7776                 if (bDistMB_pulse)
7777                 {
7778                     r = cg_cm[cg][dim1] - c->bcr1;
7779                     if (r > 0)
7780                     {
7781                         rb2 += r*r;
7782                     }
7783                 }
7784             }
7785         }
7786         else
7787         {
7788             /* Triclinic direction, more complicated */
7789             clear_rvec(rn);
7790             clear_rvec(rb);
7791             /* Rounding, conservative as the skew_fac multiplication
7792              * will slightly underestimate the distance.
7793              */
7794             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7795             {
7796                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7797                 for (i = dim0+1; i < DIM; i++)
7798                 {
7799                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7800                 }
7801                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7802                 if (bDistMB_pulse)
7803                 {
7804                     rb[dim0] = rn[dim0];
7805                     rb2      = r2;
7806                 }
7807                 /* Take care that the cell planes along dim0 might not
7808                  * be orthogonal to those along dim1 and dim2.
7809                  */
7810                 for (i = 1; i <= dim_ind; i++)
7811                 {
7812                     dimd = dd->dim[i];
7813                     if (normal[dim0][dimd] > 0)
7814                     {
7815                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7816                         if (bDistMB_pulse)
7817                         {
7818                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7819                         }
7820                     }
7821                 }
7822             }
7823             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7824             {
7825                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7826                 tric_sh   = 0;
7827                 for (i = dim1+1; i < DIM; i++)
7828                 {
7829                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7830                 }
7831                 rn[dim1] += tric_sh;
7832                 if (rn[dim1] > 0)
7833                 {
7834                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7835                     /* Take care of coupling of the distances
7836                      * to the planes along dim0 and dim1 through dim2.
7837                      */
7838                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7839                     /* Take care that the cell planes along dim1
7840                      * might not be orthogonal to that along dim2.
7841                      */
7842                     if (normal[dim1][dim2] > 0)
7843                     {
7844                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7845                     }
7846                 }
7847                 if (bDistMB_pulse)
7848                 {
7849                     rb[dim1] +=
7850                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7851                     if (rb[dim1] > 0)
7852                     {
7853                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7854                         /* Take care of coupling of the distances
7855                          * to the planes along dim0 and dim1 through dim2.
7856                          */
7857                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7858                         /* Take care that the cell planes along dim1
7859                          * might not be orthogonal to that along dim2.
7860                          */
7861                         if (normal[dim1][dim2] > 0)
7862                         {
7863                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7864                         }
7865                     }
7866                 }
7867             }
7868             /* The distance along the communication direction */
7869             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7870             tric_sh  = 0;
7871             for (i = dim+1; i < DIM; i++)
7872             {
7873                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7874             }
7875             rn[dim] += tric_sh;
7876             if (rn[dim] > 0)
7877             {
7878                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7879                 /* Take care of coupling of the distances
7880                  * to the planes along dim0 and dim1 through dim2.
7881                  */
7882                 if (dim_ind == 1 && zonei == 1)
7883                 {
7884                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7885                 }
7886             }
7887             if (bDistMB_pulse)
7888             {
7889                 clear_rvec(rb);
7890                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7891                 if (rb[dim] > 0)
7892                 {
7893                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7894                     /* Take care of coupling of the distances
7895                      * to the planes along dim0 and dim1 through dim2.
7896                      */
7897                     if (dim_ind == 1 && zonei == 1)
7898                     {
7899                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7900                     }
7901                 }
7902             }
7903         }
7904
7905         if (r2 < r_comm2 ||
7906             (bDistBonded &&
7907              ((bDistMB && rb2 < r_bcomm2) ||
7908               (bDist2B && r2  < r_bcomm2)) &&
7909              (!bBondComm ||
7910               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7911                missing_link(comm->cglink, index_gl[cg],
7912                             comm->bLocalCG)))))
7913         {
7914             /* Make an index to the local charge groups */
7915             if (nsend+1 > ind->nalloc)
7916             {
7917                 ind->nalloc = over_alloc_large(nsend+1);
7918                 srenew(ind->index, ind->nalloc);
7919             }
7920             if (nsend+1 > *ibuf_nalloc)
7921             {
7922                 *ibuf_nalloc = over_alloc_large(nsend+1);
7923                 srenew(*ibuf, *ibuf_nalloc);
7924             }
7925             ind->index[nsend] = cg;
7926             (*ibuf)[nsend]    = index_gl[cg];
7927             nsend_z++;
7928             vec_rvec_check_alloc(vbuf, nsend+1);
7929
7930             if (dd->ci[dim] == 0)
7931             {
7932                 /* Correct cg_cm for pbc */
7933                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7934                 if (bScrew)
7935                 {
7936                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7937                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7938                 }
7939             }
7940             else
7941             {
7942                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7943             }
7944             nsend++;
7945             nat += cgindex[cg+1] - cgindex[cg];
7946         }
7947     }
7948
7949     *nsend_ptr   = nsend;
7950     *nat_ptr     = nat;
7951     *nsend_z_ptr = nsend_z;
7952 }
7953
7954 static void setup_dd_communication(gmx_domdec_t *dd,
7955                                    matrix box, gmx_ddbox_t *ddbox,
7956                                    t_forcerec *fr, t_state *state, rvec **f)
7957 {
7958     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7959     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7960     int                    c, i, j, cg, cg_gl, nrcg;
7961     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7962     gmx_domdec_comm_t     *comm;
7963     gmx_domdec_zones_t    *zones;
7964     gmx_domdec_comm_dim_t *cd;
7965     gmx_domdec_ind_t      *ind;
7966     cginfo_mb_t           *cginfo_mb;
7967     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7968     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
7969     dd_corners_t           corners;
7970     ivec                   tric_dist;
7971     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7972     real                   skew_fac2_d, skew_fac_01;
7973     rvec                   sf2_round;
7974     int                    nsend, nat;
7975     int                    th;
7976
7977     if (debug)
7978     {
7979         fprintf(debug, "Setting up DD communication\n");
7980     }
7981
7982     comm  = dd->comm;
7983
7984     switch (fr->cutoff_scheme)
7985     {
7986         case ecutsGROUP:
7987             cg_cm = fr->cg_cm;
7988             break;
7989         case ecutsVERLET:
7990             cg_cm = state->x;
7991             break;
7992         default:
7993             gmx_incons("unimplemented");
7994             cg_cm = NULL;
7995     }
7996
7997     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7998     {
7999         dim = dd->dim[dim_ind];
8000
8001         /* Check if we need to use triclinic distances */
8002         tric_dist[dim_ind] = 0;
8003         for (i = 0; i <= dim_ind; i++)
8004         {
8005             if (ddbox->tric_dir[dd->dim[i]])
8006             {
8007                 tric_dist[dim_ind] = 1;
8008             }
8009         }
8010     }
8011
8012     bBondComm = comm->bBondComm;
8013
8014     /* Do we need to determine extra distances for multi-body bondeds? */
8015     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8016
8017     /* Do we need to determine extra distances for only two-body bondeds? */
8018     bDist2B = (bBondComm && !bDistMB);
8019
8020     r_comm2  = sqr(comm->cutoff);
8021     r_bcomm2 = sqr(comm->cutoff_mbody);
8022
8023     if (debug)
8024     {
8025         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8026     }
8027
8028     zones = &comm->zones;
8029
8030     dim0 = dd->dim[0];
8031     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8032     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8033
8034     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8035
8036     /* Triclinic stuff */
8037     normal      = ddbox->normal;
8038     skew_fac_01 = 0;
8039     if (dd->ndim >= 2)
8040     {
8041         v_0 = ddbox->v[dim0];
8042         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8043         {
8044             /* Determine the coupling coefficient for the distances
8045              * to the cell planes along dim0 and dim1 through dim2.
8046              * This is required for correct rounding.
8047              */
8048             skew_fac_01 =
8049                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8050             if (debug)
8051             {
8052                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8053             }
8054         }
8055     }
8056     if (dd->ndim >= 3)
8057     {
8058         v_1 = ddbox->v[dim1];
8059     }
8060
8061     zone_cg_range = zones->cg_range;
8062     index_gl      = dd->index_gl;
8063     cgindex       = dd->cgindex;
8064     cginfo_mb     = fr->cginfo_mb;
8065
8066     zone_cg_range[0]   = 0;
8067     zone_cg_range[1]   = dd->ncg_home;
8068     comm->zone_ncg1[0] = dd->ncg_home;
8069     pos_cg             = dd->ncg_home;
8070
8071     nat_tot = dd->nat_home;
8072     nzone   = 1;
8073     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8074     {
8075         dim = dd->dim[dim_ind];
8076         cd  = &comm->cd[dim_ind];
8077
8078         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8079         {
8080             /* No pbc in this dimension, the first node should not comm. */
8081             nzone_send = 0;
8082         }
8083         else
8084         {
8085             nzone_send = nzone;
8086         }
8087
8088         v_d         = ddbox->v[dim];
8089         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8090
8091         cd->bInPlace = TRUE;
8092         for (p = 0; p < cd->np; p++)
8093         {
8094             /* Only atoms communicated in the first pulse are used
8095              * for multi-body bonded interactions or for bBondComm.
8096              */
8097             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8098
8099             ind   = &cd->ind[p];
8100             nsend = 0;
8101             nat   = 0;
8102             for (zone = 0; zone < nzone_send; zone++)
8103             {
8104                 if (tric_dist[dim_ind] && dim_ind > 0)
8105                 {
8106                     /* Determine slightly more optimized skew_fac's
8107                      * for rounding.
8108                      * This reduces the number of communicated atoms
8109                      * by about 10% for 3D DD of rhombic dodecahedra.
8110                      */
8111                     for (dimd = 0; dimd < dim; dimd++)
8112                     {
8113                         sf2_round[dimd] = 1;
8114                         if (ddbox->tric_dir[dimd])
8115                         {
8116                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8117                             {
8118                                 /* If we are shifted in dimension i
8119                                  * and the cell plane is tilted forward
8120                                  * in dimension i, skip this coupling.
8121                                  */
8122                                 if (!(zones->shift[nzone+zone][i] &&
8123                                       ddbox->v[dimd][i][dimd] >= 0))
8124                                 {
8125                                     sf2_round[dimd] +=
8126                                         sqr(ddbox->v[dimd][i][dimd]);
8127                                 }
8128                             }
8129                             sf2_round[dimd] = 1/sf2_round[dimd];
8130                         }
8131                     }
8132                 }
8133
8134                 zonei = zone_perm[dim_ind][zone];
8135                 if (p == 0)
8136                 {
8137                     /* Here we permutate the zones to obtain a convenient order
8138                      * for neighbor searching
8139                      */
8140                     cg0 = zone_cg_range[zonei];
8141                     cg1 = zone_cg_range[zonei+1];
8142                 }
8143                 else
8144                 {
8145                     /* Look only at the cg's received in the previous grid pulse
8146                      */
8147                     cg1 = zone_cg_range[nzone+zone+1];
8148                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8149                 }
8150
8151 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8152                 for (th = 0; th < comm->nth; th++)
8153                 {
8154                     gmx_domdec_ind_t *ind_p;
8155                     int             **ibuf_p, *ibuf_nalloc_p;
8156                     vec_rvec_t       *vbuf_p;
8157                     int              *nsend_p, *nat_p;
8158                     int              *nsend_zone_p;
8159                     int               cg0_th, cg1_th;
8160
8161                     if (th == 0)
8162                     {
8163                         /* Thread 0 writes in the comm buffers */
8164                         ind_p         = ind;
8165                         ibuf_p        = &comm->buf_int;
8166                         ibuf_nalloc_p = &comm->nalloc_int;
8167                         vbuf_p        = &comm->vbuf;
8168                         nsend_p       = &nsend;
8169                         nat_p         = &nat;
8170                         nsend_zone_p  = &ind->nsend[zone];
8171                     }
8172                     else
8173                     {
8174                         /* Other threads write into temp buffers */
8175                         ind_p         = &comm->dth[th].ind;
8176                         ibuf_p        = &comm->dth[th].ibuf;
8177                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8178                         vbuf_p        = &comm->dth[th].vbuf;
8179                         nsend_p       = &comm->dth[th].nsend;
8180                         nat_p         = &comm->dth[th].nat;
8181                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8182
8183                         comm->dth[th].nsend      = 0;
8184                         comm->dth[th].nat        = 0;
8185                         comm->dth[th].nsend_zone = 0;
8186                     }
8187
8188                     if (comm->nth == 1)
8189                     {
8190                         cg0_th = cg0;
8191                         cg1_th = cg1;
8192                     }
8193                     else
8194                     {
8195                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8196                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8197                     }
8198
8199                     /* Get the cg's for this pulse in this zone */
8200                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8201                                        index_gl, cgindex,
8202                                        dim, dim_ind, dim0, dim1, dim2,
8203                                        r_comm2, r_bcomm2,
8204                                        box, tric_dist,
8205                                        normal, skew_fac2_d, skew_fac_01,
8206                                        v_d, v_0, v_1, &corners, sf2_round,
8207                                        bDistBonded, bBondComm,
8208                                        bDist2B, bDistMB,
8209                                        cg_cm, fr->cginfo,
8210                                        ind_p,
8211                                        ibuf_p, ibuf_nalloc_p,
8212                                        vbuf_p,
8213                                        nsend_p, nat_p,
8214                                        nsend_zone_p);
8215                 }
8216
8217                 /* Append data of threads>=1 to the communication buffers */
8218                 for (th = 1; th < comm->nth; th++)
8219                 {
8220                     dd_comm_setup_work_t *dth;
8221                     int                   i, ns1;
8222
8223                     dth = &comm->dth[th];
8224
8225                     ns1 = nsend + dth->nsend_zone;
8226                     if (ns1 > ind->nalloc)
8227                     {
8228                         ind->nalloc = over_alloc_dd(ns1);
8229                         srenew(ind->index, ind->nalloc);
8230                     }
8231                     if (ns1 > comm->nalloc_int)
8232                     {
8233                         comm->nalloc_int = over_alloc_dd(ns1);
8234                         srenew(comm->buf_int, comm->nalloc_int);
8235                     }
8236                     if (ns1 > comm->vbuf.nalloc)
8237                     {
8238                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8239                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8240                     }
8241
8242                     for (i = 0; i < dth->nsend_zone; i++)
8243                     {
8244                         ind->index[nsend]    = dth->ind.index[i];
8245                         comm->buf_int[nsend] = dth->ibuf[i];
8246                         copy_rvec(dth->vbuf.v[i],
8247                                   comm->vbuf.v[nsend]);
8248                         nsend++;
8249                     }
8250                     nat              += dth->nat;
8251                     ind->nsend[zone] += dth->nsend_zone;
8252                 }
8253             }
8254             /* Clear the counts in case we do not have pbc */
8255             for (zone = nzone_send; zone < nzone; zone++)
8256             {
8257                 ind->nsend[zone] = 0;
8258             }
8259             ind->nsend[nzone]   = nsend;
8260             ind->nsend[nzone+1] = nat;
8261             /* Communicate the number of cg's and atoms to receive */
8262             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8263                             ind->nsend, nzone+2,
8264                             ind->nrecv, nzone+2);
8265
8266             /* The rvec buffer is also required for atom buffers of size nsend
8267              * in dd_move_x and dd_move_f.
8268              */
8269             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8270
8271             if (p > 0)
8272             {
8273                 /* We can receive in place if only the last zone is not empty */
8274                 for (zone = 0; zone < nzone-1; zone++)
8275                 {
8276                     if (ind->nrecv[zone] > 0)
8277                     {
8278                         cd->bInPlace = FALSE;
8279                     }
8280                 }
8281                 if (!cd->bInPlace)
8282                 {
8283                     /* The int buffer is only required here for the cg indices */
8284                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8285                     {
8286                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8287                         srenew(comm->buf_int2, comm->nalloc_int2);
8288                     }
8289                     /* The rvec buffer is also required for atom buffers
8290                      * of size nrecv in dd_move_x and dd_move_f.
8291                      */
8292                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8293                     vec_rvec_check_alloc(&comm->vbuf2, i);
8294                 }
8295             }
8296
8297             /* Make space for the global cg indices */
8298             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8299                 || dd->cg_nalloc == 0)
8300             {
8301                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8302                 srenew(index_gl, dd->cg_nalloc);
8303                 srenew(cgindex, dd->cg_nalloc+1);
8304             }
8305             /* Communicate the global cg indices */
8306             if (cd->bInPlace)
8307             {
8308                 recv_i = index_gl + pos_cg;
8309             }
8310             else
8311             {
8312                 recv_i = comm->buf_int2;
8313             }
8314             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8315                             comm->buf_int, nsend,
8316                             recv_i,        ind->nrecv[nzone]);
8317
8318             /* Make space for cg_cm */
8319             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8320             if (fr->cutoff_scheme == ecutsGROUP)
8321             {
8322                 cg_cm = fr->cg_cm;
8323             }
8324             else
8325             {
8326                 cg_cm = state->x;
8327             }
8328             /* Communicate cg_cm */
8329             if (cd->bInPlace)
8330             {
8331                 recv_vr = cg_cm + pos_cg;
8332             }
8333             else
8334             {
8335                 recv_vr = comm->vbuf2.v;
8336             }
8337             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8338                              comm->vbuf.v, nsend,
8339                              recv_vr,      ind->nrecv[nzone]);
8340
8341             /* Make the charge group index */
8342             if (cd->bInPlace)
8343             {
8344                 zone = (p == 0 ? 0 : nzone - 1);
8345                 while (zone < nzone)
8346                 {
8347                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8348                     {
8349                         cg_gl              = index_gl[pos_cg];
8350                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8351                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8352                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8353                         if (bBondComm)
8354                         {
8355                             /* Update the charge group presence,
8356                              * so we can use it in the next pass of the loop.
8357                              */
8358                             comm->bLocalCG[cg_gl] = TRUE;
8359                         }
8360                         pos_cg++;
8361                     }
8362                     if (p == 0)
8363                     {
8364                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8365                     }
8366                     zone++;
8367                     zone_cg_range[nzone+zone] = pos_cg;
8368                 }
8369             }
8370             else
8371             {
8372                 /* This part of the code is never executed with bBondComm. */
8373                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8374                                  index_gl, recv_i, cg_cm, recv_vr,
8375                                  cgindex, fr->cginfo_mb, fr->cginfo);
8376                 pos_cg += ind->nrecv[nzone];
8377             }
8378             nat_tot += ind->nrecv[nzone+1];
8379         }
8380         if (!cd->bInPlace)
8381         {
8382             /* Store the atom block for easy copying of communication buffers */
8383             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8384         }
8385         nzone += nzone;
8386     }
8387     dd->index_gl = index_gl;
8388     dd->cgindex  = cgindex;
8389
8390     dd->ncg_tot          = zone_cg_range[zones->n];
8391     dd->nat_tot          = nat_tot;
8392     comm->nat[ddnatHOME] = dd->nat_home;
8393     for (i = ddnatZONE; i < ddnatNR; i++)
8394     {
8395         comm->nat[i] = dd->nat_tot;
8396     }
8397
8398     if (!bBondComm)
8399     {
8400         /* We don't need to update cginfo, since that was alrady done above.
8401          * So we pass NULL for the forcerec.
8402          */
8403         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8404                       NULL, comm->bLocalCG);
8405     }
8406
8407     if (debug)
8408     {
8409         fprintf(debug, "Finished setting up DD communication, zones:");
8410         for (c = 0; c < zones->n; c++)
8411         {
8412             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8413         }
8414         fprintf(debug, "\n");
8415     }
8416 }
8417
8418 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8419 {
8420     int c;
8421
8422     for (c = 0; c < zones->nizone; c++)
8423     {
8424         zones->izone[c].cg1  = zones->cg_range[c+1];
8425         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8426         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8427     }
8428 }
8429
8430 static void set_zones_size(gmx_domdec_t *dd,
8431                            matrix box, const gmx_ddbox_t *ddbox,
8432                            int zone_start, int zone_end)
8433 {
8434     gmx_domdec_comm_t  *comm;
8435     gmx_domdec_zones_t *zones;
8436     gmx_bool            bDistMB;
8437     int                 z, zi, zj0, zj1, d, dim;
8438     real                rcs, rcmbs;
8439     int                 i, j;
8440     real                size_j, add_tric;
8441     real                vol;
8442
8443     comm = dd->comm;
8444
8445     zones = &comm->zones;
8446
8447     /* Do we need to determine extra distances for multi-body bondeds? */
8448     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8449
8450     for (z = zone_start; z < zone_end; z++)
8451     {
8452         /* Copy cell limits to zone limits.
8453          * Valid for non-DD dims and non-shifted dims.
8454          */
8455         copy_rvec(comm->cell_x0, zones->size[z].x0);
8456         copy_rvec(comm->cell_x1, zones->size[z].x1);
8457     }
8458
8459     for (d = 0; d < dd->ndim; d++)
8460     {
8461         dim = dd->dim[d];
8462
8463         for (z = 0; z < zones->n; z++)
8464         {
8465             /* With a staggered grid we have different sizes
8466              * for non-shifted dimensions.
8467              */
8468             if (dd->bGridJump && zones->shift[z][dim] == 0)
8469             {
8470                 if (d == 1)
8471                 {
8472                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8473                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8474                 }
8475                 else if (d == 2)
8476                 {
8477                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8478                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8479                 }
8480             }
8481         }
8482
8483         rcs   = comm->cutoff;
8484         rcmbs = comm->cutoff_mbody;
8485         if (ddbox->tric_dir[dim])
8486         {
8487             rcs   /= ddbox->skew_fac[dim];
8488             rcmbs /= ddbox->skew_fac[dim];
8489         }
8490
8491         /* Set the lower limit for the shifted zone dimensions */
8492         for (z = zone_start; z < zone_end; z++)
8493         {
8494             if (zones->shift[z][dim] > 0)
8495             {
8496                 dim = dd->dim[d];
8497                 if (!dd->bGridJump || d == 0)
8498                 {
8499                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8500                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8501                 }
8502                 else
8503                 {
8504                     /* Here we take the lower limit of the zone from
8505                      * the lowest domain of the zone below.
8506                      */
8507                     if (z < 4)
8508                     {
8509                         zones->size[z].x0[dim] =
8510                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8511                     }
8512                     else
8513                     {
8514                         if (d == 1)
8515                         {
8516                             zones->size[z].x0[dim] =
8517                                 zones->size[zone_perm[2][z-4]].x0[dim];
8518                         }
8519                         else
8520                         {
8521                             zones->size[z].x0[dim] =
8522                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8523                         }
8524                     }
8525                     /* A temporary limit, is updated below */
8526                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8527
8528                     if (bDistMB)
8529                     {
8530                         for (zi = 0; zi < zones->nizone; zi++)
8531                         {
8532                             if (zones->shift[zi][dim] == 0)
8533                             {
8534                                 /* This takes the whole zone into account.
8535                                  * With multiple pulses this will lead
8536                                  * to a larger zone then strictly necessary.
8537                                  */
8538                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8539                                                              zones->size[zi].x1[dim]+rcmbs);
8540                             }
8541                         }
8542                     }
8543                 }
8544             }
8545         }
8546
8547         /* Loop over the i-zones to set the upper limit of each
8548          * j-zone they see.
8549          */
8550         for (zi = 0; zi < zones->nizone; zi++)
8551         {
8552             if (zones->shift[zi][dim] == 0)
8553             {
8554                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8555                 {
8556                     if (zones->shift[z][dim] > 0)
8557                     {
8558                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8559                                                      zones->size[zi].x1[dim]+rcs);
8560                     }
8561                 }
8562             }
8563         }
8564     }
8565
8566     for (z = zone_start; z < zone_end; z++)
8567     {
8568         /* Initialization only required to keep the compiler happy */
8569         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8570         int  nc, c;
8571
8572         /* To determine the bounding box for a zone we need to find
8573          * the extreme corners of 4, 2 or 1 corners.
8574          */
8575         nc = 1 << (ddbox->npbcdim - 1);
8576
8577         for (c = 0; c < nc; c++)
8578         {
8579             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8580             corner[XX] = 0;
8581             if ((c & 1) == 0)
8582             {
8583                 corner[YY] = zones->size[z].x0[YY];
8584             }
8585             else
8586             {
8587                 corner[YY] = zones->size[z].x1[YY];
8588             }
8589             if ((c & 2) == 0)
8590             {
8591                 corner[ZZ] = zones->size[z].x0[ZZ];
8592             }
8593             else
8594             {
8595                 corner[ZZ] = zones->size[z].x1[ZZ];
8596             }
8597             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8598             {
8599                 /* With 1D domain decomposition the cg's are not in
8600                  * the triclinic box, but triclinic x-y and rectangular y-z.
8601                  * Shift y back, so it will later end up at 0.
8602                  */
8603                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8604             }
8605             /* Apply the triclinic couplings */
8606             for (i = YY; i < ddbox->npbcdim; i++)
8607             {
8608                 for (j = XX; j < i; j++)
8609                 {
8610                     corner[j] += corner[i]*box[i][j]/box[i][i];
8611                 }
8612             }
8613             if (c == 0)
8614             {
8615                 copy_rvec(corner, corner_min);
8616                 copy_rvec(corner, corner_max);
8617             }
8618             else
8619             {
8620                 for (i = 0; i < DIM; i++)
8621                 {
8622                     corner_min[i] = min(corner_min[i], corner[i]);
8623                     corner_max[i] = max(corner_max[i], corner[i]);
8624                 }
8625             }
8626         }
8627         /* Copy the extreme cornes without offset along x */
8628         for (i = 0; i < DIM; i++)
8629         {
8630             zones->size[z].bb_x0[i] = corner_min[i];
8631             zones->size[z].bb_x1[i] = corner_max[i];
8632         }
8633         /* Add the offset along x */
8634         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8635         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8636     }
8637
8638     if (zone_start == 0)
8639     {
8640         vol = 1;
8641         for (dim = 0; dim < DIM; dim++)
8642         {
8643             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8644         }
8645         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8646     }
8647
8648     if (debug)
8649     {
8650         for (z = zone_start; z < zone_end; z++)
8651         {
8652             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8653                     z,
8654                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8655                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8656                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8657             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8658                     z,
8659                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8660                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8661                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8662         }
8663     }
8664 }
8665
8666 static int comp_cgsort(const void *a, const void *b)
8667 {
8668     int           comp;
8669
8670     gmx_cgsort_t *cga, *cgb;
8671     cga = (gmx_cgsort_t *)a;
8672     cgb = (gmx_cgsort_t *)b;
8673
8674     comp = cga->nsc - cgb->nsc;
8675     if (comp == 0)
8676     {
8677         comp = cga->ind_gl - cgb->ind_gl;
8678     }
8679
8680     return comp;
8681 }
8682
8683 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8684                          int *a, int *buf)
8685 {
8686     int i;
8687
8688     /* Order the data */
8689     for (i = 0; i < n; i++)
8690     {
8691         buf[i] = a[sort[i].ind];
8692     }
8693
8694     /* Copy back to the original array */
8695     for (i = 0; i < n; i++)
8696     {
8697         a[i] = buf[i];
8698     }
8699 }
8700
8701 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8702                          rvec *v, rvec *buf)
8703 {
8704     int i;
8705
8706     /* Order the data */
8707     for (i = 0; i < n; i++)
8708     {
8709         copy_rvec(v[sort[i].ind], buf[i]);
8710     }
8711
8712     /* Copy back to the original array */
8713     for (i = 0; i < n; i++)
8714     {
8715         copy_rvec(buf[i], v[i]);
8716     }
8717 }
8718
8719 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8720                            rvec *v, rvec *buf)
8721 {
8722     int a, atot, cg, cg0, cg1, i;
8723
8724     if (cgindex == NULL)
8725     {
8726         /* Avoid the useless loop of the atoms within a cg */
8727         order_vec_cg(ncg, sort, v, buf);
8728
8729         return;
8730     }
8731
8732     /* Order the data */
8733     a = 0;
8734     for (cg = 0; cg < ncg; cg++)
8735     {
8736         cg0 = cgindex[sort[cg].ind];
8737         cg1 = cgindex[sort[cg].ind+1];
8738         for (i = cg0; i < cg1; i++)
8739         {
8740             copy_rvec(v[i], buf[a]);
8741             a++;
8742         }
8743     }
8744     atot = a;
8745
8746     /* Copy back to the original array */
8747     for (a = 0; a < atot; a++)
8748     {
8749         copy_rvec(buf[a], v[a]);
8750     }
8751 }
8752
8753 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8754                          int nsort_new, gmx_cgsort_t *sort_new,
8755                          gmx_cgsort_t *sort1)
8756 {
8757     int i1, i2, i_new;
8758
8759     /* The new indices are not very ordered, so we qsort them */
8760     qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8761
8762     /* sort2 is already ordered, so now we can merge the two arrays */
8763     i1    = 0;
8764     i2    = 0;
8765     i_new = 0;
8766     while (i2 < nsort2 || i_new < nsort_new)
8767     {
8768         if (i2 == nsort2)
8769         {
8770             sort1[i1++] = sort_new[i_new++];
8771         }
8772         else if (i_new == nsort_new)
8773         {
8774             sort1[i1++] = sort2[i2++];
8775         }
8776         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8777                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8778                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8779         {
8780             sort1[i1++] = sort2[i2++];
8781         }
8782         else
8783         {
8784             sort1[i1++] = sort_new[i_new++];
8785         }
8786     }
8787 }
8788
8789 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8790 {
8791     gmx_domdec_sort_t *sort;
8792     gmx_cgsort_t      *cgsort, *sort_i;
8793     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8794     int                sort_last, sort_skip;
8795
8796     sort = dd->comm->sort;
8797
8798     a = fr->ns.grid->cell_index;
8799
8800     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8801
8802     if (ncg_home_old >= 0)
8803     {
8804         /* The charge groups that remained in the same ns grid cell
8805          * are completely ordered. So we can sort efficiently by sorting
8806          * the charge groups that did move into the stationary list.
8807          */
8808         ncg_new   = 0;
8809         nsort2    = 0;
8810         nsort_new = 0;
8811         for (i = 0; i < dd->ncg_home; i++)
8812         {
8813             /* Check if this cg did not move to another node */
8814             if (a[i] < moved)
8815             {
8816                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8817                 {
8818                     /* This cg is new on this node or moved ns grid cell */
8819                     if (nsort_new >= sort->sort_new_nalloc)
8820                     {
8821                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8822                         srenew(sort->sort_new, sort->sort_new_nalloc);
8823                     }
8824                     sort_i = &(sort->sort_new[nsort_new++]);
8825                 }
8826                 else
8827                 {
8828                     /* This cg did not move */
8829                     sort_i = &(sort->sort2[nsort2++]);
8830                 }
8831                 /* Sort on the ns grid cell indices
8832                  * and the global topology index.
8833                  * index_gl is irrelevant with cell ns,
8834                  * but we set it here anyhow to avoid a conditional.
8835                  */
8836                 sort_i->nsc    = a[i];
8837                 sort_i->ind_gl = dd->index_gl[i];
8838                 sort_i->ind    = i;
8839                 ncg_new++;
8840             }
8841         }
8842         if (debug)
8843         {
8844             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8845                     nsort2, nsort_new);
8846         }
8847         /* Sort efficiently */
8848         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8849                      sort->sort);
8850     }
8851     else
8852     {
8853         cgsort  = sort->sort;
8854         ncg_new = 0;
8855         for (i = 0; i < dd->ncg_home; i++)
8856         {
8857             /* Sort on the ns grid cell indices
8858              * and the global topology index
8859              */
8860             cgsort[i].nsc    = a[i];
8861             cgsort[i].ind_gl = dd->index_gl[i];
8862             cgsort[i].ind    = i;
8863             if (cgsort[i].nsc < moved)
8864             {
8865                 ncg_new++;
8866             }
8867         }
8868         if (debug)
8869         {
8870             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8871         }
8872         /* Determine the order of the charge groups using qsort */
8873         qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8874     }
8875
8876     return ncg_new;
8877 }
8878
8879 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8880 {
8881     gmx_cgsort_t *sort;
8882     int           ncg_new, i, *a, na;
8883
8884     sort = dd->comm->sort->sort;
8885
8886     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8887
8888     ncg_new = 0;
8889     for (i = 0; i < na; i++)
8890     {
8891         if (a[i] >= 0)
8892         {
8893             sort[ncg_new].ind = a[i];
8894             ncg_new++;
8895         }
8896     }
8897
8898     return ncg_new;
8899 }
8900
8901 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
8902                           int ncg_home_old)
8903 {
8904     gmx_domdec_sort_t *sort;
8905     gmx_cgsort_t      *cgsort, *sort_i;
8906     int               *cgindex;
8907     int                ncg_new, i, *ibuf, cgsize;
8908     rvec              *vbuf;
8909
8910     sort = dd->comm->sort;
8911
8912     if (dd->ncg_home > sort->sort_nalloc)
8913     {
8914         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8915         srenew(sort->sort, sort->sort_nalloc);
8916         srenew(sort->sort2, sort->sort_nalloc);
8917     }
8918     cgsort = sort->sort;
8919
8920     switch (fr->cutoff_scheme)
8921     {
8922         case ecutsGROUP:
8923             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8924             break;
8925         case ecutsVERLET:
8926             ncg_new = dd_sort_order_nbnxn(dd, fr);
8927             break;
8928         default:
8929             gmx_incons("unimplemented");
8930             ncg_new = 0;
8931     }
8932
8933     /* We alloc with the old size, since cgindex is still old */
8934     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8935     vbuf = dd->comm->vbuf.v;
8936
8937     if (dd->comm->bCGs)
8938     {
8939         cgindex = dd->cgindex;
8940     }
8941     else
8942     {
8943         cgindex = NULL;
8944     }
8945
8946     /* Remove the charge groups which are no longer at home here */
8947     dd->ncg_home = ncg_new;
8948     if (debug)
8949     {
8950         fprintf(debug, "Set the new home charge group count to %d\n",
8951                 dd->ncg_home);
8952     }
8953
8954     /* Reorder the state */
8955     for (i = 0; i < estNR; i++)
8956     {
8957         if (EST_DISTR(i) && (state->flags & (1<<i)))
8958         {
8959             switch (i)
8960             {
8961                 case estX:
8962                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8963                     break;
8964                 case estV:
8965                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8966                     break;
8967                 case estSDX:
8968                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8969                     break;
8970                 case estCGP:
8971                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8972                     break;
8973                 case estLD_RNG:
8974                 case estLD_RNGI:
8975                 case estDISRE_INITF:
8976                 case estDISRE_RM3TAV:
8977                 case estORIRE_INITF:
8978                 case estORIRE_DTAV:
8979                     /* No ordering required */
8980                     break;
8981                 default:
8982                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8983                     break;
8984             }
8985         }
8986     }
8987     if (fr->cutoff_scheme == ecutsGROUP)
8988     {
8989         /* Reorder cgcm */
8990         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8991     }
8992
8993     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8994     {
8995         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8996         srenew(sort->ibuf, sort->ibuf_nalloc);
8997     }
8998     ibuf = sort->ibuf;
8999     /* Reorder the global cg index */
9000     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9001     /* Reorder the cginfo */
9002     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9003     /* Rebuild the local cg index */
9004     if (dd->comm->bCGs)
9005     {
9006         ibuf[0] = 0;
9007         for (i = 0; i < dd->ncg_home; i++)
9008         {
9009             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9010             ibuf[i+1] = ibuf[i] + cgsize;
9011         }
9012         for (i = 0; i < dd->ncg_home+1; i++)
9013         {
9014             dd->cgindex[i] = ibuf[i];
9015         }
9016     }
9017     else
9018     {
9019         for (i = 0; i < dd->ncg_home+1; i++)
9020         {
9021             dd->cgindex[i] = i;
9022         }
9023     }
9024     /* Set the home atom number */
9025     dd->nat_home = dd->cgindex[dd->ncg_home];
9026
9027     if (fr->cutoff_scheme == ecutsVERLET)
9028     {
9029         /* The atoms are now exactly in grid order, update the grid order */
9030         nbnxn_set_atomorder(fr->nbv->nbs);
9031     }
9032     else
9033     {
9034         /* Copy the sorted ns cell indices back to the ns grid struct */
9035         for (i = 0; i < dd->ncg_home; i++)
9036         {
9037             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9038         }
9039         fr->ns.grid->nr = dd->ncg_home;
9040     }
9041 }
9042
9043 static void add_dd_statistics(gmx_domdec_t *dd)
9044 {
9045     gmx_domdec_comm_t *comm;
9046     int                ddnat;
9047
9048     comm = dd->comm;
9049
9050     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9051     {
9052         comm->sum_nat[ddnat-ddnatZONE] +=
9053             comm->nat[ddnat] - comm->nat[ddnat-1];
9054     }
9055     comm->ndecomp++;
9056 }
9057
9058 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9059 {
9060     gmx_domdec_comm_t *comm;
9061     int                ddnat;
9062
9063     comm = dd->comm;
9064
9065     /* Reset all the statistics and counters for total run counting */
9066     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9067     {
9068         comm->sum_nat[ddnat-ddnatZONE] = 0;
9069     }
9070     comm->ndecomp   = 0;
9071     comm->nload     = 0;
9072     comm->load_step = 0;
9073     comm->load_sum  = 0;
9074     comm->load_max  = 0;
9075     clear_ivec(comm->load_lim);
9076     comm->load_mdf = 0;
9077     comm->load_pme = 0;
9078 }
9079
9080 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9081 {
9082     gmx_domdec_comm_t *comm;
9083     int                ddnat;
9084     double             av;
9085
9086     comm = cr->dd->comm;
9087
9088     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9089
9090     if (fplog == NULL)
9091     {
9092         return;
9093     }
9094
9095     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9096
9097     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9098     {
9099         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9100         switch (ddnat)
9101         {
9102             case ddnatZONE:
9103                 fprintf(fplog,
9104                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9105                         2, av);
9106                 break;
9107             case ddnatVSITE:
9108                 if (cr->dd->vsite_comm)
9109                 {
9110                     fprintf(fplog,
9111                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9112                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9113                             av);
9114                 }
9115                 break;
9116             case ddnatCON:
9117                 if (cr->dd->constraint_comm)
9118                 {
9119                     fprintf(fplog,
9120                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9121                             1 + ir->nLincsIter, av);
9122                 }
9123                 break;
9124             default:
9125                 gmx_incons(" Unknown type for DD statistics");
9126         }
9127     }
9128     fprintf(fplog, "\n");
9129
9130     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9131     {
9132         print_dd_load_av(fplog, cr->dd);
9133     }
9134 }
9135
9136 void dd_partition_system(FILE                *fplog,
9137                          gmx_large_int_t      step,
9138                          t_commrec           *cr,
9139                          gmx_bool             bMasterState,
9140                          int                  nstglobalcomm,
9141                          t_state             *state_global,
9142                          gmx_mtop_t          *top_global,
9143                          t_inputrec          *ir,
9144                          t_state             *state_local,
9145                          rvec               **f,
9146                          t_mdatoms           *mdatoms,
9147                          gmx_localtop_t      *top_local,
9148                          t_forcerec          *fr,
9149                          gmx_vsite_t         *vsite,
9150                          gmx_shellfc_t        shellfc,
9151                          gmx_constr_t         constr,
9152                          t_nrnb              *nrnb,
9153                          gmx_wallcycle_t      wcycle,
9154                          gmx_bool             bVerbose)
9155 {
9156     gmx_domdec_t      *dd;
9157     gmx_domdec_comm_t *comm;
9158     gmx_ddbox_t        ddbox = {0};
9159     t_block           *cgs_gl;
9160     gmx_large_int_t    step_pcoupl;
9161     rvec               cell_ns_x0, cell_ns_x1;
9162     int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9163     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9164     gmx_bool           bRedist, bSortCG, bResortAll;
9165     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9166     real               grid_density;
9167     char               sbuf[22];
9168
9169     dd   = cr->dd;
9170     comm = dd->comm;
9171
9172     bBoxChanged = (bMasterState || DEFORM(*ir));
9173     if (ir->epc != epcNO)
9174     {
9175         /* With nstpcouple > 1 pressure coupling happens.
9176          * one step after calculating the pressure.
9177          * Box scaling happens at the end of the MD step,
9178          * after the DD partitioning.
9179          * We therefore have to do DLB in the first partitioning
9180          * after an MD step where P-coupling occured.
9181          * We need to determine the last step in which p-coupling occurred.
9182          * MRS -- need to validate this for vv?
9183          */
9184         n = ir->nstpcouple;
9185         if (n == 1)
9186         {
9187             step_pcoupl = step - 1;
9188         }
9189         else
9190         {
9191             step_pcoupl = ((step - 1)/n)*n + 1;
9192         }
9193         if (step_pcoupl >= comm->partition_step)
9194         {
9195             bBoxChanged = TRUE;
9196         }
9197     }
9198
9199     bNStGlobalComm = (step % nstglobalcomm == 0);
9200
9201     if (!comm->bDynLoadBal)
9202     {
9203         bDoDLB = FALSE;
9204     }
9205     else
9206     {
9207         /* Should we do dynamic load balacing this step?
9208          * Since it requires (possibly expensive) global communication,
9209          * we might want to do DLB less frequently.
9210          */
9211         if (bBoxChanged || ir->epc != epcNO)
9212         {
9213             bDoDLB = bBoxChanged;
9214         }
9215         else
9216         {
9217             bDoDLB = bNStGlobalComm;
9218         }
9219     }
9220
9221     /* Check if we have recorded loads on the nodes */
9222     if (comm->bRecordLoad && dd_load_count(comm))
9223     {
9224         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9225         {
9226             /* Check if we should use DLB at the second partitioning
9227              * and every 100 partitionings,
9228              * so the extra communication cost is negligible.
9229              */
9230             n         = max(100, nstglobalcomm);
9231             bCheckDLB = (comm->n_load_collect == 0 ||
9232                          comm->n_load_have % n == n-1);
9233         }
9234         else
9235         {
9236             bCheckDLB = FALSE;
9237         }
9238
9239         /* Print load every nstlog, first and last step to the log file */
9240         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9241                     comm->n_load_collect == 0 ||
9242                     (ir->nsteps >= 0 &&
9243                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9244
9245         /* Avoid extra communication due to verbose screen output
9246          * when nstglobalcomm is set.
9247          */
9248         if (bDoDLB || bLogLoad || bCheckDLB ||
9249             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9250         {
9251             get_load_distribution(dd, wcycle);
9252             if (DDMASTER(dd))
9253             {
9254                 if (bLogLoad)
9255                 {
9256                     dd_print_load(fplog, dd, step-1);
9257                 }
9258                 if (bVerbose)
9259                 {
9260                     dd_print_load_verbose(dd);
9261                 }
9262             }
9263             comm->n_load_collect++;
9264
9265             if (bCheckDLB)
9266             {
9267                 /* Since the timings are node dependent, the master decides */
9268                 if (DDMASTER(dd))
9269                 {
9270                     bTurnOnDLB =
9271                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9272                     if (debug)
9273                     {
9274                         fprintf(debug, "step %s, imb loss %f\n",
9275                                 gmx_step_str(step, sbuf),
9276                                 dd_force_imb_perf_loss(dd));
9277                     }
9278                 }
9279                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9280                 if (bTurnOnDLB)
9281                 {
9282                     turn_on_dlb(fplog, cr, step);
9283                     bDoDLB = TRUE;
9284                 }
9285             }
9286         }
9287         comm->n_load_have++;
9288     }
9289
9290     cgs_gl = &comm->cgs_gl;
9291
9292     bRedist = FALSE;
9293     if (bMasterState)
9294     {
9295         /* Clear the old state */
9296         clear_dd_indices(dd, 0, 0);
9297         ncgindex_set = 0;
9298
9299         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9300                   TRUE, cgs_gl, state_global->x, &ddbox);
9301
9302         get_cg_distribution(fplog, step, dd, cgs_gl,
9303                             state_global->box, &ddbox, state_global->x);
9304
9305         dd_distribute_state(dd, cgs_gl,
9306                             state_global, state_local, f);
9307
9308         dd_make_local_cgs(dd, &top_local->cgs);
9309
9310         /* Ensure that we have space for the new distribution */
9311         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9312
9313         if (fr->cutoff_scheme == ecutsGROUP)
9314         {
9315             calc_cgcm(fplog, 0, dd->ncg_home,
9316                       &top_local->cgs, state_local->x, fr->cg_cm);
9317         }
9318
9319         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9320
9321         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9322     }
9323     else if (state_local->ddp_count != dd->ddp_count)
9324     {
9325         if (state_local->ddp_count > dd->ddp_count)
9326         {
9327             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9328         }
9329
9330         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9331         {
9332             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9333         }
9334
9335         /* Clear the old state */
9336         clear_dd_indices(dd, 0, 0);
9337
9338         /* Build the new indices */
9339         rebuild_cgindex(dd, cgs_gl->index, state_local);
9340         make_dd_indices(dd, cgs_gl->index, 0);
9341         ncgindex_set = dd->ncg_home;
9342
9343         if (fr->cutoff_scheme == ecutsGROUP)
9344         {
9345             /* Redetermine the cg COMs */
9346             calc_cgcm(fplog, 0, dd->ncg_home,
9347                       &top_local->cgs, state_local->x, fr->cg_cm);
9348         }
9349
9350         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9351
9352         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9353
9354         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9355                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9356
9357         bRedist = comm->bDynLoadBal;
9358     }
9359     else
9360     {
9361         /* We have the full state, only redistribute the cgs */
9362
9363         /* Clear the non-home indices */
9364         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9365         ncgindex_set = 0;
9366
9367         /* Avoid global communication for dim's without pbc and -gcom */
9368         if (!bNStGlobalComm)
9369         {
9370             copy_rvec(comm->box0, ddbox.box0    );
9371             copy_rvec(comm->box_size, ddbox.box_size);
9372         }
9373         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9374                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9375
9376         bBoxChanged = TRUE;
9377         bRedist     = TRUE;
9378     }
9379     /* For dim's without pbc and -gcom */
9380     copy_rvec(ddbox.box0, comm->box0    );
9381     copy_rvec(ddbox.box_size, comm->box_size);
9382
9383     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9384                       step, wcycle);
9385
9386     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9387     {
9388         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9389     }
9390
9391     /* Check if we should sort the charge groups */
9392     if (comm->nstSortCG > 0)
9393     {
9394         bSortCG = (bMasterState ||
9395                    (bRedist && (step % comm->nstSortCG == 0)));
9396     }
9397     else
9398     {
9399         bSortCG = FALSE;
9400     }
9401
9402     ncg_home_old = dd->ncg_home;
9403
9404     ncg_moved = 0;
9405     if (bRedist)
9406     {
9407         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9408
9409         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9410                            state_local, f, fr,
9411                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9412
9413         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9414     }
9415
9416     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9417                           dd, &ddbox,
9418                           &comm->cell_x0, &comm->cell_x1,
9419                           dd->ncg_home, fr->cg_cm,
9420                           cell_ns_x0, cell_ns_x1, &grid_density);
9421
9422     if (bBoxChanged)
9423     {
9424         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9425     }
9426
9427     switch (fr->cutoff_scheme)
9428     {
9429         case ecutsGROUP:
9430             copy_ivec(fr->ns.grid->n, ncells_old);
9431             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9432                        state_local->box, cell_ns_x0, cell_ns_x1,
9433                        fr->rlistlong, grid_density);
9434             break;
9435         case ecutsVERLET:
9436             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9437             break;
9438         default:
9439             gmx_incons("unimplemented");
9440     }
9441     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9442     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9443
9444     if (bSortCG)
9445     {
9446         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9447
9448         /* Sort the state on charge group position.
9449          * This enables exact restarts from this step.
9450          * It also improves performance by about 15% with larger numbers
9451          * of atoms per node.
9452          */
9453
9454         /* Fill the ns grid with the home cell,
9455          * so we can sort with the indices.
9456          */
9457         set_zones_ncg_home(dd);
9458
9459         switch (fr->cutoff_scheme)
9460         {
9461             case ecutsVERLET:
9462                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9463
9464                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9465                                   0,
9466                                   comm->zones.size[0].bb_x0,
9467                                   comm->zones.size[0].bb_x1,
9468                                   0, dd->ncg_home,
9469                                   comm->zones.dens_zone0,
9470                                   fr->cginfo,
9471                                   state_local->x,
9472                                   ncg_moved, bRedist ? comm->moved : NULL,
9473                                   fr->nbv->grp[eintLocal].kernel_type,
9474                                   fr->nbv->grp[eintLocal].nbat);
9475
9476                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9477                 break;
9478             case ecutsGROUP:
9479                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9480                           0, dd->ncg_home, fr->cg_cm);
9481
9482                 copy_ivec(fr->ns.grid->n, ncells_new);
9483                 break;
9484             default:
9485                 gmx_incons("unimplemented");
9486         }
9487
9488         bResortAll = bMasterState;
9489
9490         /* Check if we can user the old order and ns grid cell indices
9491          * of the charge groups to sort the charge groups efficiently.
9492          */
9493         if (ncells_new[XX] != ncells_old[XX] ||
9494             ncells_new[YY] != ncells_old[YY] ||
9495             ncells_new[ZZ] != ncells_old[ZZ])
9496         {
9497             bResortAll = TRUE;
9498         }
9499
9500         if (debug)
9501         {
9502             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9503                     gmx_step_str(step, sbuf), dd->ncg_home);
9504         }
9505         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9506                       bResortAll ? -1 : ncg_home_old);
9507         /* Rebuild all the indices */
9508         ga2la_clear(dd->ga2la);
9509         ncgindex_set = 0;
9510
9511         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9512     }
9513
9514     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9515
9516     /* Setup up the communication and communicate the coordinates */
9517     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9518
9519     /* Set the indices */
9520     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9521
9522     /* Set the charge group boundaries for neighbor searching */
9523     set_cg_boundaries(&comm->zones);
9524
9525     if (fr->cutoff_scheme == ecutsVERLET)
9526     {
9527         set_zones_size(dd, state_local->box, &ddbox,
9528                        bSortCG ? 1 : 0, comm->zones.n);
9529     }
9530
9531     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9532
9533     /*
9534        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9535                  -1,state_local->x,state_local->box);
9536      */
9537
9538     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9539
9540     /* Extract a local topology from the global topology */
9541     for (i = 0; i < dd->ndim; i++)
9542     {
9543         np[dd->dim[i]] = comm->cd[i].np;
9544     }
9545     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9546                       comm->cellsize_min, np,
9547                       fr,
9548                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9549                       vsite, top_global, top_local);
9550
9551     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9552
9553     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9554
9555     /* Set up the special atom communication */
9556     n = comm->nat[ddnatZONE];
9557     for (i = ddnatZONE+1; i < ddnatNR; i++)
9558     {
9559         switch (i)
9560         {
9561             case ddnatVSITE:
9562                 if (vsite && vsite->n_intercg_vsite)
9563                 {
9564                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9565                 }
9566                 break;
9567             case ddnatCON:
9568                 if (dd->bInterCGcons || dd->bInterCGsettles)
9569                 {
9570                     /* Only for inter-cg constraints we need special code */
9571                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9572                                                   constr, ir->nProjOrder,
9573                                                   top_local->idef.il);
9574                 }
9575                 break;
9576             default:
9577                 gmx_incons("Unknown special atom type setup");
9578         }
9579         comm->nat[i] = n;
9580     }
9581
9582     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9583
9584     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9585
9586     /* Make space for the extra coordinates for virtual site
9587      * or constraint communication.
9588      */
9589     state_local->natoms = comm->nat[ddnatNR-1];
9590     if (state_local->natoms > state_local->nalloc)
9591     {
9592         dd_realloc_state(state_local, f, state_local->natoms);
9593     }
9594
9595     if (fr->bF_NoVirSum)
9596     {
9597         if (vsite && vsite->n_intercg_vsite)
9598         {
9599             nat_f_novirsum = comm->nat[ddnatVSITE];
9600         }
9601         else
9602         {
9603             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9604             {
9605                 nat_f_novirsum = dd->nat_tot;
9606             }
9607             else
9608             {
9609                 nat_f_novirsum = dd->nat_home;
9610             }
9611         }
9612     }
9613     else
9614     {
9615         nat_f_novirsum = 0;
9616     }
9617
9618     /* Set the number of atoms required for the force calculation.
9619      * Forces need to be constrained when using a twin-range setup
9620      * or with energy minimization. For simple simulations we could
9621      * avoid some allocation, zeroing and copying, but this is
9622      * probably not worth the complications ande checking.
9623      */
9624     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9625                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9626
9627     /* We make the all mdatoms up to nat_tot_con.
9628      * We could save some work by only setting invmass
9629      * between nat_tot and nat_tot_con.
9630      */
9631     /* This call also sets the new number of home particles to dd->nat_home */
9632     atoms2md(top_global, ir,
9633              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9634
9635     /* Now we have the charges we can sort the FE interactions */
9636     dd_sort_local_top(dd, mdatoms, top_local);
9637
9638     if (vsite != NULL)
9639     {
9640         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9641         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
9642     }
9643
9644     if (shellfc)
9645     {
9646         /* Make the local shell stuff, currently no communication is done */
9647         make_local_shells(cr, mdatoms, shellfc);
9648     }
9649
9650     if (ir->implicit_solvent)
9651     {
9652         make_local_gb(cr, fr->born, ir->gb_algorithm);
9653     }
9654
9655     init_bonded_thread_force_reduction(fr, &top_local->idef);
9656
9657     if (!(cr->duty & DUTY_PME))
9658     {
9659         /* Send the charges to our PME only node */
9660         gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
9661                        mdatoms->chargeA, mdatoms->chargeB,
9662                        dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9663     }
9664
9665     if (constr)
9666     {
9667         set_constraints(constr, top_local, ir, mdatoms, cr);
9668     }
9669
9670     if (ir->ePull != epullNO)
9671     {
9672         /* Update the local pull groups */
9673         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9674     }
9675
9676     if (ir->bRot)
9677     {
9678         /* Update the local rotation groups */
9679         dd_make_local_rotation_groups(dd, ir->rot);
9680     }
9681
9682
9683     add_dd_statistics(dd);
9684
9685     /* Make sure we only count the cycles for this DD partitioning */
9686     clear_dd_cycle_counts(dd);
9687
9688     /* Because the order of the atoms might have changed since
9689      * the last vsite construction, we need to communicate the constructing
9690      * atom coordinates again (for spreading the forces this MD step).
9691      */
9692     dd_move_x_vsites(dd, state_local->box, state_local->x);
9693
9694     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9695
9696     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9697     {
9698         dd_move_x(dd, state_local->box, state_local->x);
9699         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9700                      -1, state_local->x, state_local->box);
9701     }
9702
9703     /* Store the partitioning step */
9704     comm->partition_step = step;
9705
9706     /* Increase the DD partitioning counter */
9707     dd->ddp_count++;
9708     /* The state currently matches this DD partitioning count, store it */
9709     state_local->ddp_count = dd->ddp_count;
9710     if (bMasterState)
9711     {
9712         /* The DD master node knows the complete cg distribution,
9713          * store the count so we can possibly skip the cg info communication.
9714          */
9715         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9716     }
9717
9718     if (comm->DD_debug > 0)
9719     {
9720         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9721         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9722                                 "after partitioning");
9723     }
9724 }