src/gromacs/mdlib/domdec.c

   1 /* -*- mode: c; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4; c-file-style: "stroustrup"; -*-
   2  *
   3  *
   4  * This file is part of Gromacs        Copyright (c) 1991-2008
   5  * David van der Spoel, Erik Lindahl, Berk Hess, University of Groningen.
   6  *
   7  * This program is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU General Public License
   9  * as published by the Free Software Foundation; either version 2
  10  * of the License, or (at your option) any later version.
  11  *
  12  * To help us fund GROMACS development, we humbly ask that you cite
  13  * the research papers on the package. Check out http://www.gromacs.org
  14  *
  15  * And Hey:
  16  * Gnomes, ROck Monsters And Chili Sauce
  17  */
  18
  19 #ifdef HAVE_CONFIG_H
  20 #include <config.h>
  21 #endif
  22
  23 #include <stdio.h>
  24 #include <time.h>
  25 #include <math.h>
  26 #include <string.h>
  27 #include <stdlib.h>
  28 #include "typedefs.h"
  29 #include "smalloc.h"
  30 #include "gmx_fatal.h"
  31 #include "gmx_fatal_collective.h"
  32 #include "vec.h"
  33 #include "domdec.h"
  34 #include "domdec_network.h"
  35 #include "nrnb.h"
  36 #include "pbc.h"
  37 #include "chargegroup.h"
  38 #include "constr.h"
  39 #include "mdatoms.h"
  40 #include "names.h"
  41 #include "pdbio.h"
  42 #include "futil.h"
  43 #include "force.h"
  44 #include "pme.h"
  45 #include "pull.h"
  46 #include "pull_rotation.h"
  47 #include "gmx_wallcycle.h"
  48 #include "mdrun.h"
  49 #include "nsgrid.h"
  50 #include "shellfc.h"
  51 #include "mtop_util.h"
  52 #include "gmxfio.h"
  53 #include "gmx_ga2la.h"
  54 #include "gmx_sort.h"
  55 #include "macros.h"
  56 #include "nbnxn_search.h"
  57 #include "bondf.h"
  58 #include "gmx_omp_nthreads.h"
  59
  60 #ifdef GMX_LIB_MPI
  61 #include <mpi.h>
  62 #endif
  63 #ifdef GMX_THREAD_MPI
  64 #include "tmpi.h"
  65 #endif
  66
  67 #define DDRANK(dd, rank)    (rank)
  68 #define DDMASTERRANK(dd)   (dd->masterrank)
  69
  70 typedef struct gmx_domdec_master
  71 {
  72     /* The cell boundaries */
  73     real **cell_x;
  74     /* The global charge group division */
  75     int   *ncg;    /* Number of home charge groups for each node */
  76     int   *index;  /* Index of nnodes+1 into cg */
  77     int   *cg;     /* Global charge group index */
  78     int   *nat;    /* Number of home atoms for each node. */
  79     int   *ibuf;   /* Buffer for communication */
  80     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  81 } gmx_domdec_master_t;
  82
  83 typedef struct
  84 {
  85     /* The numbers of charge groups to send and receive for each cell
  86      * that requires communication, the last entry contains the total
  87      * number of atoms that needs to be communicated.
  88      */
  89     int  nsend[DD_MAXIZONE+2];
  90     int  nrecv[DD_MAXIZONE+2];
  91     /* The charge groups to send */
  92     int *index;
  93     int  nalloc;
  94     /* The atom range for non-in-place communication */
  95     int  cell2at0[DD_MAXIZONE];
  96     int  cell2at1[DD_MAXIZONE];
  97 } gmx_domdec_ind_t;
  98
  99 typedef struct
 100 {
 101     int               np;       /* Number of grid pulses in this dimension */
 102     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 103     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 104     int               np_nalloc;
 105     gmx_bool          bInPlace; /* Can we communicate in place?            */
 106 } gmx_domdec_comm_dim_t;
 107
 108 typedef struct
 109 {
 110     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 111     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 112     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 113     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 114     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 115     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 116     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 117     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 118     real     *buf_ncd;     /* Temp. var.                                     */
 119 } gmx_domdec_root_t;
 120
 121 #define DD_NLOAD_MAX 9
 122
 123 /* Here floats are accurate enough, since these variables
 124  * only influence the load balancing, not the actual MD results.
 125  */
 126 typedef struct
 127 {
 128     int    nload;
 129     float *load;
 130     float  sum;
 131     float  max;
 132     float  sum_m;
 133     float  cvol_min;
 134     float  mdf;
 135     float  pme;
 136     int    flags;
 137 } gmx_domdec_load_t;
 138
 139 typedef struct
 140 {
 141     int  nsc;
 142     int  ind_gl;
 143     int  ind;
 144 } gmx_cgsort_t;
 145
 146 typedef struct
 147 {
 148     gmx_cgsort_t *sort;
 149     gmx_cgsort_t *sort2;
 150     int           sort_nalloc;
 151     gmx_cgsort_t *sort_new;
 152     int           sort_new_nalloc;
 153     int          *ibuf;
 154     int           ibuf_nalloc;
 155 } gmx_domdec_sort_t;
 156
 157 typedef struct
 158 {
 159     rvec *v;
 160     int   nalloc;
 161 } vec_rvec_t;
 162
 163 /* This enum determines the order of the coordinates.
 164  * ddnatHOME and ddnatZONE should be first and second,
 165  * the others can be ordered as wanted.
 166  */
 167 enum {
 168     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 169 };
 170
 171 enum {
 172     edlbAUTO, edlbNO, edlbYES, edlbNR
 173 };
 174 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 175
 176 typedef struct
 177 {
 178     int      dim;       /* The dimension                                          */
 179     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 180     int      nslab;     /* The number of PME slabs in this dimension              */
 181     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 182     int     *pp_min;    /* The minimum pp node location, size nslab               */
 183     int     *pp_max;    /* The maximum pp node location,size nslab                */
 184     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 185 } gmx_ddpme_t;
 186
 187 typedef struct
 188 {
 189     real min0;    /* The minimum bottom of this zone                        */
 190     real max1;    /* The maximum top of this zone                           */
 191     real min1;    /* The minimum top of this zone                           */
 192     real mch0;    /* The maximum bottom communicaton height for this zone   */
 193     real mch1;    /* The maximum top communicaton height for this zone      */
 194     real p1_0;    /* The bottom value of the first cell in this zone        */
 195     real p1_1;    /* The top value of the first cell in this zone           */
 196 } gmx_ddzone_t;
 197
 198 typedef struct
 199 {
 200     gmx_domdec_ind_t ind;
 201     int             *ibuf;
 202     int              ibuf_nalloc;
 203     vec_rvec_t       vbuf;
 204     int              nsend;
 205     int              nat;
 206     int              nsend_zone;
 207 } dd_comm_setup_work_t;
 208
 209 typedef struct gmx_domdec_comm
 210 {
 211     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 212      * unless stated otherwise.
 213      */
 214
 215     /* The number of decomposition dimensions for PME, 0: no PME */
 216     int         npmedecompdim;
 217     /* The number of nodes doing PME (PP/PME or only PME) */
 218     int         npmenodes;
 219     int         npmenodes_x;
 220     int         npmenodes_y;
 221     /* The communication setup including the PME only nodes */
 222     gmx_bool    bCartesianPP_PME;
 223     ivec        ntot;
 224     int         cartpmedim;
 225     int        *pmenodes;          /* size npmenodes                         */
 226     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 227                                     * but with bCartesianPP_PME              */
 228     gmx_ddpme_t ddpme[2];
 229
 230     /* The DD particle-particle nodes only */
 231     gmx_bool bCartesianPP;
 232     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 233
 234     /* The global charge groups */
 235     t_block cgs_gl;
 236
 237     /* Should we sort the cgs */
 238     int                nstSortCG;
 239     gmx_domdec_sort_t *sort;
 240
 241     /* Are there charge groups? */
 242     gmx_bool bCGs;
 243
 244     /* Are there bonded and multi-body interactions between charge groups? */
 245     gmx_bool bInterCGBondeds;
 246     gmx_bool bInterCGMultiBody;
 247
 248     /* Data for the optional bonded interaction atom communication range */
 249     gmx_bool  bBondComm;
 250     t_blocka *cglink;
 251     char     *bLocalCG;
 252
 253     /* The DLB option */
 254     int      eDLB;
 255     /* Are we actually using DLB? */
 256     gmx_bool bDynLoadBal;
 257
 258     /* Cell sizes for static load balancing, first index cartesian */
 259     real **slb_frac;
 260
 261     /* The width of the communicated boundaries */
 262     real     cutoff_mbody;
 263     real     cutoff;
 264     /* The minimum cell size (including triclinic correction) */
 265     rvec     cellsize_min;
 266     /* For dlb, for use with edlbAUTO */
 267     rvec     cellsize_min_dlb;
 268     /* The lower limit for the DD cell size with DLB */
 269     real     cellsize_limit;
 270     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 271     gmx_bool bVacDLBNoLimit;
 272
 273     /* With PME load balancing we set limits on DLB */
 274     gmx_bool bPMELoadBalDLBLimits;
 275     /* DLB needs to take into account that we want to allow this maximum
 276      * cut-off (for PME load balancing), this could limit cell boundaries.
 277      */
 278     real PMELoadBal_max_cutoff;
 279
 280     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 281     ivec tric_dir;
 282     /* box0 and box_size are required with dim's without pbc and -gcom */
 283     rvec box0;
 284     rvec box_size;
 285
 286     /* The cell boundaries */
 287     rvec cell_x0;
 288     rvec cell_x1;
 289
 290     /* The old location of the cell boundaries, to check cg displacements */
 291     rvec old_cell_x0;
 292     rvec old_cell_x1;
 293
 294     /* The communication setup and charge group boundaries for the zones */
 295     gmx_domdec_zones_t zones;
 296
 297     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 298      * cell boundaries of neighboring cells for dynamic load balancing.
 299      */
 300     gmx_ddzone_t zone_d1[2];
 301     gmx_ddzone_t zone_d2[2][2];
 302
 303     /* The coordinate/force communication setup and indices */
 304     gmx_domdec_comm_dim_t cd[DIM];
 305     /* The maximum number of cells to communicate with in one dimension */
 306     int                   maxpulse;
 307
 308     /* Which cg distribution is stored on the master node */
 309     int master_cg_ddp_count;
 310
 311     /* The number of cg's received from the direct neighbors */
 312     int  zone_ncg1[DD_MAXZONE];
 313
 314     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 315     int  nat[ddnatNR];
 316
 317     /* Array for signalling if atoms have moved to another domain */
 318     int  *moved;
 319     int   moved_nalloc;
 320
 321     /* Communication buffer for general use */
 322     int  *buf_int;
 323     int   nalloc_int;
 324
 325     /* Communication buffer for general use */
 326     vec_rvec_t vbuf;
 327
 328     /* Temporary storage for thread parallel communication setup */
 329     int                   nth;
 330     dd_comm_setup_work_t *dth;
 331
 332     /* Communication buffers only used with multiple grid pulses */
 333     int       *buf_int2;
 334     int        nalloc_int2;
 335     vec_rvec_t vbuf2;
 336
 337     /* Communication buffers for local redistribution */
 338     int  **cggl_flag;
 339     int    cggl_flag_nalloc[DIM*2];
 340     rvec **cgcm_state;
 341     int    cgcm_state_nalloc[DIM*2];
 342
 343     /* Cell sizes for dynamic load balancing */
 344     gmx_domdec_root_t **root;
 345     real               *cell_f_row;
 346     real                cell_f0[DIM];
 347     real                cell_f1[DIM];
 348     real                cell_f_max0[DIM];
 349     real                cell_f_min1[DIM];
 350
 351     /* Stuff for load communication */
 352     gmx_bool           bRecordLoad;
 353     gmx_domdec_load_t *load;
 354 #ifdef GMX_MPI
 355     MPI_Comm          *mpi_comm_load;
 356 #endif
 357
 358     /* Maximum DLB scaling per load balancing step in percent */
 359     int dlb_scale_lim;
 360
 361     /* Cycle counters */
 362     float  cycl[ddCyclNr];
 363     int    cycl_n[ddCyclNr];
 364     float  cycl_max[ddCyclNr];
 365     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 366     int    eFlop;
 367     double flop;
 368     int    flop_n;
 369     /* Have often have did we have load measurements */
 370     int    n_load_have;
 371     /* Have often have we collected the load measurements */
 372     int    n_load_collect;
 373
 374     /* Statistics */
 375     double sum_nat[ddnatNR-ddnatZONE];
 376     int    ndecomp;
 377     int    nload;
 378     double load_step;
 379     double load_sum;
 380     double load_max;
 381     ivec   load_lim;
 382     double load_mdf;
 383     double load_pme;
 384
 385     /* The last partition step */
 386     gmx_large_int_t partition_step;
 387
 388     /* Debugging */
 389     int  nstDDDump;
 390     int  nstDDDumpGrid;
 391     int  DD_debug;
 392 } gmx_domdec_comm_t;
 393
 394 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 395 #define DD_CGIBS 2
 396
 397 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 398 #define DD_FLAG_NRCG  65535
 399 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 400 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 401
 402 /* Zone permutation required to obtain consecutive charge groups
 403  * for neighbor searching.
 404  */
 405 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 406
 407 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 408  * components see only j zones with that component 0.
 409  */
 410
 411 /* The DD zone order */
 412 static const ivec dd_zo[DD_MAXZONE] =
 413 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 414
 415 /* The 3D setup */
 416 #define dd_z3n  8
 417 #define dd_zp3n 4
 418 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 419
 420 /* The 2D setup */
 421 #define dd_z2n  4
 422 #define dd_zp2n 2
 423 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 424
 425 /* The 1D setup */
 426 #define dd_z1n  2
 427 #define dd_zp1n 1
 428 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 429
 430 /* Factors used to avoid problems due to rounding issues */
 431 #define DD_CELL_MARGIN       1.0001
 432 #define DD_CELL_MARGIN2      1.00005
 433 /* Factor to account for pressure scaling during nstlist steps */
 434 #define DD_PRES_SCALE_MARGIN 1.02
 435
 436 /* Allowed performance loss before we DLB or warn */
 437 #define DD_PERF_LOSS 0.05
 438
 439 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 440
 441 /* Use separate MPI send and receive commands
 442  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 443  * This saves memory (and some copying for small nnodes).
 444  * For high parallelization scatter and gather calls are used.
 445  */
 446 #define GMX_DD_NNODES_SENDRECV 4
 447
 448
 449 /*
 450    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 451
 452    static void index2xyz(ivec nc,int ind,ivec xyz)
 453    {
 454    xyz[XX] = ind % nc[XX];
 455    xyz[YY] = (ind / nc[XX]) % nc[YY];
 456    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 457    }
 458  */
 459
 460 /* This order is required to minimize the coordinate communication in PME
 461  * which uses decomposition in the x direction.
 462  */
 463 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 464
 465 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 466 {
 467     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 468     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 469     xyz[ZZ] = ind % nc[ZZ];
 470 }
 471
 472 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 473 {
 474     int ddindex;
 475     int ddnodeid = -1;
 476
 477     ddindex = dd_index(dd->nc, c);
 478     if (dd->comm->bCartesianPP_PME)
 479     {
 480         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 481     }
 482     else if (dd->comm->bCartesianPP)
 483     {
 484 #ifdef GMX_MPI
 485         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 486 #endif
 487     }
 488     else
 489     {
 490         ddnodeid = ddindex;
 491     }
 492
 493     return ddnodeid;
 494 }
 495
 496 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 497 {
 498     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 499 }
 500
 501 int ddglatnr(gmx_domdec_t *dd, int i)
 502 {
 503     int atnr;
 504
 505     if (dd == NULL)
 506     {
 507         atnr = i + 1;
 508     }
 509     else
 510     {
 511         if (i >= dd->comm->nat[ddnatNR-1])
 512         {
 513             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 514         }
 515         atnr = dd->gatindex[i] + 1;
 516     }
 517
 518     return atnr;
 519 }
 520
 521 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 522 {
 523     return &dd->comm->cgs_gl;
 524 }
 525
 526 static void vec_rvec_init(vec_rvec_t *v)
 527 {
 528     v->nalloc = 0;
 529     v->v      = NULL;
 530 }
 531
 532 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 533 {
 534     if (n > v->nalloc)
 535     {
 536         v->nalloc = over_alloc_dd(n);
 537         srenew(v->v, v->nalloc);
 538     }
 539 }
 540
 541 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 542 {
 543     int i;
 544
 545     if (state->ddp_count != dd->ddp_count)
 546     {
 547         gmx_incons("The state does not the domain decomposition state");
 548     }
 549
 550     state->ncg_gl = dd->ncg_home;
 551     if (state->ncg_gl > state->cg_gl_nalloc)
 552     {
 553         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 554         srenew(state->cg_gl, state->cg_gl_nalloc);
 555     }
 556     for (i = 0; i < state->ncg_gl; i++)
 557     {
 558         state->cg_gl[i] = dd->index_gl[i];
 559     }
 560
 561     state->ddp_count_cg_gl = dd->ddp_count;
 562 }
 563
 564 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 565 {
 566     return &dd->comm->zones;
 567 }
 568
 569 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 570                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 571 {
 572     gmx_domdec_zones_t *zones;
 573     int                 izone, d, dim;
 574
 575     zones = &dd->comm->zones;
 576
 577     izone = 0;
 578     while (icg >= zones->izone[izone].cg1)
 579     {
 580         izone++;
 581     }
 582
 583     if (izone == 0)
 584     {
 585         *jcg0 = icg;
 586     }
 587     else if (izone < zones->nizone)
 588     {
 589         *jcg0 = zones->izone[izone].jcg0;
 590     }
 591     else
 592     {
 593         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 594                   icg, izone, zones->nizone);
 595     }
 596
 597     *jcg1 = zones->izone[izone].jcg1;
 598
 599     for (d = 0; d < dd->ndim; d++)
 600     {
 601         dim         = dd->dim[d];
 602         shift0[dim] = zones->izone[izone].shift0[dim];
 603         shift1[dim] = zones->izone[izone].shift1[dim];
 604         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 605         {
 606             /* A conservative approach, this can be optimized */
 607             shift0[dim] -= 1;
 608             shift1[dim] += 1;
 609         }
 610     }
 611 }
 612
 613 int dd_natoms_vsite(gmx_domdec_t *dd)
 614 {
 615     return dd->comm->nat[ddnatVSITE];
 616 }
 617
 618 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 619 {
 620     *at_start = dd->comm->nat[ddnatCON-1];
 621     *at_end   = dd->comm->nat[ddnatCON];
 622 }
 623
 624 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 625 {
 626     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 627     int                   *index, *cgindex;
 628     gmx_domdec_comm_t     *comm;
 629     gmx_domdec_comm_dim_t *cd;
 630     gmx_domdec_ind_t      *ind;
 631     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 632     gmx_bool               bPBC, bScrew;
 633
 634     comm = dd->comm;
 635
 636     cgindex = dd->cgindex;
 637
 638     buf = comm->vbuf.v;
 639
 640     nzone   = 1;
 641     nat_tot = dd->nat_home;
 642     for (d = 0; d < dd->ndim; d++)
 643     {
 644         bPBC   = (dd->ci[dd->dim[d]] == 0);
 645         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 646         if (bPBC)
 647         {
 648             copy_rvec(box[dd->dim[d]], shift);
 649         }
 650         cd = &comm->cd[d];
 651         for (p = 0; p < cd->np; p++)
 652         {
 653             ind   = &cd->ind[p];
 654             index = ind->index;
 655             n     = 0;
 656             if (!bPBC)
 657             {
 658                 for (i = 0; i < ind->nsend[nzone]; i++)
 659                 {
 660                     at0 = cgindex[index[i]];
 661                     at1 = cgindex[index[i]+1];
 662                     for (j = at0; j < at1; j++)
 663                     {
 664                         copy_rvec(x[j], buf[n]);
 665                         n++;
 666                     }
 667                 }
 668             }
 669             else if (!bScrew)
 670             {
 671                 for (i = 0; i < ind->nsend[nzone]; i++)
 672                 {
 673                     at0 = cgindex[index[i]];
 674                     at1 = cgindex[index[i]+1];
 675                     for (j = at0; j < at1; j++)
 676                     {
 677                         /* We need to shift the coordinates */
 678                         rvec_add(x[j], shift, buf[n]);
 679                         n++;
 680                     }
 681                 }
 682             }
 683             else
 684             {
 685                 for (i = 0; i < ind->nsend[nzone]; i++)
 686                 {
 687                     at0 = cgindex[index[i]];
 688                     at1 = cgindex[index[i]+1];
 689                     for (j = at0; j < at1; j++)
 690                     {
 691                         /* Shift x */
 692                         buf[n][XX] = x[j][XX] + shift[XX];
 693                         /* Rotate y and z.
 694                          * This operation requires a special shift force
 695                          * treatment, which is performed in calc_vir.
 696                          */
 697                         buf[n][YY] = box[YY][YY] - x[j][YY];
 698                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 699                         n++;
 700                     }
 701                 }
 702             }
 703
 704             if (cd->bInPlace)
 705             {
 706                 rbuf = x + nat_tot;
 707             }
 708             else
 709             {
 710                 rbuf = comm->vbuf2.v;
 711             }
 712             /* Send and receive the coordinates */
 713             dd_sendrecv_rvec(dd, d, dddirBackward,
 714                              buf,  ind->nsend[nzone+1],
 715                              rbuf, ind->nrecv[nzone+1]);
 716             if (!cd->bInPlace)
 717             {
 718                 j = 0;
 719                 for (zone = 0; zone < nzone; zone++)
 720                 {
 721                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 722                     {
 723                         copy_rvec(rbuf[j], x[i]);
 724                         j++;
 725                     }
 726                 }
 727             }
 728             nat_tot += ind->nrecv[nzone+1];
 729         }
 730         nzone += nzone;
 731     }
 732 }
 733
 734 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 735 {
 736     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 737     int                   *index, *cgindex;
 738     gmx_domdec_comm_t     *comm;
 739     gmx_domdec_comm_dim_t *cd;
 740     gmx_domdec_ind_t      *ind;
 741     rvec                  *buf, *sbuf;
 742     ivec                   vis;
 743     int                    is;
 744     gmx_bool               bPBC, bScrew;
 745
 746     comm = dd->comm;
 747
 748     cgindex = dd->cgindex;
 749
 750     buf = comm->vbuf.v;
 751
 752     n       = 0;
 753     nzone   = comm->zones.n/2;
 754     nat_tot = dd->nat_tot;
 755     for (d = dd->ndim-1; d >= 0; d--)
 756     {
 757         bPBC   = (dd->ci[dd->dim[d]] == 0);
 758         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 759         if (fshift == NULL && !bScrew)
 760         {
 761             bPBC = FALSE;
 762         }
 763         /* Determine which shift vector we need */
 764         clear_ivec(vis);
 765         vis[dd->dim[d]] = 1;
 766         is              = IVEC2IS(vis);
 767
 768         cd = &comm->cd[d];
 769         for (p = cd->np-1; p >= 0; p--)
 770         {
 771             ind      = &cd->ind[p];
 772             nat_tot -= ind->nrecv[nzone+1];
 773             if (cd->bInPlace)
 774             {
 775                 sbuf = f + nat_tot;
 776             }
 777             else
 778             {
 779                 sbuf = comm->vbuf2.v;
 780                 j    = 0;
 781                 for (zone = 0; zone < nzone; zone++)
 782                 {
 783                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 784                     {
 785                         copy_rvec(f[i], sbuf[j]);
 786                         j++;
 787                     }
 788                 }
 789             }
 790             /* Communicate the forces */
 791             dd_sendrecv_rvec(dd, d, dddirForward,
 792                              sbuf, ind->nrecv[nzone+1],
 793                              buf,  ind->nsend[nzone+1]);
 794             index = ind->index;
 795             /* Add the received forces */
 796             n = 0;
 797             if (!bPBC)
 798             {
 799                 for (i = 0; i < ind->nsend[nzone]; i++)
 800                 {
 801                     at0 = cgindex[index[i]];
 802                     at1 = cgindex[index[i]+1];
 803                     for (j = at0; j < at1; j++)
 804                     {
 805                         rvec_inc(f[j], buf[n]);
 806                         n++;
 807                     }
 808                 }
 809             }
 810             else if (!bScrew)
 811             {
 812                 for (i = 0; i < ind->nsend[nzone]; i++)
 813                 {
 814                     at0 = cgindex[index[i]];
 815                     at1 = cgindex[index[i]+1];
 816                     for (j = at0; j < at1; j++)
 817                     {
 818                         rvec_inc(f[j], buf[n]);
 819                         /* Add this force to the shift force */
 820                         rvec_inc(fshift[is], buf[n]);
 821                         n++;
 822                     }
 823                 }
 824             }
 825             else
 826             {
 827                 for (i = 0; i < ind->nsend[nzone]; i++)
 828                 {
 829                     at0 = cgindex[index[i]];
 830                     at1 = cgindex[index[i]+1];
 831                     for (j = at0; j < at1; j++)
 832                     {
 833                         /* Rotate the force */
 834                         f[j][XX] += buf[n][XX];
 835                         f[j][YY] -= buf[n][YY];
 836                         f[j][ZZ] -= buf[n][ZZ];
 837                         if (fshift)
 838                         {
 839                             /* Add this force to the shift force */
 840                             rvec_inc(fshift[is], buf[n]);
 841                         }
 842                         n++;
 843                     }
 844                 }
 845             }
 846         }
 847         nzone /= 2;
 848     }
 849 }
 850
 851 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 852 {
 853     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 854     int                   *index, *cgindex;
 855     gmx_domdec_comm_t     *comm;
 856     gmx_domdec_comm_dim_t *cd;
 857     gmx_domdec_ind_t      *ind;
 858     real                  *buf, *rbuf;
 859
 860     comm = dd->comm;
 861
 862     cgindex = dd->cgindex;
 863
 864     buf = &comm->vbuf.v[0][0];
 865
 866     nzone   = 1;
 867     nat_tot = dd->nat_home;
 868     for (d = 0; d < dd->ndim; d++)
 869     {
 870         cd = &comm->cd[d];
 871         for (p = 0; p < cd->np; p++)
 872         {
 873             ind   = &cd->ind[p];
 874             index = ind->index;
 875             n     = 0;
 876             for (i = 0; i < ind->nsend[nzone]; i++)
 877             {
 878                 at0 = cgindex[index[i]];
 879                 at1 = cgindex[index[i]+1];
 880                 for (j = at0; j < at1; j++)
 881                 {
 882                     buf[n] = v[j];
 883                     n++;
 884                 }
 885             }
 886
 887             if (cd->bInPlace)
 888             {
 889                 rbuf = v + nat_tot;
 890             }
 891             else
 892             {
 893                 rbuf = &comm->vbuf2.v[0][0];
 894             }
 895             /* Send and receive the coordinates */
 896             dd_sendrecv_real(dd, d, dddirBackward,
 897                              buf,  ind->nsend[nzone+1],
 898                              rbuf, ind->nrecv[nzone+1]);
 899             if (!cd->bInPlace)
 900             {
 901                 j = 0;
 902                 for (zone = 0; zone < nzone; zone++)
 903                 {
 904                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 905                     {
 906                         v[i] = rbuf[j];
 907                         j++;
 908                     }
 909                 }
 910             }
 911             nat_tot += ind->nrecv[nzone+1];
 912         }
 913         nzone += nzone;
 914     }
 915 }
 916
 917 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 918 {
 919     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 920     int                   *index, *cgindex;
 921     gmx_domdec_comm_t     *comm;
 922     gmx_domdec_comm_dim_t *cd;
 923     gmx_domdec_ind_t      *ind;
 924     real                  *buf, *sbuf;
 925
 926     comm = dd->comm;
 927
 928     cgindex = dd->cgindex;
 929
 930     buf = &comm->vbuf.v[0][0];
 931
 932     n       = 0;
 933     nzone   = comm->zones.n/2;
 934     nat_tot = dd->nat_tot;
 935     for (d = dd->ndim-1; d >= 0; d--)
 936     {
 937         cd = &comm->cd[d];
 938         for (p = cd->np-1; p >= 0; p--)
 939         {
 940             ind      = &cd->ind[p];
 941             nat_tot -= ind->nrecv[nzone+1];
 942             if (cd->bInPlace)
 943             {
 944                 sbuf = v + nat_tot;
 945             }
 946             else
 947             {
 948                 sbuf = &comm->vbuf2.v[0][0];
 949                 j    = 0;
 950                 for (zone = 0; zone < nzone; zone++)
 951                 {
 952                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 953                     {
 954                         sbuf[j] = v[i];
 955                         j++;
 956                     }
 957                 }
 958             }
 959             /* Communicate the forces */
 960             dd_sendrecv_real(dd, d, dddirForward,
 961                              sbuf, ind->nrecv[nzone+1],
 962                              buf,  ind->nsend[nzone+1]);
 963             index = ind->index;
 964             /* Add the received forces */
 965             n = 0;
 966             for (i = 0; i < ind->nsend[nzone]; i++)
 967             {
 968                 at0 = cgindex[index[i]];
 969                 at1 = cgindex[index[i]+1];
 970                 for (j = at0; j < at1; j++)
 971                 {
 972                     v[j] += buf[n];
 973                     n++;
 974                 }
 975             }
 976         }
 977         nzone /= 2;
 978     }
 979 }
 980
 981 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 982 {
 983     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 984             d, i, j,
 985             zone->min0, zone->max1,
 986             zone->mch0, zone->mch0,
 987             zone->p1_0, zone->p1_1);
 988 }
 989
 990
 991 #define DDZONECOMM_MAXZONE  5
 992 #define DDZONECOMM_BUFSIZE  3
 993
 994 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
 995                                int ddimind, int direction,
 996                                gmx_ddzone_t *buf_s, int n_s,
 997                                gmx_ddzone_t *buf_r, int n_r)
 998 {
 999 #define ZBS  DDZONECOMM_BUFSIZE
1000     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1001     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1002     int  i;
1003
1004     for (i = 0; i < n_s; i++)
1005     {
1006         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1007         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1008         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1009         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1010         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1011         vbuf_s[i*ZBS+1][2] = 0;
1012         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1013         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1014         vbuf_s[i*ZBS+2][2] = 0;
1015     }
1016
1017     dd_sendrecv_rvec(dd, ddimind, direction,
1018                      vbuf_s, n_s*ZBS,
1019                      vbuf_r, n_r*ZBS);
1020
1021     for (i = 0; i < n_r; i++)
1022     {
1023         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1024         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1025         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1026         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1027         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1028         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1029         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1030     }
1031
1032 #undef ZBS
1033 }
1034
1035 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1036                           rvec cell_ns_x0, rvec cell_ns_x1)
1037 {
1038     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1039     gmx_ddzone_t      *zp;
1040     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1041     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1042     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1043     rvec               extr_s[2], extr_r[2];
1044     rvec               dh;
1045     real               dist_d, c = 0, det;
1046     gmx_domdec_comm_t *comm;
1047     gmx_bool           bPBC, bUse;
1048
1049     comm = dd->comm;
1050
1051     for (d = 1; d < dd->ndim; d++)
1052     {
1053         dim      = dd->dim[d];
1054         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1055         zp->min0 = cell_ns_x0[dim];
1056         zp->max1 = cell_ns_x1[dim];
1057         zp->min1 = cell_ns_x1[dim];
1058         zp->mch0 = cell_ns_x0[dim];
1059         zp->mch1 = cell_ns_x1[dim];
1060         zp->p1_0 = cell_ns_x0[dim];
1061         zp->p1_1 = cell_ns_x1[dim];
1062     }
1063
1064     for (d = dd->ndim-2; d >= 0; d--)
1065     {
1066         dim  = dd->dim[d];
1067         bPBC = (dim < ddbox->npbcdim);
1068
1069         /* Use an rvec to store two reals */
1070         extr_s[d][0] = comm->cell_f0[d+1];
1071         extr_s[d][1] = comm->cell_f1[d+1];
1072         extr_s[d][2] = comm->cell_f1[d+1];
1073
1074         pos = 0;
1075         /* Store the extremes in the backward sending buffer,
1076          * so the get updated separately from the forward communication.
1077          */
1078         for (d1 = d; d1 < dd->ndim-1; d1++)
1079         {
1080             /* We invert the order to be able to use the same loop for buf_e */
1081             buf_s[pos].min0 = extr_s[d1][1];
1082             buf_s[pos].max1 = extr_s[d1][0];
1083             buf_s[pos].min1 = extr_s[d1][2];
1084             buf_s[pos].mch0 = 0;
1085             buf_s[pos].mch1 = 0;
1086             /* Store the cell corner of the dimension we communicate along */
1087             buf_s[pos].p1_0 = comm->cell_x0[dim];
1088             buf_s[pos].p1_1 = 0;
1089             pos++;
1090         }
1091
1092         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1093         pos++;
1094
1095         if (dd->ndim == 3 && d == 0)
1096         {
1097             buf_s[pos] = comm->zone_d2[0][1];
1098             pos++;
1099             buf_s[pos] = comm->zone_d1[0];
1100             pos++;
1101         }
1102
1103         /* We only need to communicate the extremes
1104          * in the forward direction
1105          */
1106         npulse = comm->cd[d].np;
1107         if (bPBC)
1108         {
1109             /* Take the minimum to avoid double communication */
1110             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1111         }
1112         else
1113         {
1114             /* Without PBC we should really not communicate over
1115              * the boundaries, but implementing that complicates
1116              * the communication setup and therefore we simply
1117              * do all communication, but ignore some data.
1118              */
1119             npulse_min = npulse;
1120         }
1121         for (p = 0; p < npulse_min; p++)
1122         {
1123             /* Communicate the extremes forward */
1124             bUse = (bPBC || dd->ci[dim] > 0);
1125
1126             dd_sendrecv_rvec(dd, d, dddirForward,
1127                              extr_s+d, dd->ndim-d-1,
1128                              extr_r+d, dd->ndim-d-1);
1129
1130             if (bUse)
1131             {
1132                 for (d1 = d; d1 < dd->ndim-1; d1++)
1133                 {
1134                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1135                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1136                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1137                 }
1138             }
1139         }
1140
1141         buf_size = pos;
1142         for (p = 0; p < npulse; p++)
1143         {
1144             /* Communicate all the zone information backward */
1145             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1146
1147             dd_sendrecv_ddzone(dd, d, dddirBackward,
1148                                buf_s, buf_size,
1149                                buf_r, buf_size);
1150
1151             clear_rvec(dh);
1152             if (p > 0)
1153             {
1154                 for (d1 = d+1; d1 < dd->ndim; d1++)
1155                 {
1156                     /* Determine the decrease of maximum required
1157                      * communication height along d1 due to the distance along d,
1158                      * this avoids a lot of useless atom communication.
1159                      */
1160                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1161
1162                     if (ddbox->tric_dir[dim])
1163                     {
1164                         /* c is the off-diagonal coupling between the cell planes
1165                          * along directions d and d1.
1166                          */
1167                         c = ddbox->v[dim][dd->dim[d1]][dim];
1168                     }
1169                     else
1170                     {
1171                         c = 0;
1172                     }
1173                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1174                     if (det > 0)
1175                     {
1176                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1177                     }
1178                     else
1179                     {
1180                         /* A negative value signals out of range */
1181                         dh[d1] = -1;
1182                     }
1183                 }
1184             }
1185
1186             /* Accumulate the extremes over all pulses */
1187             for (i = 0; i < buf_size; i++)
1188             {
1189                 if (p == 0)
1190                 {
1191                     buf_e[i] = buf_r[i];
1192                 }
1193                 else
1194                 {
1195                     if (bUse)
1196                     {
1197                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1198                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1199                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1200                     }
1201
1202                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1203                     {
1204                         d1 = 1;
1205                     }
1206                     else
1207                     {
1208                         d1 = d + 1;
1209                     }
1210                     if (bUse && dh[d1] >= 0)
1211                     {
1212                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1213                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1214                     }
1215                 }
1216                 /* Copy the received buffer to the send buffer,
1217                  * to pass the data through with the next pulse.
1218                  */
1219                 buf_s[i] = buf_r[i];
1220             }
1221             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1222                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1223             {
1224                 /* Store the extremes */
1225                 pos = 0;
1226
1227                 for (d1 = d; d1 < dd->ndim-1; d1++)
1228                 {
1229                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1230                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1231                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1232                     pos++;
1233                 }
1234
1235                 if (d == 1 || (d == 0 && dd->ndim == 3))
1236                 {
1237                     for (i = d; i < 2; i++)
1238                     {
1239                         comm->zone_d2[1-d][i] = buf_e[pos];
1240                         pos++;
1241                     }
1242                 }
1243                 if (d == 0)
1244                 {
1245                     comm->zone_d1[1] = buf_e[pos];
1246                     pos++;
1247                 }
1248             }
1249         }
1250     }
1251
1252     if (dd->ndim >= 2)
1253     {
1254         dim = dd->dim[1];
1255         for (i = 0; i < 2; i++)
1256         {
1257             if (debug)
1258             {
1259                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1260             }
1261             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1262             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1263         }
1264     }
1265     if (dd->ndim >= 3)
1266     {
1267         dim = dd->dim[2];
1268         for (i = 0; i < 2; i++)
1269         {
1270             for (j = 0; j < 2; j++)
1271             {
1272                 if (debug)
1273                 {
1274                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1275                 }
1276                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1277                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1278             }
1279         }
1280     }
1281     for (d = 1; d < dd->ndim; d++)
1282     {
1283         comm->cell_f_max0[d] = extr_s[d-1][0];
1284         comm->cell_f_min1[d] = extr_s[d-1][1];
1285         if (debug)
1286         {
1287             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1288                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1289         }
1290     }
1291 }
1292
1293 static void dd_collect_cg(gmx_domdec_t *dd,
1294                           t_state      *state_local)
1295 {
1296     gmx_domdec_master_t *ma = NULL;
1297     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1298     t_block             *cgs_gl;
1299
1300     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1301     {
1302         /* The master has the correct distribution */
1303         return;
1304     }
1305
1306     if (state_local->ddp_count == dd->ddp_count)
1307     {
1308         ncg_home = dd->ncg_home;
1309         cg       = dd->index_gl;
1310         nat_home = dd->nat_home;
1311     }
1312     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1313     {
1314         cgs_gl = &dd->comm->cgs_gl;
1315
1316         ncg_home = state_local->ncg_gl;
1317         cg       = state_local->cg_gl;
1318         nat_home = 0;
1319         for (i = 0; i < ncg_home; i++)
1320         {
1321             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1322         }
1323     }
1324     else
1325     {
1326         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1327     }
1328
1329     buf2[0] = dd->ncg_home;
1330     buf2[1] = dd->nat_home;
1331     if (DDMASTER(dd))
1332     {
1333         ma   = dd->ma;
1334         ibuf = ma->ibuf;
1335     }
1336     else
1337     {
1338         ibuf = NULL;
1339     }
1340     /* Collect the charge group and atom counts on the master */
1341     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1342
1343     if (DDMASTER(dd))
1344     {
1345         ma->index[0] = 0;
1346         for (i = 0; i < dd->nnodes; i++)
1347         {
1348             ma->ncg[i]     = ma->ibuf[2*i];
1349             ma->nat[i]     = ma->ibuf[2*i+1];
1350             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1351
1352         }
1353         /* Make byte counts and indices */
1354         for (i = 0; i < dd->nnodes; i++)
1355         {
1356             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1357             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1358         }
1359         if (debug)
1360         {
1361             fprintf(debug, "Initial charge group distribution: ");
1362             for (i = 0; i < dd->nnodes; i++)
1363             {
1364                 fprintf(debug, " %d", ma->ncg[i]);
1365             }
1366             fprintf(debug, "\n");
1367         }
1368     }
1369
1370     /* Collect the charge group indices on the master */
1371     dd_gatherv(dd,
1372                dd->ncg_home*sizeof(int), dd->index_gl,
1373                DDMASTER(dd) ? ma->ibuf : NULL,
1374                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1375                DDMASTER(dd) ? ma->cg : NULL);
1376
1377     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1378 }
1379
1380 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1381                                     rvec *lv, rvec *v)
1382 {
1383     gmx_domdec_master_t *ma;
1384     int                  n, i, c, a, nalloc = 0;
1385     rvec                *buf = NULL;
1386     t_block             *cgs_gl;
1387
1388     ma = dd->ma;
1389
1390     if (!DDMASTER(dd))
1391     {
1392 #ifdef GMX_MPI
1393         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1394                  dd->rank, dd->mpi_comm_all);
1395 #endif
1396     }
1397     else
1398     {
1399         /* Copy the master coordinates to the global array */
1400         cgs_gl = &dd->comm->cgs_gl;
1401
1402         n = DDMASTERRANK(dd);
1403         a = 0;
1404         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1405         {
1406             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1407             {
1408                 copy_rvec(lv[a++], v[c]);
1409             }
1410         }
1411
1412         for (n = 0; n < dd->nnodes; n++)
1413         {
1414             if (n != dd->rank)
1415             {
1416                 if (ma->nat[n] > nalloc)
1417                 {
1418                     nalloc = over_alloc_dd(ma->nat[n]);
1419                     srenew(buf, nalloc);
1420                 }
1421 #ifdef GMX_MPI
1422                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1423                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1424 #endif
1425                 a = 0;
1426                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1427                 {
1428                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1429                     {
1430                         copy_rvec(buf[a++], v[c]);
1431                     }
1432                 }
1433             }
1434         }
1435         sfree(buf);
1436     }
1437 }
1438
1439 static void get_commbuffer_counts(gmx_domdec_t *dd,
1440                                   int **counts, int **disps)
1441 {
1442     gmx_domdec_master_t *ma;
1443     int                  n;
1444
1445     ma = dd->ma;
1446
1447     /* Make the rvec count and displacment arrays */
1448     *counts  = ma->ibuf;
1449     *disps   = ma->ibuf + dd->nnodes;
1450     for (n = 0; n < dd->nnodes; n++)
1451     {
1452         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1453         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1454     }
1455 }
1456
1457 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1458                                    rvec *lv, rvec *v)
1459 {
1460     gmx_domdec_master_t *ma;
1461     int                 *rcounts = NULL, *disps = NULL;
1462     int                  n, i, c, a;
1463     rvec                *buf = NULL;
1464     t_block             *cgs_gl;
1465
1466     ma = dd->ma;
1467
1468     if (DDMASTER(dd))
1469     {
1470         get_commbuffer_counts(dd, &rcounts, &disps);
1471
1472         buf = ma->vbuf;
1473     }
1474
1475     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1476
1477     if (DDMASTER(dd))
1478     {
1479         cgs_gl = &dd->comm->cgs_gl;
1480
1481         a = 0;
1482         for (n = 0; n < dd->nnodes; n++)
1483         {
1484             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1485             {
1486                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1487                 {
1488                     copy_rvec(buf[a++], v[c]);
1489                 }
1490             }
1491         }
1492     }
1493 }
1494
1495 void dd_collect_vec(gmx_domdec_t *dd,
1496                     t_state *state_local, rvec *lv, rvec *v)
1497 {
1498     gmx_domdec_master_t *ma;
1499     int                  n, i, c, a, nalloc = 0;
1500     rvec                *buf = NULL;
1501
1502     dd_collect_cg(dd, state_local);
1503
1504     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1505     {
1506         dd_collect_vec_sendrecv(dd, lv, v);
1507     }
1508     else
1509     {
1510         dd_collect_vec_gatherv(dd, lv, v);
1511     }
1512 }
1513
1514
1515 void dd_collect_state(gmx_domdec_t *dd,
1516                       t_state *state_local, t_state *state)
1517 {
1518     int est, i, j, nh;
1519
1520     nh = state->nhchainlength;
1521
1522     if (DDMASTER(dd))
1523     {
1524         for (i = 0; i < efptNR; i++)
1525         {
1526             state->lambda[i] = state_local->lambda[i];
1527         }
1528         state->fep_state = state_local->fep_state;
1529         state->veta      = state_local->veta;
1530         state->vol0      = state_local->vol0;
1531         copy_mat(state_local->box, state->box);
1532         copy_mat(state_local->boxv, state->boxv);
1533         copy_mat(state_local->svir_prev, state->svir_prev);
1534         copy_mat(state_local->fvir_prev, state->fvir_prev);
1535         copy_mat(state_local->pres_prev, state->pres_prev);
1536
1537
1538         for (i = 0; i < state_local->ngtc; i++)
1539         {
1540             for (j = 0; j < nh; j++)
1541             {
1542                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1543                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1544             }
1545             state->therm_integral[i] = state_local->therm_integral[i];
1546         }
1547         for (i = 0; i < state_local->nnhpres; i++)
1548         {
1549             for (j = 0; j < nh; j++)
1550             {
1551                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1552                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1553             }
1554         }
1555     }
1556     for (est = 0; est < estNR; est++)
1557     {
1558         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1559         {
1560             switch (est)
1561             {
1562                 case estX:
1563                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1564                     break;
1565                 case estV:
1566                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1567                     break;
1568                 case estSDX:
1569                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1570                     break;
1571                 case estCGP:
1572                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1573                     break;
1574                 case estLD_RNG:
1575                     if (state->nrngi == 1)
1576                     {
1577                         if (DDMASTER(dd))
1578                         {
1579                             for (i = 0; i < state_local->nrng; i++)
1580                             {
1581                                 state->ld_rng[i] = state_local->ld_rng[i];
1582                             }
1583                         }
1584                     }
1585                     else
1586                     {
1587                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1588                                   state_local->ld_rng, state->ld_rng);
1589                     }
1590                     break;
1591                 case estLD_RNGI:
1592                     if (state->nrngi == 1)
1593                     {
1594                         if (DDMASTER(dd))
1595                         {
1596                             state->ld_rngi[0] = state_local->ld_rngi[0];
1597                         }
1598                     }
1599                     else
1600                     {
1601                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1602                                   state_local->ld_rngi, state->ld_rngi);
1603                     }
1604                     break;
1605                 case estDISRE_INITF:
1606                 case estDISRE_RM3TAV:
1607                 case estORIRE_INITF:
1608                 case estORIRE_DTAV:
1609                     break;
1610                 default:
1611                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1612             }
1613         }
1614     }
1615 }
1616
1617 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1618 {
1619     int est;
1620
1621     if (debug)
1622     {
1623         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1624     }
1625
1626     state->nalloc = over_alloc_dd(nalloc);
1627
1628     for (est = 0; est < estNR; est++)
1629     {
1630         if (EST_DISTR(est) && (state->flags & (1<<est)))
1631         {
1632             switch (est)
1633             {
1634                 case estX:
1635                     srenew(state->x, state->nalloc);
1636                     break;
1637                 case estV:
1638                     srenew(state->v, state->nalloc);
1639                     break;
1640                 case estSDX:
1641                     srenew(state->sd_X, state->nalloc);
1642                     break;
1643                 case estCGP:
1644                     srenew(state->cg_p, state->nalloc);
1645                     break;
1646                 case estLD_RNG:
1647                 case estLD_RNGI:
1648                 case estDISRE_INITF:
1649                 case estDISRE_RM3TAV:
1650                 case estORIRE_INITF:
1651                 case estORIRE_DTAV:
1652                     /* No reallocation required */
1653                     break;
1654                 default:
1655                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1656             }
1657         }
1658     }
1659
1660     if (f != NULL)
1661     {
1662         srenew(*f, state->nalloc);
1663     }
1664 }
1665
1666 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1667                                int nalloc)
1668 {
1669     if (nalloc > fr->cg_nalloc)
1670     {
1671         if (debug)
1672         {
1673             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1674         }
1675         fr->cg_nalloc = over_alloc_dd(nalloc);
1676         srenew(fr->cginfo, fr->cg_nalloc);
1677         if (fr->cutoff_scheme == ecutsGROUP)
1678         {
1679             srenew(fr->cg_cm, fr->cg_nalloc);
1680         }
1681     }
1682     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1683     {
1684         /* We don't use charge groups, we use x in state to set up
1685          * the atom communication.
1686          */
1687         dd_realloc_state(state, f, nalloc);
1688     }
1689 }
1690
1691 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1692                                        rvec *v, rvec *lv)
1693 {
1694     gmx_domdec_master_t *ma;
1695     int                  n, i, c, a, nalloc = 0;
1696     rvec                *buf = NULL;
1697
1698     if (DDMASTER(dd))
1699     {
1700         ma  = dd->ma;
1701
1702         for (n = 0; n < dd->nnodes; n++)
1703         {
1704             if (n != dd->rank)
1705             {
1706                 if (ma->nat[n] > nalloc)
1707                 {
1708                     nalloc = over_alloc_dd(ma->nat[n]);
1709                     srenew(buf, nalloc);
1710                 }
1711                 /* Use lv as a temporary buffer */
1712                 a = 0;
1713                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1714                 {
1715                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1716                     {
1717                         copy_rvec(v[c], buf[a++]);
1718                     }
1719                 }
1720                 if (a != ma->nat[n])
1721                 {
1722                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1723                               a, ma->nat[n]);
1724                 }
1725
1726 #ifdef GMX_MPI
1727                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1728                          DDRANK(dd, n), n, dd->mpi_comm_all);
1729 #endif
1730             }
1731         }
1732         sfree(buf);
1733         n = DDMASTERRANK(dd);
1734         a = 0;
1735         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1736         {
1737             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1738             {
1739                 copy_rvec(v[c], lv[a++]);
1740             }
1741         }
1742     }
1743     else
1744     {
1745 #ifdef GMX_MPI
1746         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1747                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1748 #endif
1749     }
1750 }
1751
1752 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1753                                        rvec *v, rvec *lv)
1754 {
1755     gmx_domdec_master_t *ma;
1756     int                 *scounts = NULL, *disps = NULL;
1757     int                  n, i, c, a, nalloc = 0;
1758     rvec                *buf = NULL;
1759
1760     if (DDMASTER(dd))
1761     {
1762         ma  = dd->ma;
1763
1764         get_commbuffer_counts(dd, &scounts, &disps);
1765
1766         buf = ma->vbuf;
1767         a   = 0;
1768         for (n = 0; n < dd->nnodes; n++)
1769         {
1770             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1771             {
1772                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1773                 {
1774                     copy_rvec(v[c], buf[a++]);
1775                 }
1776             }
1777         }
1778     }
1779
1780     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1781 }
1782
1783 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1784 {
1785     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1786     {
1787         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1788     }
1789     else
1790     {
1791         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1792     }
1793 }
1794
1795 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1796                                 t_state *state, t_state *state_local,
1797                                 rvec **f)
1798 {
1799     int  i, j, nh;
1800
1801     nh = state->nhchainlength;
1802
1803     if (DDMASTER(dd))
1804     {
1805         for (i = 0; i < efptNR; i++)
1806         {
1807             state_local->lambda[i] = state->lambda[i];
1808         }
1809         state_local->fep_state = state->fep_state;
1810         state_local->veta      = state->veta;
1811         state_local->vol0      = state->vol0;
1812         copy_mat(state->box, state_local->box);
1813         copy_mat(state->box_rel, state_local->box_rel);
1814         copy_mat(state->boxv, state_local->boxv);
1815         copy_mat(state->svir_prev, state_local->svir_prev);
1816         copy_mat(state->fvir_prev, state_local->fvir_prev);
1817         for (i = 0; i < state_local->ngtc; i++)
1818         {
1819             for (j = 0; j < nh; j++)
1820             {
1821                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1822                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1823             }
1824             state_local->therm_integral[i] = state->therm_integral[i];
1825         }
1826         for (i = 0; i < state_local->nnhpres; i++)
1827         {
1828             for (j = 0; j < nh; j++)
1829             {
1830                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1831                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1832             }
1833         }
1834     }
1835     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1836     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1837     dd_bcast(dd, sizeof(real), &state_local->veta);
1838     dd_bcast(dd, sizeof(real), &state_local->vol0);
1839     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1840     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1841     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1842     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1843     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1844     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1845     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1846     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1847     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1848     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1849
1850     if (dd->nat_home > state_local->nalloc)
1851     {
1852         dd_realloc_state(state_local, f, dd->nat_home);
1853     }
1854     for (i = 0; i < estNR; i++)
1855     {
1856         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1857         {
1858             switch (i)
1859             {
1860                 case estX:
1861                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1862                     break;
1863                 case estV:
1864                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1865                     break;
1866                 case estSDX:
1867                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1868                     break;
1869                 case estCGP:
1870                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1871                     break;
1872                 case estLD_RNG:
1873                     if (state->nrngi == 1)
1874                     {
1875                         dd_bcastc(dd,
1876                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1877                                   state->ld_rng, state_local->ld_rng);
1878                     }
1879                     else
1880                     {
1881                         dd_scatter(dd,
1882                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1883                                    state->ld_rng, state_local->ld_rng);
1884                     }
1885                     break;
1886                 case estLD_RNGI:
1887                     if (state->nrngi == 1)
1888                     {
1889                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1890                                   state->ld_rngi, state_local->ld_rngi);
1891                     }
1892                     else
1893                     {
1894                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1895                                    state->ld_rngi, state_local->ld_rngi);
1896                     }
1897                     break;
1898                 case estDISRE_INITF:
1899                 case estDISRE_RM3TAV:
1900                 case estORIRE_INITF:
1901                 case estORIRE_DTAV:
1902                     /* Not implemented yet */
1903                     break;
1904                 default:
1905                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1906             }
1907         }
1908     }
1909 }
1910
1911 static char dim2char(int dim)
1912 {
1913     char c = '?';
1914
1915     switch (dim)
1916     {
1917         case XX: c = 'X'; break;
1918         case YY: c = 'Y'; break;
1919         case ZZ: c = 'Z'; break;
1920         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1921     }
1922
1923     return c;
1924 }
1925
1926 static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
1927                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1928 {
1929     rvec   grid_s[2], *grid_r = NULL, cx, r;
1930     char   fname[STRLEN], format[STRLEN], buf[22];
1931     FILE  *out;
1932     int    a, i, d, z, y, x;
1933     matrix tric;
1934     real   vol;
1935
1936     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1937     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1938
1939     if (DDMASTER(dd))
1940     {
1941         snew(grid_r, 2*dd->nnodes);
1942     }
1943
1944     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1945
1946     if (DDMASTER(dd))
1947     {
1948         for (d = 0; d < DIM; d++)
1949         {
1950             for (i = 0; i < DIM; i++)
1951             {
1952                 if (d == i)
1953                 {
1954                     tric[d][i] = 1;
1955                 }
1956                 else
1957                 {
1958                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1959                     {
1960                         tric[d][i] = box[i][d]/box[i][i];
1961                     }
1962                     else
1963                     {
1964                         tric[d][i] = 0;
1965                     }
1966                 }
1967             }
1968         }
1969         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1970         sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
1971         out = gmx_fio_fopen(fname, "w");
1972         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1973         a = 1;
1974         for (i = 0; i < dd->nnodes; i++)
1975         {
1976             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1977             for (d = 0; d < DIM; d++)
1978             {
1979                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1980             }
1981             for (z = 0; z < 2; z++)
1982             {
1983                 for (y = 0; y < 2; y++)
1984                 {
1985                     for (x = 0; x < 2; x++)
1986                     {
1987                         cx[XX] = grid_r[i*2+x][XX];
1988                         cx[YY] = grid_r[i*2+y][YY];
1989                         cx[ZZ] = grid_r[i*2+z][ZZ];
1990                         mvmul(tric, cx, r);
1991                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
1992                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
1993                     }
1994                 }
1995             }
1996             for (d = 0; d < DIM; d++)
1997             {
1998                 for (x = 0; x < 4; x++)
1999                 {
2000                     switch (d)
2001                     {
2002                         case 0: y = 1 + i*8 + 2*x; break;
2003                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2004                         case 2: y = 1 + i*8 + x; break;
2005                     }
2006                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2007                 }
2008             }
2009         }
2010         gmx_fio_fclose(out);
2011         sfree(grid_r);
2012     }
2013 }
2014
2015 void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
2016                   gmx_mtop_t *mtop, t_commrec *cr,
2017                   int natoms, rvec x[], matrix box)
2018 {
2019     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2020     FILE         *out;
2021     int           i, ii, resnr, c;
2022     char         *atomname, *resname;
2023     real          b;
2024     gmx_domdec_t *dd;
2025
2026     dd = cr->dd;
2027     if (natoms == -1)
2028     {
2029         natoms = dd->comm->nat[ddnatVSITE];
2030     }
2031
2032     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2033
2034     sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
2035     sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
2036
2037     out = gmx_fio_fopen(fname, "w");
2038
2039     fprintf(out, "TITLE     %s\n", title);
2040     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2041     for (i = 0; i < natoms; i++)
2042     {
2043         ii = dd->gatindex[i];
2044         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2045         if (i < dd->comm->nat[ddnatZONE])
2046         {
2047             c = 0;
2048             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2049             {
2050                 c++;
2051             }
2052             b = c;
2053         }
2054         else if (i < dd->comm->nat[ddnatVSITE])
2055         {
2056             b = dd->comm->zones.n;
2057         }
2058         else
2059         {
2060             b = dd->comm->zones.n + 1;
2061         }
2062         fprintf(out, strlen(atomname) < 4 ? format : format4,
2063                 "ATOM", (ii+1)%100000,
2064                 atomname, resname, ' ', resnr%10000, ' ',
2065                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2066     }
2067     fprintf(out, "TER\n");
2068
2069     gmx_fio_fclose(out);
2070 }
2071
2072 real dd_cutoff_mbody(gmx_domdec_t *dd)
2073 {
2074     gmx_domdec_comm_t *comm;
2075     int                di;
2076     real               r;
2077
2078     comm = dd->comm;
2079
2080     r = -1;
2081     if (comm->bInterCGBondeds)
2082     {
2083         if (comm->cutoff_mbody > 0)
2084         {
2085             r = comm->cutoff_mbody;
2086         }
2087         else
2088         {
2089             /* cutoff_mbody=0 means we do not have DLB */
2090             r = comm->cellsize_min[dd->dim[0]];
2091             for (di = 1; di < dd->ndim; di++)
2092             {
2093                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2094             }
2095             if (comm->bBondComm)
2096             {
2097                 r = max(r, comm->cutoff_mbody);
2098             }
2099             else
2100             {
2101                 r = min(r, comm->cutoff);
2102             }
2103         }
2104     }
2105
2106     return r;
2107 }
2108
2109 real dd_cutoff_twobody(gmx_domdec_t *dd)
2110 {
2111     real r_mb;
2112
2113     r_mb = dd_cutoff_mbody(dd);
2114
2115     return max(dd->comm->cutoff, r_mb);
2116 }
2117
2118
2119 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2120 {
2121     int nc, ntot;
2122
2123     nc   = dd->nc[dd->comm->cartpmedim];
2124     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2125     copy_ivec(coord, coord_pme);
2126     coord_pme[dd->comm->cartpmedim] =
2127         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2128 }
2129
2130 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2131 {
2132     /* Here we assign a PME node to communicate with this DD node
2133      * by assuming that the major index of both is x.
2134      * We add cr->npmenodes/2 to obtain an even distribution.
2135      */
2136     return (ddindex*npme + npme/2)/ndd;
2137 }
2138
2139 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2140 {
2141     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2142 }
2143
2144 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2145 {
2146     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2147 }
2148
2149 static int *dd_pmenodes(t_commrec *cr)
2150 {
2151     int *pmenodes;
2152     int  n, i, p0, p1;
2153
2154     snew(pmenodes, cr->npmenodes);
2155     n = 0;
2156     for (i = 0; i < cr->dd->nnodes; i++)
2157     {
2158         p0 = cr_ddindex2pmeindex(cr, i);
2159         p1 = cr_ddindex2pmeindex(cr, i+1);
2160         if (i+1 == cr->dd->nnodes || p1 > p0)
2161         {
2162             if (debug)
2163             {
2164                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2165             }
2166             pmenodes[n] = i + 1 + n;
2167             n++;
2168         }
2169     }
2170
2171     return pmenodes;
2172 }
2173
2174 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2175 {
2176     gmx_domdec_t *dd;
2177     ivec          coords, coords_pme, nc;
2178     int           slab;
2179
2180     dd = cr->dd;
2181     /*
2182        if (dd->comm->bCartesian) {
2183        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2184        dd_coords2pmecoords(dd,coords,coords_pme);
2185        copy_ivec(dd->ntot,nc);
2186        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2187        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2188
2189        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2190        } else {
2191        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2192        }
2193      */
2194     coords[XX] = x;
2195     coords[YY] = y;
2196     coords[ZZ] = z;
2197     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2198
2199     return slab;
2200 }
2201
2202 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2203 {
2204     gmx_domdec_comm_t *comm;
2205     ivec               coords;
2206     int                ddindex, nodeid = -1;
2207
2208     comm = cr->dd->comm;
2209
2210     coords[XX] = x;
2211     coords[YY] = y;
2212     coords[ZZ] = z;
2213     if (comm->bCartesianPP_PME)
2214     {
2215 #ifdef GMX_MPI
2216         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2217 #endif
2218     }
2219     else
2220     {
2221         ddindex = dd_index(cr->dd->nc, coords);
2222         if (comm->bCartesianPP)
2223         {
2224             nodeid = comm->ddindex2simnodeid[ddindex];
2225         }
2226         else
2227         {
2228             if (comm->pmenodes)
2229             {
2230                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2231             }
2232             else
2233             {
2234                 nodeid = ddindex;
2235             }
2236         }
2237     }
2238
2239     return nodeid;
2240 }
2241
2242 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2243 {
2244     gmx_domdec_t      *dd;
2245     gmx_domdec_comm_t *comm;
2246     ivec               coord, coord_pme;
2247     int                i;
2248     int                pmenode = -1;
2249
2250     dd   = cr->dd;
2251     comm = dd->comm;
2252
2253     /* This assumes a uniform x domain decomposition grid cell size */
2254     if (comm->bCartesianPP_PME)
2255     {
2256 #ifdef GMX_MPI
2257         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2258         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2259         {
2260             /* This is a PP node */
2261             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2262             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2263         }
2264 #endif
2265     }
2266     else if (comm->bCartesianPP)
2267     {
2268         if (sim_nodeid < dd->nnodes)
2269         {
2270             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2271         }
2272     }
2273     else
2274     {
2275         /* This assumes DD cells with identical x coordinates
2276          * are numbered sequentially.
2277          */
2278         if (dd->comm->pmenodes == NULL)
2279         {
2280             if (sim_nodeid < dd->nnodes)
2281             {
2282                 /* The DD index equals the nodeid */
2283                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2284             }
2285         }
2286         else
2287         {
2288             i = 0;
2289             while (sim_nodeid > dd->comm->pmenodes[i])
2290             {
2291                 i++;
2292             }
2293             if (sim_nodeid < dd->comm->pmenodes[i])
2294             {
2295                 pmenode = dd->comm->pmenodes[i];
2296             }
2297         }
2298     }
2299
2300     return pmenode;
2301 }
2302
2303 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2304 {
2305     gmx_bool bPMEOnlyNode;
2306
2307     if (DOMAINDECOMP(cr))
2308     {
2309         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2310     }
2311     else
2312     {
2313         bPMEOnlyNode = FALSE;
2314     }
2315
2316     return bPMEOnlyNode;
2317 }
2318
2319 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2320                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2321 {
2322     gmx_domdec_t *dd;
2323     int           x, y, z;
2324     ivec          coord, coord_pme;
2325
2326     dd = cr->dd;
2327
2328     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2329
2330     *nmy_ddnodes = 0;
2331     for (x = 0; x < dd->nc[XX]; x++)
2332     {
2333         for (y = 0; y < dd->nc[YY]; y++)
2334         {
2335             for (z = 0; z < dd->nc[ZZ]; z++)
2336             {
2337                 if (dd->comm->bCartesianPP_PME)
2338                 {
2339                     coord[XX] = x;
2340                     coord[YY] = y;
2341                     coord[ZZ] = z;
2342                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2343                     if (dd->ci[XX] == coord_pme[XX] &&
2344                         dd->ci[YY] == coord_pme[YY] &&
2345                         dd->ci[ZZ] == coord_pme[ZZ])
2346                     {
2347                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2348                     }
2349                 }
2350                 else
2351                 {
2352                     /* The slab corresponds to the nodeid in the PME group */
2353                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2354                     {
2355                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2356                     }
2357                 }
2358             }
2359         }
2360     }
2361
2362     /* The last PP-only node is the peer node */
2363     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2364
2365     if (debug)
2366     {
2367         fprintf(debug, "Receive coordinates from PP nodes:");
2368         for (x = 0; x < *nmy_ddnodes; x++)
2369         {
2370             fprintf(debug, " %d", (*my_ddnodes)[x]);
2371         }
2372         fprintf(debug, "\n");
2373     }
2374 }
2375
2376 static gmx_bool receive_vir_ener(t_commrec *cr)
2377 {
2378     gmx_domdec_comm_t *comm;
2379     int                pmenode, coords[DIM], rank;
2380     gmx_bool           bReceive;
2381
2382     bReceive = TRUE;
2383     if (cr->npmenodes < cr->dd->nnodes)
2384     {
2385         comm = cr->dd->comm;
2386         if (comm->bCartesianPP_PME)
2387         {
2388             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2389 #ifdef GMX_MPI
2390             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2391             coords[comm->cartpmedim]++;
2392             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2393             {
2394                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2395                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2396                 {
2397                     /* This is not the last PP node for pmenode */
2398                     bReceive = FALSE;
2399                 }
2400             }
2401 #endif
2402         }
2403         else
2404         {
2405             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2406             if (cr->sim_nodeid+1 < cr->nnodes &&
2407                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2408             {
2409                 /* This is not the last PP node for pmenode */
2410                 bReceive = FALSE;
2411             }
2412         }
2413     }
2414
2415     return bReceive;
2416 }
2417
2418 static void set_zones_ncg_home(gmx_domdec_t *dd)
2419 {
2420     gmx_domdec_zones_t *zones;
2421     int                 i;
2422
2423     zones = &dd->comm->zones;
2424
2425     zones->cg_range[0] = 0;
2426     for (i = 1; i < zones->n+1; i++)
2427     {
2428         zones->cg_range[i] = dd->ncg_home;
2429     }
2430 }
2431
2432 static void rebuild_cgindex(gmx_domdec_t *dd,
2433                             const int *gcgs_index, t_state *state)
2434 {
2435     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2436
2437     ind        = state->cg_gl;
2438     dd_cg_gl   = dd->index_gl;
2439     cgindex    = dd->cgindex;
2440     nat        = 0;
2441     cgindex[0] = nat;
2442     for (i = 0; i < state->ncg_gl; i++)
2443     {
2444         cgindex[i]  = nat;
2445         cg_gl       = ind[i];
2446         dd_cg_gl[i] = cg_gl;
2447         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2448     }
2449     cgindex[i] = nat;
2450
2451     dd->ncg_home = state->ncg_gl;
2452     dd->nat_home = nat;
2453
2454     set_zones_ncg_home(dd);
2455 }
2456
2457 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2458 {
2459     while (cg >= cginfo_mb->cg_end)
2460     {
2461         cginfo_mb++;
2462     }
2463
2464     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2465 }
2466
2467 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2468                           t_forcerec *fr, char *bLocalCG)
2469 {
2470     cginfo_mb_t *cginfo_mb;
2471     int         *cginfo;
2472     int          cg;
2473
2474     if (fr != NULL)
2475     {
2476         cginfo_mb = fr->cginfo_mb;
2477         cginfo    = fr->cginfo;
2478
2479         for (cg = cg0; cg < cg1; cg++)
2480         {
2481             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2482         }
2483     }
2484
2485     if (bLocalCG != NULL)
2486     {
2487         for (cg = cg0; cg < cg1; cg++)
2488         {
2489             bLocalCG[index_gl[cg]] = TRUE;
2490         }
2491     }
2492 }
2493
2494 static void make_dd_indices(gmx_domdec_t *dd,
2495                             const int *gcgs_index, int cg_start)
2496 {
2497     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2498     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2499     gmx_ga2la_t *ga2la;
2500     char        *bLocalCG;
2501     gmx_bool     bCGs;
2502
2503     bLocalCG = dd->comm->bLocalCG;
2504
2505     if (dd->nat_tot > dd->gatindex_nalloc)
2506     {
2507         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2508         srenew(dd->gatindex, dd->gatindex_nalloc);
2509     }
2510
2511     nzone      = dd->comm->zones.n;
2512     zone2cg    = dd->comm->zones.cg_range;
2513     zone_ncg1  = dd->comm->zone_ncg1;
2514     index_gl   = dd->index_gl;
2515     gatindex   = dd->gatindex;
2516     bCGs       = dd->comm->bCGs;
2517
2518     if (zone2cg[1] != dd->ncg_home)
2519     {
2520         gmx_incons("dd->ncg_zone is not up to date");
2521     }
2522
2523     /* Make the local to global and global to local atom index */
2524     a = dd->cgindex[cg_start];
2525     for (zone = 0; zone < nzone; zone++)
2526     {
2527         if (zone == 0)
2528         {
2529             cg0 = cg_start;
2530         }
2531         else
2532         {
2533             cg0 = zone2cg[zone];
2534         }
2535         cg1    = zone2cg[zone+1];
2536         cg1_p1 = cg0 + zone_ncg1[zone];
2537
2538         for (cg = cg0; cg < cg1; cg++)
2539         {
2540             zone1 = zone;
2541             if (cg >= cg1_p1)
2542             {
2543                 /* Signal that this cg is from more than one pulse away */
2544                 zone1 += nzone;
2545             }
2546             cg_gl = index_gl[cg];
2547             if (bCGs)
2548             {
2549                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2550                 {
2551                     gatindex[a] = a_gl;
2552                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2553                     a++;
2554                 }
2555             }
2556             else
2557             {
2558                 gatindex[a] = cg_gl;
2559                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2560                 a++;
2561             }
2562         }
2563     }
2564 }
2565
2566 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2567                           const char *where)
2568 {
2569     int ncg, i, ngl, nerr;
2570
2571     nerr = 0;
2572     if (bLocalCG == NULL)
2573     {
2574         return nerr;
2575     }
2576     for (i = 0; i < dd->ncg_tot; i++)
2577     {
2578         if (!bLocalCG[dd->index_gl[i]])
2579         {
2580             fprintf(stderr,
2581                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2582             nerr++;
2583         }
2584     }
2585     ngl = 0;
2586     for (i = 0; i < ncg_sys; i++)
2587     {
2588         if (bLocalCG[i])
2589         {
2590             ngl++;
2591         }
2592     }
2593     if (ngl != dd->ncg_tot)
2594     {
2595         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2596         nerr++;
2597     }
2598
2599     return nerr;
2600 }
2601
2602 static void check_index_consistency(gmx_domdec_t *dd,
2603                                     int natoms_sys, int ncg_sys,
2604                                     const char *where)
2605 {
2606     int   nerr, ngl, i, a, cell;
2607     int  *have;
2608
2609     nerr = 0;
2610
2611     if (dd->comm->DD_debug > 1)
2612     {
2613         snew(have, natoms_sys);
2614         for (a = 0; a < dd->nat_tot; a++)
2615         {
2616             if (have[dd->gatindex[a]] > 0)
2617             {
2618                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2619             }
2620             else
2621             {
2622                 have[dd->gatindex[a]] = a + 1;
2623             }
2624         }
2625         sfree(have);
2626     }
2627
2628     snew(have, dd->nat_tot);
2629
2630     ngl  = 0;
2631     for (i = 0; i < natoms_sys; i++)
2632     {
2633         if (ga2la_get(dd->ga2la, i, &a, &cell))
2634         {
2635             if (a >= dd->nat_tot)
2636             {
2637                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2638                 nerr++;
2639             }
2640             else
2641             {
2642                 have[a] = 1;
2643                 if (dd->gatindex[a] != i)
2644                 {
2645                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2646                     nerr++;
2647                 }
2648             }
2649             ngl++;
2650         }
2651     }
2652     if (ngl != dd->nat_tot)
2653     {
2654         fprintf(stderr,
2655                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2656                 dd->rank, where, ngl, dd->nat_tot);
2657     }
2658     for (a = 0; a < dd->nat_tot; a++)
2659     {
2660         if (have[a] == 0)
2661         {
2662             fprintf(stderr,
2663                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2664                     dd->rank, where, a+1, dd->gatindex[a]+1);
2665         }
2666     }
2667     sfree(have);
2668
2669     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2670
2671     if (nerr > 0)
2672     {
2673         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2674                   dd->rank, where, nerr);
2675     }
2676 }
2677
2678 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2679 {
2680     int   i;
2681     char *bLocalCG;
2682
2683     if (a_start == 0)
2684     {
2685         /* Clear the whole list without searching */
2686         ga2la_clear(dd->ga2la);
2687     }
2688     else
2689     {
2690         for (i = a_start; i < dd->nat_tot; i++)
2691         {
2692             ga2la_del(dd->ga2la, dd->gatindex[i]);
2693         }
2694     }
2695
2696     bLocalCG = dd->comm->bLocalCG;
2697     if (bLocalCG)
2698     {
2699         for (i = cg_start; i < dd->ncg_tot; i++)
2700         {
2701             bLocalCG[dd->index_gl[i]] = FALSE;
2702         }
2703     }
2704
2705     dd_clear_local_vsite_indices(dd);
2706
2707     if (dd->constraints)
2708     {
2709         dd_clear_local_constraint_indices(dd);
2710     }
2711 }
2712
2713 /* This function should be used for moving the domain boudaries during DLB,
2714  * for obtaining the minimum cell size. It checks the initially set limit
2715  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2716  * and, possibly, a longer cut-off limit set for PME load balancing.
2717  */
2718 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2719 {
2720     real cellsize_min;
2721
2722     cellsize_min = comm->cellsize_min[dim];
2723
2724     if (!comm->bVacDLBNoLimit && comm->bPMELoadBalDLBLimits)
2725     {
2726         cellsize_min = max(cellsize_min,
2727                            comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2728     }
2729
2730     return cellsize_min;
2731 }
2732
2733 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2734                             int dim_ind)
2735 {
2736     real grid_jump_limit;
2737
2738     /* The distance between the boundaries of cells at distance
2739      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2740      * and by the fact that cells should not be shifted by more than
2741      * half their size, such that cg's only shift by one cell
2742      * at redecomposition.
2743      */
2744     grid_jump_limit = comm->cellsize_limit;
2745     if (!comm->bVacDLBNoLimit)
2746     {
2747         if (comm->bPMELoadBalDLBLimits)
2748         {
2749             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2750         }
2751         grid_jump_limit = max(grid_jump_limit,
2752                               cutoff/comm->cd[dim_ind].np);
2753     }
2754
2755     return grid_jump_limit;
2756 }
2757
2758 static gmx_bool check_grid_jump(gmx_large_int_t step,
2759                                 gmx_domdec_t   *dd,
2760                                 real            cutoff,
2761                                 gmx_ddbox_t    *ddbox,
2762                                 gmx_bool        bFatal)
2763 {
2764     gmx_domdec_comm_t *comm;
2765     int                d, dim;
2766     real               limit, bfac;
2767     gmx_bool           bInvalid;
2768
2769     bInvalid = FALSE;
2770
2771     comm = dd->comm;
2772
2773     for (d = 1; d < dd->ndim; d++)
2774     {
2775         dim   = dd->dim[d];
2776         limit = grid_jump_limit(comm, cutoff, d);
2777         bfac  = ddbox->box_size[dim];
2778         if (ddbox->tric_dir[dim])
2779         {
2780             bfac *= ddbox->skew_fac[dim];
2781         }
2782         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2783                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2784         {
2785             bInvalid = TRUE;
2786
2787             if (bFatal)
2788             {
2789                 char buf[22];
2790
2791                 /* This error should never be triggered under normal
2792                  * circumstances, but you never know ...
2793                  */
2794                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2795                           gmx_step_str(step, buf),
2796                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2797             }
2798         }
2799     }
2800
2801     return bInvalid;
2802 }
2803
2804 static int dd_load_count(gmx_domdec_comm_t *comm)
2805 {
2806     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2807 }
2808
2809 static float dd_force_load(gmx_domdec_comm_t *comm)
2810 {
2811     float load;
2812
2813     if (comm->eFlop)
2814     {
2815         load = comm->flop;
2816         if (comm->eFlop > 1)
2817         {
2818             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2819         }
2820     }
2821     else
2822     {
2823         load = comm->cycl[ddCyclF];
2824         if (comm->cycl_n[ddCyclF] > 1)
2825         {
2826             /* Subtract the maximum of the last n cycle counts
2827              * to get rid of possible high counts due to other soures,
2828              * for instance system activity, that would otherwise
2829              * affect the dynamic load balancing.
2830              */
2831             load -= comm->cycl_max[ddCyclF];
2832         }
2833     }
2834
2835     return load;
2836 }
2837
2838 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2839 {
2840     gmx_domdec_comm_t *comm;
2841     int                i;
2842
2843     comm = dd->comm;
2844
2845     snew(*dim_f, dd->nc[dim]+1);
2846     (*dim_f)[0] = 0;
2847     for (i = 1; i < dd->nc[dim]; i++)
2848     {
2849         if (comm->slb_frac[dim])
2850         {
2851             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2852         }
2853         else
2854         {
2855             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2856         }
2857     }
2858     (*dim_f)[dd->nc[dim]] = 1;
2859 }
2860
2861 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2862 {
2863     int  pmeindex, slab, nso, i;
2864     ivec xyz;
2865
2866     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2867     {
2868         ddpme->dim = YY;
2869     }
2870     else
2871     {
2872         ddpme->dim = dimind;
2873     }
2874     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2875
2876     ddpme->nslab = (ddpme->dim == 0 ?
2877                     dd->comm->npmenodes_x :
2878                     dd->comm->npmenodes_y);
2879
2880     if (ddpme->nslab <= 1)
2881     {
2882         return;
2883     }
2884
2885     nso = dd->comm->npmenodes/ddpme->nslab;
2886     /* Determine for each PME slab the PP location range for dimension dim */
2887     snew(ddpme->pp_min, ddpme->nslab);
2888     snew(ddpme->pp_max, ddpme->nslab);
2889     for (slab = 0; slab < ddpme->nslab; slab++)
2890     {
2891         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2892         ddpme->pp_max[slab] = 0;
2893     }
2894     for (i = 0; i < dd->nnodes; i++)
2895     {
2896         ddindex2xyz(dd->nc, i, xyz);
2897         /* For y only use our y/z slab.
2898          * This assumes that the PME x grid size matches the DD grid size.
2899          */
2900         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2901         {
2902             pmeindex = ddindex2pmeindex(dd, i);
2903             if (dimind == 0)
2904             {
2905                 slab = pmeindex/nso;
2906             }
2907             else
2908             {
2909                 slab = pmeindex % ddpme->nslab;
2910             }
2911             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2912             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2913         }
2914     }
2915
2916     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2917 }
2918
2919 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2920 {
2921     if (dd->comm->ddpme[0].dim == XX)
2922     {
2923         return dd->comm->ddpme[0].maxshift;
2924     }
2925     else
2926     {
2927         return 0;
2928     }
2929 }
2930
2931 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2932 {
2933     if (dd->comm->ddpme[0].dim == YY)
2934     {
2935         return dd->comm->ddpme[0].maxshift;
2936     }
2937     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2938     {
2939         return dd->comm->ddpme[1].maxshift;
2940     }
2941     else
2942     {
2943         return 0;
2944     }
2945 }
2946
2947 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2948                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2949 {
2950     gmx_domdec_comm_t *comm;
2951     int                nc, ns, s;
2952     int               *xmin, *xmax;
2953     real               range, pme_boundary;
2954     int                sh;
2955
2956     comm = dd->comm;
2957     nc   = dd->nc[ddpme->dim];
2958     ns   = ddpme->nslab;
2959
2960     if (!ddpme->dim_match)
2961     {
2962         /* PP decomposition is not along dim: the worst situation */
2963         sh = ns/2;
2964     }
2965     else if (ns <= 3 || (bUniform && ns == nc))
2966     {
2967         /* The optimal situation */
2968         sh = 1;
2969     }
2970     else
2971     {
2972         /* We need to check for all pme nodes which nodes they
2973          * could possibly need to communicate with.
2974          */
2975         xmin = ddpme->pp_min;
2976         xmax = ddpme->pp_max;
2977         /* Allow for atoms to be maximally 2/3 times the cut-off
2978          * out of their DD cell. This is a reasonable balance between
2979          * between performance and support for most charge-group/cut-off
2980          * combinations.
2981          */
2982         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
2983         /* Avoid extra communication when we are exactly at a boundary */
2984         range *= 0.999;
2985
2986         sh = 1;
2987         for (s = 0; s < ns; s++)
2988         {
2989             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
2990             pme_boundary = (real)s/ns;
2991             while (sh+1 < ns &&
2992                    ((s-(sh+1) >= 0 &&
2993                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
2994                     (s-(sh+1) <  0 &&
2995                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
2996             {
2997                 sh++;
2998             }
2999             pme_boundary = (real)(s+1)/ns;
3000             while (sh+1 < ns &&
3001                    ((s+(sh+1) <  ns &&
3002                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3003                     (s+(sh+1) >= ns &&
3004                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3005             {
3006                 sh++;
3007             }
3008         }
3009     }
3010
3011     ddpme->maxshift = sh;
3012
3013     if (debug)
3014     {
3015         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3016                 ddpme->dim, ddpme->maxshift);
3017     }
3018 }
3019
3020 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3021 {
3022     int d, dim;
3023
3024     for (d = 0; d < dd->ndim; d++)
3025     {
3026         dim = dd->dim[d];
3027         if (dim < ddbox->nboundeddim &&
3028             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3029             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3030         {
3031             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3032                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3033                       dd->nc[dim], dd->comm->cellsize_limit);
3034         }
3035     }
3036 }
3037
3038 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3039                                   gmx_bool bMaster, ivec npulse)
3040 {
3041     gmx_domdec_comm_t *comm;
3042     int                d, j;
3043     rvec               cellsize_min;
3044     real              *cell_x, cell_dx, cellsize;
3045
3046     comm = dd->comm;
3047
3048     for (d = 0; d < DIM; d++)
3049     {
3050         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3051         npulse[d]       = 1;
3052         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3053         {
3054             /* Uniform grid */
3055             cell_dx = ddbox->box_size[d]/dd->nc[d];
3056             if (bMaster)
3057             {
3058                 for (j = 0; j < dd->nc[d]+1; j++)
3059                 {
3060                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3061                 }
3062             }
3063             else
3064             {
3065                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3066                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3067             }
3068             cellsize = cell_dx*ddbox->skew_fac[d];
3069             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3070             {
3071                 npulse[d]++;
3072             }
3073             cellsize_min[d] = cellsize;
3074         }
3075         else
3076         {
3077             /* Statically load balanced grid */
3078             /* Also when we are not doing a master distribution we determine
3079              * all cell borders in a loop to obtain identical values
3080              * to the master distribution case and to determine npulse.
3081              */
3082             if (bMaster)
3083             {
3084                 cell_x = dd->ma->cell_x[d];
3085             }
3086             else
3087             {
3088                 snew(cell_x, dd->nc[d]+1);
3089             }
3090             cell_x[0] = ddbox->box0[d];
3091             for (j = 0; j < dd->nc[d]; j++)
3092             {
3093                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3094                 cell_x[j+1] = cell_x[j] + cell_dx;
3095                 cellsize    = cell_dx*ddbox->skew_fac[d];
3096                 while (cellsize*npulse[d] < comm->cutoff &&
3097                        npulse[d] < dd->nc[d]-1)
3098                 {
3099                     npulse[d]++;
3100                 }
3101                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3102             }
3103             if (!bMaster)
3104             {
3105                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3106                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3107                 sfree(cell_x);
3108             }
3109         }
3110         /* The following limitation is to avoid that a cell would receive
3111          * some of its own home charge groups back over the periodic boundary.
3112          * Double charge groups cause trouble with the global indices.
3113          */
3114         if (d < ddbox->npbcdim &&
3115             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3116         {
3117             gmx_fatal_collective(FARGS, NULL, dd,
3118                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3119                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3120                                  comm->cutoff,
3121                                  dd->nc[d], dd->nc[d],
3122                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3123         }
3124     }
3125
3126     if (!comm->bDynLoadBal)
3127     {
3128         copy_rvec(cellsize_min, comm->cellsize_min);
3129     }
3130
3131     for (d = 0; d < comm->npmedecompdim; d++)
3132     {
3133         set_pme_maxshift(dd, &comm->ddpme[d],
3134                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3135                          comm->ddpme[d].slb_dim_f);
3136     }
3137 }
3138
3139
3140 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3141                                                   int d, int dim, gmx_domdec_root_t *root,
3142                                                   gmx_ddbox_t *ddbox,
3143                                                   gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
3144 {
3145     gmx_domdec_comm_t *comm;
3146     int                ncd, i, j, nmin, nmin_old;
3147     gmx_bool           bLimLo, bLimHi;
3148     real              *cell_size;
3149     real               fac, halfway, cellsize_limit_f_i, region_size;
3150     gmx_bool           bPBC, bLastHi = FALSE;
3151     int                nrange[] = {range[0], range[1]};
3152
3153     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3154
3155     comm = dd->comm;
3156
3157     ncd = dd->nc[dim];
3158
3159     bPBC = (dim < ddbox->npbcdim);
3160
3161     cell_size = root->buf_ncd;
3162
3163     if (debug)
3164     {
3165         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3166     }
3167
3168     /* First we need to check if the scaling does not make cells
3169      * smaller than the smallest allowed size.
3170      * We need to do this iteratively, since if a cell is too small,
3171      * it needs to be enlarged, which makes all the other cells smaller,
3172      * which could in turn make another cell smaller than allowed.
3173      */
3174     for (i = range[0]; i < range[1]; i++)
3175     {
3176         root->bCellMin[i] = FALSE;
3177     }
3178     nmin = 0;
3179     do
3180     {
3181         nmin_old = nmin;
3182         /* We need the total for normalization */
3183         fac = 0;
3184         for (i = range[0]; i < range[1]; i++)
3185         {
3186             if (root->bCellMin[i] == FALSE)
3187             {
3188                 fac += cell_size[i];
3189             }
3190         }
3191         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3192         /* Determine the cell boundaries */
3193         for (i = range[0]; i < range[1]; i++)
3194         {
3195             if (root->bCellMin[i] == FALSE)
3196             {
3197                 cell_size[i] *= fac;
3198                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3199                 {
3200                     cellsize_limit_f_i = 0;
3201                 }
3202                 else
3203                 {
3204                     cellsize_limit_f_i = cellsize_limit_f;
3205                 }
3206                 if (cell_size[i] < cellsize_limit_f_i)
3207                 {
3208                     root->bCellMin[i] = TRUE;
3209                     cell_size[i]      = cellsize_limit_f_i;
3210                     nmin++;
3211                 }
3212             }
3213             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3214         }
3215     }
3216     while (nmin > nmin_old);
3217
3218     i            = range[1]-1;
3219     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3220     /* For this check we should not use DD_CELL_MARGIN,
3221      * but a slightly smaller factor,
3222      * since rounding could get use below the limit.
3223      */
3224     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3225     {
3226         char buf[22];
3227         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3228                   gmx_step_str(step, buf),
3229                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3230                   ncd, comm->cellsize_min[dim]);
3231     }
3232
3233     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3234
3235     if (!bUniform)
3236     {
3237         /* Check if the boundary did not displace more than halfway
3238          * each of the cells it bounds, as this could cause problems,
3239          * especially when the differences between cell sizes are large.
3240          * If changes are applied, they will not make cells smaller
3241          * than the cut-off, as we check all the boundaries which
3242          * might be affected by a change and if the old state was ok,
3243          * the cells will at most be shrunk back to their old size.
3244          */
3245         for (i = range[0]+1; i < range[1]; i++)
3246         {
3247             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3248             if (root->cell_f[i] < halfway)
3249             {
3250                 root->cell_f[i] = halfway;
3251                 /* Check if the change also causes shifts of the next boundaries */
3252                 for (j = i+1; j < range[1]; j++)
3253                 {
3254                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3255                     {
3256                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3257                     }
3258                 }
3259             }
3260             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3261             if (root->cell_f[i] > halfway)
3262             {
3263                 root->cell_f[i] = halfway;
3264                 /* Check if the change also causes shifts of the next boundaries */
3265                 for (j = i-1; j >= range[0]+1; j--)
3266                 {
3267                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3268                     {
3269                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3270                     }
3271                 }
3272             }
3273         }
3274     }
3275
3276     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3277     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3278      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3279      * for a and b nrange is used */
3280     if (d > 0)
3281     {
3282         /* Take care of the staggering of the cell boundaries */
3283         if (bUniform)
3284         {
3285             for (i = range[0]; i < range[1]; i++)
3286             {
3287                 root->cell_f_max0[i] = root->cell_f[i];
3288                 root->cell_f_min1[i] = root->cell_f[i+1];
3289             }
3290         }
3291         else
3292         {
3293             for (i = range[0]+1; i < range[1]; i++)
3294             {
3295                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3296                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3297                 if (bLimLo && bLimHi)
3298                 {
3299                     /* Both limits violated, try the best we can */
3300                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3301                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3302                     nrange[0]       = range[0];
3303                     nrange[1]       = i;
3304                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3305
3306                     nrange[0] = i;
3307                     nrange[1] = range[1];
3308                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3309
3310                     return;
3311                 }
3312                 else if (bLimLo)
3313                 {
3314                     /* root->cell_f[i] = root->bound_min[i]; */
3315                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3316                     bLastHi   = FALSE;
3317                 }
3318                 else if (bLimHi && !bLastHi)
3319                 {
3320                     bLastHi = TRUE;
3321                     if (nrange[1] < range[1])   /* found a LimLo before */
3322                     {
3323                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3324                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3325                         nrange[0] = nrange[1];
3326                     }
3327                     root->cell_f[i] = root->bound_max[i];
3328                     nrange[1]       = i;
3329                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3330                     nrange[0] = i;
3331                     nrange[1] = range[1];
3332                 }
3333             }
3334             if (nrange[1] < range[1])   /* found last a LimLo */
3335             {
3336                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3337                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3338                 nrange[0] = nrange[1];
3339                 nrange[1] = range[1];
3340                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3341             }
3342             else if (nrange[0] > range[0]) /* found at least one LimHi */
3343             {
3344                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3345             }
3346         }
3347     }
3348 }
3349
3350
3351 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3352                                        int d, int dim, gmx_domdec_root_t *root,
3353                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3354                                        gmx_bool bUniform, gmx_large_int_t step)
3355 {
3356     gmx_domdec_comm_t *comm;
3357     int                ncd, d1, i, j, pos;
3358     real              *cell_size;
3359     real               load_aver, load_i, imbalance, change, change_max, sc;
3360     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3361     real               change_limit;
3362     real               relax = 0.5;
3363     gmx_bool           bPBC;
3364     int                range[] = { 0, 0 };
3365
3366     comm = dd->comm;
3367
3368     /* Convert the maximum change from the input percentage to a fraction */
3369     change_limit = comm->dlb_scale_lim*0.01;
3370
3371     ncd = dd->nc[dim];
3372
3373     bPBC = (dim < ddbox->npbcdim);
3374
3375     cell_size = root->buf_ncd;
3376
3377     /* Store the original boundaries */
3378     for (i = 0; i < ncd+1; i++)
3379     {
3380         root->old_cell_f[i] = root->cell_f[i];
3381     }
3382     if (bUniform)
3383     {
3384         for (i = 0; i < ncd; i++)
3385         {
3386             cell_size[i] = 1.0/ncd;
3387         }
3388     }
3389     else if (dd_load_count(comm))
3390     {
3391         load_aver  = comm->load[d].sum_m/ncd;
3392         change_max = 0;
3393         for (i = 0; i < ncd; i++)
3394         {
3395             /* Determine the relative imbalance of cell i */
3396             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3397             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3398             /* Determine the change of the cell size using underrelaxation */
3399             change     = -relax*imbalance;
3400             change_max = max(change_max, max(change, -change));
3401         }
3402         /* Limit the amount of scaling.
3403          * We need to use the same rescaling for all cells in one row,
3404          * otherwise the load balancing might not converge.
3405          */
3406         sc = relax;
3407         if (change_max > change_limit)
3408         {
3409             sc *= change_limit/change_max;
3410         }
3411         for (i = 0; i < ncd; i++)
3412         {
3413             /* Determine the relative imbalance of cell i */
3414             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3415             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3416             /* Determine the change of the cell size using underrelaxation */
3417             change       = -sc*imbalance;
3418             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3419         }
3420     }
3421
3422     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3423     cellsize_limit_f *= DD_CELL_MARGIN;
3424     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3425     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3426     if (ddbox->tric_dir[dim])
3427     {
3428         cellsize_limit_f /= ddbox->skew_fac[dim];
3429         dist_min_f       /= ddbox->skew_fac[dim];
3430     }
3431     if (bDynamicBox && d > 0)
3432     {
3433         dist_min_f *= DD_PRES_SCALE_MARGIN;
3434     }
3435     if (d > 0 && !bUniform)
3436     {
3437         /* Make sure that the grid is not shifted too much */
3438         for (i = 1; i < ncd; i++)
3439         {
3440             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3441             {
3442                 gmx_incons("Inconsistent DD boundary staggering limits!");
3443             }
3444             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3445             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3446             if (space > 0)
3447             {
3448                 root->bound_min[i] += 0.5*space;
3449             }
3450             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3451             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3452             if (space < 0)
3453             {
3454                 root->bound_max[i] += 0.5*space;
3455             }
3456             if (debug)
3457             {
3458                 fprintf(debug,
3459                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3460                         d, i,
3461                         root->cell_f_max0[i-1] + dist_min_f,
3462                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3463                         root->cell_f_min1[i] - dist_min_f);
3464             }
3465         }
3466     }
3467     range[1]          = ncd;
3468     root->cell_f[0]   = 0;
3469     root->cell_f[ncd] = 1;
3470     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3471
3472
3473     /* After the checks above, the cells should obey the cut-off
3474      * restrictions, but it does not hurt to check.
3475      */
3476     for (i = 0; i < ncd; i++)
3477     {
3478         if (debug)
3479         {
3480             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3481                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3482         }
3483
3484         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3485             root->cell_f[i+1] - root->cell_f[i] <
3486             cellsize_limit_f/DD_CELL_MARGIN)
3487         {
3488             char buf[22];
3489             fprintf(stderr,
3490                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3491                     gmx_step_str(step, buf), dim2char(dim), i,
3492                     (root->cell_f[i+1] - root->cell_f[i])
3493                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3494         }
3495     }
3496
3497     pos = ncd + 1;
3498     /* Store the cell boundaries of the lower dimensions at the end */
3499     for (d1 = 0; d1 < d; d1++)
3500     {
3501         root->cell_f[pos++] = comm->cell_f0[d1];
3502         root->cell_f[pos++] = comm->cell_f1[d1];
3503     }
3504
3505     if (d < comm->npmedecompdim)
3506     {
3507         /* The master determines the maximum shift for
3508          * the coordinate communication between separate PME nodes.
3509          */
3510         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3511     }
3512     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3513     if (d >= 1)
3514     {
3515         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3516     }
3517 }
3518
3519 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3520                                              gmx_ddbox_t *ddbox, int dimind)
3521 {
3522     gmx_domdec_comm_t *comm;
3523     int                dim;
3524
3525     comm = dd->comm;
3526
3527     /* Set the cell dimensions */
3528     dim                = dd->dim[dimind];
3529     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3530     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3531     if (dim >= ddbox->nboundeddim)
3532     {
3533         comm->cell_x0[dim] += ddbox->box0[dim];
3534         comm->cell_x1[dim] += ddbox->box0[dim];
3535     }
3536 }
3537
3538 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3539                                          int d, int dim, real *cell_f_row,
3540                                          gmx_ddbox_t *ddbox)
3541 {
3542     gmx_domdec_comm_t *comm;
3543     int                d1, dim1, pos;
3544
3545     comm = dd->comm;
3546
3547 #ifdef GMX_MPI
3548     /* Each node would only need to know two fractions,
3549      * but it is probably cheaper to broadcast the whole array.
3550      */
3551     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3552               0, comm->mpi_comm_load[d]);
3553 #endif
3554     /* Copy the fractions for this dimension from the buffer */
3555     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3556     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3557     /* The whole array was communicated, so set the buffer position */
3558     pos = dd->nc[dim] + 1;
3559     for (d1 = 0; d1 <= d; d1++)
3560     {
3561         if (d1 < d)
3562         {
3563             /* Copy the cell fractions of the lower dimensions */
3564             comm->cell_f0[d1] = cell_f_row[pos++];
3565             comm->cell_f1[d1] = cell_f_row[pos++];
3566         }
3567         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3568     }
3569     /* Convert the communicated shift from float to int */
3570     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3571     if (d >= 1)
3572     {
3573         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3574     }
3575 }
3576
3577 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3578                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3579                                          gmx_bool bUniform, gmx_large_int_t step)
3580 {
3581     gmx_domdec_comm_t *comm;
3582     int                d, dim, d1;
3583     gmx_bool           bRowMember, bRowRoot;
3584     real              *cell_f_row;
3585
3586     comm = dd->comm;
3587
3588     for (d = 0; d < dd->ndim; d++)
3589     {
3590         dim        = dd->dim[d];
3591         bRowMember = TRUE;
3592         bRowRoot   = TRUE;
3593         for (d1 = d; d1 < dd->ndim; d1++)
3594         {
3595             if (dd->ci[dd->dim[d1]] > 0)
3596             {
3597                 if (d1 > d)
3598                 {
3599                     bRowMember = FALSE;
3600                 }
3601                 bRowRoot = FALSE;
3602             }
3603         }
3604         if (bRowMember)
3605         {
3606             if (bRowRoot)
3607             {
3608                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3609                                            ddbox, bDynamicBox, bUniform, step);
3610                 cell_f_row = comm->root[d]->cell_f;
3611             }
3612             else
3613             {
3614                 cell_f_row = comm->cell_f_row;
3615             }
3616             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3617         }
3618     }
3619 }
3620
3621 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3622 {
3623     int d;
3624
3625     /* This function assumes the box is static and should therefore
3626      * not be called when the box has changed since the last
3627      * call to dd_partition_system.
3628      */
3629     for (d = 0; d < dd->ndim; d++)
3630     {
3631         relative_to_absolute_cell_bounds(dd, ddbox, d);
3632     }
3633 }
3634
3635
3636
3637 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3638                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3639                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3640                                   gmx_wallcycle_t wcycle)
3641 {
3642     gmx_domdec_comm_t *comm;
3643     int                dim;
3644
3645     comm = dd->comm;
3646
3647     if (bDoDLB)
3648     {
3649         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3650         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3651         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3652     }
3653     else if (bDynamicBox)
3654     {
3655         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3656     }
3657
3658     /* Set the dimensions for which no DD is used */
3659     for (dim = 0; dim < DIM; dim++)
3660     {
3661         if (dd->nc[dim] == 1)
3662         {
3663             comm->cell_x0[dim] = 0;
3664             comm->cell_x1[dim] = ddbox->box_size[dim];
3665             if (dim >= ddbox->nboundeddim)
3666             {
3667                 comm->cell_x0[dim] += ddbox->box0[dim];
3668                 comm->cell_x1[dim] += ddbox->box0[dim];
3669             }
3670         }
3671     }
3672 }
3673
3674 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3675 {
3676     int                    d, np, i;
3677     gmx_domdec_comm_dim_t *cd;
3678
3679     for (d = 0; d < dd->ndim; d++)
3680     {
3681         cd = &dd->comm->cd[d];
3682         np = npulse[dd->dim[d]];
3683         if (np > cd->np_nalloc)
3684         {
3685             if (debug)
3686             {
3687                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3688                         dim2char(dd->dim[d]), np);
3689             }
3690             if (DDMASTER(dd) && cd->np_nalloc > 0)
3691             {
3692                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3693             }
3694             srenew(cd->ind, np);
3695             for (i = cd->np_nalloc; i < np; i++)
3696             {
3697                 cd->ind[i].index  = NULL;
3698                 cd->ind[i].nalloc = 0;
3699             }
3700             cd->np_nalloc = np;
3701         }
3702         cd->np = np;
3703     }
3704 }
3705
3706
3707 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3708                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3709                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3710                               gmx_wallcycle_t wcycle)
3711 {
3712     gmx_domdec_comm_t *comm;
3713     int                d;
3714     ivec               npulse;
3715
3716     comm = dd->comm;
3717
3718     /* Copy the old cell boundaries for the cg displacement check */
3719     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3720     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3721
3722     if (comm->bDynLoadBal)
3723     {
3724         if (DDMASTER(dd))
3725         {
3726             check_box_size(dd, ddbox);
3727         }
3728         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3729     }
3730     else
3731     {
3732         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
3733         realloc_comm_ind(dd, npulse);
3734     }
3735
3736     if (debug)
3737     {
3738         for (d = 0; d < DIM; d++)
3739         {
3740             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3741                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3742         }
3743     }
3744 }
3745
3746 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3747                                   gmx_ddbox_t *ddbox,
3748                                   rvec cell_ns_x0, rvec cell_ns_x1,
3749                                   gmx_large_int_t step)
3750 {
3751     gmx_domdec_comm_t *comm;
3752     int                dim_ind, dim;
3753
3754     comm = dd->comm;
3755
3756     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3757     {
3758         dim = dd->dim[dim_ind];
3759
3760         /* Without PBC we don't have restrictions on the outer cells */
3761         if (!(dim >= ddbox->npbcdim &&
3762               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3763             comm->bDynLoadBal &&
3764             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3765             comm->cellsize_min[dim])
3766         {
3767             char buf[22];
3768             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3769                       gmx_step_str(step, buf), dim2char(dim),
3770                       comm->cell_x1[dim] - comm->cell_x0[dim],
3771                       ddbox->skew_fac[dim],
3772                       dd->comm->cellsize_min[dim],
3773                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3774         }
3775     }
3776
3777     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3778     {
3779         /* Communicate the boundaries and update cell_ns_x0/1 */
3780         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3781         if (dd->bGridJump && dd->ndim > 1)
3782         {
3783             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3784         }
3785     }
3786 }
3787
3788 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3789 {
3790     if (YY < npbcdim)
3791     {
3792         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3793     }
3794     else
3795     {
3796         tcm[YY][XX] = 0;
3797     }
3798     if (ZZ < npbcdim)
3799     {
3800         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3801         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3802     }
3803     else
3804     {
3805         tcm[ZZ][XX] = 0;
3806         tcm[ZZ][YY] = 0;
3807     }
3808 }
3809
3810 static void check_screw_box(matrix box)
3811 {
3812     /* Mathematical limitation */
3813     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3814     {
3815         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3816     }
3817
3818     /* Limitation due to the asymmetry of the eighth shell method */
3819     if (box[ZZ][YY] != 0)
3820     {
3821         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3822     }
3823 }
3824
3825 static void distribute_cg(FILE *fplog, gmx_large_int_t step,
3826                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3827                           gmx_domdec_t *dd)
3828 {
3829     gmx_domdec_master_t *ma;
3830     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3831     int                  i, icg, j, k, k0, k1, d, npbcdim;
3832     matrix               tcm;
3833     rvec                 box_size, cg_cm;
3834     ivec                 ind;
3835     real                 nrcg, inv_ncg, pos_d;
3836     atom_id             *cgindex;
3837     gmx_bool             bUnbounded, bScrew;
3838
3839     ma = dd->ma;
3840
3841     if (tmp_ind == NULL)
3842     {
3843         snew(tmp_nalloc, dd->nnodes);
3844         snew(tmp_ind, dd->nnodes);
3845         for (i = 0; i < dd->nnodes; i++)
3846         {
3847             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3848             snew(tmp_ind[i], tmp_nalloc[i]);
3849         }
3850     }
3851
3852     /* Clear the count */
3853     for (i = 0; i < dd->nnodes; i++)
3854     {
3855         ma->ncg[i] = 0;
3856         ma->nat[i] = 0;
3857     }
3858
3859     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3860
3861     cgindex = cgs->index;
3862
3863     /* Compute the center of geometry for all charge groups */
3864     for (icg = 0; icg < cgs->nr; icg++)
3865     {
3866         k0      = cgindex[icg];
3867         k1      = cgindex[icg+1];
3868         nrcg    = k1 - k0;
3869         if (nrcg == 1)
3870         {
3871             copy_rvec(pos[k0], cg_cm);
3872         }
3873         else
3874         {
3875             inv_ncg = 1.0/nrcg;
3876
3877             clear_rvec(cg_cm);
3878             for (k = k0; (k < k1); k++)
3879             {
3880                 rvec_inc(cg_cm, pos[k]);
3881             }
3882             for (d = 0; (d < DIM); d++)
3883             {
3884                 cg_cm[d] *= inv_ncg;
3885             }
3886         }
3887         /* Put the charge group in the box and determine the cell index */
3888         for (d = DIM-1; d >= 0; d--)
3889         {
3890             pos_d = cg_cm[d];
3891             if (d < dd->npbcdim)
3892             {
3893                 bScrew = (dd->bScrewPBC && d == XX);
3894                 if (tric_dir[d] && dd->nc[d] > 1)
3895                 {
3896                     /* Use triclinic coordintates for this dimension */
3897                     for (j = d+1; j < DIM; j++)
3898                     {
3899                         pos_d += cg_cm[j]*tcm[j][d];
3900                     }
3901                 }
3902                 while (pos_d >= box[d][d])
3903                 {
3904                     pos_d -= box[d][d];
3905                     rvec_dec(cg_cm, box[d]);
3906                     if (bScrew)
3907                     {
3908                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3909                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3910                     }
3911                     for (k = k0; (k < k1); k++)
3912                     {
3913                         rvec_dec(pos[k], box[d]);
3914                         if (bScrew)
3915                         {
3916                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3917                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3918                         }
3919                     }
3920                 }
3921                 while (pos_d < 0)
3922                 {
3923                     pos_d += box[d][d];
3924                     rvec_inc(cg_cm, box[d]);
3925                     if (bScrew)
3926                     {
3927                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3928                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3929                     }
3930                     for (k = k0; (k < k1); k++)
3931                     {
3932                         rvec_inc(pos[k], box[d]);
3933                         if (bScrew)
3934                         {
3935                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3936                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3937                         }
3938                     }
3939                 }
3940             }
3941             /* This could be done more efficiently */
3942             ind[d] = 0;
3943             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3944             {
3945                 ind[d]++;
3946             }
3947         }
3948         i = dd_index(dd->nc, ind);
3949         if (ma->ncg[i] == tmp_nalloc[i])
3950         {
3951             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3952             srenew(tmp_ind[i], tmp_nalloc[i]);
3953         }
3954         tmp_ind[i][ma->ncg[i]] = icg;
3955         ma->ncg[i]++;
3956         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3957     }
3958
3959     k1 = 0;
3960     for (i = 0; i < dd->nnodes; i++)
3961     {
3962         ma->index[i] = k1;
3963         for (k = 0; k < ma->ncg[i]; k++)
3964         {
3965             ma->cg[k1++] = tmp_ind[i][k];
3966         }
3967     }
3968     ma->index[dd->nnodes] = k1;
3969
3970     for (i = 0; i < dd->nnodes; i++)
3971     {
3972         sfree(tmp_ind[i]);
3973     }
3974     sfree(tmp_ind);
3975     sfree(tmp_nalloc);
3976
3977     if (fplog)
3978     {
3979         char buf[22];
3980         fprintf(fplog, "Charge group distribution at step %s:",
3981                 gmx_step_str(step, buf));
3982         for (i = 0; i < dd->nnodes; i++)
3983         {
3984             fprintf(fplog, " %d", ma->ncg[i]);
3985         }
3986         fprintf(fplog, "\n");
3987     }
3988 }
3989
3990 static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
3991                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
3992                                 rvec pos[])
3993 {
3994     gmx_domdec_master_t *ma = NULL;
3995     ivec                 npulse;
3996     int                  i, cg_gl;
3997     int                 *ibuf, buf2[2] = { 0, 0 };
3998     gmx_bool             bMaster = DDMASTER(dd);
3999     if (bMaster)
4000     {
4001         ma = dd->ma;
4002
4003         if (dd->bScrewPBC)
4004         {
4005             check_screw_box(box);
4006         }
4007
4008         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
4009
4010         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4011         for (i = 0; i < dd->nnodes; i++)
4012         {
4013             ma->ibuf[2*i]   = ma->ncg[i];
4014             ma->ibuf[2*i+1] = ma->nat[i];
4015         }
4016         ibuf = ma->ibuf;
4017     }
4018     else
4019     {
4020         ibuf = NULL;
4021     }
4022     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4023
4024     dd->ncg_home = buf2[0];
4025     dd->nat_home = buf2[1];
4026     dd->ncg_tot  = dd->ncg_home;
4027     dd->nat_tot  = dd->nat_home;
4028     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4029     {
4030         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4031         srenew(dd->index_gl, dd->cg_nalloc);
4032         srenew(dd->cgindex, dd->cg_nalloc+1);
4033     }
4034     if (bMaster)
4035     {
4036         for (i = 0; i < dd->nnodes; i++)
4037         {
4038             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4039             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4040         }
4041     }
4042
4043     dd_scatterv(dd,
4044                 DDMASTER(dd) ? ma->ibuf : NULL,
4045                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4046                 DDMASTER(dd) ? ma->cg : NULL,
4047                 dd->ncg_home*sizeof(int), dd->index_gl);
4048
4049     /* Determine the home charge group sizes */
4050     dd->cgindex[0] = 0;
4051     for (i = 0; i < dd->ncg_home; i++)
4052     {
4053         cg_gl            = dd->index_gl[i];
4054         dd->cgindex[i+1] =
4055             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4056     }
4057
4058     if (debug)
4059     {
4060         fprintf(debug, "Home charge groups:\n");
4061         for (i = 0; i < dd->ncg_home; i++)
4062         {
4063             fprintf(debug, " %d", dd->index_gl[i]);
4064             if (i % 10 == 9)
4065             {
4066                 fprintf(debug, "\n");
4067             }
4068         }
4069         fprintf(debug, "\n");
4070     }
4071 }
4072
4073 static int compact_and_copy_vec_at(int ncg, int *move,
4074                                    int *cgindex,
4075                                    int nvec, int vec,
4076                                    rvec *src, gmx_domdec_comm_t *comm,
4077                                    gmx_bool bCompact)
4078 {
4079     int m, icg, i, i0, i1, nrcg;
4080     int home_pos;
4081     int pos_vec[DIM*2];
4082
4083     home_pos = 0;
4084
4085     for (m = 0; m < DIM*2; m++)
4086     {
4087         pos_vec[m] = 0;
4088     }
4089
4090     i0 = 0;
4091     for (icg = 0; icg < ncg; icg++)
4092     {
4093         i1 = cgindex[icg+1];
4094         m  = move[icg];
4095         if (m == -1)
4096         {
4097             if (bCompact)
4098             {
4099                 /* Compact the home array in place */
4100                 for (i = i0; i < i1; i++)
4101                 {
4102                     copy_rvec(src[i], src[home_pos++]);
4103                 }
4104             }
4105         }
4106         else
4107         {
4108             /* Copy to the communication buffer */
4109             nrcg        = i1 - i0;
4110             pos_vec[m] += 1 + vec*nrcg;
4111             for (i = i0; i < i1; i++)
4112             {
4113                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4114             }
4115             pos_vec[m] += (nvec - vec - 1)*nrcg;
4116         }
4117         if (!bCompact)
4118         {
4119             home_pos += i1 - i0;
4120         }
4121         i0 = i1;
4122     }
4123
4124     return home_pos;
4125 }
4126
4127 static int compact_and_copy_vec_cg(int ncg, int *move,
4128                                    int *cgindex,
4129                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4130                                    gmx_bool bCompact)
4131 {
4132     int m, icg, i0, i1, nrcg;
4133     int home_pos;
4134     int pos_vec[DIM*2];
4135
4136     home_pos = 0;
4137
4138     for (m = 0; m < DIM*2; m++)
4139     {
4140         pos_vec[m] = 0;
4141     }
4142
4143     i0 = 0;
4144     for (icg = 0; icg < ncg; icg++)
4145     {
4146         i1 = cgindex[icg+1];
4147         m  = move[icg];
4148         if (m == -1)
4149         {
4150             if (bCompact)
4151             {
4152                 /* Compact the home array in place */
4153                 copy_rvec(src[icg], src[home_pos++]);
4154             }
4155         }
4156         else
4157         {
4158             nrcg = i1 - i0;
4159             /* Copy to the communication buffer */
4160             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4161             pos_vec[m] += 1 + nrcg*nvec;
4162         }
4163         i0 = i1;
4164     }
4165     if (!bCompact)
4166     {
4167         home_pos = ncg;
4168     }
4169
4170     return home_pos;
4171 }
4172
4173 static int compact_ind(int ncg, int *move,
4174                        int *index_gl, int *cgindex,
4175                        int *gatindex,
4176                        gmx_ga2la_t ga2la, char *bLocalCG,
4177                        int *cginfo)
4178 {
4179     int cg, nat, a0, a1, a, a_gl;
4180     int home_pos;
4181
4182     home_pos = 0;
4183     nat      = 0;
4184     for (cg = 0; cg < ncg; cg++)
4185     {
4186         a0 = cgindex[cg];
4187         a1 = cgindex[cg+1];
4188         if (move[cg] == -1)
4189         {
4190             /* Compact the home arrays in place.
4191              * Anything that can be done here avoids access to global arrays.
4192              */
4193             cgindex[home_pos] = nat;
4194             for (a = a0; a < a1; a++)
4195             {
4196                 a_gl          = gatindex[a];
4197                 gatindex[nat] = a_gl;
4198                 /* The cell number stays 0, so we don't need to set it */
4199                 ga2la_change_la(ga2la, a_gl, nat);
4200                 nat++;
4201             }
4202             index_gl[home_pos] = index_gl[cg];
4203             cginfo[home_pos]   = cginfo[cg];
4204             /* The charge group remains local, so bLocalCG does not change */
4205             home_pos++;
4206         }
4207         else
4208         {
4209             /* Clear the global indices */
4210             for (a = a0; a < a1; a++)
4211             {
4212                 ga2la_del(ga2la, gatindex[a]);
4213             }
4214             if (bLocalCG)
4215             {
4216                 bLocalCG[index_gl[cg]] = FALSE;
4217             }
4218         }
4219     }
4220     cgindex[home_pos] = nat;
4221
4222     return home_pos;
4223 }
4224
4225 static void clear_and_mark_ind(int ncg, int *move,
4226                                int *index_gl, int *cgindex, int *gatindex,
4227                                gmx_ga2la_t ga2la, char *bLocalCG,
4228                                int *cell_index)
4229 {
4230     int cg, a0, a1, a;
4231
4232     for (cg = 0; cg < ncg; cg++)
4233     {
4234         if (move[cg] >= 0)
4235         {
4236             a0 = cgindex[cg];
4237             a1 = cgindex[cg+1];
4238             /* Clear the global indices */
4239             for (a = a0; a < a1; a++)
4240             {
4241                 ga2la_del(ga2la, gatindex[a]);
4242             }
4243             if (bLocalCG)
4244             {
4245                 bLocalCG[index_gl[cg]] = FALSE;
4246             }
4247             /* Signal that this cg has moved using the ns cell index.
4248              * Here we set it to -1. fill_grid will change it
4249              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4250              */
4251             cell_index[cg] = -1;
4252         }
4253     }
4254 }
4255
4256 static void print_cg_move(FILE *fplog,
4257                           gmx_domdec_t *dd,
4258                           gmx_large_int_t step, int cg, int dim, int dir,
4259                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4260                           rvec cm_old, rvec cm_new, real pos_d)
4261 {
4262     gmx_domdec_comm_t *comm;
4263     char               buf[22];
4264
4265     comm = dd->comm;
4266
4267     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4268     if (bHaveLimitdAndCMOld)
4269     {
4270         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4271                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4272     }
4273     else
4274     {
4275         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4276                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4277     }
4278     fprintf(fplog, "distance out of cell %f\n",
4279             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4280     if (bHaveLimitdAndCMOld)
4281     {
4282         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4283                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4284     }
4285     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4286             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4287     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4288             dim2char(dim),
4289             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4290     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4291             dim2char(dim),
4292             comm->cell_x0[dim], comm->cell_x1[dim]);
4293 }
4294
4295 static void cg_move_error(FILE *fplog,
4296                           gmx_domdec_t *dd,
4297                           gmx_large_int_t step, int cg, int dim, int dir,
4298                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4299                           rvec cm_old, rvec cm_new, real pos_d)
4300 {
4301     if (fplog)
4302     {
4303         print_cg_move(fplog, dd, step, cg, dim, dir,
4304                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4305     }
4306     print_cg_move(stderr, dd, step, cg, dim, dir,
4307                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4308     gmx_fatal(FARGS,
4309               "A charge group moved too far between two domain decomposition steps\n"
4310               "This usually means that your system is not well equilibrated");
4311 }
4312
4313 static void rotate_state_atom(t_state *state, int a)
4314 {
4315     int est;
4316
4317     for (est = 0; est < estNR; est++)
4318     {
4319         if (EST_DISTR(est) && (state->flags & (1<<est)))
4320         {
4321             switch (est)
4322             {
4323                 case estX:
4324                     /* Rotate the complete state; for a rectangular box only */
4325                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4326                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4327                     break;
4328                 case estV:
4329                     state->v[a][YY] = -state->v[a][YY];
4330                     state->v[a][ZZ] = -state->v[a][ZZ];
4331                     break;
4332                 case estSDX:
4333                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4334                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4335                     break;
4336                 case estCGP:
4337                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4338                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4339                     break;
4340                 case estDISRE_INITF:
4341                 case estDISRE_RM3TAV:
4342                 case estORIRE_INITF:
4343                 case estORIRE_DTAV:
4344                     /* These are distances, so not affected by rotation */
4345                     break;
4346                 default:
4347                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4348             }
4349         }
4350     }
4351 }
4352
4353 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4354 {
4355     if (natoms > comm->moved_nalloc)
4356     {
4357         /* Contents should be preserved here */
4358         comm->moved_nalloc = over_alloc_dd(natoms);
4359         srenew(comm->moved, comm->moved_nalloc);
4360     }
4361
4362     return comm->moved;
4363 }
4364
4365 static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
4366                          gmx_domdec_t *dd,
4367                          t_state *state,
4368                          ivec tric_dir, matrix tcm,
4369                          rvec cell_x0, rvec cell_x1,
4370                          rvec limitd, rvec limit0, rvec limit1,
4371                          const int *cgindex,
4372                          int cg_start, int cg_end,
4373                          rvec *cg_cm,
4374                          int *move)
4375 {
4376     int      npbcdim;
4377     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4378     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4379     int      flag;
4380     gmx_bool bScrew;
4381     ivec     dev;
4382     real     inv_ncg, pos_d;
4383     rvec     cm_new;
4384
4385     npbcdim = dd->npbcdim;
4386
4387     for (cg = cg_start; cg < cg_end; cg++)
4388     {
4389         k0   = cgindex[cg];
4390         k1   = cgindex[cg+1];
4391         nrcg = k1 - k0;
4392         if (nrcg == 1)
4393         {
4394             copy_rvec(state->x[k0], cm_new);
4395         }
4396         else
4397         {
4398             inv_ncg = 1.0/nrcg;
4399
4400             clear_rvec(cm_new);
4401             for (k = k0; (k < k1); k++)
4402             {
4403                 rvec_inc(cm_new, state->x[k]);
4404             }
4405             for (d = 0; (d < DIM); d++)
4406             {
4407                 cm_new[d] = inv_ncg*cm_new[d];
4408             }
4409         }
4410
4411         clear_ivec(dev);
4412         /* Do pbc and check DD cell boundary crossings */
4413         for (d = DIM-1; d >= 0; d--)
4414         {
4415             if (dd->nc[d] > 1)
4416             {
4417                 bScrew = (dd->bScrewPBC && d == XX);
4418                 /* Determine the location of this cg in lattice coordinates */
4419                 pos_d = cm_new[d];
4420                 if (tric_dir[d])
4421                 {
4422                     for (d2 = d+1; d2 < DIM; d2++)
4423                     {
4424                         pos_d += cm_new[d2]*tcm[d2][d];
4425                     }
4426                 }
4427                 /* Put the charge group in the triclinic unit-cell */
4428                 if (pos_d >= cell_x1[d])
4429                 {
4430                     if (pos_d >= limit1[d])
4431                     {
4432                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4433                                       cg_cm[cg], cm_new, pos_d);
4434                     }
4435                     dev[d] = 1;
4436                     if (dd->ci[d] == dd->nc[d] - 1)
4437                     {
4438                         rvec_dec(cm_new, state->box[d]);
4439                         if (bScrew)
4440                         {
4441                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4442                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4443                         }
4444                         for (k = k0; (k < k1); k++)
4445                         {
4446                             rvec_dec(state->x[k], state->box[d]);
4447                             if (bScrew)
4448                             {
4449                                 rotate_state_atom(state, k);
4450                             }
4451                         }
4452                     }
4453                 }
4454                 else if (pos_d < cell_x0[d])
4455                 {
4456                     if (pos_d < limit0[d])
4457                     {
4458                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4459                                       cg_cm[cg], cm_new, pos_d);
4460                     }
4461                     dev[d] = -1;
4462                     if (dd->ci[d] == 0)
4463                     {
4464                         rvec_inc(cm_new, state->box[d]);
4465                         if (bScrew)
4466                         {
4467                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4468                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4469                         }
4470                         for (k = k0; (k < k1); k++)
4471                         {
4472                             rvec_inc(state->x[k], state->box[d]);
4473                             if (bScrew)
4474                             {
4475                                 rotate_state_atom(state, k);
4476                             }
4477                         }
4478                     }
4479                 }
4480             }
4481             else if (d < npbcdim)
4482             {
4483                 /* Put the charge group in the rectangular unit-cell */
4484                 while (cm_new[d] >= state->box[d][d])
4485                 {
4486                     rvec_dec(cm_new, state->box[d]);
4487                     for (k = k0; (k < k1); k++)
4488                     {
4489                         rvec_dec(state->x[k], state->box[d]);
4490                     }
4491                 }
4492                 while (cm_new[d] < 0)
4493                 {
4494                     rvec_inc(cm_new, state->box[d]);
4495                     for (k = k0; (k < k1); k++)
4496                     {
4497                         rvec_inc(state->x[k], state->box[d]);
4498                     }
4499                 }
4500             }
4501         }
4502
4503         copy_rvec(cm_new, cg_cm[cg]);
4504
4505         /* Determine where this cg should go */
4506         flag = 0;
4507         mc   = -1;
4508         for (d = 0; d < dd->ndim; d++)
4509         {
4510             dim = dd->dim[d];
4511             if (dev[dim] == 1)
4512             {
4513                 flag |= DD_FLAG_FW(d);
4514                 if (mc == -1)
4515                 {
4516                     mc = d*2;
4517                 }
4518             }
4519             else if (dev[dim] == -1)
4520             {
4521                 flag |= DD_FLAG_BW(d);
4522                 if (mc == -1)
4523                 {
4524                     if (dd->nc[dim] > 2)
4525                     {
4526                         mc = d*2 + 1;
4527                     }
4528                     else
4529                     {
4530                         mc = d*2;
4531                     }
4532                 }
4533             }
4534         }
4535         /* Temporarily store the flag in move */
4536         move[cg] = mc + flag;
4537     }
4538 }
4539
4540 static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
4541                                gmx_domdec_t *dd, ivec tric_dir,
4542                                t_state *state, rvec **f,
4543                                t_forcerec *fr, t_mdatoms *md,
4544                                gmx_bool bCompact,
4545                                t_nrnb *nrnb,
4546                                int *ncg_stay_home,
4547                                int *ncg_moved)
4548 {
4549     int               *move;
4550     int                npbcdim;
4551     int                ncg[DIM*2], nat[DIM*2];
4552     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4553     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4554     int                sbuf[2], rbuf[2];
4555     int                home_pos_cg, home_pos_at, buf_pos;
4556     int                flag;
4557     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4558     gmx_bool           bScrew;
4559     ivec               dev;
4560     real               inv_ncg, pos_d;
4561     matrix             tcm;
4562     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4563     atom_id           *cgindex;
4564     cginfo_mb_t       *cginfo_mb;
4565     gmx_domdec_comm_t *comm;
4566     int               *moved;
4567     int                nthread, thread;
4568
4569     if (dd->bScrewPBC)
4570     {
4571         check_screw_box(state->box);
4572     }
4573
4574     comm  = dd->comm;
4575     if (fr->cutoff_scheme == ecutsGROUP)
4576     {
4577         cg_cm = fr->cg_cm;
4578     }
4579
4580     for (i = 0; i < estNR; i++)
4581     {
4582         if (EST_DISTR(i))
4583         {
4584             switch (i)
4585             {
4586                 case estX: /* Always present */ break;
4587                 case estV:   bV   = (state->flags & (1<<i)); break;
4588                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4589                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4590                 case estLD_RNG:
4591                 case estLD_RNGI:
4592                 case estDISRE_INITF:
4593                 case estDISRE_RM3TAV:
4594                 case estORIRE_INITF:
4595                 case estORIRE_DTAV:
4596                     /* No processing required */
4597                     break;
4598                 default:
4599                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4600             }
4601         }
4602     }
4603
4604     if (dd->ncg_tot > comm->nalloc_int)
4605     {
4606         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4607         srenew(comm->buf_int, comm->nalloc_int);
4608     }
4609     move = comm->buf_int;
4610
4611     /* Clear the count */
4612     for (c = 0; c < dd->ndim*2; c++)
4613     {
4614         ncg[c] = 0;
4615         nat[c] = 0;
4616     }
4617
4618     npbcdim = dd->npbcdim;
4619
4620     for (d = 0; (d < DIM); d++)
4621     {
4622         limitd[d] = dd->comm->cellsize_min[d];
4623         if (d >= npbcdim && dd->ci[d] == 0)
4624         {
4625             cell_x0[d] = -GMX_FLOAT_MAX;
4626         }
4627         else
4628         {
4629             cell_x0[d] = comm->cell_x0[d];
4630         }
4631         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4632         {
4633             cell_x1[d] = GMX_FLOAT_MAX;
4634         }
4635         else
4636         {
4637             cell_x1[d] = comm->cell_x1[d];
4638         }
4639         if (d < npbcdim)
4640         {
4641             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4642             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4643         }
4644         else
4645         {
4646             /* We check after communication if a charge group moved
4647              * more than one cell. Set the pre-comm check limit to float_max.
4648              */
4649             limit0[d] = -GMX_FLOAT_MAX;
4650             limit1[d] =  GMX_FLOAT_MAX;
4651         }
4652     }
4653
4654     make_tric_corr_matrix(npbcdim, state->box, tcm);
4655
4656     cgindex = dd->cgindex;
4657
4658     nthread = gmx_omp_nthreads_get(emntDomdec);
4659
4660     /* Compute the center of geometry for all home charge groups
4661      * and put them in the box and determine where they should go.
4662      */
4663 #pragma omp parallel for num_threads(nthread) schedule(static)
4664     for (thread = 0; thread < nthread; thread++)
4665     {
4666         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4667                      cell_x0, cell_x1, limitd, limit0, limit1,
4668                      cgindex,
4669                      ( thread   *dd->ncg_home)/nthread,
4670                      ((thread+1)*dd->ncg_home)/nthread,
4671                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4672                      move);
4673     }
4674
4675     for (cg = 0; cg < dd->ncg_home; cg++)
4676     {
4677         if (move[cg] >= 0)
4678         {
4679             mc       = move[cg];
4680             flag     = mc & ~DD_FLAG_NRCG;
4681             mc       = mc & DD_FLAG_NRCG;
4682             move[cg] = mc;
4683
4684             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4685             {
4686                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4687                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4688             }
4689             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4690             /* We store the cg size in the lower 16 bits
4691              * and the place where the charge group should go
4692              * in the next 6 bits. This saves some communication volume.
4693              */
4694             nrcg = cgindex[cg+1] - cgindex[cg];
4695             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4696             ncg[mc] += 1;
4697             nat[mc] += nrcg;
4698         }
4699     }
4700
4701     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4702     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4703
4704     *ncg_moved = 0;
4705     for (i = 0; i < dd->ndim*2; i++)
4706     {
4707         *ncg_moved += ncg[i];
4708     }
4709
4710     nvec = 1;
4711     if (bV)
4712     {
4713         nvec++;
4714     }
4715     if (bSDX)
4716     {
4717         nvec++;
4718     }
4719     if (bCGP)
4720     {
4721         nvec++;
4722     }
4723
4724     /* Make sure the communication buffers are large enough */
4725     for (mc = 0; mc < dd->ndim*2; mc++)
4726     {
4727         nvr = ncg[mc] + nat[mc]*nvec;
4728         if (nvr > comm->cgcm_state_nalloc[mc])
4729         {
4730             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4731             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4732         }
4733     }
4734
4735     switch (fr->cutoff_scheme)
4736     {
4737         case ecutsGROUP:
4738             /* Recalculating cg_cm might be cheaper than communicating,
4739              * but that could give rise to rounding issues.
4740              */
4741             home_pos_cg =
4742                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4743                                         nvec, cg_cm, comm, bCompact);
4744             break;
4745         case ecutsVERLET:
4746             /* Without charge groups we send the moved atom coordinates
4747              * over twice. This is so the code below can be used without
4748              * many conditionals for both for with and without charge groups.
4749              */
4750             home_pos_cg =
4751                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4752                                         nvec, state->x, comm, FALSE);
4753             if (bCompact)
4754             {
4755                 home_pos_cg -= *ncg_moved;
4756             }
4757             break;
4758         default:
4759             gmx_incons("unimplemented");
4760             home_pos_cg = 0;
4761     }
4762
4763     vec         = 0;
4764     home_pos_at =
4765         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4766                                 nvec, vec++, state->x, comm, bCompact);
4767     if (bV)
4768     {
4769         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4770                                 nvec, vec++, state->v, comm, bCompact);
4771     }
4772     if (bSDX)
4773     {
4774         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4775                                 nvec, vec++, state->sd_X, comm, bCompact);
4776     }
4777     if (bCGP)
4778     {
4779         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4780                                 nvec, vec++, state->cg_p, comm, bCompact);
4781     }
4782
4783     if (bCompact)
4784     {
4785         compact_ind(dd->ncg_home, move,
4786                     dd->index_gl, dd->cgindex, dd->gatindex,
4787                     dd->ga2la, comm->bLocalCG,
4788                     fr->cginfo);
4789     }
4790     else
4791     {
4792         if (fr->cutoff_scheme == ecutsVERLET)
4793         {
4794             moved = get_moved(comm, dd->ncg_home);
4795
4796             for (k = 0; k < dd->ncg_home; k++)
4797             {
4798                 moved[k] = 0;
4799             }
4800         }
4801         else
4802         {
4803             moved = fr->ns.grid->cell_index;
4804         }
4805
4806         clear_and_mark_ind(dd->ncg_home, move,
4807                            dd->index_gl, dd->cgindex, dd->gatindex,
4808                            dd->ga2la, comm->bLocalCG,
4809                            moved);
4810     }
4811
4812     cginfo_mb = fr->cginfo_mb;
4813
4814     *ncg_stay_home = home_pos_cg;
4815     for (d = 0; d < dd->ndim; d++)
4816     {
4817         dim      = dd->dim[d];
4818         ncg_recv = 0;
4819         nat_recv = 0;
4820         nvr      = 0;
4821         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4822         {
4823             cdd = d*2 + dir;
4824             /* Communicate the cg and atom counts */
4825             sbuf[0] = ncg[cdd];
4826             sbuf[1] = nat[cdd];
4827             if (debug)
4828             {
4829                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4830                         d, dir, sbuf[0], sbuf[1]);
4831             }
4832             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4833
4834             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4835             {
4836                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4837                 srenew(comm->buf_int, comm->nalloc_int);
4838             }
4839
4840             /* Communicate the charge group indices, sizes and flags */
4841             dd_sendrecv_int(dd, d, dir,
4842                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4843                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4844
4845             nvs = ncg[cdd] + nat[cdd]*nvec;
4846             i   = rbuf[0]  + rbuf[1] *nvec;
4847             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4848
4849             /* Communicate cgcm and state */
4850             dd_sendrecv_rvec(dd, d, dir,
4851                              comm->cgcm_state[cdd], nvs,
4852                              comm->vbuf.v+nvr, i);
4853             ncg_recv += rbuf[0];
4854             nat_recv += rbuf[1];
4855             nvr      += i;
4856         }
4857
4858         /* Process the received charge groups */
4859         buf_pos = 0;
4860         for (cg = 0; cg < ncg_recv; cg++)
4861         {
4862             flag = comm->buf_int[cg*DD_CGIBS+1];
4863
4864             if (dim >= npbcdim && dd->nc[dim] > 2)
4865             {
4866                 /* No pbc in this dim and more than one domain boundary.
4867                  * We do a separate check if a charge group didn't move too far.
4868                  */
4869                 if (((flag & DD_FLAG_FW(d)) &&
4870                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4871                     ((flag & DD_FLAG_BW(d)) &&
4872                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4873                 {
4874                     cg_move_error(fplog, dd, step, cg, dim,
4875                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4876                                   FALSE, 0,
4877                                   comm->vbuf.v[buf_pos],
4878                                   comm->vbuf.v[buf_pos],
4879                                   comm->vbuf.v[buf_pos][dim]);
4880                 }
4881             }
4882
4883             mc = -1;
4884             if (d < dd->ndim-1)
4885             {
4886                 /* Check which direction this cg should go */
4887                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4888                 {
4889                     if (dd->bGridJump)
4890                     {
4891                         /* The cell boundaries for dimension d2 are not equal
4892                          * for each cell row of the lower dimension(s),
4893                          * therefore we might need to redetermine where
4894                          * this cg should go.
4895                          */
4896                         dim2 = dd->dim[d2];
4897                         /* If this cg crosses the box boundary in dimension d2
4898                          * we can use the communicated flag, so we do not
4899                          * have to worry about pbc.
4900                          */
4901                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4902                                (flag & DD_FLAG_FW(d2))) ||
4903                               (dd->ci[dim2] == 0 &&
4904                                (flag & DD_FLAG_BW(d2)))))
4905                         {
4906                             /* Clear the two flags for this dimension */
4907                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4908                             /* Determine the location of this cg
4909                              * in lattice coordinates
4910                              */
4911                             pos_d = comm->vbuf.v[buf_pos][dim2];
4912                             if (tric_dir[dim2])
4913                             {
4914                                 for (d3 = dim2+1; d3 < DIM; d3++)
4915                                 {
4916                                     pos_d +=
4917                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4918                                 }
4919                             }
4920                             /* Check of we are not at the box edge.
4921                              * pbc is only handled in the first step above,
4922                              * but this check could move over pbc while
4923                              * the first step did not due to different rounding.
4924                              */
4925                             if (pos_d >= cell_x1[dim2] &&
4926                                 dd->ci[dim2] != dd->nc[dim2]-1)
4927                             {
4928                                 flag |= DD_FLAG_FW(d2);
4929                             }
4930                             else if (pos_d < cell_x0[dim2] &&
4931                                      dd->ci[dim2] != 0)
4932                             {
4933                                 flag |= DD_FLAG_BW(d2);
4934                             }
4935                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4936                         }
4937                     }
4938                     /* Set to which neighboring cell this cg should go */
4939                     if (flag & DD_FLAG_FW(d2))
4940                     {
4941                         mc = d2*2;
4942                     }
4943                     else if (flag & DD_FLAG_BW(d2))
4944                     {
4945                         if (dd->nc[dd->dim[d2]] > 2)
4946                         {
4947                             mc = d2*2+1;
4948                         }
4949                         else
4950                         {
4951                             mc = d2*2;
4952                         }
4953                     }
4954                 }
4955             }
4956
4957             nrcg = flag & DD_FLAG_NRCG;
4958             if (mc == -1)
4959             {
4960                 if (home_pos_cg+1 > dd->cg_nalloc)
4961                 {
4962                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4963                     srenew(dd->index_gl, dd->cg_nalloc);
4964                     srenew(dd->cgindex, dd->cg_nalloc+1);
4965                 }
4966                 /* Set the global charge group index and size */
4967                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4968                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4969                 /* Copy the state from the buffer */
4970                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4971                 if (fr->cutoff_scheme == ecutsGROUP)
4972                 {
4973                     cg_cm = fr->cg_cm;
4974                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4975                 }
4976                 buf_pos++;
4977
4978                 /* Set the cginfo */
4979                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4980                                                    dd->index_gl[home_pos_cg]);
4981                 if (comm->bLocalCG)
4982                 {
4983                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
4984                 }
4985
4986                 if (home_pos_at+nrcg > state->nalloc)
4987                 {
4988                     dd_realloc_state(state, f, home_pos_at+nrcg);
4989                 }
4990                 for (i = 0; i < nrcg; i++)
4991                 {
4992                     copy_rvec(comm->vbuf.v[buf_pos++],
4993                               state->x[home_pos_at+i]);
4994                 }
4995                 if (bV)
4996                 {
4997                     for (i = 0; i < nrcg; i++)
4998                     {
4999                         copy_rvec(comm->vbuf.v[buf_pos++],
5000                                   state->v[home_pos_at+i]);
5001                     }
5002                 }
5003                 if (bSDX)
5004                 {
5005                     for (i = 0; i < nrcg; i++)
5006                     {
5007                         copy_rvec(comm->vbuf.v[buf_pos++],
5008                                   state->sd_X[home_pos_at+i]);
5009                     }
5010                 }
5011                 if (bCGP)
5012                 {
5013                     for (i = 0; i < nrcg; i++)
5014                     {
5015                         copy_rvec(comm->vbuf.v[buf_pos++],
5016                                   state->cg_p[home_pos_at+i]);
5017                     }
5018                 }
5019                 home_pos_cg += 1;
5020                 home_pos_at += nrcg;
5021             }
5022             else
5023             {
5024                 /* Reallocate the buffers if necessary  */
5025                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5026                 {
5027                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5028                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5029                 }
5030                 nvr = ncg[mc] + nat[mc]*nvec;
5031                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5032                 {
5033                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5034                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5035                 }
5036                 /* Copy from the receive to the send buffers */
5037                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5038                        comm->buf_int + cg*DD_CGIBS,
5039                        DD_CGIBS*sizeof(int));
5040                 memcpy(comm->cgcm_state[mc][nvr],
5041                        comm->vbuf.v[buf_pos],
5042                        (1+nrcg*nvec)*sizeof(rvec));
5043                 buf_pos += 1 + nrcg*nvec;
5044                 ncg[mc] += 1;
5045                 nat[mc] += nrcg;
5046             }
5047         }
5048     }
5049
5050     /* With sorting (!bCompact) the indices are now only partially up to date
5051      * and ncg_home and nat_home are not the real count, since there are
5052      * "holes" in the arrays for the charge groups that moved to neighbors.
5053      */
5054     if (fr->cutoff_scheme == ecutsVERLET)
5055     {
5056         moved = get_moved(comm, home_pos_cg);
5057
5058         for (i = dd->ncg_home; i < home_pos_cg; i++)
5059         {
5060             moved[i] = 0;
5061         }
5062     }
5063     dd->ncg_home = home_pos_cg;
5064     dd->nat_home = home_pos_at;
5065
5066     if (debug)
5067     {
5068         fprintf(debug,
5069                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5070                 *ncg_moved, dd->ncg_home-*ncg_moved);
5071
5072     }
5073 }
5074
5075 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5076 {
5077     dd->comm->cycl[ddCycl] += cycles;
5078     dd->comm->cycl_n[ddCycl]++;
5079     if (cycles > dd->comm->cycl_max[ddCycl])
5080     {
5081         dd->comm->cycl_max[ddCycl] = cycles;
5082     }
5083 }
5084
5085 static double force_flop_count(t_nrnb *nrnb)
5086 {
5087     int         i;
5088     double      sum;
5089     const char *name;
5090
5091     sum = 0;
5092     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5093     {
5094         /* To get closer to the real timings, we half the count
5095          * for the normal loops and again half it for water loops.
5096          */
5097         name = nrnb_str(i);
5098         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5099         {
5100             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5101         }
5102         else
5103         {
5104             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5105         }
5106     }
5107     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5108     {
5109         name = nrnb_str(i);
5110         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5111         {
5112             sum += nrnb->n[i]*cost_nrnb(i);
5113         }
5114     }
5115     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5116     {
5117         sum += nrnb->n[i]*cost_nrnb(i);
5118     }
5119
5120     return sum;
5121 }
5122
5123 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5124 {
5125     if (dd->comm->eFlop)
5126     {
5127         dd->comm->flop -= force_flop_count(nrnb);
5128     }
5129 }
5130 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5131 {
5132     if (dd->comm->eFlop)
5133     {
5134         dd->comm->flop += force_flop_count(nrnb);
5135         dd->comm->flop_n++;
5136     }
5137 }
5138
5139 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5140 {
5141     int i;
5142
5143     for (i = 0; i < ddCyclNr; i++)
5144     {
5145         dd->comm->cycl[i]     = 0;
5146         dd->comm->cycl_n[i]   = 0;
5147         dd->comm->cycl_max[i] = 0;
5148     }
5149     dd->comm->flop   = 0;
5150     dd->comm->flop_n = 0;
5151 }
5152
5153 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5154 {
5155     gmx_domdec_comm_t *comm;
5156     gmx_domdec_load_t *load;
5157     gmx_domdec_root_t *root = NULL;
5158     int                d, dim, cid, i, pos;
5159     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5160     gmx_bool           bSepPME;
5161
5162     if (debug)
5163     {
5164         fprintf(debug, "get_load_distribution start\n");
5165     }
5166
5167     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5168
5169     comm = dd->comm;
5170
5171     bSepPME = (dd->pme_nodeid >= 0);
5172
5173     for (d = dd->ndim-1; d >= 0; d--)
5174     {
5175         dim = dd->dim[d];
5176         /* Check if we participate in the communication in this dimension */
5177         if (d == dd->ndim-1 ||
5178             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5179         {
5180             load = &comm->load[d];
5181             if (dd->bGridJump)
5182             {
5183                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5184             }
5185             pos = 0;
5186             if (d == dd->ndim-1)
5187             {
5188                 sbuf[pos++] = dd_force_load(comm);
5189                 sbuf[pos++] = sbuf[0];
5190                 if (dd->bGridJump)
5191                 {
5192                     sbuf[pos++] = sbuf[0];
5193                     sbuf[pos++] = cell_frac;
5194                     if (d > 0)
5195                     {
5196                         sbuf[pos++] = comm->cell_f_max0[d];
5197                         sbuf[pos++] = comm->cell_f_min1[d];
5198                     }
5199                 }
5200                 if (bSepPME)
5201                 {
5202                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5203                     sbuf[pos++] = comm->cycl[ddCyclPME];
5204                 }
5205             }
5206             else
5207             {
5208                 sbuf[pos++] = comm->load[d+1].sum;
5209                 sbuf[pos++] = comm->load[d+1].max;
5210                 if (dd->bGridJump)
5211                 {
5212                     sbuf[pos++] = comm->load[d+1].sum_m;
5213                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5214                     sbuf[pos++] = comm->load[d+1].flags;
5215                     if (d > 0)
5216                     {
5217                         sbuf[pos++] = comm->cell_f_max0[d];
5218                         sbuf[pos++] = comm->cell_f_min1[d];
5219                     }
5220                 }
5221                 if (bSepPME)
5222                 {
5223                     sbuf[pos++] = comm->load[d+1].mdf;
5224                     sbuf[pos++] = comm->load[d+1].pme;
5225                 }
5226             }
5227             load->nload = pos;
5228             /* Communicate a row in DD direction d.
5229              * The communicators are setup such that the root always has rank 0.
5230              */
5231 #ifdef GMX_MPI
5232             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5233                        load->load, load->nload*sizeof(float), MPI_BYTE,
5234                        0, comm->mpi_comm_load[d]);
5235 #endif
5236             if (dd->ci[dim] == dd->master_ci[dim])
5237             {
5238                 /* We are the root, process this row */
5239                 if (comm->bDynLoadBal)
5240                 {
5241                     root = comm->root[d];
5242                 }
5243                 load->sum      = 0;
5244                 load->max      = 0;
5245                 load->sum_m    = 0;
5246                 load->cvol_min = 1;
5247                 load->flags    = 0;
5248                 load->mdf      = 0;
5249                 load->pme      = 0;
5250                 pos            = 0;
5251                 for (i = 0; i < dd->nc[dim]; i++)
5252                 {
5253                     load->sum += load->load[pos++];
5254                     load->max  = max(load->max, load->load[pos]);
5255                     pos++;
5256                     if (dd->bGridJump)
5257                     {
5258                         if (root->bLimited)
5259                         {
5260                             /* This direction could not be load balanced properly,
5261                              * therefore we need to use the maximum iso the average load.
5262                              */
5263                             load->sum_m = max(load->sum_m, load->load[pos]);
5264                         }
5265                         else
5266                         {
5267                             load->sum_m += load->load[pos];
5268                         }
5269                         pos++;
5270                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5271                         pos++;
5272                         if (d < dd->ndim-1)
5273                         {
5274                             load->flags = (int)(load->load[pos++] + 0.5);
5275                         }
5276                         if (d > 0)
5277                         {
5278                             root->cell_f_max0[i] = load->load[pos++];
5279                             root->cell_f_min1[i] = load->load[pos++];
5280                         }
5281                     }
5282                     if (bSepPME)
5283                     {
5284                         load->mdf = max(load->mdf, load->load[pos]);
5285                         pos++;
5286                         load->pme = max(load->pme, load->load[pos]);
5287                         pos++;
5288                     }
5289                 }
5290                 if (comm->bDynLoadBal && root->bLimited)
5291                 {
5292                     load->sum_m *= dd->nc[dim];
5293                     load->flags |= (1<<d);
5294                 }
5295             }
5296         }
5297     }
5298
5299     if (DDMASTER(dd))
5300     {
5301         comm->nload      += dd_load_count(comm);
5302         comm->load_step  += comm->cycl[ddCyclStep];
5303         comm->load_sum   += comm->load[0].sum;
5304         comm->load_max   += comm->load[0].max;
5305         if (comm->bDynLoadBal)
5306         {
5307             for (d = 0; d < dd->ndim; d++)
5308             {
5309                 if (comm->load[0].flags & (1<<d))
5310                 {
5311                     comm->load_lim[d]++;
5312                 }
5313             }
5314         }
5315         if (bSepPME)
5316         {
5317             comm->load_mdf += comm->load[0].mdf;
5318             comm->load_pme += comm->load[0].pme;
5319         }
5320     }
5321
5322     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5323
5324     if (debug)
5325     {
5326         fprintf(debug, "get_load_distribution finished\n");
5327     }
5328 }
5329
5330 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5331 {
5332     /* Return the relative performance loss on the total run time
5333      * due to the force calculation load imbalance.
5334      */
5335     if (dd->comm->nload > 0)
5336     {
5337         return
5338             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5339             (dd->comm->load_step*dd->nnodes);
5340     }
5341     else
5342     {
5343         return 0;
5344     }
5345 }
5346
5347 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5348 {
5349     char               buf[STRLEN];
5350     int                npp, npme, nnodes, d, limp;
5351     float              imbal, pme_f_ratio, lossf, lossp = 0;
5352     gmx_bool           bLim;
5353     gmx_domdec_comm_t *comm;
5354
5355     comm = dd->comm;
5356     if (DDMASTER(dd) && comm->nload > 0)
5357     {
5358         npp    = dd->nnodes;
5359         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5360         nnodes = npp + npme;
5361         imbal  = comm->load_max*npp/comm->load_sum - 1;
5362         lossf  = dd_force_imb_perf_loss(dd);
5363         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5364         fprintf(fplog, "%s", buf);
5365         fprintf(stderr, "\n");
5366         fprintf(stderr, "%s", buf);
5367         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5368         fprintf(fplog, "%s", buf);
5369         fprintf(stderr, "%s", buf);
5370         bLim = FALSE;
5371         if (comm->bDynLoadBal)
5372         {
5373             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5374             for (d = 0; d < dd->ndim; d++)
5375             {
5376                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5377                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5378                 if (limp >= 50)
5379                 {
5380                     bLim = TRUE;
5381                 }
5382             }
5383             sprintf(buf+strlen(buf), "\n");
5384             fprintf(fplog, "%s", buf);
5385             fprintf(stderr, "%s", buf);
5386         }
5387         if (npme > 0)
5388         {
5389             pme_f_ratio = comm->load_pme/comm->load_mdf;
5390             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5391             if (lossp <= 0)
5392             {
5393                 lossp *= (float)npme/(float)nnodes;
5394             }
5395             else
5396             {
5397                 lossp *= (float)npp/(float)nnodes;
5398             }
5399             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5400             fprintf(fplog, "%s", buf);
5401             fprintf(stderr, "%s", buf);
5402             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5403             fprintf(fplog, "%s", buf);
5404             fprintf(stderr, "%s", buf);
5405         }
5406         fprintf(fplog, "\n");
5407         fprintf(stderr, "\n");
5408
5409         if (lossf >= DD_PERF_LOSS)
5410         {
5411             sprintf(buf,
5412                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5413                     "      in the domain decomposition.\n", lossf*100);
5414             if (!comm->bDynLoadBal)
5415             {
5416                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5417             }
5418             else if (bLim)
5419             {
5420                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5421             }
5422             fprintf(fplog, "%s\n", buf);
5423             fprintf(stderr, "%s\n", buf);
5424         }
5425         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5426         {
5427             sprintf(buf,
5428                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5429                     "      had %s work to do than the PP nodes.\n"
5430                     "      You might want to %s the number of PME nodes\n"
5431                     "      or %s the cut-off and the grid spacing.\n",
5432                     fabs(lossp*100),
5433                     (lossp < 0) ? "less"     : "more",
5434                     (lossp < 0) ? "decrease" : "increase",
5435                     (lossp < 0) ? "decrease" : "increase");
5436             fprintf(fplog, "%s\n", buf);
5437             fprintf(stderr, "%s\n", buf);
5438         }
5439     }
5440 }
5441
5442 static float dd_vol_min(gmx_domdec_t *dd)
5443 {
5444     return dd->comm->load[0].cvol_min*dd->nnodes;
5445 }
5446
5447 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5448 {
5449     return dd->comm->load[0].flags;
5450 }
5451
5452 static float dd_f_imbal(gmx_domdec_t *dd)
5453 {
5454     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5455 }
5456
5457 float dd_pme_f_ratio(gmx_domdec_t *dd)
5458 {
5459     if (dd->comm->cycl_n[ddCyclPME] > 0)
5460     {
5461         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5462     }
5463     else
5464     {
5465         return -1.0;
5466     }
5467 }
5468
5469 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
5470 {
5471     int  flags, d;
5472     char buf[22];
5473
5474     flags = dd_load_flags(dd);
5475     if (flags)
5476     {
5477         fprintf(fplog,
5478                 "DD  load balancing is limited by minimum cell size in dimension");
5479         for (d = 0; d < dd->ndim; d++)
5480         {
5481             if (flags & (1<<d))
5482             {
5483                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5484             }
5485         }
5486         fprintf(fplog, "\n");
5487     }
5488     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5489     if (dd->comm->bDynLoadBal)
5490     {
5491         fprintf(fplog, "  vol min/aver %5.3f%c",
5492                 dd_vol_min(dd), flags ? '!' : ' ');
5493     }
5494     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5495     if (dd->comm->cycl_n[ddCyclPME])
5496     {
5497         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5498     }
5499     fprintf(fplog, "\n\n");
5500 }
5501
5502 static void dd_print_load_verbose(gmx_domdec_t *dd)
5503 {
5504     if (dd->comm->bDynLoadBal)
5505     {
5506         fprintf(stderr, "vol %4.2f%c ",
5507                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5508     }
5509     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5510     if (dd->comm->cycl_n[ddCyclPME])
5511     {
5512         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5513     }
5514 }
5515
5516 #ifdef GMX_MPI
5517 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5518 {
5519     MPI_Comm           c_row;
5520     int                dim, i, rank;
5521     ivec               loc_c;
5522     gmx_domdec_root_t *root;
5523     gmx_bool           bPartOfGroup = FALSE;
5524
5525     dim = dd->dim[dim_ind];
5526     copy_ivec(loc, loc_c);
5527     for (i = 0; i < dd->nc[dim]; i++)
5528     {
5529         loc_c[dim] = i;
5530         rank       = dd_index(dd->nc, loc_c);
5531         if (rank == dd->rank)
5532         {
5533             /* This process is part of the group */
5534             bPartOfGroup = TRUE;
5535         }
5536     }
5537     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5538                    &c_row);
5539     if (bPartOfGroup)
5540     {
5541         dd->comm->mpi_comm_load[dim_ind] = c_row;
5542         if (dd->comm->eDLB != edlbNO)
5543         {
5544             if (dd->ci[dim] == dd->master_ci[dim])
5545             {
5546                 /* This is the root process of this row */
5547                 snew(dd->comm->root[dim_ind], 1);
5548                 root = dd->comm->root[dim_ind];
5549                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5550                 snew(root->old_cell_f, dd->nc[dim]+1);
5551                 snew(root->bCellMin, dd->nc[dim]);
5552                 if (dim_ind > 0)
5553                 {
5554                     snew(root->cell_f_max0, dd->nc[dim]);
5555                     snew(root->cell_f_min1, dd->nc[dim]);
5556                     snew(root->bound_min, dd->nc[dim]);
5557                     snew(root->bound_max, dd->nc[dim]);
5558                 }
5559                 snew(root->buf_ncd, dd->nc[dim]);
5560             }
5561             else
5562             {
5563                 /* This is not a root process, we only need to receive cell_f */
5564                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5565             }
5566         }
5567         if (dd->ci[dim] == dd->master_ci[dim])
5568         {
5569             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5570         }
5571     }
5572 }
5573 #endif
5574
5575 static void make_load_communicators(gmx_domdec_t *dd)
5576 {
5577 #ifdef GMX_MPI
5578     int  dim0, dim1, i, j;
5579     ivec loc;
5580
5581     if (debug)
5582     {
5583         fprintf(debug, "Making load communicators\n");
5584     }
5585
5586     snew(dd->comm->load, dd->ndim);
5587     snew(dd->comm->mpi_comm_load, dd->ndim);
5588
5589     clear_ivec(loc);
5590     make_load_communicator(dd, 0, loc);
5591     if (dd->ndim > 1)
5592     {
5593         dim0 = dd->dim[0];
5594         for (i = 0; i < dd->nc[dim0]; i++)
5595         {
5596             loc[dim0] = i;
5597             make_load_communicator(dd, 1, loc);
5598         }
5599     }
5600     if (dd->ndim > 2)
5601     {
5602         dim0 = dd->dim[0];
5603         for (i = 0; i < dd->nc[dim0]; i++)
5604         {
5605             loc[dim0] = i;
5606             dim1      = dd->dim[1];
5607             for (j = 0; j < dd->nc[dim1]; j++)
5608             {
5609                 loc[dim1] = j;
5610                 make_load_communicator(dd, 2, loc);
5611             }
5612         }
5613     }
5614
5615     if (debug)
5616     {
5617         fprintf(debug, "Finished making load communicators\n");
5618     }
5619 #endif
5620 }
5621
5622 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5623 {
5624     gmx_bool                bZYX;
5625     int                     d, dim, i, j, m;
5626     ivec                    tmp, s;
5627     int                     nzone, nzonep;
5628     ivec                    dd_zp[DD_MAXIZONE];
5629     gmx_domdec_zones_t     *zones;
5630     gmx_domdec_ns_ranges_t *izone;
5631
5632     for (d = 0; d < dd->ndim; d++)
5633     {
5634         dim = dd->dim[d];
5635         copy_ivec(dd->ci, tmp);
5636         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5637         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5638         copy_ivec(dd->ci, tmp);
5639         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5640         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5641         if (debug)
5642         {
5643             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5644                     dd->rank, dim,
5645                     dd->neighbor[d][0],
5646                     dd->neighbor[d][1]);
5647         }
5648     }
5649
5650     if (fplog)
5651     {
5652         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5653                 dd->ndim,
5654                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5655                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5656     }
5657     switch (dd->ndim)
5658     {
5659         case 3:
5660             nzone  = dd_z3n;
5661             nzonep = dd_zp3n;
5662             for (i = 0; i < nzonep; i++)
5663             {
5664                 copy_ivec(dd_zp3[i], dd_zp[i]);
5665             }
5666             break;
5667         case 2:
5668             nzone  = dd_z2n;
5669             nzonep = dd_zp2n;
5670             for (i = 0; i < nzonep; i++)
5671             {
5672                 copy_ivec(dd_zp2[i], dd_zp[i]);
5673             }
5674             break;
5675         case 1:
5676             nzone  = dd_z1n;
5677             nzonep = dd_zp1n;
5678             for (i = 0; i < nzonep; i++)
5679             {
5680                 copy_ivec(dd_zp1[i], dd_zp[i]);
5681             }
5682             break;
5683         default:
5684             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5685             nzone  = 0;
5686             nzonep = 0;
5687     }
5688
5689     zones = &dd->comm->zones;
5690
5691     for (i = 0; i < nzone; i++)
5692     {
5693         m = 0;
5694         clear_ivec(zones->shift[i]);
5695         for (d = 0; d < dd->ndim; d++)
5696         {
5697             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5698         }
5699     }
5700
5701     zones->n = nzone;
5702     for (i = 0; i < nzone; i++)
5703     {
5704         for (d = 0; d < DIM; d++)
5705         {
5706             s[d] = dd->ci[d] - zones->shift[i][d];
5707             if (s[d] < 0)
5708             {
5709                 s[d] += dd->nc[d];
5710             }
5711             else if (s[d] >= dd->nc[d])
5712             {
5713                 s[d] -= dd->nc[d];
5714             }
5715         }
5716     }
5717     zones->nizone = nzonep;
5718     for (i = 0; i < zones->nizone; i++)
5719     {
5720         if (dd_zp[i][0] != i)
5721         {
5722             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5723         }
5724         izone     = &zones->izone[i];
5725         izone->j0 = dd_zp[i][1];
5726         izone->j1 = dd_zp[i][2];
5727         for (dim = 0; dim < DIM; dim++)
5728         {
5729             if (dd->nc[dim] == 1)
5730             {
5731                 /* All shifts should be allowed */
5732                 izone->shift0[dim] = -1;
5733                 izone->shift1[dim] = 1;
5734             }
5735             else
5736             {
5737                 /*
5738                    izone->shift0[d] = 0;
5739                    izone->shift1[d] = 0;
5740                    for(j=izone->j0; j<izone->j1; j++) {
5741                    if (dd->shift[j][d] > dd->shift[i][d])
5742                    izone->shift0[d] = -1;
5743                    if (dd->shift[j][d] < dd->shift[i][d])
5744                    izone->shift1[d] = 1;
5745                    }
5746                  */
5747
5748                 int shift_diff;
5749
5750                 /* Assume the shift are not more than 1 cell */
5751                 izone->shift0[dim] = 1;
5752                 izone->shift1[dim] = -1;
5753                 for (j = izone->j0; j < izone->j1; j++)
5754                 {
5755                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5756                     if (shift_diff < izone->shift0[dim])
5757                     {
5758                         izone->shift0[dim] = shift_diff;
5759                     }
5760                     if (shift_diff > izone->shift1[dim])
5761                     {
5762                         izone->shift1[dim] = shift_diff;
5763                     }
5764                 }
5765             }
5766         }
5767     }
5768
5769     if (dd->comm->eDLB != edlbNO)
5770     {
5771         snew(dd->comm->root, dd->ndim);
5772     }
5773
5774     if (dd->comm->bRecordLoad)
5775     {
5776         make_load_communicators(dd);
5777     }
5778 }
5779
5780 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
5781 {
5782     gmx_domdec_t      *dd;
5783     gmx_domdec_comm_t *comm;
5784     int                i, rank, *buf;
5785     ivec               periods;
5786 #ifdef GMX_MPI
5787     MPI_Comm           comm_cart;
5788 #endif
5789
5790     dd   = cr->dd;
5791     comm = dd->comm;
5792
5793 #ifdef GMX_MPI
5794     if (comm->bCartesianPP)
5795     {
5796         /* Set up cartesian communication for the particle-particle part */
5797         if (fplog)
5798         {
5799             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5800                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5801         }
5802
5803         for (i = 0; i < DIM; i++)
5804         {
5805             periods[i] = TRUE;
5806         }
5807         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5808                         &comm_cart);
5809         /* We overwrite the old communicator with the new cartesian one */
5810         cr->mpi_comm_mygroup = comm_cart;
5811     }
5812
5813     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5814     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5815
5816     if (comm->bCartesianPP_PME)
5817     {
5818         /* Since we want to use the original cartesian setup for sim,
5819          * and not the one after split, we need to make an index.
5820          */
5821         snew(comm->ddindex2ddnodeid, dd->nnodes);
5822         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5823         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5824         /* Get the rank of the DD master,
5825          * above we made sure that the master node is a PP node.
5826          */
5827         if (MASTER(cr))
5828         {
5829             rank = dd->rank;
5830         }
5831         else
5832         {
5833             rank = 0;
5834         }
5835         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5836     }
5837     else if (comm->bCartesianPP)
5838     {
5839         if (cr->npmenodes == 0)
5840         {
5841             /* The PP communicator is also
5842              * the communicator for this simulation
5843              */
5844             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5845         }
5846         cr->nodeid = dd->rank;
5847
5848         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5849
5850         /* We need to make an index to go from the coordinates
5851          * to the nodeid of this simulation.
5852          */
5853         snew(comm->ddindex2simnodeid, dd->nnodes);
5854         snew(buf, dd->nnodes);
5855         if (cr->duty & DUTY_PP)
5856         {
5857             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5858         }
5859         /* Communicate the ddindex to simulation nodeid index */
5860         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5861                       cr->mpi_comm_mysim);
5862         sfree(buf);
5863
5864         /* Determine the master coordinates and rank.
5865          * The DD master should be the same node as the master of this sim.
5866          */
5867         for (i = 0; i < dd->nnodes; i++)
5868         {
5869             if (comm->ddindex2simnodeid[i] == 0)
5870             {
5871                 ddindex2xyz(dd->nc, i, dd->master_ci);
5872                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5873             }
5874         }
5875         if (debug)
5876         {
5877             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5878         }
5879     }
5880     else
5881     {
5882         /* No Cartesian communicators */
5883         /* We use the rank in dd->comm->all as DD index */
5884         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5885         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5886         dd->masterrank = 0;
5887         clear_ivec(dd->master_ci);
5888     }
5889 #endif
5890
5891     if (fplog)
5892     {
5893         fprintf(fplog,
5894                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5895                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5896     }
5897     if (debug)
5898     {
5899         fprintf(debug,
5900                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5901                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5902     }
5903 }
5904
5905 static void receive_ddindex2simnodeid(t_commrec *cr)
5906 {
5907     gmx_domdec_t      *dd;
5908
5909     gmx_domdec_comm_t *comm;
5910     int               *buf;
5911
5912     dd   = cr->dd;
5913     comm = dd->comm;
5914
5915 #ifdef GMX_MPI
5916     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5917     {
5918         snew(comm->ddindex2simnodeid, dd->nnodes);
5919         snew(buf, dd->nnodes);
5920         if (cr->duty & DUTY_PP)
5921         {
5922             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5923         }
5924 #ifdef GMX_MPI
5925         /* Communicate the ddindex to simulation nodeid index */
5926         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5927                       cr->mpi_comm_mysim);
5928 #endif
5929         sfree(buf);
5930     }
5931 #endif
5932 }
5933
5934 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5935                                                      int ncg, int natoms)
5936 {
5937     gmx_domdec_master_t *ma;
5938     int                  i;
5939
5940     snew(ma, 1);
5941
5942     snew(ma->ncg, dd->nnodes);
5943     snew(ma->index, dd->nnodes+1);
5944     snew(ma->cg, ncg);
5945     snew(ma->nat, dd->nnodes);
5946     snew(ma->ibuf, dd->nnodes*2);
5947     snew(ma->cell_x, DIM);
5948     for (i = 0; i < DIM; i++)
5949     {
5950         snew(ma->cell_x[i], dd->nc[i]+1);
5951     }
5952
5953     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5954     {
5955         ma->vbuf = NULL;
5956     }
5957     else
5958     {
5959         snew(ma->vbuf, natoms);
5960     }
5961
5962     return ma;
5963 }
5964
5965 static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
5966                                int reorder)
5967 {
5968     gmx_domdec_t      *dd;
5969     gmx_domdec_comm_t *comm;
5970     int                i, rank;
5971     gmx_bool           bDiv[DIM];
5972     ivec               periods;
5973 #ifdef GMX_MPI
5974     MPI_Comm           comm_cart;
5975 #endif
5976
5977     dd   = cr->dd;
5978     comm = dd->comm;
5979
5980     if (comm->bCartesianPP)
5981     {
5982         for (i = 1; i < DIM; i++)
5983         {
5984             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
5985         }
5986         if (bDiv[YY] || bDiv[ZZ])
5987         {
5988             comm->bCartesianPP_PME = TRUE;
5989             /* If we have 2D PME decomposition, which is always in x+y,
5990              * we stack the PME only nodes in z.
5991              * Otherwise we choose the direction that provides the thinnest slab
5992              * of PME only nodes as this will have the least effect
5993              * on the PP communication.
5994              * But for the PME communication the opposite might be better.
5995              */
5996             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
5997                              !bDiv[YY] ||
5998                              dd->nc[YY] > dd->nc[ZZ]))
5999             {
6000                 comm->cartpmedim = ZZ;
6001             }
6002             else
6003             {
6004                 comm->cartpmedim = YY;
6005             }
6006             comm->ntot[comm->cartpmedim]
6007                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6008         }
6009         else if (fplog)
6010         {
6011             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6012             fprintf(fplog,
6013                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6014         }
6015     }
6016
6017 #ifdef GMX_MPI
6018     if (comm->bCartesianPP_PME)
6019     {
6020         if (fplog)
6021         {
6022             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6023         }
6024
6025         for (i = 0; i < DIM; i++)
6026         {
6027             periods[i] = TRUE;
6028         }
6029         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6030                         &comm_cart);
6031
6032         MPI_Comm_rank(comm_cart, &rank);
6033         if (MASTERNODE(cr) && rank != 0)
6034         {
6035             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6036         }
6037
6038         /* With this assigment we loose the link to the original communicator
6039          * which will usually be MPI_COMM_WORLD, unless have multisim.
6040          */
6041         cr->mpi_comm_mysim = comm_cart;
6042         cr->sim_nodeid     = rank;
6043
6044         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6045
6046         if (fplog)
6047         {
6048             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6049                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6050         }
6051
6052         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6053         {
6054             cr->duty = DUTY_PP;
6055         }
6056         if (cr->npmenodes == 0 ||
6057             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6058         {
6059             cr->duty = DUTY_PME;
6060         }
6061
6062         /* Split the sim communicator into PP and PME only nodes */
6063         MPI_Comm_split(cr->mpi_comm_mysim,
6064                        cr->duty,
6065                        dd_index(comm->ntot, dd->ci),
6066                        &cr->mpi_comm_mygroup);
6067     }
6068     else
6069     {
6070         switch (dd_node_order)
6071         {
6072             case ddnoPP_PME:
6073                 if (fplog)
6074                 {
6075                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6076                 }
6077                 break;
6078             case ddnoINTERLEAVE:
6079                 /* Interleave the PP-only and PME-only nodes,
6080                  * as on clusters with dual-core machines this will double
6081                  * the communication bandwidth of the PME processes
6082                  * and thus speed up the PP <-> PME and inter PME communication.
6083                  */
6084                 if (fplog)
6085                 {
6086                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6087                 }
6088                 comm->pmenodes = dd_pmenodes(cr);
6089                 break;
6090             case ddnoCARTESIAN:
6091                 break;
6092             default:
6093                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6094         }
6095
6096         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6097         {
6098             cr->duty = DUTY_PME;
6099         }
6100         else
6101         {
6102             cr->duty = DUTY_PP;
6103         }
6104
6105         /* Split the sim communicator into PP and PME only nodes */
6106         MPI_Comm_split(cr->mpi_comm_mysim,
6107                        cr->duty,
6108                        cr->nodeid,
6109                        &cr->mpi_comm_mygroup);
6110         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6111     }
6112 #endif
6113
6114     if (fplog)
6115     {
6116         fprintf(fplog, "This is a %s only node\n\n",
6117                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6118     }
6119 }
6120
6121 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6122 {
6123     gmx_domdec_t      *dd;
6124     gmx_domdec_comm_t *comm;
6125     int                CartReorder;
6126
6127     dd   = cr->dd;
6128     comm = dd->comm;
6129
6130     copy_ivec(dd->nc, comm->ntot);
6131
6132     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6133     comm->bCartesianPP_PME = FALSE;
6134
6135     /* Reorder the nodes by default. This might change the MPI ranks.
6136      * Real reordering is only supported on very few architectures,
6137      * Blue Gene is one of them.
6138      */
6139     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6140
6141     if (cr->npmenodes > 0)
6142     {
6143         /* Split the communicator into a PP and PME part */
6144         split_communicator(fplog, cr, dd_node_order, CartReorder);
6145         if (comm->bCartesianPP_PME)
6146         {
6147             /* We (possibly) reordered the nodes in split_communicator,
6148              * so it is no longer required in make_pp_communicator.
6149              */
6150             CartReorder = FALSE;
6151         }
6152     }
6153     else
6154     {
6155         /* All nodes do PP and PME */
6156 #ifdef GMX_MPI
6157         /* We do not require separate communicators */
6158         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6159 #endif
6160     }
6161
6162     if (cr->duty & DUTY_PP)
6163     {
6164         /* Copy or make a new PP communicator */
6165         make_pp_communicator(fplog, cr, CartReorder);
6166     }
6167     else
6168     {
6169         receive_ddindex2simnodeid(cr);
6170     }
6171
6172     if (!(cr->duty & DUTY_PME))
6173     {
6174         /* Set up the commnuication to our PME node */
6175         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6176         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6177         if (debug)
6178         {
6179             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6180                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6181         }
6182     }
6183     else
6184     {
6185         dd->pme_nodeid = -1;
6186     }
6187
6188     if (DDMASTER(dd))
6189     {
6190         dd->ma = init_gmx_domdec_master_t(dd,
6191                                           comm->cgs_gl.nr,
6192                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6193     }
6194 }
6195
6196 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6197 {
6198     real  *slb_frac, tot;
6199     int    i, n;
6200     double dbl;
6201
6202     slb_frac = NULL;
6203     if (nc > 1 && size_string != NULL)
6204     {
6205         if (fplog)
6206         {
6207             fprintf(fplog, "Using static load balancing for the %s direction\n",
6208                     dir);
6209         }
6210         snew(slb_frac, nc);
6211         tot = 0;
6212         for (i = 0; i < nc; i++)
6213         {
6214             dbl = 0;
6215             sscanf(size_string, "%lf%n", &dbl, &n);
6216             if (dbl == 0)
6217             {
6218                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6219             }
6220             slb_frac[i]  = dbl;
6221             size_string += n;
6222             tot         += slb_frac[i];
6223         }
6224         /* Normalize */
6225         if (fplog)
6226         {
6227             fprintf(fplog, "Relative cell sizes:");
6228         }
6229         for (i = 0; i < nc; i++)
6230         {
6231             slb_frac[i] /= tot;
6232             if (fplog)
6233             {
6234                 fprintf(fplog, " %5.3f", slb_frac[i]);
6235             }
6236         }
6237         if (fplog)
6238         {
6239             fprintf(fplog, "\n");
6240         }
6241     }
6242
6243     return slb_frac;
6244 }
6245
6246 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6247 {
6248     int                  n, nmol, ftype;
6249     gmx_mtop_ilistloop_t iloop;
6250     t_ilist             *il;
6251
6252     n     = 0;
6253     iloop = gmx_mtop_ilistloop_init(mtop);
6254     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6255     {
6256         for (ftype = 0; ftype < F_NRE; ftype++)
6257         {
6258             if ((interaction_function[ftype].flags & IF_BOND) &&
6259                 NRAL(ftype) >  2)
6260             {
6261                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6262             }
6263         }
6264     }
6265
6266     return n;
6267 }
6268
6269 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6270 {
6271     char *val;
6272     int   nst;
6273
6274     nst = def;
6275     val = getenv(env_var);
6276     if (val)
6277     {
6278         if (sscanf(val, "%d", &nst) <= 0)
6279         {
6280             nst = 1;
6281         }
6282         if (fplog)
6283         {
6284             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6285                     env_var, val, nst);
6286         }
6287     }
6288
6289     return nst;
6290 }
6291
6292 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6293 {
6294     if (MASTER(cr))
6295     {
6296         fprintf(stderr, "\n%s\n", warn_string);
6297     }
6298     if (fplog)
6299     {
6300         fprintf(fplog, "\n%s\n", warn_string);
6301     }
6302 }
6303
6304 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6305                                   t_inputrec *ir, FILE *fplog)
6306 {
6307     if (ir->ePBC == epbcSCREW &&
6308         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6309     {
6310         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6311     }
6312
6313     if (ir->ns_type == ensSIMPLE)
6314     {
6315         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6316     }
6317
6318     if (ir->nstlist == 0)
6319     {
6320         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6321     }
6322
6323     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6324     {
6325         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6326     }
6327 }
6328
6329 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6330 {
6331     int  di, d;
6332     real r;
6333
6334     r = ddbox->box_size[XX];
6335     for (di = 0; di < dd->ndim; di++)
6336     {
6337         d = dd->dim[di];
6338         /* Check using the initial average cell size */
6339         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6340     }
6341
6342     return r;
6343 }
6344
6345 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6346                              const char *dlb_opt, gmx_bool bRecordLoad,
6347                              unsigned long Flags, t_inputrec *ir)
6348 {
6349     gmx_domdec_t *dd;
6350     int           eDLB = -1;
6351     char          buf[STRLEN];
6352
6353     switch (dlb_opt[0])
6354     {
6355         case 'a': eDLB = edlbAUTO; break;
6356         case 'n': eDLB = edlbNO;   break;
6357         case 'y': eDLB = edlbYES;  break;
6358         default: gmx_incons("Unknown dlb_opt");
6359     }
6360
6361     if (Flags & MD_RERUN)
6362     {
6363         return edlbNO;
6364     }
6365
6366     if (!EI_DYNAMICS(ir->eI))
6367     {
6368         if (eDLB == edlbYES)
6369         {
6370             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6371             dd_warning(cr, fplog, buf);
6372         }
6373
6374         return edlbNO;
6375     }
6376
6377     if (!bRecordLoad)
6378     {
6379         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6380
6381         return edlbNO;
6382     }
6383
6384     if (Flags & MD_REPRODUCIBLE)
6385     {
6386         switch (eDLB)
6387         {
6388             case edlbNO:
6389                 break;
6390             case edlbAUTO:
6391                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6392                 eDLB = edlbNO;
6393                 break;
6394             case edlbYES:
6395                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6396                 break;
6397             default:
6398                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6399                 break;
6400         }
6401     }
6402
6403     return eDLB;
6404 }
6405
6406 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6407 {
6408     int dim;
6409
6410     dd->ndim = 0;
6411     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6412     {
6413         /* Decomposition order z,y,x */
6414         if (fplog)
6415         {
6416             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6417         }
6418         for (dim = DIM-1; dim >= 0; dim--)
6419         {
6420             if (dd->nc[dim] > 1)
6421             {
6422                 dd->dim[dd->ndim++] = dim;
6423             }
6424         }
6425     }
6426     else
6427     {
6428         /* Decomposition order x,y,z */
6429         for (dim = 0; dim < DIM; dim++)
6430         {
6431             if (dd->nc[dim] > 1)
6432             {
6433                 dd->dim[dd->ndim++] = dim;
6434             }
6435         }
6436     }
6437 }
6438
6439 static gmx_domdec_comm_t *init_dd_comm()
6440 {
6441     gmx_domdec_comm_t *comm;
6442     int                i;
6443
6444     snew(comm, 1);
6445     snew(comm->cggl_flag, DIM*2);
6446     snew(comm->cgcm_state, DIM*2);
6447     for (i = 0; i < DIM*2; i++)
6448     {
6449         comm->cggl_flag_nalloc[i]  = 0;
6450         comm->cgcm_state_nalloc[i] = 0;
6451     }
6452
6453     comm->nalloc_int = 0;
6454     comm->buf_int    = NULL;
6455
6456     vec_rvec_init(&comm->vbuf);
6457
6458     comm->n_load_have    = 0;
6459     comm->n_load_collect = 0;
6460
6461     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6462     {
6463         comm->sum_nat[i] = 0;
6464     }
6465     comm->ndecomp   = 0;
6466     comm->nload     = 0;
6467     comm->load_step = 0;
6468     comm->load_sum  = 0;
6469     comm->load_max  = 0;
6470     clear_ivec(comm->load_lim);
6471     comm->load_mdf  = 0;
6472     comm->load_pme  = 0;
6473
6474     return comm;
6475 }
6476
6477 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6478                                         unsigned long Flags,
6479                                         ivec nc,
6480                                         real comm_distance_min, real rconstr,
6481                                         const char *dlb_opt, real dlb_scale,
6482                                         const char *sizex, const char *sizey, const char *sizez,
6483                                         gmx_mtop_t *mtop, t_inputrec *ir,
6484                                         matrix box, rvec *x,
6485                                         gmx_ddbox_t *ddbox,
6486                                         int *npme_x, int *npme_y)
6487 {
6488     gmx_domdec_t      *dd;
6489     gmx_domdec_comm_t *comm;
6490     int                recload;
6491     int                d, i, j;
6492     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6493     gmx_bool           bC;
6494     char               buf[STRLEN];
6495
6496     if (fplog)
6497     {
6498         fprintf(fplog,
6499                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6500     }
6501
6502     snew(dd, 1);
6503
6504     dd->comm = init_dd_comm();
6505     comm     = dd->comm;
6506     snew(comm->cggl_flag, DIM*2);
6507     snew(comm->cgcm_state, DIM*2);
6508
6509     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6510     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6511
6512     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6513     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6514     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6515     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6516     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6517     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6518     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6519     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6520
6521     dd->pme_recv_f_alloc = 0;
6522     dd->pme_recv_f_buf   = NULL;
6523
6524     if (dd->bSendRecv2 && fplog)
6525     {
6526         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6527     }
6528     if (comm->eFlop)
6529     {
6530         if (fplog)
6531         {
6532             fprintf(fplog, "Will load balance based on FLOP count\n");
6533         }
6534         if (comm->eFlop > 1)
6535         {
6536             srand(1+cr->nodeid);
6537         }
6538         comm->bRecordLoad = TRUE;
6539     }
6540     else
6541     {
6542         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6543
6544     }
6545
6546     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6547
6548     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6549     if (fplog)
6550     {
6551         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6552     }
6553     dd->bGridJump              = comm->bDynLoadBal;
6554     comm->bPMELoadBalDLBLimits = FALSE;
6555
6556     if (comm->nstSortCG)
6557     {
6558         if (fplog)
6559         {
6560             if (comm->nstSortCG == 1)
6561             {
6562                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6563             }
6564             else
6565             {
6566                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6567                         comm->nstSortCG);
6568             }
6569         }
6570         snew(comm->sort, 1);
6571     }
6572     else
6573     {
6574         if (fplog)
6575         {
6576             fprintf(fplog, "Will not sort the charge groups\n");
6577         }
6578     }
6579
6580     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6581
6582     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6583     if (comm->bInterCGBondeds)
6584     {
6585         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6586     }
6587     else
6588     {
6589         comm->bInterCGMultiBody = FALSE;
6590     }
6591
6592     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6593     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6594
6595     if (ir->rlistlong == 0)
6596     {
6597         /* Set the cut-off to some very large value,
6598          * so we don't need if statements everywhere in the code.
6599          * We use sqrt, since the cut-off is squared in some places.
6600          */
6601         comm->cutoff   = GMX_CUTOFF_INF;
6602     }
6603     else
6604     {
6605         comm->cutoff   = ir->rlistlong;
6606     }
6607     comm->cutoff_mbody = 0;
6608
6609     comm->cellsize_limit = 0;
6610     comm->bBondComm      = FALSE;
6611
6612     if (comm->bInterCGBondeds)
6613     {
6614         if (comm_distance_min > 0)
6615         {
6616             comm->cutoff_mbody = comm_distance_min;
6617             if (Flags & MD_DDBONDCOMM)
6618             {
6619                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6620             }
6621             else
6622             {
6623                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6624             }
6625             r_bonded_limit = comm->cutoff_mbody;
6626         }
6627         else if (ir->bPeriodicMols)
6628         {
6629             /* Can not easily determine the required cut-off */
6630             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6631             comm->cutoff_mbody = comm->cutoff/2;
6632             r_bonded_limit     = comm->cutoff_mbody;
6633         }
6634         else
6635         {
6636             if (MASTER(cr))
6637             {
6638                 dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
6639                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6640             }
6641             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6642             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6643
6644             /* We use an initial margin of 10% for the minimum cell size,
6645              * except when we are just below the non-bonded cut-off.
6646              */
6647             if (Flags & MD_DDBONDCOMM)
6648             {
6649                 if (max(r_2b, r_mb) > comm->cutoff)
6650                 {
6651                     r_bonded        = max(r_2b, r_mb);
6652                     r_bonded_limit  = 1.1*r_bonded;
6653                     comm->bBondComm = TRUE;
6654                 }
6655                 else
6656                 {
6657                     r_bonded       = r_mb;
6658                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6659                 }
6660                 /* We determine cutoff_mbody later */
6661             }
6662             else
6663             {
6664                 /* No special bonded communication,
6665                  * simply increase the DD cut-off.
6666                  */
6667                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6668                 comm->cutoff_mbody = r_bonded_limit;
6669                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6670             }
6671         }
6672         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6673         if (fplog)
6674         {
6675             fprintf(fplog,
6676                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6677                     comm->cellsize_limit);
6678         }
6679     }
6680
6681     if (dd->bInterCGcons && rconstr <= 0)
6682     {
6683         /* There is a cell size limit due to the constraints (P-LINCS) */
6684         rconstr = constr_r_max(fplog, mtop, ir);
6685         if (fplog)
6686         {
6687             fprintf(fplog,
6688                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6689                     rconstr);
6690             if (rconstr > comm->cellsize_limit)
6691             {
6692                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6693             }
6694         }
6695     }
6696     else if (rconstr > 0 && fplog)
6697     {
6698         /* Here we do not check for dd->bInterCGcons,
6699          * because one can also set a cell size limit for virtual sites only
6700          * and at this point we don't know yet if there are intercg v-sites.
6701          */
6702         fprintf(fplog,
6703                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6704                 rconstr);
6705     }
6706     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6707
6708     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6709
6710     if (nc[XX] > 0)
6711     {
6712         copy_ivec(nc, dd->nc);
6713         set_dd_dim(fplog, dd);
6714         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6715
6716         if (cr->npmenodes == -1)
6717         {
6718             cr->npmenodes = 0;
6719         }
6720         acs = average_cellsize_min(dd, ddbox);
6721         if (acs < comm->cellsize_limit)
6722         {
6723             if (fplog)
6724             {
6725                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6726             }
6727             gmx_fatal_collective(FARGS, cr, NULL,
6728                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6729                                  acs, comm->cellsize_limit);
6730         }
6731     }
6732     else
6733     {
6734         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6735
6736         /* We need to choose the optimal DD grid and possibly PME nodes */
6737         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6738                                comm->eDLB != edlbNO, dlb_scale,
6739                                comm->cellsize_limit, comm->cutoff,
6740                                comm->bInterCGBondeds, comm->bInterCGMultiBody);
6741
6742         if (dd->nc[XX] == 0)
6743         {
6744             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6745             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6746                     !bC ? "-rdd" : "-rcon",
6747                     comm->eDLB != edlbNO ? " or -dds" : "",
6748                     bC ? " or your LINCS settings" : "");
6749
6750             gmx_fatal_collective(FARGS, cr, NULL,
6751                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6752                                  "%s\n"
6753                                  "Look in the log file for details on the domain decomposition",
6754                                  cr->nnodes-cr->npmenodes, limit, buf);
6755         }
6756         set_dd_dim(fplog, dd);
6757     }
6758
6759     if (fplog)
6760     {
6761         fprintf(fplog,
6762                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6763                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6764     }
6765
6766     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6767     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6768     {
6769         gmx_fatal_collective(FARGS, cr, NULL,
6770                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6771                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6772     }
6773     if (cr->npmenodes > dd->nnodes)
6774     {
6775         gmx_fatal_collective(FARGS, cr, NULL,
6776                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6777     }
6778     if (cr->npmenodes > 0)
6779     {
6780         comm->npmenodes = cr->npmenodes;
6781     }
6782     else
6783     {
6784         comm->npmenodes = dd->nnodes;
6785     }
6786
6787     if (EEL_PME(ir->coulombtype))
6788     {
6789         /* The following choices should match those
6790          * in comm_cost_est in domdec_setup.c.
6791          * Note that here the checks have to take into account
6792          * that the decomposition might occur in a different order than xyz
6793          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6794          * in which case they will not match those in comm_cost_est,
6795          * but since that is mainly for testing purposes that's fine.
6796          */
6797         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6798             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6799             getenv("GMX_PMEONEDD") == NULL)
6800         {
6801             comm->npmedecompdim = 2;
6802             comm->npmenodes_x   = dd->nc[XX];
6803             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6804         }
6805         else
6806         {
6807             /* In case nc is 1 in both x and y we could still choose to
6808              * decompose pme in y instead of x, but we use x for simplicity.
6809              */
6810             comm->npmedecompdim = 1;
6811             if (dd->dim[0] == YY)
6812             {
6813                 comm->npmenodes_x = 1;
6814                 comm->npmenodes_y = comm->npmenodes;
6815             }
6816             else
6817             {
6818                 comm->npmenodes_x = comm->npmenodes;
6819                 comm->npmenodes_y = 1;
6820             }
6821         }
6822         if (fplog)
6823         {
6824             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6825                     comm->npmenodes_x, comm->npmenodes_y, 1);
6826         }
6827     }
6828     else
6829     {
6830         comm->npmedecompdim = 0;
6831         comm->npmenodes_x   = 0;
6832         comm->npmenodes_y   = 0;
6833     }
6834
6835     /* Technically we don't need both of these,
6836      * but it simplifies code not having to recalculate it.
6837      */
6838     *npme_x = comm->npmenodes_x;
6839     *npme_y = comm->npmenodes_y;
6840
6841     snew(comm->slb_frac, DIM);
6842     if (comm->eDLB == edlbNO)
6843     {
6844         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6845         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6846         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6847     }
6848
6849     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6850     {
6851         if (comm->bBondComm || comm->eDLB != edlbNO)
6852         {
6853             /* Set the bonded communication distance to halfway
6854              * the minimum and the maximum,
6855              * since the extra communication cost is nearly zero.
6856              */
6857             acs                = average_cellsize_min(dd, ddbox);
6858             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6859             if (comm->eDLB != edlbNO)
6860             {
6861                 /* Check if this does not limit the scaling */
6862                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
6863             }
6864             if (!comm->bBondComm)
6865             {
6866                 /* Without bBondComm do not go beyond the n.b. cut-off */
6867                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
6868                 if (comm->cellsize_limit >= comm->cutoff)
6869                 {
6870                     /* We don't loose a lot of efficieny
6871                      * when increasing it to the n.b. cut-off.
6872                      * It can even be slightly faster, because we need
6873                      * less checks for the communication setup.
6874                      */
6875                     comm->cutoff_mbody = comm->cutoff;
6876                 }
6877             }
6878             /* Check if we did not end up below our original limit */
6879             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
6880
6881             if (comm->cutoff_mbody > comm->cellsize_limit)
6882             {
6883                 comm->cellsize_limit = comm->cutoff_mbody;
6884             }
6885         }
6886         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6887     }
6888
6889     if (debug)
6890     {
6891         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6892                 "cellsize limit %f\n",
6893                 comm->bBondComm, comm->cellsize_limit);
6894     }
6895
6896     if (MASTER(cr))
6897     {
6898         check_dd_restrictions(cr, dd, ir, fplog);
6899     }
6900
6901     comm->partition_step = INT_MIN;
6902     dd->ddp_count        = 0;
6903
6904     clear_dd_cycle_counts(dd);
6905
6906     return dd;
6907 }
6908
6909 static void set_dlb_limits(gmx_domdec_t *dd)
6910
6911 {
6912     int d;
6913
6914     for (d = 0; d < dd->ndim; d++)
6915     {
6916         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6917         dd->comm->cellsize_min[dd->dim[d]] =
6918             dd->comm->cellsize_min_dlb[dd->dim[d]];
6919     }
6920 }
6921
6922
6923 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
6924 {
6925     gmx_domdec_t      *dd;
6926     gmx_domdec_comm_t *comm;
6927     real               cellsize_min;
6928     int                d, nc, i;
6929     char               buf[STRLEN];
6930
6931     dd   = cr->dd;
6932     comm = dd->comm;
6933
6934     if (fplog)
6935     {
6936         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6937     }
6938
6939     cellsize_min = comm->cellsize_min[dd->dim[0]];
6940     for (d = 1; d < dd->ndim; d++)
6941     {
6942         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6943     }
6944
6945     if (cellsize_min < comm->cellsize_limit*1.05)
6946     {
6947         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6948
6949         /* Change DLB from "auto" to "no". */
6950         comm->eDLB = edlbNO;
6951
6952         return;
6953     }
6954
6955     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6956     comm->bDynLoadBal = TRUE;
6957     dd->bGridJump     = TRUE;
6958
6959     set_dlb_limits(dd);
6960
6961     /* We can set the required cell size info here,
6962      * so we do not need to communicate this.
6963      * The grid is completely uniform.
6964      */
6965     for (d = 0; d < dd->ndim; d++)
6966     {
6967         if (comm->root[d])
6968         {
6969             comm->load[d].sum_m = comm->load[d].sum;
6970
6971             nc = dd->nc[dd->dim[d]];
6972             for (i = 0; i < nc; i++)
6973             {
6974                 comm->root[d]->cell_f[i]    = i/(real)nc;
6975                 if (d > 0)
6976                 {
6977                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6978                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6979                 }
6980             }
6981             comm->root[d]->cell_f[nc] = 1.0;
6982         }
6983     }
6984 }
6985
6986 static char *init_bLocalCG(gmx_mtop_t *mtop)
6987 {
6988     int   ncg, cg;
6989     char *bLocalCG;
6990
6991     ncg = ncg_mtop(mtop);
6992     snew(bLocalCG, ncg);
6993     for (cg = 0; cg < ncg; cg++)
6994     {
6995         bLocalCG[cg] = FALSE;
6996     }
6997
6998     return bLocalCG;
6999 }
7000
7001 void dd_init_bondeds(FILE *fplog,
7002                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7003                      gmx_vsite_t *vsite, gmx_constr_t constr,
7004                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7005 {
7006     gmx_domdec_comm_t *comm;
7007     gmx_bool           bBondComm;
7008     int                d;
7009
7010     dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
7011
7012     comm = dd->comm;
7013
7014     if (comm->bBondComm)
7015     {
7016         /* Communicate atoms beyond the cut-off for bonded interactions */
7017         comm = dd->comm;
7018
7019         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7020
7021         comm->bLocalCG = init_bLocalCG(mtop);
7022     }
7023     else
7024     {
7025         /* Only communicate atoms based on cut-off */
7026         comm->cglink   = NULL;
7027         comm->bLocalCG = NULL;
7028     }
7029 }
7030
7031 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7032                               t_inputrec *ir,
7033                               gmx_bool bDynLoadBal, real dlb_scale,
7034                               gmx_ddbox_t *ddbox)
7035 {
7036     gmx_domdec_comm_t *comm;
7037     int                d;
7038     ivec               np;
7039     real               limit, shrink;
7040     char               buf[64];
7041
7042     if (fplog == NULL)
7043     {
7044         return;
7045     }
7046
7047     comm = dd->comm;
7048
7049     if (bDynLoadBal)
7050     {
7051         fprintf(fplog, "The maximum number of communication pulses is:");
7052         for (d = 0; d < dd->ndim; d++)
7053         {
7054             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7055         }
7056         fprintf(fplog, "\n");
7057         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7058         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7059         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7060         for (d = 0; d < DIM; d++)
7061         {
7062             if (dd->nc[d] > 1)
7063             {
7064                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7065                 {
7066                     shrink = 0;
7067                 }
7068                 else
7069                 {
7070                     shrink =
7071                         comm->cellsize_min_dlb[d]/
7072                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7073                 }
7074                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7075             }
7076         }
7077         fprintf(fplog, "\n");
7078     }
7079     else
7080     {
7081         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
7082         fprintf(fplog, "The initial number of communication pulses is:");
7083         for (d = 0; d < dd->ndim; d++)
7084         {
7085             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7086         }
7087         fprintf(fplog, "\n");
7088         fprintf(fplog, "The initial domain decomposition cell size is:");
7089         for (d = 0; d < DIM; d++)
7090         {
7091             if (dd->nc[d] > 1)
7092             {
7093                 fprintf(fplog, " %c %.2f nm",
7094                         dim2char(d), dd->comm->cellsize_min[d]);
7095             }
7096         }
7097         fprintf(fplog, "\n\n");
7098     }
7099
7100     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7101     {
7102         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7103         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7104                 "non-bonded interactions", "", comm->cutoff);
7105
7106         if (bDynLoadBal)
7107         {
7108             limit = dd->comm->cellsize_limit;
7109         }
7110         else
7111         {
7112             if (dynamic_dd_box(ddbox, ir))
7113             {
7114                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7115             }
7116             limit = dd->comm->cellsize_min[XX];
7117             for (d = 1; d < DIM; d++)
7118             {
7119                 limit = min(limit, dd->comm->cellsize_min[d]);
7120             }
7121         }
7122
7123         if (comm->bInterCGBondeds)
7124         {
7125             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7126                     "two-body bonded interactions", "(-rdd)",
7127                     max(comm->cutoff, comm->cutoff_mbody));
7128             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7129                     "multi-body bonded interactions", "(-rdd)",
7130                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7131         }
7132         if (dd->vsite_comm)
7133         {
7134             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7135                     "virtual site constructions", "(-rcon)", limit);
7136         }
7137         if (dd->constraint_comm)
7138         {
7139             sprintf(buf, "atoms separated by up to %d constraints",
7140                     1+ir->nProjOrder);
7141             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7142                     buf, "(-rcon)", limit);
7143         }
7144         fprintf(fplog, "\n");
7145     }
7146
7147     fflush(fplog);
7148 }
7149
7150 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7151                                 real               dlb_scale,
7152                                 const t_inputrec  *ir,
7153                                 const gmx_ddbox_t *ddbox)
7154 {
7155     gmx_domdec_comm_t *comm;
7156     int                d, dim, npulse, npulse_d_max, npulse_d;
7157     gmx_bool           bNoCutOff;
7158
7159     comm = dd->comm;
7160
7161     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7162
7163     /* Determine the maximum number of comm. pulses in one dimension */
7164
7165     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7166
7167     /* Determine the maximum required number of grid pulses */
7168     if (comm->cellsize_limit >= comm->cutoff)
7169     {
7170         /* Only a single pulse is required */
7171         npulse = 1;
7172     }
7173     else if (!bNoCutOff && comm->cellsize_limit > 0)
7174     {
7175         /* We round down slightly here to avoid overhead due to the latency
7176          * of extra communication calls when the cut-off
7177          * would be only slightly longer than the cell size.
7178          * Later cellsize_limit is redetermined,
7179          * so we can not miss interactions due to this rounding.
7180          */
7181         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7182     }
7183     else
7184     {
7185         /* There is no cell size limit */
7186         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7187     }
7188
7189     if (!bNoCutOff && npulse > 1)
7190     {
7191         /* See if we can do with less pulses, based on dlb_scale */
7192         npulse_d_max = 0;
7193         for (d = 0; d < dd->ndim; d++)
7194         {
7195             dim      = dd->dim[d];
7196             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7197                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7198             npulse_d_max = max(npulse_d_max, npulse_d);
7199         }
7200         npulse = min(npulse, npulse_d_max);
7201     }
7202
7203     /* This env var can override npulse */
7204     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7205     if (d > 0)
7206     {
7207         npulse = d;
7208     }
7209
7210     comm->maxpulse       = 1;
7211     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7212     for (d = 0; d < dd->ndim; d++)
7213     {
7214         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7215         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7216         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7217         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7218         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7219         {
7220             comm->bVacDLBNoLimit = FALSE;
7221         }
7222     }
7223
7224     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7225     if (!comm->bVacDLBNoLimit)
7226     {
7227         comm->cellsize_limit = max(comm->cellsize_limit,
7228                                    comm->cutoff/comm->maxpulse);
7229     }
7230     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7231     /* Set the minimum cell size for each DD dimension */
7232     for (d = 0; d < dd->ndim; d++)
7233     {
7234         if (comm->bVacDLBNoLimit ||
7235             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7236         {
7237             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7238         }
7239         else
7240         {
7241             comm->cellsize_min_dlb[dd->dim[d]] =
7242                 comm->cutoff/comm->cd[d].np_dlb;
7243         }
7244     }
7245     if (comm->cutoff_mbody <= 0)
7246     {
7247         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7248     }
7249     if (comm->bDynLoadBal)
7250     {
7251         set_dlb_limits(dd);
7252     }
7253 }
7254
7255 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7256 {
7257     /* If each molecule is a single charge group
7258      * or we use domain decomposition for each periodic dimension,
7259      * we do not need to take pbc into account for the bonded interactions.
7260      */
7261     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7262             !(dd->nc[XX] > 1 &&
7263               dd->nc[YY] > 1 &&
7264               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7265 }
7266
7267 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7268                        t_inputrec *ir, t_forcerec *fr,
7269                        gmx_ddbox_t *ddbox)
7270 {
7271     gmx_domdec_comm_t *comm;
7272     int                natoms_tot;
7273     real               vol_frac;
7274
7275     comm = dd->comm;
7276
7277     /* Initialize the thread data.
7278      * This can not be done in init_domain_decomposition,
7279      * as the numbers of threads is determined later.
7280      */
7281     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7282     if (comm->nth > 1)
7283     {
7284         snew(comm->dth, comm->nth);
7285     }
7286
7287     if (EEL_PME(ir->coulombtype))
7288     {
7289         init_ddpme(dd, &comm->ddpme[0], 0);
7290         if (comm->npmedecompdim >= 2)
7291         {
7292             init_ddpme(dd, &comm->ddpme[1], 1);
7293         }
7294     }
7295     else
7296     {
7297         comm->npmenodes = 0;
7298         if (dd->pme_nodeid >= 0)
7299         {
7300             gmx_fatal_collective(FARGS, NULL, dd,
7301                                  "Can not have separate PME nodes without PME electrostatics");
7302         }
7303     }
7304
7305     if (debug)
7306     {
7307         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7308     }
7309     if (comm->eDLB != edlbNO)
7310     {
7311         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7312     }
7313
7314     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7315     if (comm->eDLB == edlbAUTO)
7316     {
7317         if (fplog)
7318         {
7319             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7320         }
7321         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7322     }
7323
7324     if (ir->ePBC == epbcNONE)
7325     {
7326         vol_frac = 1 - 1/(double)dd->nnodes;
7327     }
7328     else
7329     {
7330         vol_frac =
7331             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7332     }
7333     if (debug)
7334     {
7335         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7336     }
7337     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7338
7339     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7340 }
7341
7342 static gmx_bool test_dd_cutoff(t_commrec *cr,
7343                                t_state *state, t_inputrec *ir,
7344                                real cutoff_req)
7345 {
7346     gmx_domdec_t *dd;
7347     gmx_ddbox_t   ddbox;
7348     int           d, dim, np;
7349     real          inv_cell_size;
7350     int           LocallyLimited;
7351
7352     dd = cr->dd;
7353
7354     set_ddbox(dd, FALSE, cr, ir, state->box,
7355               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7356
7357     LocallyLimited = 0;
7358
7359     for (d = 0; d < dd->ndim; d++)
7360     {
7361         dim = dd->dim[d];
7362
7363         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7364         if (dynamic_dd_box(&ddbox, ir))
7365         {
7366             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7367         }
7368
7369         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7370
7371         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7372             dd->comm->cd[d].np_dlb > 0)
7373         {
7374             if (np > dd->comm->cd[d].np_dlb)
7375             {
7376                 return FALSE;
7377             }
7378
7379             /* If a current local cell size is smaller than the requested
7380              * cut-off, we could still fix it, but this gets very complicated.
7381              * Without fixing here, we might actually need more checks.
7382              */
7383             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7384             {
7385                 LocallyLimited = 1;
7386             }
7387         }
7388     }
7389
7390     if (dd->comm->eDLB != edlbNO)
7391     {
7392         /* If DLB is not active yet, we don't need to check the grid jumps.
7393          * Actually we shouldn't, because then the grid jump data is not set.
7394          */
7395         if (dd->comm->bDynLoadBal &&
7396             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7397         {
7398             LocallyLimited = 1;
7399         }
7400
7401         gmx_sumi(1, &LocallyLimited, cr);
7402
7403         if (LocallyLimited > 0)
7404         {
7405             return FALSE;
7406         }
7407     }
7408
7409     return TRUE;
7410 }
7411
7412 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7413                           real cutoff_req)
7414 {
7415     gmx_bool bCutoffAllowed;
7416
7417     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7418
7419     if (bCutoffAllowed)
7420     {
7421         cr->dd->comm->cutoff = cutoff_req;
7422     }
7423
7424     return bCutoffAllowed;
7425 }
7426
7427 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7428 {
7429     gmx_domdec_comm_t *comm;
7430
7431     comm = cr->dd->comm;
7432
7433     /* Turn on the DLB limiting (might have been on already) */
7434     comm->bPMELoadBalDLBLimits = TRUE;
7435
7436     /* Change the cut-off limit */
7437     comm->PMELoadBal_max_cutoff = comm->cutoff;
7438 }
7439
7440 static void merge_cg_buffers(int ncell,
7441                              gmx_domdec_comm_dim_t *cd, int pulse,
7442                              int  *ncg_cell,
7443                              int  *index_gl, int  *recv_i,
7444                              rvec *cg_cm,    rvec *recv_vr,
7445                              int *cgindex,
7446                              cginfo_mb_t *cginfo_mb, int *cginfo)
7447 {
7448     gmx_domdec_ind_t *ind, *ind_p;
7449     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7450     int               shift, shift_at;
7451
7452     ind = &cd->ind[pulse];
7453
7454     /* First correct the already stored data */
7455     shift = ind->nrecv[ncell];
7456     for (cell = ncell-1; cell >= 0; cell--)
7457     {
7458         shift -= ind->nrecv[cell];
7459         if (shift > 0)
7460         {
7461             /* Move the cg's present from previous grid pulses */
7462             cg0                = ncg_cell[ncell+cell];
7463             cg1                = ncg_cell[ncell+cell+1];
7464             cgindex[cg1+shift] = cgindex[cg1];
7465             for (cg = cg1-1; cg >= cg0; cg--)
7466             {
7467                 index_gl[cg+shift] = index_gl[cg];
7468                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7469                 cgindex[cg+shift] = cgindex[cg];
7470                 cginfo[cg+shift]  = cginfo[cg];
7471             }
7472             /* Correct the already stored send indices for the shift */
7473             for (p = 1; p <= pulse; p++)
7474             {
7475                 ind_p = &cd->ind[p];
7476                 cg0   = 0;
7477                 for (c = 0; c < cell; c++)
7478                 {
7479                     cg0 += ind_p->nsend[c];
7480                 }
7481                 cg1 = cg0 + ind_p->nsend[cell];
7482                 for (cg = cg0; cg < cg1; cg++)
7483                 {
7484                     ind_p->index[cg] += shift;
7485                 }
7486             }
7487         }
7488     }
7489
7490     /* Merge in the communicated buffers */
7491     shift    = 0;
7492     shift_at = 0;
7493     cg0      = 0;
7494     for (cell = 0; cell < ncell; cell++)
7495     {
7496         cg1 = ncg_cell[ncell+cell+1] + shift;
7497         if (shift_at > 0)
7498         {
7499             /* Correct the old cg indices */
7500             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7501             {
7502                 cgindex[cg+1] += shift_at;
7503             }
7504         }
7505         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7506         {
7507             /* Copy this charge group from the buffer */
7508             index_gl[cg1] = recv_i[cg0];
7509             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7510             /* Add it to the cgindex */
7511             cg_gl          = index_gl[cg1];
7512             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7513             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7514             cgindex[cg1+1] = cgindex[cg1] + nat;
7515             cg0++;
7516             cg1++;
7517             shift_at += nat;
7518         }
7519         shift                 += ind->nrecv[cell];
7520         ncg_cell[ncell+cell+1] = cg1;
7521     }
7522 }
7523
7524 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7525                                int nzone, int cg0, const int *cgindex)
7526 {
7527     int cg, zone, p;
7528
7529     /* Store the atom block boundaries for easy copying of communication buffers
7530      */
7531     cg = cg0;
7532     for (zone = 0; zone < nzone; zone++)
7533     {
7534         for (p = 0; p < cd->np; p++)
7535         {
7536             cd->ind[p].cell2at0[zone] = cgindex[cg];
7537             cg += cd->ind[p].nrecv[zone];
7538             cd->ind[p].cell2at1[zone] = cgindex[cg];
7539         }
7540     }
7541 }
7542
7543 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7544 {
7545     int      i;
7546     gmx_bool bMiss;
7547
7548     bMiss = FALSE;
7549     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7550     {
7551         if (!bLocalCG[link->a[i]])
7552         {
7553             bMiss = TRUE;
7554         }
7555     }
7556
7557     return bMiss;
7558 }
7559
7560 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7561 typedef struct {
7562     real c[DIM][4]; /* the corners for the non-bonded communication */
7563     real cr0;       /* corner for rounding */
7564     real cr1[4];    /* corners for rounding */
7565     real bc[DIM];   /* corners for bounded communication */
7566     real bcr1;      /* corner for rounding for bonded communication */
7567 } dd_corners_t;
7568
7569 /* Determine the corners of the domain(s) we are communicating with */
7570 static void
7571 set_dd_corners(const gmx_domdec_t *dd,
7572                int dim0, int dim1, int dim2,
7573                gmx_bool bDistMB,
7574                dd_corners_t *c)
7575 {
7576     const gmx_domdec_comm_t  *comm;
7577     const gmx_domdec_zones_t *zones;
7578     int i, j;
7579
7580     comm = dd->comm;
7581
7582     zones = &comm->zones;
7583
7584     /* Keep the compiler happy */
7585     c->cr0  = 0;
7586     c->bcr1 = 0;
7587
7588     /* The first dimension is equal for all cells */
7589     c->c[0][0] = comm->cell_x0[dim0];
7590     if (bDistMB)
7591     {
7592         c->bc[0] = c->c[0][0];
7593     }
7594     if (dd->ndim >= 2)
7595     {
7596         dim1 = dd->dim[1];
7597         /* This cell row is only seen from the first row */
7598         c->c[1][0] = comm->cell_x0[dim1];
7599         /* All rows can see this row */
7600         c->c[1][1] = comm->cell_x0[dim1];
7601         if (dd->bGridJump)
7602         {
7603             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7604             if (bDistMB)
7605             {
7606                 /* For the multi-body distance we need the maximum */
7607                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7608             }
7609         }
7610         /* Set the upper-right corner for rounding */
7611         c->cr0 = comm->cell_x1[dim0];
7612
7613         if (dd->ndim >= 3)
7614         {
7615             dim2 = dd->dim[2];
7616             for (j = 0; j < 4; j++)
7617             {
7618                 c->c[2][j] = comm->cell_x0[dim2];
7619             }
7620             if (dd->bGridJump)
7621             {
7622                 /* Use the maximum of the i-cells that see a j-cell */
7623                 for (i = 0; i < zones->nizone; i++)
7624                 {
7625                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7626                     {
7627                         if (j >= 4)
7628                         {
7629                             c->c[2][j-4] =
7630                                 max(c->c[2][j-4],
7631                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7632                         }
7633                     }
7634                 }
7635                 if (bDistMB)
7636                 {
7637                     /* For the multi-body distance we need the maximum */
7638                     c->bc[2] = comm->cell_x0[dim2];
7639                     for (i = 0; i < 2; i++)
7640                     {
7641                         for (j = 0; j < 2; j++)
7642                         {
7643                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7644                         }
7645                     }
7646                 }
7647             }
7648
7649             /* Set the upper-right corner for rounding */
7650             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7651              * Only cell (0,0,0) can see cell 7 (1,1,1)
7652              */
7653             c->cr1[0] = comm->cell_x1[dim1];
7654             c->cr1[3] = comm->cell_x1[dim1];
7655             if (dd->bGridJump)
7656             {
7657                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7658                 if (bDistMB)
7659                 {
7660                     /* For the multi-body distance we need the maximum */
7661                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7662                 }
7663             }
7664         }
7665     }
7666 }
7667
7668 /* Determine which cg's we need to send in this pulse from this zone */
7669 static void
7670 get_zone_pulse_cgs(gmx_domdec_t *dd,
7671                    int zonei, int zone,
7672                    int cg0, int cg1,
7673                    const int *index_gl,
7674                    const int *cgindex,
7675                    int dim, int dim_ind,
7676                    int dim0, int dim1, int dim2,
7677                    real r_comm2, real r_bcomm2,
7678                    matrix box,
7679                    ivec tric_dist,
7680                    rvec *normal,
7681                    real skew_fac2_d, real skew_fac_01,
7682                    rvec *v_d, rvec *v_0, rvec *v_1,
7683                    const dd_corners_t *c,
7684                    rvec sf2_round,
7685                    gmx_bool bDistBonded,
7686                    gmx_bool bBondComm,
7687                    gmx_bool bDist2B,
7688                    gmx_bool bDistMB,
7689                    rvec *cg_cm,
7690                    int *cginfo,
7691                    gmx_domdec_ind_t *ind,
7692                    int **ibuf, int *ibuf_nalloc,
7693                    vec_rvec_t *vbuf,
7694                    int *nsend_ptr,
7695                    int *nat_ptr,
7696                    int *nsend_z_ptr)
7697 {
7698     gmx_domdec_comm_t *comm;
7699     gmx_bool           bScrew;
7700     gmx_bool           bDistMB_pulse;
7701     int                cg, i;
7702     real               r2, rb2, r, tric_sh;
7703     rvec               rn, rb;
7704     int                dimd;
7705     int                nsend_z, nsend, nat;
7706
7707     comm = dd->comm;
7708
7709     bScrew = (dd->bScrewPBC && dim == XX);
7710
7711     bDistMB_pulse = (bDistMB && bDistBonded);
7712
7713     nsend_z = 0;
7714     nsend   = *nsend_ptr;
7715     nat     = *nat_ptr;
7716
7717     for (cg = cg0; cg < cg1; cg++)
7718     {
7719         r2  = 0;
7720         rb2 = 0;
7721         if (tric_dist[dim_ind] == 0)
7722         {
7723             /* Rectangular direction, easy */
7724             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7725             if (r > 0)
7726             {
7727                 r2 += r*r;
7728             }
7729             if (bDistMB_pulse)
7730             {
7731                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7732                 if (r > 0)
7733                 {
7734                     rb2 += r*r;
7735                 }
7736             }
7737             /* Rounding gives at most a 16% reduction
7738              * in communicated atoms
7739              */
7740             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7741             {
7742                 r = cg_cm[cg][dim0] - c->cr0;
7743                 /* This is the first dimension, so always r >= 0 */
7744                 r2 += r*r;
7745                 if (bDistMB_pulse)
7746                 {
7747                     rb2 += r*r;
7748                 }
7749             }
7750             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7751             {
7752                 r = cg_cm[cg][dim1] - c->cr1[zone];
7753                 if (r > 0)
7754                 {
7755                     r2 += r*r;
7756                 }
7757                 if (bDistMB_pulse)
7758                 {
7759                     r = cg_cm[cg][dim1] - c->bcr1;
7760                     if (r > 0)
7761                     {
7762                         rb2 += r*r;
7763                     }
7764                 }
7765             }
7766         }
7767         else
7768         {
7769             /* Triclinic direction, more complicated */
7770             clear_rvec(rn);
7771             clear_rvec(rb);
7772             /* Rounding, conservative as the skew_fac multiplication
7773              * will slightly underestimate the distance.
7774              */
7775             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7776             {
7777                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7778                 for (i = dim0+1; i < DIM; i++)
7779                 {
7780                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7781                 }
7782                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7783                 if (bDistMB_pulse)
7784                 {
7785                     rb[dim0] = rn[dim0];
7786                     rb2      = r2;
7787                 }
7788                 /* Take care that the cell planes along dim0 might not
7789                  * be orthogonal to those along dim1 and dim2.
7790                  */
7791                 for (i = 1; i <= dim_ind; i++)
7792                 {
7793                     dimd = dd->dim[i];
7794                     if (normal[dim0][dimd] > 0)
7795                     {
7796                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7797                         if (bDistMB_pulse)
7798                         {
7799                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7800                         }
7801                     }
7802                 }
7803             }
7804             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7805             {
7806                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7807                 tric_sh   = 0;
7808                 for (i = dim1+1; i < DIM; i++)
7809                 {
7810                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7811                 }
7812                 rn[dim1] += tric_sh;
7813                 if (rn[dim1] > 0)
7814                 {
7815                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7816                     /* Take care of coupling of the distances
7817                      * to the planes along dim0 and dim1 through dim2.
7818                      */
7819                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7820                     /* Take care that the cell planes along dim1
7821                      * might not be orthogonal to that along dim2.
7822                      */
7823                     if (normal[dim1][dim2] > 0)
7824                     {
7825                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7826                     }
7827                 }
7828                 if (bDistMB_pulse)
7829                 {
7830                     rb[dim1] +=
7831                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7832                     if (rb[dim1] > 0)
7833                     {
7834                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7835                         /* Take care of coupling of the distances
7836                          * to the planes along dim0 and dim1 through dim2.
7837                          */
7838                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7839                         /* Take care that the cell planes along dim1
7840                          * might not be orthogonal to that along dim2.
7841                          */
7842                         if (normal[dim1][dim2] > 0)
7843                         {
7844                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7845                         }
7846                     }
7847                 }
7848             }
7849             /* The distance along the communication direction */
7850             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7851             tric_sh  = 0;
7852             for (i = dim+1; i < DIM; i++)
7853             {
7854                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7855             }
7856             rn[dim] += tric_sh;
7857             if (rn[dim] > 0)
7858             {
7859                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7860                 /* Take care of coupling of the distances
7861                  * to the planes along dim0 and dim1 through dim2.
7862                  */
7863                 if (dim_ind == 1 && zonei == 1)
7864                 {
7865                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7866                 }
7867             }
7868             if (bDistMB_pulse)
7869             {
7870                 clear_rvec(rb);
7871                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7872                 if (rb[dim] > 0)
7873                 {
7874                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7875                     /* Take care of coupling of the distances
7876                      * to the planes along dim0 and dim1 through dim2.
7877                      */
7878                     if (dim_ind == 1 && zonei == 1)
7879                     {
7880                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7881                     }
7882                 }
7883             }
7884         }
7885
7886         if (r2 < r_comm2 ||
7887             (bDistBonded &&
7888              ((bDistMB && rb2 < r_bcomm2) ||
7889               (bDist2B && r2  < r_bcomm2)) &&
7890              (!bBondComm ||
7891               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7892                missing_link(comm->cglink, index_gl[cg],
7893                             comm->bLocalCG)))))
7894         {
7895             /* Make an index to the local charge groups */
7896             if (nsend+1 > ind->nalloc)
7897             {
7898                 ind->nalloc = over_alloc_large(nsend+1);
7899                 srenew(ind->index, ind->nalloc);
7900             }
7901             if (nsend+1 > *ibuf_nalloc)
7902             {
7903                 *ibuf_nalloc = over_alloc_large(nsend+1);
7904                 srenew(*ibuf, *ibuf_nalloc);
7905             }
7906             ind->index[nsend] = cg;
7907             (*ibuf)[nsend]    = index_gl[cg];
7908             nsend_z++;
7909             vec_rvec_check_alloc(vbuf, nsend+1);
7910
7911             if (dd->ci[dim] == 0)
7912             {
7913                 /* Correct cg_cm for pbc */
7914                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7915                 if (bScrew)
7916                 {
7917                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7918                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7919                 }
7920             }
7921             else
7922             {
7923                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7924             }
7925             nsend++;
7926             nat += cgindex[cg+1] - cgindex[cg];
7927         }
7928     }
7929
7930     *nsend_ptr   = nsend;
7931     *nat_ptr     = nat;
7932     *nsend_z_ptr = nsend_z;
7933 }
7934
7935 static void setup_dd_communication(gmx_domdec_t *dd,
7936                                    matrix box, gmx_ddbox_t *ddbox,
7937                                    t_forcerec *fr, t_state *state, rvec **f)
7938 {
7939     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7940     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7941     int                    c, i, j, cg, cg_gl, nrcg;
7942     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7943     gmx_domdec_comm_t     *comm;
7944     gmx_domdec_zones_t    *zones;
7945     gmx_domdec_comm_dim_t *cd;
7946     gmx_domdec_ind_t      *ind;
7947     cginfo_mb_t           *cginfo_mb;
7948     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7949     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
7950     dd_corners_t           corners;
7951     ivec                   tric_dist;
7952     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7953     real                   skew_fac2_d, skew_fac_01;
7954     rvec                   sf2_round;
7955     int                    nsend, nat;
7956     int                    th;
7957
7958     if (debug)
7959     {
7960         fprintf(debug, "Setting up DD communication\n");
7961     }
7962
7963     comm  = dd->comm;
7964
7965     switch (fr->cutoff_scheme)
7966     {
7967         case ecutsGROUP:
7968             cg_cm = fr->cg_cm;
7969             break;
7970         case ecutsVERLET:
7971             cg_cm = state->x;
7972             break;
7973         default:
7974             gmx_incons("unimplemented");
7975             cg_cm = NULL;
7976     }
7977
7978     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7979     {
7980         dim = dd->dim[dim_ind];
7981
7982         /* Check if we need to use triclinic distances */
7983         tric_dist[dim_ind] = 0;
7984         for (i = 0; i <= dim_ind; i++)
7985         {
7986             if (ddbox->tric_dir[dd->dim[i]])
7987             {
7988                 tric_dist[dim_ind] = 1;
7989             }
7990         }
7991     }
7992
7993     bBondComm = comm->bBondComm;
7994
7995     /* Do we need to determine extra distances for multi-body bondeds? */
7996     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
7997
7998     /* Do we need to determine extra distances for only two-body bondeds? */
7999     bDist2B = (bBondComm && !bDistMB);
8000
8001     r_comm2  = sqr(comm->cutoff);
8002     r_bcomm2 = sqr(comm->cutoff_mbody);
8003
8004     if (debug)
8005     {
8006         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8007     }
8008
8009     zones = &comm->zones;
8010
8011     dim0 = dd->dim[0];
8012     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8013     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8014
8015     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8016
8017     /* Triclinic stuff */
8018     normal      = ddbox->normal;
8019     skew_fac_01 = 0;
8020     if (dd->ndim >= 2)
8021     {
8022         v_0 = ddbox->v[dim0];
8023         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8024         {
8025             /* Determine the coupling coefficient for the distances
8026              * to the cell planes along dim0 and dim1 through dim2.
8027              * This is required for correct rounding.
8028              */
8029             skew_fac_01 =
8030                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8031             if (debug)
8032             {
8033                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8034             }
8035         }
8036     }
8037     if (dd->ndim >= 3)
8038     {
8039         v_1 = ddbox->v[dim1];
8040     }
8041
8042     zone_cg_range = zones->cg_range;
8043     index_gl      = dd->index_gl;
8044     cgindex       = dd->cgindex;
8045     cginfo_mb     = fr->cginfo_mb;
8046
8047     zone_cg_range[0]   = 0;
8048     zone_cg_range[1]   = dd->ncg_home;
8049     comm->zone_ncg1[0] = dd->ncg_home;
8050     pos_cg             = dd->ncg_home;
8051
8052     nat_tot = dd->nat_home;
8053     nzone   = 1;
8054     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8055     {
8056         dim = dd->dim[dim_ind];
8057         cd  = &comm->cd[dim_ind];
8058
8059         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8060         {
8061             /* No pbc in this dimension, the first node should not comm. */
8062             nzone_send = 0;
8063         }
8064         else
8065         {
8066             nzone_send = nzone;
8067         }
8068
8069         v_d         = ddbox->v[dim];
8070         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8071
8072         cd->bInPlace = TRUE;
8073         for (p = 0; p < cd->np; p++)
8074         {
8075             /* Only atoms communicated in the first pulse are used
8076              * for multi-body bonded interactions or for bBondComm.
8077              */
8078             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8079
8080             ind   = &cd->ind[p];
8081             nsend = 0;
8082             nat   = 0;
8083             for (zone = 0; zone < nzone_send; zone++)
8084             {
8085                 if (tric_dist[dim_ind] && dim_ind > 0)
8086                 {
8087                     /* Determine slightly more optimized skew_fac's
8088                      * for rounding.
8089                      * This reduces the number of communicated atoms
8090                      * by about 10% for 3D DD of rhombic dodecahedra.
8091                      */
8092                     for (dimd = 0; dimd < dim; dimd++)
8093                     {
8094                         sf2_round[dimd] = 1;
8095                         if (ddbox->tric_dir[dimd])
8096                         {
8097                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8098                             {
8099                                 /* If we are shifted in dimension i
8100                                  * and the cell plane is tilted forward
8101                                  * in dimension i, skip this coupling.
8102                                  */
8103                                 if (!(zones->shift[nzone+zone][i] &&
8104                                       ddbox->v[dimd][i][dimd] >= 0))
8105                                 {
8106                                     sf2_round[dimd] +=
8107                                         sqr(ddbox->v[dimd][i][dimd]);
8108                                 }
8109                             }
8110                             sf2_round[dimd] = 1/sf2_round[dimd];
8111                         }
8112                     }
8113                 }
8114
8115                 zonei = zone_perm[dim_ind][zone];
8116                 if (p == 0)
8117                 {
8118                     /* Here we permutate the zones to obtain a convenient order
8119                      * for neighbor searching
8120                      */
8121                     cg0 = zone_cg_range[zonei];
8122                     cg1 = zone_cg_range[zonei+1];
8123                 }
8124                 else
8125                 {
8126                     /* Look only at the cg's received in the previous grid pulse
8127                      */
8128                     cg1 = zone_cg_range[nzone+zone+1];
8129                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8130                 }
8131
8132 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8133                 for (th = 0; th < comm->nth; th++)
8134                 {
8135                     gmx_domdec_ind_t *ind_p;
8136                     int             **ibuf_p, *ibuf_nalloc_p;
8137                     vec_rvec_t       *vbuf_p;
8138                     int              *nsend_p, *nat_p;
8139                     int              *nsend_zone_p;
8140                     int               cg0_th, cg1_th;
8141
8142                     if (th == 0)
8143                     {
8144                         /* Thread 0 writes in the comm buffers */
8145                         ind_p         = ind;
8146                         ibuf_p        = &comm->buf_int;
8147                         ibuf_nalloc_p = &comm->nalloc_int;
8148                         vbuf_p        = &comm->vbuf;
8149                         nsend_p       = &nsend;
8150                         nat_p         = &nat;
8151                         nsend_zone_p  = &ind->nsend[zone];
8152                     }
8153                     else
8154                     {
8155                         /* Other threads write into temp buffers */
8156                         ind_p         = &comm->dth[th].ind;
8157                         ibuf_p        = &comm->dth[th].ibuf;
8158                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8159                         vbuf_p        = &comm->dth[th].vbuf;
8160                         nsend_p       = &comm->dth[th].nsend;
8161                         nat_p         = &comm->dth[th].nat;
8162                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8163
8164                         comm->dth[th].nsend      = 0;
8165                         comm->dth[th].nat        = 0;
8166                         comm->dth[th].nsend_zone = 0;
8167                     }
8168
8169                     if (comm->nth == 1)
8170                     {
8171                         cg0_th = cg0;
8172                         cg1_th = cg1;
8173                     }
8174                     else
8175                     {
8176                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8177                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8178                     }
8179
8180                     /* Get the cg's for this pulse in this zone */
8181                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8182                                        index_gl, cgindex,
8183                                        dim, dim_ind, dim0, dim1, dim2,
8184                                        r_comm2, r_bcomm2,
8185                                        box, tric_dist,
8186                                        normal, skew_fac2_d, skew_fac_01,
8187                                        v_d, v_0, v_1, &corners, sf2_round,
8188                                        bDistBonded, bBondComm,
8189                                        bDist2B, bDistMB,
8190                                        cg_cm, fr->cginfo,
8191                                        ind_p,
8192                                        ibuf_p, ibuf_nalloc_p,
8193                                        vbuf_p,
8194                                        nsend_p, nat_p,
8195                                        nsend_zone_p);
8196                 }
8197
8198                 /* Append data of threads>=1 to the communication buffers */
8199                 for (th = 1; th < comm->nth; th++)
8200                 {
8201                     dd_comm_setup_work_t *dth;
8202                     int                   i, ns1;
8203
8204                     dth = &comm->dth[th];
8205
8206                     ns1 = nsend + dth->nsend_zone;
8207                     if (ns1 > ind->nalloc)
8208                     {
8209                         ind->nalloc = over_alloc_dd(ns1);
8210                         srenew(ind->index, ind->nalloc);
8211                     }
8212                     if (ns1 > comm->nalloc_int)
8213                     {
8214                         comm->nalloc_int = over_alloc_dd(ns1);
8215                         srenew(comm->buf_int, comm->nalloc_int);
8216                     }
8217                     if (ns1 > comm->vbuf.nalloc)
8218                     {
8219                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8220                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8221                     }
8222
8223                     for (i = 0; i < dth->nsend_zone; i++)
8224                     {
8225                         ind->index[nsend]    = dth->ind.index[i];
8226                         comm->buf_int[nsend] = dth->ibuf[i];
8227                         copy_rvec(dth->vbuf.v[i],
8228                                   comm->vbuf.v[nsend]);
8229                         nsend++;
8230                     }
8231                     nat              += dth->nat;
8232                     ind->nsend[zone] += dth->nsend_zone;
8233                 }
8234             }
8235             /* Clear the counts in case we do not have pbc */
8236             for (zone = nzone_send; zone < nzone; zone++)
8237             {
8238                 ind->nsend[zone] = 0;
8239             }
8240             ind->nsend[nzone]   = nsend;
8241             ind->nsend[nzone+1] = nat;
8242             /* Communicate the number of cg's and atoms to receive */
8243             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8244                             ind->nsend, nzone+2,
8245                             ind->nrecv, nzone+2);
8246
8247             /* The rvec buffer is also required for atom buffers of size nsend
8248              * in dd_move_x and dd_move_f.
8249              */
8250             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8251
8252             if (p > 0)
8253             {
8254                 /* We can receive in place if only the last zone is not empty */
8255                 for (zone = 0; zone < nzone-1; zone++)
8256                 {
8257                     if (ind->nrecv[zone] > 0)
8258                     {
8259                         cd->bInPlace = FALSE;
8260                     }
8261                 }
8262                 if (!cd->bInPlace)
8263                 {
8264                     /* The int buffer is only required here for the cg indices */
8265                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8266                     {
8267                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8268                         srenew(comm->buf_int2, comm->nalloc_int2);
8269                     }
8270                     /* The rvec buffer is also required for atom buffers
8271                      * of size nrecv in dd_move_x and dd_move_f.
8272                      */
8273                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8274                     vec_rvec_check_alloc(&comm->vbuf2, i);
8275                 }
8276             }
8277
8278             /* Make space for the global cg indices */
8279             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8280                 || dd->cg_nalloc == 0)
8281             {
8282                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8283                 srenew(index_gl, dd->cg_nalloc);
8284                 srenew(cgindex, dd->cg_nalloc+1);
8285             }
8286             /* Communicate the global cg indices */
8287             if (cd->bInPlace)
8288             {
8289                 recv_i = index_gl + pos_cg;
8290             }
8291             else
8292             {
8293                 recv_i = comm->buf_int2;
8294             }
8295             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8296                             comm->buf_int, nsend,
8297                             recv_i,        ind->nrecv[nzone]);
8298
8299             /* Make space for cg_cm */
8300             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8301             if (fr->cutoff_scheme == ecutsGROUP)
8302             {
8303                 cg_cm = fr->cg_cm;
8304             }
8305             else
8306             {
8307                 cg_cm = state->x;
8308             }
8309             /* Communicate cg_cm */
8310             if (cd->bInPlace)
8311             {
8312                 recv_vr = cg_cm + pos_cg;
8313             }
8314             else
8315             {
8316                 recv_vr = comm->vbuf2.v;
8317             }
8318             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8319                              comm->vbuf.v, nsend,
8320                              recv_vr,      ind->nrecv[nzone]);
8321
8322             /* Make the charge group index */
8323             if (cd->bInPlace)
8324             {
8325                 zone = (p == 0 ? 0 : nzone - 1);
8326                 while (zone < nzone)
8327                 {
8328                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8329                     {
8330                         cg_gl              = index_gl[pos_cg];
8331                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8332                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8333                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8334                         if (bBondComm)
8335                         {
8336                             /* Update the charge group presence,
8337                              * so we can use it in the next pass of the loop.
8338                              */
8339                             comm->bLocalCG[cg_gl] = TRUE;
8340                         }
8341                         pos_cg++;
8342                     }
8343                     if (p == 0)
8344                     {
8345                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8346                     }
8347                     zone++;
8348                     zone_cg_range[nzone+zone] = pos_cg;
8349                 }
8350             }
8351             else
8352             {
8353                 /* This part of the code is never executed with bBondComm. */
8354                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8355                                  index_gl, recv_i, cg_cm, recv_vr,
8356                                  cgindex, fr->cginfo_mb, fr->cginfo);
8357                 pos_cg += ind->nrecv[nzone];
8358             }
8359             nat_tot += ind->nrecv[nzone+1];
8360         }
8361         if (!cd->bInPlace)
8362         {
8363             /* Store the atom block for easy copying of communication buffers */
8364             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8365         }
8366         nzone += nzone;
8367     }
8368     dd->index_gl = index_gl;
8369     dd->cgindex  = cgindex;
8370
8371     dd->ncg_tot          = zone_cg_range[zones->n];
8372     dd->nat_tot          = nat_tot;
8373     comm->nat[ddnatHOME] = dd->nat_home;
8374     for (i = ddnatZONE; i < ddnatNR; i++)
8375     {
8376         comm->nat[i] = dd->nat_tot;
8377     }
8378
8379     if (!bBondComm)
8380     {
8381         /* We don't need to update cginfo, since that was alrady done above.
8382          * So we pass NULL for the forcerec.
8383          */
8384         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8385                       NULL, comm->bLocalCG);
8386     }
8387
8388     if (debug)
8389     {
8390         fprintf(debug, "Finished setting up DD communication, zones:");
8391         for (c = 0; c < zones->n; c++)
8392         {
8393             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8394         }
8395         fprintf(debug, "\n");
8396     }
8397 }
8398
8399 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8400 {
8401     int c;
8402
8403     for (c = 0; c < zones->nizone; c++)
8404     {
8405         zones->izone[c].cg1  = zones->cg_range[c+1];
8406         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8407         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8408     }
8409 }
8410
8411 static void set_zones_size(gmx_domdec_t *dd,
8412                            matrix box, const gmx_ddbox_t *ddbox,
8413                            int zone_start, int zone_end)
8414 {
8415     gmx_domdec_comm_t  *comm;
8416     gmx_domdec_zones_t *zones;
8417     gmx_bool            bDistMB;
8418     int                 z, zi, zj0, zj1, d, dim;
8419     real                rcs, rcmbs;
8420     int                 i, j;
8421     real                size_j, add_tric;
8422     real                vol;
8423
8424     comm = dd->comm;
8425
8426     zones = &comm->zones;
8427
8428     /* Do we need to determine extra distances for multi-body bondeds? */
8429     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8430
8431     for (z = zone_start; z < zone_end; z++)
8432     {
8433         /* Copy cell limits to zone limits.
8434          * Valid for non-DD dims and non-shifted dims.
8435          */
8436         copy_rvec(comm->cell_x0, zones->size[z].x0);
8437         copy_rvec(comm->cell_x1, zones->size[z].x1);
8438     }
8439
8440     for (d = 0; d < dd->ndim; d++)
8441     {
8442         dim = dd->dim[d];
8443
8444         for (z = 0; z < zones->n; z++)
8445         {
8446             /* With a staggered grid we have different sizes
8447              * for non-shifted dimensions.
8448              */
8449             if (dd->bGridJump && zones->shift[z][dim] == 0)
8450             {
8451                 if (d == 1)
8452                 {
8453                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8454                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8455                 }
8456                 else if (d == 2)
8457                 {
8458                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8459                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8460                 }
8461             }
8462         }
8463
8464         rcs   = comm->cutoff;
8465         rcmbs = comm->cutoff_mbody;
8466         if (ddbox->tric_dir[dim])
8467         {
8468             rcs   /= ddbox->skew_fac[dim];
8469             rcmbs /= ddbox->skew_fac[dim];
8470         }
8471
8472         /* Set the lower limit for the shifted zone dimensions */
8473         for (z = zone_start; z < zone_end; z++)
8474         {
8475             if (zones->shift[z][dim] > 0)
8476             {
8477                 dim = dd->dim[d];
8478                 if (!dd->bGridJump || d == 0)
8479                 {
8480                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8481                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8482                 }
8483                 else
8484                 {
8485                     /* Here we take the lower limit of the zone from
8486                      * the lowest domain of the zone below.
8487                      */
8488                     if (z < 4)
8489                     {
8490                         zones->size[z].x0[dim] =
8491                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8492                     }
8493                     else
8494                     {
8495                         if (d == 1)
8496                         {
8497                             zones->size[z].x0[dim] =
8498                                 zones->size[zone_perm[2][z-4]].x0[dim];
8499                         }
8500                         else
8501                         {
8502                             zones->size[z].x0[dim] =
8503                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8504                         }
8505                     }
8506                     /* A temporary limit, is updated below */
8507                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8508
8509                     if (bDistMB)
8510                     {
8511                         for (zi = 0; zi < zones->nizone; zi++)
8512                         {
8513                             if (zones->shift[zi][dim] == 0)
8514                             {
8515                                 /* This takes the whole zone into account.
8516                                  * With multiple pulses this will lead
8517                                  * to a larger zone then strictly necessary.
8518                                  */
8519                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8520                                                              zones->size[zi].x1[dim]+rcmbs);
8521                             }
8522                         }
8523                     }
8524                 }
8525             }
8526         }
8527
8528         /* Loop over the i-zones to set the upper limit of each
8529          * j-zone they see.
8530          */
8531         for (zi = 0; zi < zones->nizone; zi++)
8532         {
8533             if (zones->shift[zi][dim] == 0)
8534             {
8535                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8536                 {
8537                     if (zones->shift[z][dim] > 0)
8538                     {
8539                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8540                                                      zones->size[zi].x1[dim]+rcs);
8541                     }
8542                 }
8543             }
8544         }
8545     }
8546
8547     for (z = zone_start; z < zone_end; z++)
8548     {
8549         /* Initialization only required to keep the compiler happy */
8550         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8551         int  nc, c;
8552
8553         /* To determine the bounding box for a zone we need to find
8554          * the extreme corners of 4, 2 or 1 corners.
8555          */
8556         nc = 1 << (ddbox->npbcdim - 1);
8557
8558         for (c = 0; c < nc; c++)
8559         {
8560             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8561             corner[XX] = 0;
8562             if ((c & 1) == 0)
8563             {
8564                 corner[YY] = zones->size[z].x0[YY];
8565             }
8566             else
8567             {
8568                 corner[YY] = zones->size[z].x1[YY];
8569             }
8570             if ((c & 2) == 0)
8571             {
8572                 corner[ZZ] = zones->size[z].x0[ZZ];
8573             }
8574             else
8575             {
8576                 corner[ZZ] = zones->size[z].x1[ZZ];
8577             }
8578             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8579             {
8580                 /* With 1D domain decomposition the cg's are not in
8581                  * the triclinic box, but triclinic x-y and rectangular y-z.
8582                  * Shift y back, so it will later end up at 0.
8583                  */
8584                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8585             }
8586             /* Apply the triclinic couplings */
8587             for (i = YY; i < ddbox->npbcdim; i++)
8588             {
8589                 for (j = XX; j < i; j++)
8590                 {
8591                     corner[j] += corner[i]*box[i][j]/box[i][i];
8592                 }
8593             }
8594             if (c == 0)
8595             {
8596                 copy_rvec(corner, corner_min);
8597                 copy_rvec(corner, corner_max);
8598             }
8599             else
8600             {
8601                 for (i = 0; i < DIM; i++)
8602                 {
8603                     corner_min[i] = min(corner_min[i], corner[i]);
8604                     corner_max[i] = max(corner_max[i], corner[i]);
8605                 }
8606             }
8607         }
8608         /* Copy the extreme cornes without offset along x */
8609         for (i = 0; i < DIM; i++)
8610         {
8611             zones->size[z].bb_x0[i] = corner_min[i];
8612             zones->size[z].bb_x1[i] = corner_max[i];
8613         }
8614         /* Add the offset along x */
8615         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8616         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8617     }
8618
8619     if (zone_start == 0)
8620     {
8621         vol = 1;
8622         for (dim = 0; dim < DIM; dim++)
8623         {
8624             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8625         }
8626         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8627     }
8628
8629     if (debug)
8630     {
8631         for (z = zone_start; z < zone_end; z++)
8632         {
8633             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8634                     z,
8635                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8636                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8637                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8638             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8639                     z,
8640                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8641                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8642                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8643         }
8644     }
8645 }
8646
8647 static int comp_cgsort(const void *a, const void *b)
8648 {
8649     int           comp;
8650
8651     gmx_cgsort_t *cga, *cgb;
8652     cga = (gmx_cgsort_t *)a;
8653     cgb = (gmx_cgsort_t *)b;
8654
8655     comp = cga->nsc - cgb->nsc;
8656     if (comp == 0)
8657     {
8658         comp = cga->ind_gl - cgb->ind_gl;
8659     }
8660
8661     return comp;
8662 }
8663
8664 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8665                          int *a, int *buf)
8666 {
8667     int i;
8668
8669     /* Order the data */
8670     for (i = 0; i < n; i++)
8671     {
8672         buf[i] = a[sort[i].ind];
8673     }
8674
8675     /* Copy back to the original array */
8676     for (i = 0; i < n; i++)
8677     {
8678         a[i] = buf[i];
8679     }
8680 }
8681
8682 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8683                          rvec *v, rvec *buf)
8684 {
8685     int i;
8686
8687     /* Order the data */
8688     for (i = 0; i < n; i++)
8689     {
8690         copy_rvec(v[sort[i].ind], buf[i]);
8691     }
8692
8693     /* Copy back to the original array */
8694     for (i = 0; i < n; i++)
8695     {
8696         copy_rvec(buf[i], v[i]);
8697     }
8698 }
8699
8700 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8701                            rvec *v, rvec *buf)
8702 {
8703     int a, atot, cg, cg0, cg1, i;
8704
8705     if (cgindex == NULL)
8706     {
8707         /* Avoid the useless loop of the atoms within a cg */
8708         order_vec_cg(ncg, sort, v, buf);
8709
8710         return;
8711     }
8712
8713     /* Order the data */
8714     a = 0;
8715     for (cg = 0; cg < ncg; cg++)
8716     {
8717         cg0 = cgindex[sort[cg].ind];
8718         cg1 = cgindex[sort[cg].ind+1];
8719         for (i = cg0; i < cg1; i++)
8720         {
8721             copy_rvec(v[i], buf[a]);
8722             a++;
8723         }
8724     }
8725     atot = a;
8726
8727     /* Copy back to the original array */
8728     for (a = 0; a < atot; a++)
8729     {
8730         copy_rvec(buf[a], v[a]);
8731     }
8732 }
8733
8734 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8735                          int nsort_new, gmx_cgsort_t *sort_new,
8736                          gmx_cgsort_t *sort1)
8737 {
8738     int i1, i2, i_new;
8739
8740     /* The new indices are not very ordered, so we qsort them */
8741     qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8742
8743     /* sort2 is already ordered, so now we can merge the two arrays */
8744     i1    = 0;
8745     i2    = 0;
8746     i_new = 0;
8747     while (i2 < nsort2 || i_new < nsort_new)
8748     {
8749         if (i2 == nsort2)
8750         {
8751             sort1[i1++] = sort_new[i_new++];
8752         }
8753         else if (i_new == nsort_new)
8754         {
8755             sort1[i1++] = sort2[i2++];
8756         }
8757         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8758                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8759                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8760         {
8761             sort1[i1++] = sort2[i2++];
8762         }
8763         else
8764         {
8765             sort1[i1++] = sort_new[i_new++];
8766         }
8767     }
8768 }
8769
8770 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8771 {
8772     gmx_domdec_sort_t *sort;
8773     gmx_cgsort_t      *cgsort, *sort_i;
8774     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8775     int                sort_last, sort_skip;
8776
8777     sort = dd->comm->sort;
8778
8779     a = fr->ns.grid->cell_index;
8780
8781     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8782
8783     if (ncg_home_old >= 0)
8784     {
8785         /* The charge groups that remained in the same ns grid cell
8786          * are completely ordered. So we can sort efficiently by sorting
8787          * the charge groups that did move into the stationary list.
8788          */
8789         ncg_new   = 0;
8790         nsort2    = 0;
8791         nsort_new = 0;
8792         for (i = 0; i < dd->ncg_home; i++)
8793         {
8794             /* Check if this cg did not move to another node */
8795             if (a[i] < moved)
8796             {
8797                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8798                 {
8799                     /* This cg is new on this node or moved ns grid cell */
8800                     if (nsort_new >= sort->sort_new_nalloc)
8801                     {
8802                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8803                         srenew(sort->sort_new, sort->sort_new_nalloc);
8804                     }
8805                     sort_i = &(sort->sort_new[nsort_new++]);
8806                 }
8807                 else
8808                 {
8809                     /* This cg did not move */
8810                     sort_i = &(sort->sort2[nsort2++]);
8811                 }
8812                 /* Sort on the ns grid cell indices
8813                  * and the global topology index.
8814                  * index_gl is irrelevant with cell ns,
8815                  * but we set it here anyhow to avoid a conditional.
8816                  */
8817                 sort_i->nsc    = a[i];
8818                 sort_i->ind_gl = dd->index_gl[i];
8819                 sort_i->ind    = i;
8820                 ncg_new++;
8821             }
8822         }
8823         if (debug)
8824         {
8825             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8826                     nsort2, nsort_new);
8827         }
8828         /* Sort efficiently */
8829         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8830                      sort->sort);
8831     }
8832     else
8833     {
8834         cgsort  = sort->sort;
8835         ncg_new = 0;
8836         for (i = 0; i < dd->ncg_home; i++)
8837         {
8838             /* Sort on the ns grid cell indices
8839              * and the global topology index
8840              */
8841             cgsort[i].nsc    = a[i];
8842             cgsort[i].ind_gl = dd->index_gl[i];
8843             cgsort[i].ind    = i;
8844             if (cgsort[i].nsc < moved)
8845             {
8846                 ncg_new++;
8847             }
8848         }
8849         if (debug)
8850         {
8851             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8852         }
8853         /* Determine the order of the charge groups using qsort */
8854         qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8855     }
8856
8857     return ncg_new;
8858 }
8859
8860 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8861 {
8862     gmx_cgsort_t *sort;
8863     int           ncg_new, i, *a, na;
8864
8865     sort = dd->comm->sort->sort;
8866
8867     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8868
8869     ncg_new = 0;
8870     for (i = 0; i < na; i++)
8871     {
8872         if (a[i] >= 0)
8873         {
8874             sort[ncg_new].ind = a[i];
8875             ncg_new++;
8876         }
8877     }
8878
8879     return ncg_new;
8880 }
8881
8882 static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
8883                           rvec *cgcm, t_forcerec *fr, t_state *state,
8884                           int ncg_home_old)
8885 {
8886     gmx_domdec_sort_t *sort;
8887     gmx_cgsort_t      *cgsort, *sort_i;
8888     int               *cgindex;
8889     int                ncg_new, i, *ibuf, cgsize;
8890     rvec              *vbuf;
8891
8892     sort = dd->comm->sort;
8893
8894     if (dd->ncg_home > sort->sort_nalloc)
8895     {
8896         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8897         srenew(sort->sort, sort->sort_nalloc);
8898         srenew(sort->sort2, sort->sort_nalloc);
8899     }
8900     cgsort = sort->sort;
8901
8902     switch (fr->cutoff_scheme)
8903     {
8904         case ecutsGROUP:
8905             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8906             break;
8907         case ecutsVERLET:
8908             ncg_new = dd_sort_order_nbnxn(dd, fr);
8909             break;
8910         default:
8911             gmx_incons("unimplemented");
8912             ncg_new = 0;
8913     }
8914
8915     /* We alloc with the old size, since cgindex is still old */
8916     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8917     vbuf = dd->comm->vbuf.v;
8918
8919     if (dd->comm->bCGs)
8920     {
8921         cgindex = dd->cgindex;
8922     }
8923     else
8924     {
8925         cgindex = NULL;
8926     }
8927
8928     /* Remove the charge groups which are no longer at home here */
8929     dd->ncg_home = ncg_new;
8930     if (debug)
8931     {
8932         fprintf(debug, "Set the new home charge group count to %d\n",
8933                 dd->ncg_home);
8934     }
8935
8936     /* Reorder the state */
8937     for (i = 0; i < estNR; i++)
8938     {
8939         if (EST_DISTR(i) && (state->flags & (1<<i)))
8940         {
8941             switch (i)
8942             {
8943                 case estX:
8944                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8945                     break;
8946                 case estV:
8947                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8948                     break;
8949                 case estSDX:
8950                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8951                     break;
8952                 case estCGP:
8953                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8954                     break;
8955                 case estLD_RNG:
8956                 case estLD_RNGI:
8957                 case estDISRE_INITF:
8958                 case estDISRE_RM3TAV:
8959                 case estORIRE_INITF:
8960                 case estORIRE_DTAV:
8961                     /* No ordering required */
8962                     break;
8963                 default:
8964                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8965                     break;
8966             }
8967         }
8968     }
8969     if (fr->cutoff_scheme == ecutsGROUP)
8970     {
8971         /* Reorder cgcm */
8972         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8973     }
8974
8975     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8976     {
8977         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8978         srenew(sort->ibuf, sort->ibuf_nalloc);
8979     }
8980     ibuf = sort->ibuf;
8981     /* Reorder the global cg index */
8982     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
8983     /* Reorder the cginfo */
8984     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
8985     /* Rebuild the local cg index */
8986     if (dd->comm->bCGs)
8987     {
8988         ibuf[0] = 0;
8989         for (i = 0; i < dd->ncg_home; i++)
8990         {
8991             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
8992             ibuf[i+1] = ibuf[i] + cgsize;
8993         }
8994         for (i = 0; i < dd->ncg_home+1; i++)
8995         {
8996             dd->cgindex[i] = ibuf[i];
8997         }
8998     }
8999     else
9000     {
9001         for (i = 0; i < dd->ncg_home+1; i++)
9002         {
9003             dd->cgindex[i] = i;
9004         }
9005     }
9006     /* Set the home atom number */
9007     dd->nat_home = dd->cgindex[dd->ncg_home];
9008
9009     if (fr->cutoff_scheme == ecutsVERLET)
9010     {
9011         /* The atoms are now exactly in grid order, update the grid order */
9012         nbnxn_set_atomorder(fr->nbv->nbs);
9013     }
9014     else
9015     {
9016         /* Copy the sorted ns cell indices back to the ns grid struct */
9017         for (i = 0; i < dd->ncg_home; i++)
9018         {
9019             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9020         }
9021         fr->ns.grid->nr = dd->ncg_home;
9022     }
9023 }
9024
9025 static void add_dd_statistics(gmx_domdec_t *dd)
9026 {
9027     gmx_domdec_comm_t *comm;
9028     int                ddnat;
9029
9030     comm = dd->comm;
9031
9032     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9033     {
9034         comm->sum_nat[ddnat-ddnatZONE] +=
9035             comm->nat[ddnat] - comm->nat[ddnat-1];
9036     }
9037     comm->ndecomp++;
9038 }
9039
9040 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9041 {
9042     gmx_domdec_comm_t *comm;
9043     int                ddnat;
9044
9045     comm = dd->comm;
9046
9047     /* Reset all the statistics and counters for total run counting */
9048     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9049     {
9050         comm->sum_nat[ddnat-ddnatZONE] = 0;
9051     }
9052     comm->ndecomp   = 0;
9053     comm->nload     = 0;
9054     comm->load_step = 0;
9055     comm->load_sum  = 0;
9056     comm->load_max  = 0;
9057     clear_ivec(comm->load_lim);
9058     comm->load_mdf = 0;
9059     comm->load_pme = 0;
9060 }
9061
9062 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9063 {
9064     gmx_domdec_comm_t *comm;
9065     int                ddnat;
9066     double             av;
9067
9068     comm = cr->dd->comm;
9069
9070     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9071
9072     if (fplog == NULL)
9073     {
9074         return;
9075     }
9076
9077     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9078
9079     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9080     {
9081         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9082         switch (ddnat)
9083         {
9084             case ddnatZONE:
9085                 fprintf(fplog,
9086                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9087                         2, av);
9088                 break;
9089             case ddnatVSITE:
9090                 if (cr->dd->vsite_comm)
9091                 {
9092                     fprintf(fplog,
9093                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9094                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9095                             av);
9096                 }
9097                 break;
9098             case ddnatCON:
9099                 if (cr->dd->constraint_comm)
9100                 {
9101                     fprintf(fplog,
9102                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9103                             1 + ir->nLincsIter, av);
9104                 }
9105                 break;
9106             default:
9107                 gmx_incons(" Unknown type for DD statistics");
9108         }
9109     }
9110     fprintf(fplog, "\n");
9111
9112     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9113     {
9114         print_dd_load_av(fplog, cr->dd);
9115     }
9116 }
9117
9118 void dd_partition_system(FILE                *fplog,
9119                          gmx_large_int_t      step,
9120                          t_commrec           *cr,
9121                          gmx_bool             bMasterState,
9122                          int                  nstglobalcomm,
9123                          t_state             *state_global,
9124                          gmx_mtop_t          *top_global,
9125                          t_inputrec          *ir,
9126                          t_state             *state_local,
9127                          rvec               **f,
9128                          t_mdatoms           *mdatoms,
9129                          gmx_localtop_t      *top_local,
9130                          t_forcerec          *fr,
9131                          gmx_vsite_t         *vsite,
9132                          gmx_shellfc_t        shellfc,
9133                          gmx_constr_t         constr,
9134                          t_nrnb              *nrnb,
9135                          gmx_wallcycle_t      wcycle,
9136                          gmx_bool             bVerbose)
9137 {
9138     gmx_domdec_t      *dd;
9139     gmx_domdec_comm_t *comm;
9140     gmx_ddbox_t        ddbox = {0};
9141     t_block           *cgs_gl;
9142     gmx_large_int_t    step_pcoupl;
9143     rvec               cell_ns_x0, cell_ns_x1;
9144     int                i, j, n, cg0 = 0, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9145     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9146     gmx_bool           bRedist, bSortCG, bResortAll;
9147     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9148     real               grid_density;
9149     char               sbuf[22];
9150
9151     dd   = cr->dd;
9152     comm = dd->comm;
9153
9154     bBoxChanged = (bMasterState || DEFORM(*ir));
9155     if (ir->epc != epcNO)
9156     {
9157         /* With nstpcouple > 1 pressure coupling happens.
9158          * one step after calculating the pressure.
9159          * Box scaling happens at the end of the MD step,
9160          * after the DD partitioning.
9161          * We therefore have to do DLB in the first partitioning
9162          * after an MD step where P-coupling occured.
9163          * We need to determine the last step in which p-coupling occurred.
9164          * MRS -- need to validate this for vv?
9165          */
9166         n = ir->nstpcouple;
9167         if (n == 1)
9168         {
9169             step_pcoupl = step - 1;
9170         }
9171         else
9172         {
9173             step_pcoupl = ((step - 1)/n)*n + 1;
9174         }
9175         if (step_pcoupl >= comm->partition_step)
9176         {
9177             bBoxChanged = TRUE;
9178         }
9179     }
9180
9181     bNStGlobalComm = (step % nstglobalcomm == 0);
9182
9183     if (!comm->bDynLoadBal)
9184     {
9185         bDoDLB = FALSE;
9186     }
9187     else
9188     {
9189         /* Should we do dynamic load balacing this step?
9190          * Since it requires (possibly expensive) global communication,
9191          * we might want to do DLB less frequently.
9192          */
9193         if (bBoxChanged || ir->epc != epcNO)
9194         {
9195             bDoDLB = bBoxChanged;
9196         }
9197         else
9198         {
9199             bDoDLB = bNStGlobalComm;
9200         }
9201     }
9202
9203     /* Check if we have recorded loads on the nodes */
9204     if (comm->bRecordLoad && dd_load_count(comm))
9205     {
9206         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9207         {
9208             /* Check if we should use DLB at the second partitioning
9209              * and every 100 partitionings,
9210              * so the extra communication cost is negligible.
9211              */
9212             n         = max(100, nstglobalcomm);
9213             bCheckDLB = (comm->n_load_collect == 0 ||
9214                          comm->n_load_have % n == n-1);
9215         }
9216         else
9217         {
9218             bCheckDLB = FALSE;
9219         }
9220
9221         /* Print load every nstlog, first and last step to the log file */
9222         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9223                     comm->n_load_collect == 0 ||
9224                     (ir->nsteps >= 0 &&
9225                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9226
9227         /* Avoid extra communication due to verbose screen output
9228          * when nstglobalcomm is set.
9229          */
9230         if (bDoDLB || bLogLoad || bCheckDLB ||
9231             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9232         {
9233             get_load_distribution(dd, wcycle);
9234             if (DDMASTER(dd))
9235             {
9236                 if (bLogLoad)
9237                 {
9238                     dd_print_load(fplog, dd, step-1);
9239                 }
9240                 if (bVerbose)
9241                 {
9242                     dd_print_load_verbose(dd);
9243                 }
9244             }
9245             comm->n_load_collect++;
9246
9247             if (bCheckDLB)
9248             {
9249                 /* Since the timings are node dependent, the master decides */
9250                 if (DDMASTER(dd))
9251                 {
9252                     bTurnOnDLB =
9253                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9254                     if (debug)
9255                     {
9256                         fprintf(debug, "step %s, imb loss %f\n",
9257                                 gmx_step_str(step, sbuf),
9258                                 dd_force_imb_perf_loss(dd));
9259                     }
9260                 }
9261                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9262                 if (bTurnOnDLB)
9263                 {
9264                     turn_on_dlb(fplog, cr, step);
9265                     bDoDLB = TRUE;
9266                 }
9267             }
9268         }
9269         comm->n_load_have++;
9270     }
9271
9272     cgs_gl = &comm->cgs_gl;
9273
9274     bRedist = FALSE;
9275     if (bMasterState)
9276     {
9277         /* Clear the old state */
9278         clear_dd_indices(dd, 0, 0);
9279
9280         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9281                   TRUE, cgs_gl, state_global->x, &ddbox);
9282
9283         get_cg_distribution(fplog, step, dd, cgs_gl,
9284                             state_global->box, &ddbox, state_global->x);
9285
9286         dd_distribute_state(dd, cgs_gl,
9287                             state_global, state_local, f);
9288
9289         dd_make_local_cgs(dd, &top_local->cgs);
9290
9291         /* Ensure that we have space for the new distribution */
9292         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9293
9294         if (fr->cutoff_scheme == ecutsGROUP)
9295         {
9296             calc_cgcm(fplog, 0, dd->ncg_home,
9297                       &top_local->cgs, state_local->x, fr->cg_cm);
9298         }
9299
9300         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9301
9302         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9303
9304         cg0 = 0;
9305     }
9306     else if (state_local->ddp_count != dd->ddp_count)
9307     {
9308         if (state_local->ddp_count > dd->ddp_count)
9309         {
9310             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9311         }
9312
9313         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9314         {
9315             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9316         }
9317
9318         /* Clear the old state */
9319         clear_dd_indices(dd, 0, 0);
9320
9321         /* Build the new indices */
9322         rebuild_cgindex(dd, cgs_gl->index, state_local);
9323         make_dd_indices(dd, cgs_gl->index, 0);
9324
9325         if (fr->cutoff_scheme == ecutsGROUP)
9326         {
9327             /* Redetermine the cg COMs */
9328             calc_cgcm(fplog, 0, dd->ncg_home,
9329                       &top_local->cgs, state_local->x, fr->cg_cm);
9330         }
9331
9332         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9333
9334         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9335
9336         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9337                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9338
9339         bRedist = comm->bDynLoadBal;
9340     }
9341     else
9342     {
9343         /* We have the full state, only redistribute the cgs */
9344
9345         /* Clear the non-home indices */
9346         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9347
9348         /* Avoid global communication for dim's without pbc and -gcom */
9349         if (!bNStGlobalComm)
9350         {
9351             copy_rvec(comm->box0, ddbox.box0    );
9352             copy_rvec(comm->box_size, ddbox.box_size);
9353         }
9354         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9355                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9356
9357         bBoxChanged = TRUE;
9358         bRedist     = TRUE;
9359     }
9360     /* For dim's without pbc and -gcom */
9361     copy_rvec(ddbox.box0, comm->box0    );
9362     copy_rvec(ddbox.box_size, comm->box_size);
9363
9364     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9365                       step, wcycle);
9366
9367     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9368     {
9369         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9370     }
9371
9372     /* Check if we should sort the charge groups */
9373     if (comm->nstSortCG > 0)
9374     {
9375         bSortCG = (bMasterState ||
9376                    (bRedist && (step % comm->nstSortCG == 0)));
9377     }
9378     else
9379     {
9380         bSortCG = FALSE;
9381     }
9382
9383     ncg_home_old = dd->ncg_home;
9384
9385     ncg_moved = 0;
9386     if (bRedist)
9387     {
9388         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9389
9390         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9391                            state_local, f, fr, mdatoms,
9392                            !bSortCG, nrnb, &cg0, &ncg_moved);
9393
9394         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9395     }
9396
9397     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9398                           dd, &ddbox,
9399                           &comm->cell_x0, &comm->cell_x1,
9400                           dd->ncg_home, fr->cg_cm,
9401                           cell_ns_x0, cell_ns_x1, &grid_density);
9402
9403     if (bBoxChanged)
9404     {
9405         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9406     }
9407
9408     switch (fr->cutoff_scheme)
9409     {
9410         case ecutsGROUP:
9411             copy_ivec(fr->ns.grid->n, ncells_old);
9412             grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
9413                        state_local->box, cell_ns_x0, cell_ns_x1,
9414                        fr->rlistlong, grid_density);
9415             break;
9416         case ecutsVERLET:
9417             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9418             break;
9419         default:
9420             gmx_incons("unimplemented");
9421     }
9422     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9423     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9424
9425     if (bSortCG)
9426     {
9427         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9428
9429         /* Sort the state on charge group position.
9430          * This enables exact restarts from this step.
9431          * It also improves performance by about 15% with larger numbers
9432          * of atoms per node.
9433          */
9434
9435         /* Fill the ns grid with the home cell,
9436          * so we can sort with the indices.
9437          */
9438         set_zones_ncg_home(dd);
9439
9440         switch (fr->cutoff_scheme)
9441         {
9442             case ecutsVERLET:
9443                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9444
9445                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9446                                   0,
9447                                   comm->zones.size[0].bb_x0,
9448                                   comm->zones.size[0].bb_x1,
9449                                   0, dd->ncg_home,
9450                                   comm->zones.dens_zone0,
9451                                   fr->cginfo,
9452                                   state_local->x,
9453                                   ncg_moved, bRedist ? comm->moved : NULL,
9454                                   fr->nbv->grp[eintLocal].kernel_type,
9455                                   fr->nbv->grp[eintLocal].nbat);
9456
9457                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9458                 break;
9459             case ecutsGROUP:
9460                 fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
9461                           0, dd->ncg_home, fr->cg_cm);
9462
9463                 copy_ivec(fr->ns.grid->n, ncells_new);
9464                 break;
9465             default:
9466                 gmx_incons("unimplemented");
9467         }
9468
9469         bResortAll = bMasterState;
9470
9471         /* Check if we can user the old order and ns grid cell indices
9472          * of the charge groups to sort the charge groups efficiently.
9473          */
9474         if (ncells_new[XX] != ncells_old[XX] ||
9475             ncells_new[YY] != ncells_old[YY] ||
9476             ncells_new[ZZ] != ncells_old[ZZ])
9477         {
9478             bResortAll = TRUE;
9479         }
9480
9481         if (debug)
9482         {
9483             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9484                     gmx_step_str(step, sbuf), dd->ncg_home);
9485         }
9486         dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
9487                       bResortAll ? -1 : ncg_home_old);
9488         /* Rebuild all the indices */
9489         cg0 = 0;
9490         ga2la_clear(dd->ga2la);
9491
9492         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9493     }
9494
9495     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9496
9497     /* Setup up the communication and communicate the coordinates */
9498     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9499
9500     /* Set the indices */
9501     make_dd_indices(dd, cgs_gl->index, cg0);
9502
9503     /* Set the charge group boundaries for neighbor searching */
9504     set_cg_boundaries(&comm->zones);
9505
9506     if (fr->cutoff_scheme == ecutsVERLET)
9507     {
9508         set_zones_size(dd, state_local->box, &ddbox,
9509                        bSortCG ? 1 : 0, comm->zones.n);
9510     }
9511
9512     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9513
9514     /*
9515        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9516                  -1,state_local->x,state_local->box);
9517      */
9518
9519     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9520
9521     /* Extract a local topology from the global topology */
9522     for (i = 0; i < dd->ndim; i++)
9523     {
9524         np[dd->dim[i]] = comm->cd[i].np;
9525     }
9526     dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
9527                       comm->cellsize_min, np,
9528                       fr,
9529                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9530                       vsite, top_global, top_local);
9531
9532     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9533
9534     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9535
9536     /* Set up the special atom communication */
9537     n = comm->nat[ddnatZONE];
9538     for (i = ddnatZONE+1; i < ddnatNR; i++)
9539     {
9540         switch (i)
9541         {
9542             case ddnatVSITE:
9543                 if (vsite && vsite->n_intercg_vsite)
9544                 {
9545                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9546                 }
9547                 break;
9548             case ddnatCON:
9549                 if (dd->bInterCGcons || dd->bInterCGsettles)
9550                 {
9551                     /* Only for inter-cg constraints we need special code */
9552                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9553                                                   constr, ir->nProjOrder,
9554                                                   top_local->idef.il);
9555                 }
9556                 break;
9557             default:
9558                 gmx_incons("Unknown special atom type setup");
9559         }
9560         comm->nat[i] = n;
9561     }
9562
9563     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9564
9565     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9566
9567     /* Make space for the extra coordinates for virtual site
9568      * or constraint communication.
9569      */
9570     state_local->natoms = comm->nat[ddnatNR-1];
9571     if (state_local->natoms > state_local->nalloc)
9572     {
9573         dd_realloc_state(state_local, f, state_local->natoms);
9574     }
9575
9576     if (fr->bF_NoVirSum)
9577     {
9578         if (vsite && vsite->n_intercg_vsite)
9579         {
9580             nat_f_novirsum = comm->nat[ddnatVSITE];
9581         }
9582         else
9583         {
9584             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9585             {
9586                 nat_f_novirsum = dd->nat_tot;
9587             }
9588             else
9589             {
9590                 nat_f_novirsum = dd->nat_home;
9591             }
9592         }
9593     }
9594     else
9595     {
9596         nat_f_novirsum = 0;
9597     }
9598
9599     /* Set the number of atoms required for the force calculation.
9600      * Forces need to be constrained when using a twin-range setup
9601      * or with energy minimization. For simple simulations we could
9602      * avoid some allocation, zeroing and copying, but this is
9603      * probably not worth the complications ande checking.
9604      */
9605     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9606                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9607
9608     /* We make the all mdatoms up to nat_tot_con.
9609      * We could save some work by only setting invmass
9610      * between nat_tot and nat_tot_con.
9611      */
9612     /* This call also sets the new number of home particles to dd->nat_home */
9613     atoms2md(top_global, ir,
9614              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9615
9616     /* Now we have the charges we can sort the FE interactions */
9617     dd_sort_local_top(dd, mdatoms, top_local);
9618
9619     if (vsite != NULL)
9620     {
9621         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9622         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
9623     }
9624
9625     if (shellfc)
9626     {
9627         /* Make the local shell stuff, currently no communication is done */
9628         make_local_shells(cr, mdatoms, shellfc);
9629     }
9630
9631     if (ir->implicit_solvent)
9632     {
9633         make_local_gb(cr, fr->born, ir->gb_algorithm);
9634     }
9635
9636     init_bonded_thread_force_reduction(fr, &top_local->idef);
9637
9638     if (!(cr->duty & DUTY_PME))
9639     {
9640         /* Send the charges to our PME only node */
9641         gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
9642                        mdatoms->chargeA, mdatoms->chargeB,
9643                        dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9644     }
9645
9646     if (constr)
9647     {
9648         set_constraints(constr, top_local, ir, mdatoms, cr);
9649     }
9650
9651     if (ir->ePull != epullNO)
9652     {
9653         /* Update the local pull groups */
9654         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9655     }
9656
9657     if (ir->bRot)
9658     {
9659         /* Update the local rotation groups */
9660         dd_make_local_rotation_groups(dd, ir->rot);
9661     }
9662
9663
9664     add_dd_statistics(dd);
9665
9666     /* Make sure we only count the cycles for this DD partitioning */
9667     clear_dd_cycle_counts(dd);
9668
9669     /* Because the order of the atoms might have changed since
9670      * the last vsite construction, we need to communicate the constructing
9671      * atom coordinates again (for spreading the forces this MD step).
9672      */
9673     dd_move_x_vsites(dd, state_local->box, state_local->x);
9674
9675     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9676
9677     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9678     {
9679         dd_move_x(dd, state_local->box, state_local->x);
9680         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9681                      -1, state_local->x, state_local->box);
9682     }
9683
9684     /* Store the partitioning step */
9685     comm->partition_step = step;
9686
9687     /* Increase the DD partitioning counter */
9688     dd->ddp_count++;
9689     /* The state currently matches this DD partitioning count, store it */
9690     state_local->ddp_count = dd->ddp_count;
9691     if (bMasterState)
9692     {
9693         /* The DD master node knows the complete cg distribution,
9694          * store the count so we can possibly skip the cg info communication.
9695          */
9696         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9697     }
9698
9699     if (comm->DD_debug > 0)
9700     {
9701         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9702         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9703                                 "after partitioning");
9704     }
9705 }