src/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2008
   5  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   7  * others, as listed in the AUTHORS file in the top-level source
   8  * directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <time.h>
  43 #include <math.h>
  44 #include <string.h>
  45 #include <stdlib.h>
  46 #include "typedefs.h"
  47 #include "smalloc.h"
  48 #include "gmx_fatal.h"
  49 #include "gmx_fatal_collective.h"
  50 #include "vec.h"
  51 #include "domdec.h"
  52 #include "domdec_network.h"
  53 #include "nrnb.h"
  54 #include "pbc.h"
  55 #include "chargegroup.h"
  56 #include "constr.h"
  57 #include "mdatoms.h"
  58 #include "names.h"
  59 #include "pdbio.h"
  60 #include "futil.h"
  61 #include "force.h"
  62 #include "pme.h"
  63 #include "pull.h"
  64 #include "pull_rotation.h"
  65 #include "gmx_wallcycle.h"
  66 #include "mdrun.h"
  67 #include "nsgrid.h"
  68 #include "shellfc.h"
  69 #include "mtop_util.h"
  70 #include "gmxfio.h"
  71 #include "gmx_ga2la.h"
  72 #include "gmx_sort.h"
  73 #include "nbnxn_search.h"
  74 #include "bondf.h"
  75 #include "gmx_omp_nthreads.h"
  76
  77 #ifdef GMX_LIB_MPI
  78 #include <mpi.h>
  79 #endif
  80 #ifdef GMX_THREAD_MPI
  81 #include "tmpi.h"
  82 #endif
  83
  84 #define DDRANK(dd, rank)    (rank)
  85 #define DDMASTERRANK(dd)   (dd->masterrank)
  86
  87 typedef struct gmx_domdec_master
  88 {
  89     /* The cell boundaries */
  90     real **cell_x;
  91     /* The global charge group division */
  92     int   *ncg;    /* Number of home charge groups for each node */
  93     int   *index;  /* Index of nnodes+1 into cg */
  94     int   *cg;     /* Global charge group index */
  95     int   *nat;    /* Number of home atoms for each node. */
  96     int   *ibuf;   /* Buffer for communication */
  97     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  98 } gmx_domdec_master_t;
  99
 100 typedef struct
 101 {
 102     /* The numbers of charge groups to send and receive for each cell
 103      * that requires communication, the last entry contains the total
 104      * number of atoms that needs to be communicated.
 105      */
 106     int  nsend[DD_MAXIZONE+2];
 107     int  nrecv[DD_MAXIZONE+2];
 108     /* The charge groups to send */
 109     int *index;
 110     int  nalloc;
 111     /* The atom range for non-in-place communication */
 112     int  cell2at0[DD_MAXIZONE];
 113     int  cell2at1[DD_MAXIZONE];
 114 } gmx_domdec_ind_t;
 115
 116 typedef struct
 117 {
 118     int               np;       /* Number of grid pulses in this dimension */
 119     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 120     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 121     int               np_nalloc;
 122     gmx_bool          bInPlace; /* Can we communicate in place?            */
 123 } gmx_domdec_comm_dim_t;
 124
 125 typedef struct
 126 {
 127     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 128     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 129     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 130     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 131     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 132     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 133     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 134     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 135     real     *buf_ncd;     /* Temp. var.                                     */
 136 } gmx_domdec_root_t;
 137
 138 #define DD_NLOAD_MAX 9
 139
 140 /* Here floats are accurate enough, since these variables
 141  * only influence the load balancing, not the actual MD results.
 142  */
 143 typedef struct
 144 {
 145     int    nload;
 146     float *load;
 147     float  sum;
 148     float  max;
 149     float  sum_m;
 150     float  cvol_min;
 151     float  mdf;
 152     float  pme;
 153     int    flags;
 154 } gmx_domdec_load_t;
 155
 156 typedef struct
 157 {
 158     int  nsc;
 159     int  ind_gl;
 160     int  ind;
 161 } gmx_cgsort_t;
 162
 163 typedef struct
 164 {
 165     gmx_cgsort_t *sort;
 166     gmx_cgsort_t *sort2;
 167     int           sort_nalloc;
 168     gmx_cgsort_t *sort_new;
 169     int           sort_new_nalloc;
 170     int          *ibuf;
 171     int           ibuf_nalloc;
 172 } gmx_domdec_sort_t;
 173
 174 typedef struct
 175 {
 176     rvec *v;
 177     int   nalloc;
 178 } vec_rvec_t;
 179
 180 /* This enum determines the order of the coordinates.
 181  * ddnatHOME and ddnatZONE should be first and second,
 182  * the others can be ordered as wanted.
 183  */
 184 enum {
 185     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 186 };
 187
 188 enum {
 189     edlbAUTO, edlbNO, edlbYES, edlbNR
 190 };
 191 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 192
 193 typedef struct
 194 {
 195     int      dim;       /* The dimension                                          */
 196     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 197     int      nslab;     /* The number of PME slabs in this dimension              */
 198     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 199     int     *pp_min;    /* The minimum pp node location, size nslab               */
 200     int     *pp_max;    /* The maximum pp node location,size nslab                */
 201     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 202 } gmx_ddpme_t;
 203
 204 typedef struct
 205 {
 206     real min0;    /* The minimum bottom of this zone                        */
 207     real max1;    /* The maximum top of this zone                           */
 208     real min1;    /* The minimum top of this zone                           */
 209     real mch0;    /* The maximum bottom communicaton height for this zone   */
 210     real mch1;    /* The maximum top communicaton height for this zone      */
 211     real p1_0;    /* The bottom value of the first cell in this zone        */
 212     real p1_1;    /* The top value of the first cell in this zone           */
 213 } gmx_ddzone_t;
 214
 215 typedef struct
 216 {
 217     gmx_domdec_ind_t ind;
 218     int             *ibuf;
 219     int              ibuf_nalloc;
 220     vec_rvec_t       vbuf;
 221     int              nsend;
 222     int              nat;
 223     int              nsend_zone;
 224 } dd_comm_setup_work_t;
 225
 226 typedef struct gmx_domdec_comm
 227 {
 228     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 229      * unless stated otherwise.
 230      */
 231
 232     /* The number of decomposition dimensions for PME, 0: no PME */
 233     int         npmedecompdim;
 234     /* The number of nodes doing PME (PP/PME or only PME) */
 235     int         npmenodes;
 236     int         npmenodes_x;
 237     int         npmenodes_y;
 238     /* The communication setup including the PME only nodes */
 239     gmx_bool    bCartesianPP_PME;
 240     ivec        ntot;
 241     int         cartpmedim;
 242     int        *pmenodes;          /* size npmenodes                         */
 243     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 244                                     * but with bCartesianPP_PME              */
 245     gmx_ddpme_t ddpme[2];
 246
 247     /* The DD particle-particle nodes only */
 248     gmx_bool bCartesianPP;
 249     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 250
 251     /* The global charge groups */
 252     t_block cgs_gl;
 253
 254     /* Should we sort the cgs */
 255     int                nstSortCG;
 256     gmx_domdec_sort_t *sort;
 257
 258     /* Are there charge groups? */
 259     gmx_bool bCGs;
 260
 261     /* Are there bonded and multi-body interactions between charge groups? */
 262     gmx_bool bInterCGBondeds;
 263     gmx_bool bInterCGMultiBody;
 264
 265     /* Data for the optional bonded interaction atom communication range */
 266     gmx_bool  bBondComm;
 267     t_blocka *cglink;
 268     char     *bLocalCG;
 269
 270     /* The DLB option */
 271     int      eDLB;
 272     /* Are we actually using DLB? */
 273     gmx_bool bDynLoadBal;
 274
 275     /* Cell sizes for static load balancing, first index cartesian */
 276     real **slb_frac;
 277
 278     /* The width of the communicated boundaries */
 279     real     cutoff_mbody;
 280     real     cutoff;
 281     /* The minimum cell size (including triclinic correction) */
 282     rvec     cellsize_min;
 283     /* For dlb, for use with edlbAUTO */
 284     rvec     cellsize_min_dlb;
 285     /* The lower limit for the DD cell size with DLB */
 286     real     cellsize_limit;
 287     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 288     gmx_bool bVacDLBNoLimit;
 289
 290     /* With PME load balancing we set limits on DLB */
 291     gmx_bool bPMELoadBalDLBLimits;
 292     /* DLB needs to take into account that we want to allow this maximum
 293      * cut-off (for PME load balancing), this could limit cell boundaries.
 294      */
 295     real PMELoadBal_max_cutoff;
 296
 297     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 298     ivec tric_dir;
 299     /* box0 and box_size are required with dim's without pbc and -gcom */
 300     rvec box0;
 301     rvec box_size;
 302
 303     /* The cell boundaries */
 304     rvec cell_x0;
 305     rvec cell_x1;
 306
 307     /* The old location of the cell boundaries, to check cg displacements */
 308     rvec old_cell_x0;
 309     rvec old_cell_x1;
 310
 311     /* The communication setup and charge group boundaries for the zones */
 312     gmx_domdec_zones_t zones;
 313
 314     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 315      * cell boundaries of neighboring cells for dynamic load balancing.
 316      */
 317     gmx_ddzone_t zone_d1[2];
 318     gmx_ddzone_t zone_d2[2][2];
 319
 320     /* The coordinate/force communication setup and indices */
 321     gmx_domdec_comm_dim_t cd[DIM];
 322     /* The maximum number of cells to communicate with in one dimension */
 323     int                   maxpulse;
 324
 325     /* Which cg distribution is stored on the master node */
 326     int master_cg_ddp_count;
 327
 328     /* The number of cg's received from the direct neighbors */
 329     int  zone_ncg1[DD_MAXZONE];
 330
 331     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 332     int  nat[ddnatNR];
 333
 334     /* Array for signalling if atoms have moved to another domain */
 335     int  *moved;
 336     int   moved_nalloc;
 337
 338     /* Communication buffer for general use */
 339     int  *buf_int;
 340     int   nalloc_int;
 341
 342     /* Communication buffer for general use */
 343     vec_rvec_t vbuf;
 344
 345     /* Temporary storage for thread parallel communication setup */
 346     int                   nth;
 347     dd_comm_setup_work_t *dth;
 348
 349     /* Communication buffers only used with multiple grid pulses */
 350     int       *buf_int2;
 351     int        nalloc_int2;
 352     vec_rvec_t vbuf2;
 353
 354     /* Communication buffers for local redistribution */
 355     int  **cggl_flag;
 356     int    cggl_flag_nalloc[DIM*2];
 357     rvec **cgcm_state;
 358     int    cgcm_state_nalloc[DIM*2];
 359
 360     /* Cell sizes for dynamic load balancing */
 361     gmx_domdec_root_t **root;
 362     real               *cell_f_row;
 363     real                cell_f0[DIM];
 364     real                cell_f1[DIM];
 365     real                cell_f_max0[DIM];
 366     real                cell_f_min1[DIM];
 367
 368     /* Stuff for load communication */
 369     gmx_bool           bRecordLoad;
 370     gmx_domdec_load_t *load;
 371 #ifdef GMX_MPI
 372     MPI_Comm          *mpi_comm_load;
 373 #endif
 374
 375     /* Maximum DLB scaling per load balancing step in percent */
 376     int dlb_scale_lim;
 377
 378     /* Cycle counters */
 379     float  cycl[ddCyclNr];
 380     int    cycl_n[ddCyclNr];
 381     float  cycl_max[ddCyclNr];
 382     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 383     int    eFlop;
 384     double flop;
 385     int    flop_n;
 386     /* Have often have did we have load measurements */
 387     int    n_load_have;
 388     /* Have often have we collected the load measurements */
 389     int    n_load_collect;
 390
 391     /* Statistics */
 392     double sum_nat[ddnatNR-ddnatZONE];
 393     int    ndecomp;
 394     int    nload;
 395     double load_step;
 396     double load_sum;
 397     double load_max;
 398     ivec   load_lim;
 399     double load_mdf;
 400     double load_pme;
 401
 402     /* The last partition step */
 403     gmx_large_int_t partition_step;
 404
 405     /* Debugging */
 406     int  nstDDDump;
 407     int  nstDDDumpGrid;
 408     int  DD_debug;
 409 } gmx_domdec_comm_t;
 410
 411 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 412 #define DD_CGIBS 2
 413
 414 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 415 #define DD_FLAG_NRCG  65535
 416 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 417 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 418
 419 /* Zone permutation required to obtain consecutive charge groups
 420  * for neighbor searching.
 421  */
 422 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 423
 424 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 425  * components see only j zones with that component 0.
 426  */
 427
 428 /* The DD zone order */
 429 static const ivec dd_zo[DD_MAXZONE] =
 430 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 431
 432 /* The 3D setup */
 433 #define dd_z3n  8
 434 #define dd_zp3n 4
 435 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 436
 437 /* The 2D setup */
 438 #define dd_z2n  4
 439 #define dd_zp2n 2
 440 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 441
 442 /* The 1D setup */
 443 #define dd_z1n  2
 444 #define dd_zp1n 1
 445 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 446
 447 /* Factors used to avoid problems due to rounding issues */
 448 #define DD_CELL_MARGIN       1.0001
 449 #define DD_CELL_MARGIN2      1.00005
 450 /* Factor to account for pressure scaling during nstlist steps */
 451 #define DD_PRES_SCALE_MARGIN 1.02
 452
 453 /* Allowed performance loss before we DLB or warn */
 454 #define DD_PERF_LOSS 0.05
 455
 456 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 457
 458 /* Use separate MPI send and receive commands
 459  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 460  * This saves memory (and some copying for small nnodes).
 461  * For high parallelization scatter and gather calls are used.
 462  */
 463 #define GMX_DD_NNODES_SENDRECV 4
 464
 465
 466 /*
 467    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 468
 469    static void index2xyz(ivec nc,int ind,ivec xyz)
 470    {
 471    xyz[XX] = ind % nc[XX];
 472    xyz[YY] = (ind / nc[XX]) % nc[YY];
 473    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 474    }
 475  */
 476
 477 /* This order is required to minimize the coordinate communication in PME
 478  * which uses decomposition in the x direction.
 479  */
 480 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 481
 482 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 483 {
 484     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 485     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 486     xyz[ZZ] = ind % nc[ZZ];
 487 }
 488
 489 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 490 {
 491     int ddindex;
 492     int ddnodeid = -1;
 493
 494     ddindex = dd_index(dd->nc, c);
 495     if (dd->comm->bCartesianPP_PME)
 496     {
 497         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 498     }
 499     else if (dd->comm->bCartesianPP)
 500     {
 501 #ifdef GMX_MPI
 502         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 503 #endif
 504     }
 505     else
 506     {
 507         ddnodeid = ddindex;
 508     }
 509
 510     return ddnodeid;
 511 }
 512
 513 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 514 {
 515     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 516 }
 517
 518 int ddglatnr(gmx_domdec_t *dd, int i)
 519 {
 520     int atnr;
 521
 522     if (dd == NULL)
 523     {
 524         atnr = i + 1;
 525     }
 526     else
 527     {
 528         if (i >= dd->comm->nat[ddnatNR-1])
 529         {
 530             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 531         }
 532         atnr = dd->gatindex[i] + 1;
 533     }
 534
 535     return atnr;
 536 }
 537
 538 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 539 {
 540     return &dd->comm->cgs_gl;
 541 }
 542
 543 static void vec_rvec_init(vec_rvec_t *v)
 544 {
 545     v->nalloc = 0;
 546     v->v      = NULL;
 547 }
 548
 549 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 550 {
 551     if (n > v->nalloc)
 552     {
 553         v->nalloc = over_alloc_dd(n);
 554         srenew(v->v, v->nalloc);
 555     }
 556 }
 557
 558 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 559 {
 560     int i;
 561
 562     if (state->ddp_count != dd->ddp_count)
 563     {
 564         gmx_incons("The state does not the domain decomposition state");
 565     }
 566
 567     state->ncg_gl = dd->ncg_home;
 568     if (state->ncg_gl > state->cg_gl_nalloc)
 569     {
 570         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 571         srenew(state->cg_gl, state->cg_gl_nalloc);
 572     }
 573     for (i = 0; i < state->ncg_gl; i++)
 574     {
 575         state->cg_gl[i] = dd->index_gl[i];
 576     }
 577
 578     state->ddp_count_cg_gl = dd->ddp_count;
 579 }
 580
 581 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 582 {
 583     return &dd->comm->zones;
 584 }
 585
 586 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 587                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 588 {
 589     gmx_domdec_zones_t *zones;
 590     int                 izone, d, dim;
 591
 592     zones = &dd->comm->zones;
 593
 594     izone = 0;
 595     while (icg >= zones->izone[izone].cg1)
 596     {
 597         izone++;
 598     }
 599
 600     if (izone == 0)
 601     {
 602         *jcg0 = icg;
 603     }
 604     else if (izone < zones->nizone)
 605     {
 606         *jcg0 = zones->izone[izone].jcg0;
 607     }
 608     else
 609     {
 610         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 611                   icg, izone, zones->nizone);
 612     }
 613
 614     *jcg1 = zones->izone[izone].jcg1;
 615
 616     for (d = 0; d < dd->ndim; d++)
 617     {
 618         dim         = dd->dim[d];
 619         shift0[dim] = zones->izone[izone].shift0[dim];
 620         shift1[dim] = zones->izone[izone].shift1[dim];
 621         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 622         {
 623             /* A conservative approach, this can be optimized */
 624             shift0[dim] -= 1;
 625             shift1[dim] += 1;
 626         }
 627     }
 628 }
 629
 630 int dd_natoms_vsite(gmx_domdec_t *dd)
 631 {
 632     return dd->comm->nat[ddnatVSITE];
 633 }
 634
 635 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 636 {
 637     *at_start = dd->comm->nat[ddnatCON-1];
 638     *at_end   = dd->comm->nat[ddnatCON];
 639 }
 640
 641 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 642 {
 643     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 644     int                   *index, *cgindex;
 645     gmx_domdec_comm_t     *comm;
 646     gmx_domdec_comm_dim_t *cd;
 647     gmx_domdec_ind_t      *ind;
 648     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 649     gmx_bool               bPBC, bScrew;
 650
 651     comm = dd->comm;
 652
 653     cgindex = dd->cgindex;
 654
 655     buf = comm->vbuf.v;
 656
 657     nzone   = 1;
 658     nat_tot = dd->nat_home;
 659     for (d = 0; d < dd->ndim; d++)
 660     {
 661         bPBC   = (dd->ci[dd->dim[d]] == 0);
 662         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 663         if (bPBC)
 664         {
 665             copy_rvec(box[dd->dim[d]], shift);
 666         }
 667         cd = &comm->cd[d];
 668         for (p = 0; p < cd->np; p++)
 669         {
 670             ind   = &cd->ind[p];
 671             index = ind->index;
 672             n     = 0;
 673             if (!bPBC)
 674             {
 675                 for (i = 0; i < ind->nsend[nzone]; i++)
 676                 {
 677                     at0 = cgindex[index[i]];
 678                     at1 = cgindex[index[i]+1];
 679                     for (j = at0; j < at1; j++)
 680                     {
 681                         copy_rvec(x[j], buf[n]);
 682                         n++;
 683                     }
 684                 }
 685             }
 686             else if (!bScrew)
 687             {
 688                 for (i = 0; i < ind->nsend[nzone]; i++)
 689                 {
 690                     at0 = cgindex[index[i]];
 691                     at1 = cgindex[index[i]+1];
 692                     for (j = at0; j < at1; j++)
 693                     {
 694                         /* We need to shift the coordinates */
 695                         rvec_add(x[j], shift, buf[n]);
 696                         n++;
 697                     }
 698                 }
 699             }
 700             else
 701             {
 702                 for (i = 0; i < ind->nsend[nzone]; i++)
 703                 {
 704                     at0 = cgindex[index[i]];
 705                     at1 = cgindex[index[i]+1];
 706                     for (j = at0; j < at1; j++)
 707                     {
 708                         /* Shift x */
 709                         buf[n][XX] = x[j][XX] + shift[XX];
 710                         /* Rotate y and z.
 711                          * This operation requires a special shift force
 712                          * treatment, which is performed in calc_vir.
 713                          */
 714                         buf[n][YY] = box[YY][YY] - x[j][YY];
 715                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 716                         n++;
 717                     }
 718                 }
 719             }
 720
 721             if (cd->bInPlace)
 722             {
 723                 rbuf = x + nat_tot;
 724             }
 725             else
 726             {
 727                 rbuf = comm->vbuf2.v;
 728             }
 729             /* Send and receive the coordinates */
 730             dd_sendrecv_rvec(dd, d, dddirBackward,
 731                              buf,  ind->nsend[nzone+1],
 732                              rbuf, ind->nrecv[nzone+1]);
 733             if (!cd->bInPlace)
 734             {
 735                 j = 0;
 736                 for (zone = 0; zone < nzone; zone++)
 737                 {
 738                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 739                     {
 740                         copy_rvec(rbuf[j], x[i]);
 741                         j++;
 742                     }
 743                 }
 744             }
 745             nat_tot += ind->nrecv[nzone+1];
 746         }
 747         nzone += nzone;
 748     }
 749 }
 750
 751 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 752 {
 753     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 754     int                   *index, *cgindex;
 755     gmx_domdec_comm_t     *comm;
 756     gmx_domdec_comm_dim_t *cd;
 757     gmx_domdec_ind_t      *ind;
 758     rvec                  *buf, *sbuf;
 759     ivec                   vis;
 760     int                    is;
 761     gmx_bool               bPBC, bScrew;
 762
 763     comm = dd->comm;
 764
 765     cgindex = dd->cgindex;
 766
 767     buf = comm->vbuf.v;
 768
 769     n       = 0;
 770     nzone   = comm->zones.n/2;
 771     nat_tot = dd->nat_tot;
 772     for (d = dd->ndim-1; d >= 0; d--)
 773     {
 774         bPBC   = (dd->ci[dd->dim[d]] == 0);
 775         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 776         if (fshift == NULL && !bScrew)
 777         {
 778             bPBC = FALSE;
 779         }
 780         /* Determine which shift vector we need */
 781         clear_ivec(vis);
 782         vis[dd->dim[d]] = 1;
 783         is              = IVEC2IS(vis);
 784
 785         cd = &comm->cd[d];
 786         for (p = cd->np-1; p >= 0; p--)
 787         {
 788             ind      = &cd->ind[p];
 789             nat_tot -= ind->nrecv[nzone+1];
 790             if (cd->bInPlace)
 791             {
 792                 sbuf = f + nat_tot;
 793             }
 794             else
 795             {
 796                 sbuf = comm->vbuf2.v;
 797                 j    = 0;
 798                 for (zone = 0; zone < nzone; zone++)
 799                 {
 800                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 801                     {
 802                         copy_rvec(f[i], sbuf[j]);
 803                         j++;
 804                     }
 805                 }
 806             }
 807             /* Communicate the forces */
 808             dd_sendrecv_rvec(dd, d, dddirForward,
 809                              sbuf, ind->nrecv[nzone+1],
 810                              buf,  ind->nsend[nzone+1]);
 811             index = ind->index;
 812             /* Add the received forces */
 813             n = 0;
 814             if (!bPBC)
 815             {
 816                 for (i = 0; i < ind->nsend[nzone]; i++)
 817                 {
 818                     at0 = cgindex[index[i]];
 819                     at1 = cgindex[index[i]+1];
 820                     for (j = at0; j < at1; j++)
 821                     {
 822                         rvec_inc(f[j], buf[n]);
 823                         n++;
 824                     }
 825                 }
 826             }
 827             else if (!bScrew)
 828             {
 829                 for (i = 0; i < ind->nsend[nzone]; i++)
 830                 {
 831                     at0 = cgindex[index[i]];
 832                     at1 = cgindex[index[i]+1];
 833                     for (j = at0; j < at1; j++)
 834                     {
 835                         rvec_inc(f[j], buf[n]);
 836                         /* Add this force to the shift force */
 837                         rvec_inc(fshift[is], buf[n]);
 838                         n++;
 839                     }
 840                 }
 841             }
 842             else
 843             {
 844                 for (i = 0; i < ind->nsend[nzone]; i++)
 845                 {
 846                     at0 = cgindex[index[i]];
 847                     at1 = cgindex[index[i]+1];
 848                     for (j = at0; j < at1; j++)
 849                     {
 850                         /* Rotate the force */
 851                         f[j][XX] += buf[n][XX];
 852                         f[j][YY] -= buf[n][YY];
 853                         f[j][ZZ] -= buf[n][ZZ];
 854                         if (fshift)
 855                         {
 856                             /* Add this force to the shift force */
 857                             rvec_inc(fshift[is], buf[n]);
 858                         }
 859                         n++;
 860                     }
 861                 }
 862             }
 863         }
 864         nzone /= 2;
 865     }
 866 }
 867
 868 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 869 {
 870     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 871     int                   *index, *cgindex;
 872     gmx_domdec_comm_t     *comm;
 873     gmx_domdec_comm_dim_t *cd;
 874     gmx_domdec_ind_t      *ind;
 875     real                  *buf, *rbuf;
 876
 877     comm = dd->comm;
 878
 879     cgindex = dd->cgindex;
 880
 881     buf = &comm->vbuf.v[0][0];
 882
 883     nzone   = 1;
 884     nat_tot = dd->nat_home;
 885     for (d = 0; d < dd->ndim; d++)
 886     {
 887         cd = &comm->cd[d];
 888         for (p = 0; p < cd->np; p++)
 889         {
 890             ind   = &cd->ind[p];
 891             index = ind->index;
 892             n     = 0;
 893             for (i = 0; i < ind->nsend[nzone]; i++)
 894             {
 895                 at0 = cgindex[index[i]];
 896                 at1 = cgindex[index[i]+1];
 897                 for (j = at0; j < at1; j++)
 898                 {
 899                     buf[n] = v[j];
 900                     n++;
 901                 }
 902             }
 903
 904             if (cd->bInPlace)
 905             {
 906                 rbuf = v + nat_tot;
 907             }
 908             else
 909             {
 910                 rbuf = &comm->vbuf2.v[0][0];
 911             }
 912             /* Send and receive the coordinates */
 913             dd_sendrecv_real(dd, d, dddirBackward,
 914                              buf,  ind->nsend[nzone+1],
 915                              rbuf, ind->nrecv[nzone+1]);
 916             if (!cd->bInPlace)
 917             {
 918                 j = 0;
 919                 for (zone = 0; zone < nzone; zone++)
 920                 {
 921                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 922                     {
 923                         v[i] = rbuf[j];
 924                         j++;
 925                     }
 926                 }
 927             }
 928             nat_tot += ind->nrecv[nzone+1];
 929         }
 930         nzone += nzone;
 931     }
 932 }
 933
 934 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 935 {
 936     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 937     int                   *index, *cgindex;
 938     gmx_domdec_comm_t     *comm;
 939     gmx_domdec_comm_dim_t *cd;
 940     gmx_domdec_ind_t      *ind;
 941     real                  *buf, *sbuf;
 942
 943     comm = dd->comm;
 944
 945     cgindex = dd->cgindex;
 946
 947     buf = &comm->vbuf.v[0][0];
 948
 949     n       = 0;
 950     nzone   = comm->zones.n/2;
 951     nat_tot = dd->nat_tot;
 952     for (d = dd->ndim-1; d >= 0; d--)
 953     {
 954         cd = &comm->cd[d];
 955         for (p = cd->np-1; p >= 0; p--)
 956         {
 957             ind      = &cd->ind[p];
 958             nat_tot -= ind->nrecv[nzone+1];
 959             if (cd->bInPlace)
 960             {
 961                 sbuf = v + nat_tot;
 962             }
 963             else
 964             {
 965                 sbuf = &comm->vbuf2.v[0][0];
 966                 j    = 0;
 967                 for (zone = 0; zone < nzone; zone++)
 968                 {
 969                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 970                     {
 971                         sbuf[j] = v[i];
 972                         j++;
 973                     }
 974                 }
 975             }
 976             /* Communicate the forces */
 977             dd_sendrecv_real(dd, d, dddirForward,
 978                              sbuf, ind->nrecv[nzone+1],
 979                              buf,  ind->nsend[nzone+1]);
 980             index = ind->index;
 981             /* Add the received forces */
 982             n = 0;
 983             for (i = 0; i < ind->nsend[nzone]; i++)
 984             {
 985                 at0 = cgindex[index[i]];
 986                 at1 = cgindex[index[i]+1];
 987                 for (j = at0; j < at1; j++)
 988                 {
 989                     v[j] += buf[n];
 990                     n++;
 991                 }
 992             }
 993         }
 994         nzone /= 2;
 995     }
 996 }
 997
 998 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 999 {
1000     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1001             d, i, j,
1002             zone->min0, zone->max1,
1003             zone->mch0, zone->mch0,
1004             zone->p1_0, zone->p1_1);
1005 }
1006
1007
1008 #define DDZONECOMM_MAXZONE  5
1009 #define DDZONECOMM_BUFSIZE  3
1010
1011 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1012                                int ddimind, int direction,
1013                                gmx_ddzone_t *buf_s, int n_s,
1014                                gmx_ddzone_t *buf_r, int n_r)
1015 {
1016 #define ZBS  DDZONECOMM_BUFSIZE
1017     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1018     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1019     int  i;
1020
1021     for (i = 0; i < n_s; i++)
1022     {
1023         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1024         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1025         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1026         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1027         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1028         vbuf_s[i*ZBS+1][2] = 0;
1029         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1030         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1031         vbuf_s[i*ZBS+2][2] = 0;
1032     }
1033
1034     dd_sendrecv_rvec(dd, ddimind, direction,
1035                      vbuf_s, n_s*ZBS,
1036                      vbuf_r, n_r*ZBS);
1037
1038     for (i = 0; i < n_r; i++)
1039     {
1040         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1041         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1042         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1043         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1044         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1045         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1046         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1047     }
1048
1049 #undef ZBS
1050 }
1051
1052 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1053                           rvec cell_ns_x0, rvec cell_ns_x1)
1054 {
1055     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1056     gmx_ddzone_t      *zp;
1057     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1058     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1059     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1060     rvec               extr_s[2], extr_r[2];
1061     rvec               dh;
1062     real               dist_d, c = 0, det;
1063     gmx_domdec_comm_t *comm;
1064     gmx_bool           bPBC, bUse;
1065
1066     comm = dd->comm;
1067
1068     for (d = 1; d < dd->ndim; d++)
1069     {
1070         dim      = dd->dim[d];
1071         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1072         zp->min0 = cell_ns_x0[dim];
1073         zp->max1 = cell_ns_x1[dim];
1074         zp->min1 = cell_ns_x1[dim];
1075         zp->mch0 = cell_ns_x0[dim];
1076         zp->mch1 = cell_ns_x1[dim];
1077         zp->p1_0 = cell_ns_x0[dim];
1078         zp->p1_1 = cell_ns_x1[dim];
1079     }
1080
1081     for (d = dd->ndim-2; d >= 0; d--)
1082     {
1083         dim  = dd->dim[d];
1084         bPBC = (dim < ddbox->npbcdim);
1085
1086         /* Use an rvec to store two reals */
1087         extr_s[d][0] = comm->cell_f0[d+1];
1088         extr_s[d][1] = comm->cell_f1[d+1];
1089         extr_s[d][2] = comm->cell_f1[d+1];
1090
1091         pos = 0;
1092         /* Store the extremes in the backward sending buffer,
1093          * so the get updated separately from the forward communication.
1094          */
1095         for (d1 = d; d1 < dd->ndim-1; d1++)
1096         {
1097             /* We invert the order to be able to use the same loop for buf_e */
1098             buf_s[pos].min0 = extr_s[d1][1];
1099             buf_s[pos].max1 = extr_s[d1][0];
1100             buf_s[pos].min1 = extr_s[d1][2];
1101             buf_s[pos].mch0 = 0;
1102             buf_s[pos].mch1 = 0;
1103             /* Store the cell corner of the dimension we communicate along */
1104             buf_s[pos].p1_0 = comm->cell_x0[dim];
1105             buf_s[pos].p1_1 = 0;
1106             pos++;
1107         }
1108
1109         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1110         pos++;
1111
1112         if (dd->ndim == 3 && d == 0)
1113         {
1114             buf_s[pos] = comm->zone_d2[0][1];
1115             pos++;
1116             buf_s[pos] = comm->zone_d1[0];
1117             pos++;
1118         }
1119
1120         /* We only need to communicate the extremes
1121          * in the forward direction
1122          */
1123         npulse = comm->cd[d].np;
1124         if (bPBC)
1125         {
1126             /* Take the minimum to avoid double communication */
1127             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1128         }
1129         else
1130         {
1131             /* Without PBC we should really not communicate over
1132              * the boundaries, but implementing that complicates
1133              * the communication setup and therefore we simply
1134              * do all communication, but ignore some data.
1135              */
1136             npulse_min = npulse;
1137         }
1138         for (p = 0; p < npulse_min; p++)
1139         {
1140             /* Communicate the extremes forward */
1141             bUse = (bPBC || dd->ci[dim] > 0);
1142
1143             dd_sendrecv_rvec(dd, d, dddirForward,
1144                              extr_s+d, dd->ndim-d-1,
1145                              extr_r+d, dd->ndim-d-1);
1146
1147             if (bUse)
1148             {
1149                 for (d1 = d; d1 < dd->ndim-1; d1++)
1150                 {
1151                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1152                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1153                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1154                 }
1155             }
1156         }
1157
1158         buf_size = pos;
1159         for (p = 0; p < npulse; p++)
1160         {
1161             /* Communicate all the zone information backward */
1162             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1163
1164             dd_sendrecv_ddzone(dd, d, dddirBackward,
1165                                buf_s, buf_size,
1166                                buf_r, buf_size);
1167
1168             clear_rvec(dh);
1169             if (p > 0)
1170             {
1171                 for (d1 = d+1; d1 < dd->ndim; d1++)
1172                 {
1173                     /* Determine the decrease of maximum required
1174                      * communication height along d1 due to the distance along d,
1175                      * this avoids a lot of useless atom communication.
1176                      */
1177                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1178
1179                     if (ddbox->tric_dir[dim])
1180                     {
1181                         /* c is the off-diagonal coupling between the cell planes
1182                          * along directions d and d1.
1183                          */
1184                         c = ddbox->v[dim][dd->dim[d1]][dim];
1185                     }
1186                     else
1187                     {
1188                         c = 0;
1189                     }
1190                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1191                     if (det > 0)
1192                     {
1193                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1194                     }
1195                     else
1196                     {
1197                         /* A negative value signals out of range */
1198                         dh[d1] = -1;
1199                     }
1200                 }
1201             }
1202
1203             /* Accumulate the extremes over all pulses */
1204             for (i = 0; i < buf_size; i++)
1205             {
1206                 if (p == 0)
1207                 {
1208                     buf_e[i] = buf_r[i];
1209                 }
1210                 else
1211                 {
1212                     if (bUse)
1213                     {
1214                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1215                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1216                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1217                     }
1218
1219                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1220                     {
1221                         d1 = 1;
1222                     }
1223                     else
1224                     {
1225                         d1 = d + 1;
1226                     }
1227                     if (bUse && dh[d1] >= 0)
1228                     {
1229                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1230                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1231                     }
1232                 }
1233                 /* Copy the received buffer to the send buffer,
1234                  * to pass the data through with the next pulse.
1235                  */
1236                 buf_s[i] = buf_r[i];
1237             }
1238             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1239                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1240             {
1241                 /* Store the extremes */
1242                 pos = 0;
1243
1244                 for (d1 = d; d1 < dd->ndim-1; d1++)
1245                 {
1246                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1247                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1248                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1249                     pos++;
1250                 }
1251
1252                 if (d == 1 || (d == 0 && dd->ndim == 3))
1253                 {
1254                     for (i = d; i < 2; i++)
1255                     {
1256                         comm->zone_d2[1-d][i] = buf_e[pos];
1257                         pos++;
1258                     }
1259                 }
1260                 if (d == 0)
1261                 {
1262                     comm->zone_d1[1] = buf_e[pos];
1263                     pos++;
1264                 }
1265             }
1266         }
1267     }
1268
1269     if (dd->ndim >= 2)
1270     {
1271         dim = dd->dim[1];
1272         for (i = 0; i < 2; i++)
1273         {
1274             if (debug)
1275             {
1276                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1277             }
1278             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1279             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1280         }
1281     }
1282     if (dd->ndim >= 3)
1283     {
1284         dim = dd->dim[2];
1285         for (i = 0; i < 2; i++)
1286         {
1287             for (j = 0; j < 2; j++)
1288             {
1289                 if (debug)
1290                 {
1291                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1292                 }
1293                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1294                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1295             }
1296         }
1297     }
1298     for (d = 1; d < dd->ndim; d++)
1299     {
1300         comm->cell_f_max0[d] = extr_s[d-1][0];
1301         comm->cell_f_min1[d] = extr_s[d-1][1];
1302         if (debug)
1303         {
1304             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1305                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1306         }
1307     }
1308 }
1309
1310 static void dd_collect_cg(gmx_domdec_t *dd,
1311                           t_state      *state_local)
1312 {
1313     gmx_domdec_master_t *ma = NULL;
1314     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1315     t_block             *cgs_gl;
1316
1317     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1318     {
1319         /* The master has the correct distribution */
1320         return;
1321     }
1322
1323     if (state_local->ddp_count == dd->ddp_count)
1324     {
1325         ncg_home = dd->ncg_home;
1326         cg       = dd->index_gl;
1327         nat_home = dd->nat_home;
1328     }
1329     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1330     {
1331         cgs_gl = &dd->comm->cgs_gl;
1332
1333         ncg_home = state_local->ncg_gl;
1334         cg       = state_local->cg_gl;
1335         nat_home = 0;
1336         for (i = 0; i < ncg_home; i++)
1337         {
1338             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1339         }
1340     }
1341     else
1342     {
1343         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1344     }
1345
1346     buf2[0] = dd->ncg_home;
1347     buf2[1] = dd->nat_home;
1348     if (DDMASTER(dd))
1349     {
1350         ma   = dd->ma;
1351         ibuf = ma->ibuf;
1352     }
1353     else
1354     {
1355         ibuf = NULL;
1356     }
1357     /* Collect the charge group and atom counts on the master */
1358     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1359
1360     if (DDMASTER(dd))
1361     {
1362         ma->index[0] = 0;
1363         for (i = 0; i < dd->nnodes; i++)
1364         {
1365             ma->ncg[i]     = ma->ibuf[2*i];
1366             ma->nat[i]     = ma->ibuf[2*i+1];
1367             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1368
1369         }
1370         /* Make byte counts and indices */
1371         for (i = 0; i < dd->nnodes; i++)
1372         {
1373             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1374             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1375         }
1376         if (debug)
1377         {
1378             fprintf(debug, "Initial charge group distribution: ");
1379             for (i = 0; i < dd->nnodes; i++)
1380             {
1381                 fprintf(debug, " %d", ma->ncg[i]);
1382             }
1383             fprintf(debug, "\n");
1384         }
1385     }
1386
1387     /* Collect the charge group indices on the master */
1388     dd_gatherv(dd,
1389                dd->ncg_home*sizeof(int), dd->index_gl,
1390                DDMASTER(dd) ? ma->ibuf : NULL,
1391                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1392                DDMASTER(dd) ? ma->cg : NULL);
1393
1394     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1395 }
1396
1397 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1398                                     rvec *lv, rvec *v)
1399 {
1400     gmx_domdec_master_t *ma;
1401     int                  n, i, c, a, nalloc = 0;
1402     rvec                *buf = NULL;
1403     t_block             *cgs_gl;
1404
1405     ma = dd->ma;
1406
1407     if (!DDMASTER(dd))
1408     {
1409 #ifdef GMX_MPI
1410         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1411                  dd->rank, dd->mpi_comm_all);
1412 #endif
1413     }
1414     else
1415     {
1416         /* Copy the master coordinates to the global array */
1417         cgs_gl = &dd->comm->cgs_gl;
1418
1419         n = DDMASTERRANK(dd);
1420         a = 0;
1421         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1422         {
1423             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1424             {
1425                 copy_rvec(lv[a++], v[c]);
1426             }
1427         }
1428
1429         for (n = 0; n < dd->nnodes; n++)
1430         {
1431             if (n != dd->rank)
1432             {
1433                 if (ma->nat[n] > nalloc)
1434                 {
1435                     nalloc = over_alloc_dd(ma->nat[n]);
1436                     srenew(buf, nalloc);
1437                 }
1438 #ifdef GMX_MPI
1439                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1440                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1441 #endif
1442                 a = 0;
1443                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1444                 {
1445                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1446                     {
1447                         copy_rvec(buf[a++], v[c]);
1448                     }
1449                 }
1450             }
1451         }
1452         sfree(buf);
1453     }
1454 }
1455
1456 static void get_commbuffer_counts(gmx_domdec_t *dd,
1457                                   int **counts, int **disps)
1458 {
1459     gmx_domdec_master_t *ma;
1460     int                  n;
1461
1462     ma = dd->ma;
1463
1464     /* Make the rvec count and displacment arrays */
1465     *counts  = ma->ibuf;
1466     *disps   = ma->ibuf + dd->nnodes;
1467     for (n = 0; n < dd->nnodes; n++)
1468     {
1469         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1470         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1471     }
1472 }
1473
1474 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1475                                    rvec *lv, rvec *v)
1476 {
1477     gmx_domdec_master_t *ma;
1478     int                 *rcounts = NULL, *disps = NULL;
1479     int                  n, i, c, a;
1480     rvec                *buf = NULL;
1481     t_block             *cgs_gl;
1482
1483     ma = dd->ma;
1484
1485     if (DDMASTER(dd))
1486     {
1487         get_commbuffer_counts(dd, &rcounts, &disps);
1488
1489         buf = ma->vbuf;
1490     }
1491
1492     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1493
1494     if (DDMASTER(dd))
1495     {
1496         cgs_gl = &dd->comm->cgs_gl;
1497
1498         a = 0;
1499         for (n = 0; n < dd->nnodes; n++)
1500         {
1501             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1502             {
1503                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1504                 {
1505                     copy_rvec(buf[a++], v[c]);
1506                 }
1507             }
1508         }
1509     }
1510 }
1511
1512 void dd_collect_vec(gmx_domdec_t *dd,
1513                     t_state *state_local, rvec *lv, rvec *v)
1514 {
1515     gmx_domdec_master_t *ma;
1516     int                  n, i, c, a, nalloc = 0;
1517     rvec                *buf = NULL;
1518
1519     dd_collect_cg(dd, state_local);
1520
1521     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1522     {
1523         dd_collect_vec_sendrecv(dd, lv, v);
1524     }
1525     else
1526     {
1527         dd_collect_vec_gatherv(dd, lv, v);
1528     }
1529 }
1530
1531
1532 void dd_collect_state(gmx_domdec_t *dd,
1533                       t_state *state_local, t_state *state)
1534 {
1535     int est, i, j, nh;
1536
1537     nh = state->nhchainlength;
1538
1539     if (DDMASTER(dd))
1540     {
1541         for (i = 0; i < efptNR; i++)
1542         {
1543             state->lambda[i] = state_local->lambda[i];
1544         }
1545         state->fep_state = state_local->fep_state;
1546         state->veta      = state_local->veta;
1547         state->vol0      = state_local->vol0;
1548         copy_mat(state_local->box, state->box);
1549         copy_mat(state_local->boxv, state->boxv);
1550         copy_mat(state_local->svir_prev, state->svir_prev);
1551         copy_mat(state_local->fvir_prev, state->fvir_prev);
1552         copy_mat(state_local->pres_prev, state->pres_prev);
1553
1554
1555         for (i = 0; i < state_local->ngtc; i++)
1556         {
1557             for (j = 0; j < nh; j++)
1558             {
1559                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1560                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1561             }
1562             state->therm_integral[i] = state_local->therm_integral[i];
1563         }
1564         for (i = 0; i < state_local->nnhpres; i++)
1565         {
1566             for (j = 0; j < nh; j++)
1567             {
1568                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1569                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1570             }
1571         }
1572     }
1573     for (est = 0; est < estNR; est++)
1574     {
1575         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1576         {
1577             switch (est)
1578             {
1579                 case estX:
1580                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1581                     break;
1582                 case estV:
1583                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1584                     break;
1585                 case estSDX:
1586                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1587                     break;
1588                 case estCGP:
1589                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1590                     break;
1591                 case estLD_RNG:
1592                     if (state->nrngi == 1)
1593                     {
1594                         if (DDMASTER(dd))
1595                         {
1596                             for (i = 0; i < state_local->nrng; i++)
1597                             {
1598                                 state->ld_rng[i] = state_local->ld_rng[i];
1599                             }
1600                         }
1601                     }
1602                     else
1603                     {
1604                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1605                                   state_local->ld_rng, state->ld_rng);
1606                     }
1607                     break;
1608                 case estLD_RNGI:
1609                     if (state->nrngi == 1)
1610                     {
1611                         if (DDMASTER(dd))
1612                         {
1613                             state->ld_rngi[0] = state_local->ld_rngi[0];
1614                         }
1615                     }
1616                     else
1617                     {
1618                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1619                                   state_local->ld_rngi, state->ld_rngi);
1620                     }
1621                     break;
1622                 case estDISRE_INITF:
1623                 case estDISRE_RM3TAV:
1624                 case estORIRE_INITF:
1625                 case estORIRE_DTAV:
1626                     break;
1627                 default:
1628                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1629             }
1630         }
1631     }
1632 }
1633
1634 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1635 {
1636     int est;
1637
1638     if (debug)
1639     {
1640         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1641     }
1642
1643     state->nalloc = over_alloc_dd(nalloc);
1644
1645     for (est = 0; est < estNR; est++)
1646     {
1647         if (EST_DISTR(est) && (state->flags & (1<<est)))
1648         {
1649             switch (est)
1650             {
1651                 case estX:
1652                     srenew(state->x, state->nalloc);
1653                     break;
1654                 case estV:
1655                     srenew(state->v, state->nalloc);
1656                     break;
1657                 case estSDX:
1658                     srenew(state->sd_X, state->nalloc);
1659                     break;
1660                 case estCGP:
1661                     srenew(state->cg_p, state->nalloc);
1662                     break;
1663                 case estLD_RNG:
1664                 case estLD_RNGI:
1665                 case estDISRE_INITF:
1666                 case estDISRE_RM3TAV:
1667                 case estORIRE_INITF:
1668                 case estORIRE_DTAV:
1669                     /* No reallocation required */
1670                     break;
1671                 default:
1672                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1673             }
1674         }
1675     }
1676
1677     if (f != NULL)
1678     {
1679         srenew(*f, state->nalloc);
1680     }
1681 }
1682
1683 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1684                                int nalloc)
1685 {
1686     if (nalloc > fr->cg_nalloc)
1687     {
1688         if (debug)
1689         {
1690             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1691         }
1692         fr->cg_nalloc = over_alloc_dd(nalloc);
1693         srenew(fr->cginfo, fr->cg_nalloc);
1694         if (fr->cutoff_scheme == ecutsGROUP)
1695         {
1696             srenew(fr->cg_cm, fr->cg_nalloc);
1697         }
1698     }
1699     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1700     {
1701         /* We don't use charge groups, we use x in state to set up
1702          * the atom communication.
1703          */
1704         dd_realloc_state(state, f, nalloc);
1705     }
1706 }
1707
1708 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1709                                        rvec *v, rvec *lv)
1710 {
1711     gmx_domdec_master_t *ma;
1712     int                  n, i, c, a, nalloc = 0;
1713     rvec                *buf = NULL;
1714
1715     if (DDMASTER(dd))
1716     {
1717         ma  = dd->ma;
1718
1719         for (n = 0; n < dd->nnodes; n++)
1720         {
1721             if (n != dd->rank)
1722             {
1723                 if (ma->nat[n] > nalloc)
1724                 {
1725                     nalloc = over_alloc_dd(ma->nat[n]);
1726                     srenew(buf, nalloc);
1727                 }
1728                 /* Use lv as a temporary buffer */
1729                 a = 0;
1730                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1731                 {
1732                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1733                     {
1734                         copy_rvec(v[c], buf[a++]);
1735                     }
1736                 }
1737                 if (a != ma->nat[n])
1738                 {
1739                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1740                               a, ma->nat[n]);
1741                 }
1742
1743 #ifdef GMX_MPI
1744                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1745                          DDRANK(dd, n), n, dd->mpi_comm_all);
1746 #endif
1747             }
1748         }
1749         sfree(buf);
1750         n = DDMASTERRANK(dd);
1751         a = 0;
1752         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1753         {
1754             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1755             {
1756                 copy_rvec(v[c], lv[a++]);
1757             }
1758         }
1759     }
1760     else
1761     {
1762 #ifdef GMX_MPI
1763         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1764                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1765 #endif
1766     }
1767 }
1768
1769 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1770                                        rvec *v, rvec *lv)
1771 {
1772     gmx_domdec_master_t *ma;
1773     int                 *scounts = NULL, *disps = NULL;
1774     int                  n, i, c, a, nalloc = 0;
1775     rvec                *buf = NULL;
1776
1777     if (DDMASTER(dd))
1778     {
1779         ma  = dd->ma;
1780
1781         get_commbuffer_counts(dd, &scounts, &disps);
1782
1783         buf = ma->vbuf;
1784         a   = 0;
1785         for (n = 0; n < dd->nnodes; n++)
1786         {
1787             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1788             {
1789                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1790                 {
1791                     copy_rvec(v[c], buf[a++]);
1792                 }
1793             }
1794         }
1795     }
1796
1797     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1798 }
1799
1800 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1801 {
1802     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1803     {
1804         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1805     }
1806     else
1807     {
1808         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1809     }
1810 }
1811
1812 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1813                                 t_state *state, t_state *state_local,
1814                                 rvec **f)
1815 {
1816     int  i, j, nh;
1817
1818     nh = state->nhchainlength;
1819
1820     if (DDMASTER(dd))
1821     {
1822         for (i = 0; i < efptNR; i++)
1823         {
1824             state_local->lambda[i] = state->lambda[i];
1825         }
1826         state_local->fep_state = state->fep_state;
1827         state_local->veta      = state->veta;
1828         state_local->vol0      = state->vol0;
1829         copy_mat(state->box, state_local->box);
1830         copy_mat(state->box_rel, state_local->box_rel);
1831         copy_mat(state->boxv, state_local->boxv);
1832         copy_mat(state->svir_prev, state_local->svir_prev);
1833         copy_mat(state->fvir_prev, state_local->fvir_prev);
1834         for (i = 0; i < state_local->ngtc; i++)
1835         {
1836             for (j = 0; j < nh; j++)
1837             {
1838                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1839                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1840             }
1841             state_local->therm_integral[i] = state->therm_integral[i];
1842         }
1843         for (i = 0; i < state_local->nnhpres; i++)
1844         {
1845             for (j = 0; j < nh; j++)
1846             {
1847                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1848                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1849             }
1850         }
1851     }
1852     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1853     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1854     dd_bcast(dd, sizeof(real), &state_local->veta);
1855     dd_bcast(dd, sizeof(real), &state_local->vol0);
1856     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1857     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1858     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1859     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1860     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1861     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1862     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1863     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1864     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1865     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1866
1867     if (dd->nat_home > state_local->nalloc)
1868     {
1869         dd_realloc_state(state_local, f, dd->nat_home);
1870     }
1871     for (i = 0; i < estNR; i++)
1872     {
1873         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1874         {
1875             switch (i)
1876             {
1877                 case estX:
1878                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1879                     break;
1880                 case estV:
1881                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1882                     break;
1883                 case estSDX:
1884                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1885                     break;
1886                 case estCGP:
1887                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1888                     break;
1889                 case estLD_RNG:
1890                     if (state->nrngi == 1)
1891                     {
1892                         dd_bcastc(dd,
1893                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1894                                   state->ld_rng, state_local->ld_rng);
1895                     }
1896                     else
1897                     {
1898                         dd_scatter(dd,
1899                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1900                                    state->ld_rng, state_local->ld_rng);
1901                     }
1902                     break;
1903                 case estLD_RNGI:
1904                     if (state->nrngi == 1)
1905                     {
1906                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1907                                   state->ld_rngi, state_local->ld_rngi);
1908                     }
1909                     else
1910                     {
1911                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1912                                    state->ld_rngi, state_local->ld_rngi);
1913                     }
1914                     break;
1915                 case estDISRE_INITF:
1916                 case estDISRE_RM3TAV:
1917                 case estORIRE_INITF:
1918                 case estORIRE_DTAV:
1919                     /* Not implemented yet */
1920                     break;
1921                 default:
1922                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1923             }
1924         }
1925     }
1926 }
1927
1928 static char dim2char(int dim)
1929 {
1930     char c = '?';
1931
1932     switch (dim)
1933     {
1934         case XX: c = 'X'; break;
1935         case YY: c = 'Y'; break;
1936         case ZZ: c = 'Z'; break;
1937         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1938     }
1939
1940     return c;
1941 }
1942
1943 static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
1944                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1945 {
1946     rvec   grid_s[2], *grid_r = NULL, cx, r;
1947     char   fname[STRLEN], format[STRLEN], buf[22];
1948     FILE  *out;
1949     int    a, i, d, z, y, x;
1950     matrix tric;
1951     real   vol;
1952
1953     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1954     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1955
1956     if (DDMASTER(dd))
1957     {
1958         snew(grid_r, 2*dd->nnodes);
1959     }
1960
1961     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1962
1963     if (DDMASTER(dd))
1964     {
1965         for (d = 0; d < DIM; d++)
1966         {
1967             for (i = 0; i < DIM; i++)
1968             {
1969                 if (d == i)
1970                 {
1971                     tric[d][i] = 1;
1972                 }
1973                 else
1974                 {
1975                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1976                     {
1977                         tric[d][i] = box[i][d]/box[i][i];
1978                     }
1979                     else
1980                     {
1981                         tric[d][i] = 0;
1982                     }
1983                 }
1984             }
1985         }
1986         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1987         sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1988         out = gmx_fio_fopen(fname, "w");
1989         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1990         a = 1;
1991         for (i = 0; i < dd->nnodes; i++)
1992         {
1993             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1994             for (d = 0; d < DIM; d++)
1995             {
1996                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1997             }
1998             for (z = 0; z < 2; z++)
1999             {
2000                 for (y = 0; y < 2; y++)
2001                 {
2002                     for (x = 0; x < 2; x++)
2003                     {
2004                         cx[XX] = grid_r[i*2+x][XX];
2005                         cx[YY] = grid_r[i*2+y][YY];
2006                         cx[ZZ] = grid_r[i*2+z][ZZ];
2007                         mvmul(tric, cx, r);
2008                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
2009                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
2010                     }
2011                 }
2012             }
2013             for (d = 0; d < DIM; d++)
2014             {
2015                 for (x = 0; x < 4; x++)
2016                 {
2017                     switch (d)
2018                     {
2019                         case 0: y = 1 + i*8 + 2*x; break;
2020                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2021                         case 2: y = 1 + i*8 + x; break;
2022                     }
2023                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2024                 }
2025             }
2026         }
2027         gmx_fio_fclose(out);
2028         sfree(grid_r);
2029     }
2030 }
2031
2032 void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
2033                   gmx_mtop_t *mtop, t_commrec *cr,
2034                   int natoms, rvec x[], matrix box)
2035 {
2036     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2037     FILE         *out;
2038     int           i, ii, resnr, c;
2039     char         *atomname, *resname;
2040     real          b;
2041     gmx_domdec_t *dd;
2042
2043     dd = cr->dd;
2044     if (natoms == -1)
2045     {
2046         natoms = dd->comm->nat[ddnatVSITE];
2047     }
2048
2049     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2050
2051     sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
2052     sprintf(format4, "%s%s\n", pdbformat4, "%6.2f%6.2f");
2053
2054     out = gmx_fio_fopen(fname, "w");
2055
2056     fprintf(out, "TITLE     %s\n", title);
2057     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2058     for (i = 0; i < natoms; i++)
2059     {
2060         ii = dd->gatindex[i];
2061         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2062         if (i < dd->comm->nat[ddnatZONE])
2063         {
2064             c = 0;
2065             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2066             {
2067                 c++;
2068             }
2069             b = c;
2070         }
2071         else if (i < dd->comm->nat[ddnatVSITE])
2072         {
2073             b = dd->comm->zones.n;
2074         }
2075         else
2076         {
2077             b = dd->comm->zones.n + 1;
2078         }
2079         fprintf(out, strlen(atomname) < 4 ? format : format4,
2080                 "ATOM", (ii+1)%100000,
2081                 atomname, resname, ' ', resnr%10000, ' ',
2082                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2083     }
2084     fprintf(out, "TER\n");
2085
2086     gmx_fio_fclose(out);
2087 }
2088
2089 real dd_cutoff_mbody(gmx_domdec_t *dd)
2090 {
2091     gmx_domdec_comm_t *comm;
2092     int                di;
2093     real               r;
2094
2095     comm = dd->comm;
2096
2097     r = -1;
2098     if (comm->bInterCGBondeds)
2099     {
2100         if (comm->cutoff_mbody > 0)
2101         {
2102             r = comm->cutoff_mbody;
2103         }
2104         else
2105         {
2106             /* cutoff_mbody=0 means we do not have DLB */
2107             r = comm->cellsize_min[dd->dim[0]];
2108             for (di = 1; di < dd->ndim; di++)
2109             {
2110                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2111             }
2112             if (comm->bBondComm)
2113             {
2114                 r = max(r, comm->cutoff_mbody);
2115             }
2116             else
2117             {
2118                 r = min(r, comm->cutoff);
2119             }
2120         }
2121     }
2122
2123     return r;
2124 }
2125
2126 real dd_cutoff_twobody(gmx_domdec_t *dd)
2127 {
2128     real r_mb;
2129
2130     r_mb = dd_cutoff_mbody(dd);
2131
2132     return max(dd->comm->cutoff, r_mb);
2133 }
2134
2135
2136 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2137 {
2138     int nc, ntot;
2139
2140     nc   = dd->nc[dd->comm->cartpmedim];
2141     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2142     copy_ivec(coord, coord_pme);
2143     coord_pme[dd->comm->cartpmedim] =
2144         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2145 }
2146
2147 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2148 {
2149     /* Here we assign a PME node to communicate with this DD node
2150      * by assuming that the major index of both is x.
2151      * We add cr->npmenodes/2 to obtain an even distribution.
2152      */
2153     return (ddindex*npme + npme/2)/ndd;
2154 }
2155
2156 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2157 {
2158     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2159 }
2160
2161 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2162 {
2163     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2164 }
2165
2166 static int *dd_pmenodes(t_commrec *cr)
2167 {
2168     int *pmenodes;
2169     int  n, i, p0, p1;
2170
2171     snew(pmenodes, cr->npmenodes);
2172     n = 0;
2173     for (i = 0; i < cr->dd->nnodes; i++)
2174     {
2175         p0 = cr_ddindex2pmeindex(cr, i);
2176         p1 = cr_ddindex2pmeindex(cr, i+1);
2177         if (i+1 == cr->dd->nnodes || p1 > p0)
2178         {
2179             if (debug)
2180             {
2181                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2182             }
2183             pmenodes[n] = i + 1 + n;
2184             n++;
2185         }
2186     }
2187
2188     return pmenodes;
2189 }
2190
2191 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2192 {
2193     gmx_domdec_t *dd;
2194     ivec          coords, coords_pme, nc;
2195     int           slab;
2196
2197     dd = cr->dd;
2198     /*
2199        if (dd->comm->bCartesian) {
2200        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2201        dd_coords2pmecoords(dd,coords,coords_pme);
2202        copy_ivec(dd->ntot,nc);
2203        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2204        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2205
2206        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2207        } else {
2208        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2209        }
2210      */
2211     coords[XX] = x;
2212     coords[YY] = y;
2213     coords[ZZ] = z;
2214     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2215
2216     return slab;
2217 }
2218
2219 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2220 {
2221     gmx_domdec_comm_t *comm;
2222     ivec               coords;
2223     int                ddindex, nodeid = -1;
2224
2225     comm = cr->dd->comm;
2226
2227     coords[XX] = x;
2228     coords[YY] = y;
2229     coords[ZZ] = z;
2230     if (comm->bCartesianPP_PME)
2231     {
2232 #ifdef GMX_MPI
2233         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2234 #endif
2235     }
2236     else
2237     {
2238         ddindex = dd_index(cr->dd->nc, coords);
2239         if (comm->bCartesianPP)
2240         {
2241             nodeid = comm->ddindex2simnodeid[ddindex];
2242         }
2243         else
2244         {
2245             if (comm->pmenodes)
2246             {
2247                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2248             }
2249             else
2250             {
2251                 nodeid = ddindex;
2252             }
2253         }
2254     }
2255
2256     return nodeid;
2257 }
2258
2259 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2260 {
2261     gmx_domdec_t      *dd;
2262     gmx_domdec_comm_t *comm;
2263     ivec               coord, coord_pme;
2264     int                i;
2265     int                pmenode = -1;
2266
2267     dd   = cr->dd;
2268     comm = dd->comm;
2269
2270     /* This assumes a uniform x domain decomposition grid cell size */
2271     if (comm->bCartesianPP_PME)
2272     {
2273 #ifdef GMX_MPI
2274         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2275         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2276         {
2277             /* This is a PP node */
2278             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2279             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2280         }
2281 #endif
2282     }
2283     else if (comm->bCartesianPP)
2284     {
2285         if (sim_nodeid < dd->nnodes)
2286         {
2287             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2288         }
2289     }
2290     else
2291     {
2292         /* This assumes DD cells with identical x coordinates
2293          * are numbered sequentially.
2294          */
2295         if (dd->comm->pmenodes == NULL)
2296         {
2297             if (sim_nodeid < dd->nnodes)
2298             {
2299                 /* The DD index equals the nodeid */
2300                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2301             }
2302         }
2303         else
2304         {
2305             i = 0;
2306             while (sim_nodeid > dd->comm->pmenodes[i])
2307             {
2308                 i++;
2309             }
2310             if (sim_nodeid < dd->comm->pmenodes[i])
2311             {
2312                 pmenode = dd->comm->pmenodes[i];
2313             }
2314         }
2315     }
2316
2317     return pmenode;
2318 }
2319
2320 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2321 {
2322     gmx_bool bPMEOnlyNode;
2323
2324     if (DOMAINDECOMP(cr))
2325     {
2326         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2327     }
2328     else
2329     {
2330         bPMEOnlyNode = FALSE;
2331     }
2332
2333     return bPMEOnlyNode;
2334 }
2335
2336 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2337                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2338 {
2339     gmx_domdec_t *dd;
2340     int           x, y, z;
2341     ivec          coord, coord_pme;
2342
2343     dd = cr->dd;
2344
2345     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2346
2347     *nmy_ddnodes = 0;
2348     for (x = 0; x < dd->nc[XX]; x++)
2349     {
2350         for (y = 0; y < dd->nc[YY]; y++)
2351         {
2352             for (z = 0; z < dd->nc[ZZ]; z++)
2353             {
2354                 if (dd->comm->bCartesianPP_PME)
2355                 {
2356                     coord[XX] = x;
2357                     coord[YY] = y;
2358                     coord[ZZ] = z;
2359                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2360                     if (dd->ci[XX] == coord_pme[XX] &&
2361                         dd->ci[YY] == coord_pme[YY] &&
2362                         dd->ci[ZZ] == coord_pme[ZZ])
2363                     {
2364                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2365                     }
2366                 }
2367                 else
2368                 {
2369                     /* The slab corresponds to the nodeid in the PME group */
2370                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2371                     {
2372                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2373                     }
2374                 }
2375             }
2376         }
2377     }
2378
2379     /* The last PP-only node is the peer node */
2380     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2381
2382     if (debug)
2383     {
2384         fprintf(debug, "Receive coordinates from PP nodes:");
2385         for (x = 0; x < *nmy_ddnodes; x++)
2386         {
2387             fprintf(debug, " %d", (*my_ddnodes)[x]);
2388         }
2389         fprintf(debug, "\n");
2390     }
2391 }
2392
2393 static gmx_bool receive_vir_ener(t_commrec *cr)
2394 {
2395     gmx_domdec_comm_t *comm;
2396     int                pmenode, coords[DIM], rank;
2397     gmx_bool           bReceive;
2398
2399     bReceive = TRUE;
2400     if (cr->npmenodes < cr->dd->nnodes)
2401     {
2402         comm = cr->dd->comm;
2403         if (comm->bCartesianPP_PME)
2404         {
2405             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2406 #ifdef GMX_MPI
2407             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2408             coords[comm->cartpmedim]++;
2409             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2410             {
2411                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2412                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2413                 {
2414                     /* This is not the last PP node for pmenode */
2415                     bReceive = FALSE;
2416                 }
2417             }
2418 #endif
2419         }
2420         else
2421         {
2422             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2423             if (cr->sim_nodeid+1 < cr->nnodes &&
2424                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2425             {
2426                 /* This is not the last PP node for pmenode */
2427                 bReceive = FALSE;
2428             }
2429         }
2430     }
2431
2432     return bReceive;
2433 }
2434
2435 static void set_zones_ncg_home(gmx_domdec_t *dd)
2436 {
2437     gmx_domdec_zones_t *zones;
2438     int                 i;
2439
2440     zones = &dd->comm->zones;
2441
2442     zones->cg_range[0] = 0;
2443     for (i = 1; i < zones->n+1; i++)
2444     {
2445         zones->cg_range[i] = dd->ncg_home;
2446     }
2447 }
2448
2449 static void rebuild_cgindex(gmx_domdec_t *dd,
2450                             const int *gcgs_index, t_state *state)
2451 {
2452     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2453
2454     ind        = state->cg_gl;
2455     dd_cg_gl   = dd->index_gl;
2456     cgindex    = dd->cgindex;
2457     nat        = 0;
2458     cgindex[0] = nat;
2459     for (i = 0; i < state->ncg_gl; i++)
2460     {
2461         cgindex[i]  = nat;
2462         cg_gl       = ind[i];
2463         dd_cg_gl[i] = cg_gl;
2464         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2465     }
2466     cgindex[i] = nat;
2467
2468     dd->ncg_home = state->ncg_gl;
2469     dd->nat_home = nat;
2470
2471     set_zones_ncg_home(dd);
2472 }
2473
2474 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2475 {
2476     while (cg >= cginfo_mb->cg_end)
2477     {
2478         cginfo_mb++;
2479     }
2480
2481     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2482 }
2483
2484 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2485                           t_forcerec *fr, char *bLocalCG)
2486 {
2487     cginfo_mb_t *cginfo_mb;
2488     int         *cginfo;
2489     int          cg;
2490
2491     if (fr != NULL)
2492     {
2493         cginfo_mb = fr->cginfo_mb;
2494         cginfo    = fr->cginfo;
2495
2496         for (cg = cg0; cg < cg1; cg++)
2497         {
2498             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2499         }
2500     }
2501
2502     if (bLocalCG != NULL)
2503     {
2504         for (cg = cg0; cg < cg1; cg++)
2505         {
2506             bLocalCG[index_gl[cg]] = TRUE;
2507         }
2508     }
2509 }
2510
2511 static void make_dd_indices(gmx_domdec_t *dd,
2512                             const int *gcgs_index, int cg_start)
2513 {
2514     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2515     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2516     gmx_ga2la_t *ga2la;
2517     char        *bLocalCG;
2518     gmx_bool     bCGs;
2519
2520     bLocalCG = dd->comm->bLocalCG;
2521
2522     if (dd->nat_tot > dd->gatindex_nalloc)
2523     {
2524         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2525         srenew(dd->gatindex, dd->gatindex_nalloc);
2526     }
2527
2528     nzone      = dd->comm->zones.n;
2529     zone2cg    = dd->comm->zones.cg_range;
2530     zone_ncg1  = dd->comm->zone_ncg1;
2531     index_gl   = dd->index_gl;
2532     gatindex   = dd->gatindex;
2533     bCGs       = dd->comm->bCGs;
2534
2535     if (zone2cg[1] != dd->ncg_home)
2536     {
2537         gmx_incons("dd->ncg_zone is not up to date");
2538     }
2539
2540     /* Make the local to global and global to local atom index */
2541     a = dd->cgindex[cg_start];
2542     for (zone = 0; zone < nzone; zone++)
2543     {
2544         if (zone == 0)
2545         {
2546             cg0 = cg_start;
2547         }
2548         else
2549         {
2550             cg0 = zone2cg[zone];
2551         }
2552         cg1    = zone2cg[zone+1];
2553         cg1_p1 = cg0 + zone_ncg1[zone];
2554
2555         for (cg = cg0; cg < cg1; cg++)
2556         {
2557             zone1 = zone;
2558             if (cg >= cg1_p1)
2559             {
2560                 /* Signal that this cg is from more than one pulse away */
2561                 zone1 += nzone;
2562             }
2563             cg_gl = index_gl[cg];
2564             if (bCGs)
2565             {
2566                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2567                 {
2568                     gatindex[a] = a_gl;
2569                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2570                     a++;
2571                 }
2572             }
2573             else
2574             {
2575                 gatindex[a] = cg_gl;
2576                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2577                 a++;
2578             }
2579         }
2580     }
2581 }
2582
2583 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2584                           const char *where)
2585 {
2586     int ncg, i, ngl, nerr;
2587
2588     nerr = 0;
2589     if (bLocalCG == NULL)
2590     {
2591         return nerr;
2592     }
2593     for (i = 0; i < dd->ncg_tot; i++)
2594     {
2595         if (!bLocalCG[dd->index_gl[i]])
2596         {
2597             fprintf(stderr,
2598                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2599             nerr++;
2600         }
2601     }
2602     ngl = 0;
2603     for (i = 0; i < ncg_sys; i++)
2604     {
2605         if (bLocalCG[i])
2606         {
2607             ngl++;
2608         }
2609     }
2610     if (ngl != dd->ncg_tot)
2611     {
2612         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2613         nerr++;
2614     }
2615
2616     return nerr;
2617 }
2618
2619 static void check_index_consistency(gmx_domdec_t *dd,
2620                                     int natoms_sys, int ncg_sys,
2621                                     const char *where)
2622 {
2623     int   nerr, ngl, i, a, cell;
2624     int  *have;
2625
2626     nerr = 0;
2627
2628     if (dd->comm->DD_debug > 1)
2629     {
2630         snew(have, natoms_sys);
2631         for (a = 0; a < dd->nat_tot; a++)
2632         {
2633             if (have[dd->gatindex[a]] > 0)
2634             {
2635                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2636             }
2637             else
2638             {
2639                 have[dd->gatindex[a]] = a + 1;
2640             }
2641         }
2642         sfree(have);
2643     }
2644
2645     snew(have, dd->nat_tot);
2646
2647     ngl  = 0;
2648     for (i = 0; i < natoms_sys; i++)
2649     {
2650         if (ga2la_get(dd->ga2la, i, &a, &cell))
2651         {
2652             if (a >= dd->nat_tot)
2653             {
2654                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2655                 nerr++;
2656             }
2657             else
2658             {
2659                 have[a] = 1;
2660                 if (dd->gatindex[a] != i)
2661                 {
2662                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2663                     nerr++;
2664                 }
2665             }
2666             ngl++;
2667         }
2668     }
2669     if (ngl != dd->nat_tot)
2670     {
2671         fprintf(stderr,
2672                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2673                 dd->rank, where, ngl, dd->nat_tot);
2674     }
2675     for (a = 0; a < dd->nat_tot; a++)
2676     {
2677         if (have[a] == 0)
2678         {
2679             fprintf(stderr,
2680                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2681                     dd->rank, where, a+1, dd->gatindex[a]+1);
2682         }
2683     }
2684     sfree(have);
2685
2686     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2687
2688     if (nerr > 0)
2689     {
2690         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2691                   dd->rank, where, nerr);
2692     }
2693 }
2694
2695 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2696 {
2697     int   i;
2698     char *bLocalCG;
2699
2700     if (a_start == 0)
2701     {
2702         /* Clear the whole list without searching */
2703         ga2la_clear(dd->ga2la);
2704     }
2705     else
2706     {
2707         for (i = a_start; i < dd->nat_tot; i++)
2708         {
2709             ga2la_del(dd->ga2la, dd->gatindex[i]);
2710         }
2711     }
2712
2713     bLocalCG = dd->comm->bLocalCG;
2714     if (bLocalCG)
2715     {
2716         for (i = cg_start; i < dd->ncg_tot; i++)
2717         {
2718             bLocalCG[dd->index_gl[i]] = FALSE;
2719         }
2720     }
2721
2722     dd_clear_local_vsite_indices(dd);
2723
2724     if (dd->constraints)
2725     {
2726         dd_clear_local_constraint_indices(dd);
2727     }
2728 }
2729
2730 /* This function should be used for moving the domain boudaries during DLB,
2731  * for obtaining the minimum cell size. It checks the initially set limit
2732  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2733  * and, possibly, a longer cut-off limit set for PME load balancing.
2734  */
2735 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2736 {
2737     real cellsize_min;
2738
2739     cellsize_min = comm->cellsize_min[dim];
2740
2741     if (!comm->bVacDLBNoLimit)
2742     {
2743         /* The cut-off might have changed, e.g. by PME load balacning,
2744          * from the value used to set comm->cellsize_min, so check it.
2745          */
2746         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2747
2748         if (comm->bPMELoadBalDLBLimits)
2749         {
2750             /* Check for the cut-off limit set by the PME load balancing */
2751             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2752         }
2753     }
2754
2755     return cellsize_min;
2756 }
2757
2758 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2759                             int dim_ind)
2760 {
2761     real grid_jump_limit;
2762
2763     /* The distance between the boundaries of cells at distance
2764      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2765      * and by the fact that cells should not be shifted by more than
2766      * half their size, such that cg's only shift by one cell
2767      * at redecomposition.
2768      */
2769     grid_jump_limit = comm->cellsize_limit;
2770     if (!comm->bVacDLBNoLimit)
2771     {
2772         if (comm->bPMELoadBalDLBLimits)
2773         {
2774             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2775         }
2776         grid_jump_limit = max(grid_jump_limit,
2777                               cutoff/comm->cd[dim_ind].np);
2778     }
2779
2780     return grid_jump_limit;
2781 }
2782
2783 static gmx_bool check_grid_jump(gmx_large_int_t step,
2784                                 gmx_domdec_t   *dd,
2785                                 real            cutoff,
2786                                 gmx_ddbox_t    *ddbox,
2787                                 gmx_bool        bFatal)
2788 {
2789     gmx_domdec_comm_t *comm;
2790     int                d, dim;
2791     real               limit, bfac;
2792     gmx_bool           bInvalid;
2793
2794     bInvalid = FALSE;
2795
2796     comm = dd->comm;
2797
2798     for (d = 1; d < dd->ndim; d++)
2799     {
2800         dim   = dd->dim[d];
2801         limit = grid_jump_limit(comm, cutoff, d);
2802         bfac  = ddbox->box_size[dim];
2803         if (ddbox->tric_dir[dim])
2804         {
2805             bfac *= ddbox->skew_fac[dim];
2806         }
2807         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2808                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2809         {
2810             bInvalid = TRUE;
2811
2812             if (bFatal)
2813             {
2814                 char buf[22];
2815
2816                 /* This error should never be triggered under normal
2817                  * circumstances, but you never know ...
2818                  */
2819                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2820                           gmx_step_str(step, buf),
2821                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2822             }
2823         }
2824     }
2825
2826     return bInvalid;
2827 }
2828
2829 static int dd_load_count(gmx_domdec_comm_t *comm)
2830 {
2831     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2832 }
2833
2834 static float dd_force_load(gmx_domdec_comm_t *comm)
2835 {
2836     float load;
2837
2838     if (comm->eFlop)
2839     {
2840         load = comm->flop;
2841         if (comm->eFlop > 1)
2842         {
2843             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2844         }
2845     }
2846     else
2847     {
2848         load = comm->cycl[ddCyclF];
2849         if (comm->cycl_n[ddCyclF] > 1)
2850         {
2851             /* Subtract the maximum of the last n cycle counts
2852              * to get rid of possible high counts due to other soures,
2853              * for instance system activity, that would otherwise
2854              * affect the dynamic load balancing.
2855              */
2856             load -= comm->cycl_max[ddCyclF];
2857         }
2858     }
2859
2860     return load;
2861 }
2862
2863 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2864 {
2865     gmx_domdec_comm_t *comm;
2866     int                i;
2867
2868     comm = dd->comm;
2869
2870     snew(*dim_f, dd->nc[dim]+1);
2871     (*dim_f)[0] = 0;
2872     for (i = 1; i < dd->nc[dim]; i++)
2873     {
2874         if (comm->slb_frac[dim])
2875         {
2876             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2877         }
2878         else
2879         {
2880             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2881         }
2882     }
2883     (*dim_f)[dd->nc[dim]] = 1;
2884 }
2885
2886 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2887 {
2888     int  pmeindex, slab, nso, i;
2889     ivec xyz;
2890
2891     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2892     {
2893         ddpme->dim = YY;
2894     }
2895     else
2896     {
2897         ddpme->dim = dimind;
2898     }
2899     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2900
2901     ddpme->nslab = (ddpme->dim == 0 ?
2902                     dd->comm->npmenodes_x :
2903                     dd->comm->npmenodes_y);
2904
2905     if (ddpme->nslab <= 1)
2906     {
2907         return;
2908     }
2909
2910     nso = dd->comm->npmenodes/ddpme->nslab;
2911     /* Determine for each PME slab the PP location range for dimension dim */
2912     snew(ddpme->pp_min, ddpme->nslab);
2913     snew(ddpme->pp_max, ddpme->nslab);
2914     for (slab = 0; slab < ddpme->nslab; slab++)
2915     {
2916         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2917         ddpme->pp_max[slab] = 0;
2918     }
2919     for (i = 0; i < dd->nnodes; i++)
2920     {
2921         ddindex2xyz(dd->nc, i, xyz);
2922         /* For y only use our y/z slab.
2923          * This assumes that the PME x grid size matches the DD grid size.
2924          */
2925         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2926         {
2927             pmeindex = ddindex2pmeindex(dd, i);
2928             if (dimind == 0)
2929             {
2930                 slab = pmeindex/nso;
2931             }
2932             else
2933             {
2934                 slab = pmeindex % ddpme->nslab;
2935             }
2936             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2937             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2938         }
2939     }
2940
2941     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2942 }
2943
2944 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2945 {
2946     if (dd->comm->ddpme[0].dim == XX)
2947     {
2948         return dd->comm->ddpme[0].maxshift;
2949     }
2950     else
2951     {
2952         return 0;
2953     }
2954 }
2955
2956 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2957 {
2958     if (dd->comm->ddpme[0].dim == YY)
2959     {
2960         return dd->comm->ddpme[0].maxshift;
2961     }
2962     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2963     {
2964         return dd->comm->ddpme[1].maxshift;
2965     }
2966     else
2967     {
2968         return 0;
2969     }
2970 }
2971
2972 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2973                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2974 {
2975     gmx_domdec_comm_t *comm;
2976     int                nc, ns, s;
2977     int               *xmin, *xmax;
2978     real               range, pme_boundary;
2979     int                sh;
2980
2981     comm = dd->comm;
2982     nc   = dd->nc[ddpme->dim];
2983     ns   = ddpme->nslab;
2984
2985     if (!ddpme->dim_match)
2986     {
2987         /* PP decomposition is not along dim: the worst situation */
2988         sh = ns/2;
2989     }
2990     else if (ns <= 3 || (bUniform && ns == nc))
2991     {
2992         /* The optimal situation */
2993         sh = 1;
2994     }
2995     else
2996     {
2997         /* We need to check for all pme nodes which nodes they
2998          * could possibly need to communicate with.
2999          */
3000         xmin = ddpme->pp_min;
3001         xmax = ddpme->pp_max;
3002         /* Allow for atoms to be maximally 2/3 times the cut-off
3003          * out of their DD cell. This is a reasonable balance between
3004          * between performance and support for most charge-group/cut-off
3005          * combinations.
3006          */
3007         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3008         /* Avoid extra communication when we are exactly at a boundary */
3009         range *= 0.999;
3010
3011         sh = 1;
3012         for (s = 0; s < ns; s++)
3013         {
3014             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3015             pme_boundary = (real)s/ns;
3016             while (sh+1 < ns &&
3017                    ((s-(sh+1) >= 0 &&
3018                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3019                     (s-(sh+1) <  0 &&
3020                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3021             {
3022                 sh++;
3023             }
3024             pme_boundary = (real)(s+1)/ns;
3025             while (sh+1 < ns &&
3026                    ((s+(sh+1) <  ns &&
3027                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3028                     (s+(sh+1) >= ns &&
3029                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3030             {
3031                 sh++;
3032             }
3033         }
3034     }
3035
3036     ddpme->maxshift = sh;
3037
3038     if (debug)
3039     {
3040         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3041                 ddpme->dim, ddpme->maxshift);
3042     }
3043 }
3044
3045 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3046 {
3047     int d, dim;
3048
3049     for (d = 0; d < dd->ndim; d++)
3050     {
3051         dim = dd->dim[d];
3052         if (dim < ddbox->nboundeddim &&
3053             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3054             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3055         {
3056             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3057                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3058                       dd->nc[dim], dd->comm->cellsize_limit);
3059         }
3060     }
3061 }
3062
3063 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3064                                   gmx_bool bMaster, ivec npulse)
3065 {
3066     gmx_domdec_comm_t *comm;
3067     int                d, j;
3068     rvec               cellsize_min;
3069     real              *cell_x, cell_dx, cellsize;
3070
3071     comm = dd->comm;
3072
3073     for (d = 0; d < DIM; d++)
3074     {
3075         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3076         npulse[d]       = 1;
3077         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3078         {
3079             /* Uniform grid */
3080             cell_dx = ddbox->box_size[d]/dd->nc[d];
3081             if (bMaster)
3082             {
3083                 for (j = 0; j < dd->nc[d]+1; j++)
3084                 {
3085                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3086                 }
3087             }
3088             else
3089             {
3090                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3091                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3092             }
3093             cellsize = cell_dx*ddbox->skew_fac[d];
3094             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3095             {
3096                 npulse[d]++;
3097             }
3098             cellsize_min[d] = cellsize;
3099         }
3100         else
3101         {
3102             /* Statically load balanced grid */
3103             /* Also when we are not doing a master distribution we determine
3104              * all cell borders in a loop to obtain identical values
3105              * to the master distribution case and to determine npulse.
3106              */
3107             if (bMaster)
3108             {
3109                 cell_x = dd->ma->cell_x[d];
3110             }
3111             else
3112             {
3113                 snew(cell_x, dd->nc[d]+1);
3114             }
3115             cell_x[0] = ddbox->box0[d];
3116             for (j = 0; j < dd->nc[d]; j++)
3117             {
3118                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3119                 cell_x[j+1] = cell_x[j] + cell_dx;
3120                 cellsize    = cell_dx*ddbox->skew_fac[d];
3121                 while (cellsize*npulse[d] < comm->cutoff &&
3122                        npulse[d] < dd->nc[d]-1)
3123                 {
3124                     npulse[d]++;
3125                 }
3126                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3127             }
3128             if (!bMaster)
3129             {
3130                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3131                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3132                 sfree(cell_x);
3133             }
3134         }
3135         /* The following limitation is to avoid that a cell would receive
3136          * some of its own home charge groups back over the periodic boundary.
3137          * Double charge groups cause trouble with the global indices.
3138          */
3139         if (d < ddbox->npbcdim &&
3140             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3141         {
3142             gmx_fatal_collective(FARGS, NULL, dd,
3143                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3144                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3145                                  comm->cutoff,
3146                                  dd->nc[d], dd->nc[d],
3147                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3148         }
3149     }
3150
3151     if (!comm->bDynLoadBal)
3152     {
3153         copy_rvec(cellsize_min, comm->cellsize_min);
3154     }
3155
3156     for (d = 0; d < comm->npmedecompdim; d++)
3157     {
3158         set_pme_maxshift(dd, &comm->ddpme[d],
3159                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3160                          comm->ddpme[d].slb_dim_f);
3161     }
3162 }
3163
3164
3165 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3166                                                   int d, int dim, gmx_domdec_root_t *root,
3167                                                   gmx_ddbox_t *ddbox,
3168                                                   gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
3169 {
3170     gmx_domdec_comm_t *comm;
3171     int                ncd, i, j, nmin, nmin_old;
3172     gmx_bool           bLimLo, bLimHi;
3173     real              *cell_size;
3174     real               fac, halfway, cellsize_limit_f_i, region_size;
3175     gmx_bool           bPBC, bLastHi = FALSE;
3176     int                nrange[] = {range[0], range[1]};
3177
3178     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3179
3180     comm = dd->comm;
3181
3182     ncd = dd->nc[dim];
3183
3184     bPBC = (dim < ddbox->npbcdim);
3185
3186     cell_size = root->buf_ncd;
3187
3188     if (debug)
3189     {
3190         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3191     }
3192
3193     /* First we need to check if the scaling does not make cells
3194      * smaller than the smallest allowed size.
3195      * We need to do this iteratively, since if a cell is too small,
3196      * it needs to be enlarged, which makes all the other cells smaller,
3197      * which could in turn make another cell smaller than allowed.
3198      */
3199     for (i = range[0]; i < range[1]; i++)
3200     {
3201         root->bCellMin[i] = FALSE;
3202     }
3203     nmin = 0;
3204     do
3205     {
3206         nmin_old = nmin;
3207         /* We need the total for normalization */
3208         fac = 0;
3209         for (i = range[0]; i < range[1]; i++)
3210         {
3211             if (root->bCellMin[i] == FALSE)
3212             {
3213                 fac += cell_size[i];
3214             }
3215         }
3216         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3217         /* Determine the cell boundaries */
3218         for (i = range[0]; i < range[1]; i++)
3219         {
3220             if (root->bCellMin[i] == FALSE)
3221             {
3222                 cell_size[i] *= fac;
3223                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3224                 {
3225                     cellsize_limit_f_i = 0;
3226                 }
3227                 else
3228                 {
3229                     cellsize_limit_f_i = cellsize_limit_f;
3230                 }
3231                 if (cell_size[i] < cellsize_limit_f_i)
3232                 {
3233                     root->bCellMin[i] = TRUE;
3234                     cell_size[i]      = cellsize_limit_f_i;
3235                     nmin++;
3236                 }
3237             }
3238             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3239         }
3240     }
3241     while (nmin > nmin_old);
3242
3243     i            = range[1]-1;
3244     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3245     /* For this check we should not use DD_CELL_MARGIN,
3246      * but a slightly smaller factor,
3247      * since rounding could get use below the limit.
3248      */
3249     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3250     {
3251         char buf[22];
3252         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3253                   gmx_step_str(step, buf),
3254                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3255                   ncd, comm->cellsize_min[dim]);
3256     }
3257
3258     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3259
3260     if (!bUniform)
3261     {
3262         /* Check if the boundary did not displace more than halfway
3263          * each of the cells it bounds, as this could cause problems,
3264          * especially when the differences between cell sizes are large.
3265          * If changes are applied, they will not make cells smaller
3266          * than the cut-off, as we check all the boundaries which
3267          * might be affected by a change and if the old state was ok,
3268          * the cells will at most be shrunk back to their old size.
3269          */
3270         for (i = range[0]+1; i < range[1]; i++)
3271         {
3272             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3273             if (root->cell_f[i] < halfway)
3274             {
3275                 root->cell_f[i] = halfway;
3276                 /* Check if the change also causes shifts of the next boundaries */
3277                 for (j = i+1; j < range[1]; j++)
3278                 {
3279                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3280                     {
3281                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3282                     }
3283                 }
3284             }
3285             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3286             if (root->cell_f[i] > halfway)
3287             {
3288                 root->cell_f[i] = halfway;
3289                 /* Check if the change also causes shifts of the next boundaries */
3290                 for (j = i-1; j >= range[0]+1; j--)
3291                 {
3292                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3293                     {
3294                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3295                     }
3296                 }
3297             }
3298         }
3299     }
3300
3301     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3302     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3303      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3304      * for a and b nrange is used */
3305     if (d > 0)
3306     {
3307         /* Take care of the staggering of the cell boundaries */
3308         if (bUniform)
3309         {
3310             for (i = range[0]; i < range[1]; i++)
3311             {
3312                 root->cell_f_max0[i] = root->cell_f[i];
3313                 root->cell_f_min1[i] = root->cell_f[i+1];
3314             }
3315         }
3316         else
3317         {
3318             for (i = range[0]+1; i < range[1]; i++)
3319             {
3320                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3321                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3322                 if (bLimLo && bLimHi)
3323                 {
3324                     /* Both limits violated, try the best we can */
3325                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3326                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3327                     nrange[0]       = range[0];
3328                     nrange[1]       = i;
3329                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3330
3331                     nrange[0] = i;
3332                     nrange[1] = range[1];
3333                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3334
3335                     return;
3336                 }
3337                 else if (bLimLo)
3338                 {
3339                     /* root->cell_f[i] = root->bound_min[i]; */
3340                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3341                     bLastHi   = FALSE;
3342                 }
3343                 else if (bLimHi && !bLastHi)
3344                 {
3345                     bLastHi = TRUE;
3346                     if (nrange[1] < range[1])   /* found a LimLo before */
3347                     {
3348                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3349                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3350                         nrange[0] = nrange[1];
3351                     }
3352                     root->cell_f[i] = root->bound_max[i];
3353                     nrange[1]       = i;
3354                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3355                     nrange[0] = i;
3356                     nrange[1] = range[1];
3357                 }
3358             }
3359             if (nrange[1] < range[1])   /* found last a LimLo */
3360             {
3361                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3362                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3363                 nrange[0] = nrange[1];
3364                 nrange[1] = range[1];
3365                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3366             }
3367             else if (nrange[0] > range[0]) /* found at least one LimHi */
3368             {
3369                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3370             }
3371         }
3372     }
3373 }
3374
3375
3376 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3377                                        int d, int dim, gmx_domdec_root_t *root,
3378                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3379                                        gmx_bool bUniform, gmx_large_int_t step)
3380 {
3381     gmx_domdec_comm_t *comm;
3382     int                ncd, d1, i, j, pos;
3383     real              *cell_size;
3384     real               load_aver, load_i, imbalance, change, change_max, sc;
3385     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3386     real               change_limit;
3387     real               relax = 0.5;
3388     gmx_bool           bPBC;
3389     int                range[] = { 0, 0 };
3390
3391     comm = dd->comm;
3392
3393     /* Convert the maximum change from the input percentage to a fraction */
3394     change_limit = comm->dlb_scale_lim*0.01;
3395
3396     ncd = dd->nc[dim];
3397
3398     bPBC = (dim < ddbox->npbcdim);
3399
3400     cell_size = root->buf_ncd;
3401
3402     /* Store the original boundaries */
3403     for (i = 0; i < ncd+1; i++)
3404     {
3405         root->old_cell_f[i] = root->cell_f[i];
3406     }
3407     if (bUniform)
3408     {
3409         for (i = 0; i < ncd; i++)
3410         {
3411             cell_size[i] = 1.0/ncd;
3412         }
3413     }
3414     else if (dd_load_count(comm))
3415     {
3416         load_aver  = comm->load[d].sum_m/ncd;
3417         change_max = 0;
3418         for (i = 0; i < ncd; i++)
3419         {
3420             /* Determine the relative imbalance of cell i */
3421             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3422             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3423             /* Determine the change of the cell size using underrelaxation */
3424             change     = -relax*imbalance;
3425             change_max = max(change_max, max(change, -change));
3426         }
3427         /* Limit the amount of scaling.
3428          * We need to use the same rescaling for all cells in one row,
3429          * otherwise the load balancing might not converge.
3430          */
3431         sc = relax;
3432         if (change_max > change_limit)
3433         {
3434             sc *= change_limit/change_max;
3435         }
3436         for (i = 0; i < ncd; i++)
3437         {
3438             /* Determine the relative imbalance of cell i */
3439             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3440             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3441             /* Determine the change of the cell size using underrelaxation */
3442             change       = -sc*imbalance;
3443             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3444         }
3445     }
3446
3447     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3448     cellsize_limit_f *= DD_CELL_MARGIN;
3449     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3450     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3451     if (ddbox->tric_dir[dim])
3452     {
3453         cellsize_limit_f /= ddbox->skew_fac[dim];
3454         dist_min_f       /= ddbox->skew_fac[dim];
3455     }
3456     if (bDynamicBox && d > 0)
3457     {
3458         dist_min_f *= DD_PRES_SCALE_MARGIN;
3459     }
3460     if (d > 0 && !bUniform)
3461     {
3462         /* Make sure that the grid is not shifted too much */
3463         for (i = 1; i < ncd; i++)
3464         {
3465             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3466             {
3467                 gmx_incons("Inconsistent DD boundary staggering limits!");
3468             }
3469             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3470             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3471             if (space > 0)
3472             {
3473                 root->bound_min[i] += 0.5*space;
3474             }
3475             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3476             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3477             if (space < 0)
3478             {
3479                 root->bound_max[i] += 0.5*space;
3480             }
3481             if (debug)
3482             {
3483                 fprintf(debug,
3484                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3485                         d, i,
3486                         root->cell_f_max0[i-1] + dist_min_f,
3487                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3488                         root->cell_f_min1[i] - dist_min_f);
3489             }
3490         }
3491     }
3492     range[1]          = ncd;
3493     root->cell_f[0]   = 0;
3494     root->cell_f[ncd] = 1;
3495     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3496
3497
3498     /* After the checks above, the cells should obey the cut-off
3499      * restrictions, but it does not hurt to check.
3500      */
3501     for (i = 0; i < ncd; i++)
3502     {
3503         if (debug)
3504         {
3505             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3506                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3507         }
3508
3509         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3510             root->cell_f[i+1] - root->cell_f[i] <
3511             cellsize_limit_f/DD_CELL_MARGIN)
3512         {
3513             char buf[22];
3514             fprintf(stderr,
3515                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3516                     gmx_step_str(step, buf), dim2char(dim), i,
3517                     (root->cell_f[i+1] - root->cell_f[i])
3518                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3519         }
3520     }
3521
3522     pos = ncd + 1;
3523     /* Store the cell boundaries of the lower dimensions at the end */
3524     for (d1 = 0; d1 < d; d1++)
3525     {
3526         root->cell_f[pos++] = comm->cell_f0[d1];
3527         root->cell_f[pos++] = comm->cell_f1[d1];
3528     }
3529
3530     if (d < comm->npmedecompdim)
3531     {
3532         /* The master determines the maximum shift for
3533          * the coordinate communication between separate PME nodes.
3534          */
3535         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3536     }
3537     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3538     if (d >= 1)
3539     {
3540         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3541     }
3542 }
3543
3544 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3545                                              gmx_ddbox_t *ddbox, int dimind)
3546 {
3547     gmx_domdec_comm_t *comm;
3548     int                dim;
3549
3550     comm = dd->comm;
3551
3552     /* Set the cell dimensions */
3553     dim                = dd->dim[dimind];
3554     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3555     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3556     if (dim >= ddbox->nboundeddim)
3557     {
3558         comm->cell_x0[dim] += ddbox->box0[dim];
3559         comm->cell_x1[dim] += ddbox->box0[dim];
3560     }
3561 }
3562
3563 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3564                                          int d, int dim, real *cell_f_row,
3565                                          gmx_ddbox_t *ddbox)
3566 {
3567     gmx_domdec_comm_t *comm;
3568     int                d1, dim1, pos;
3569
3570     comm = dd->comm;
3571
3572 #ifdef GMX_MPI
3573     /* Each node would only need to know two fractions,
3574      * but it is probably cheaper to broadcast the whole array.
3575      */
3576     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3577               0, comm->mpi_comm_load[d]);
3578 #endif
3579     /* Copy the fractions for this dimension from the buffer */
3580     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3581     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3582     /* The whole array was communicated, so set the buffer position */
3583     pos = dd->nc[dim] + 1;
3584     for (d1 = 0; d1 <= d; d1++)
3585     {
3586         if (d1 < d)
3587         {
3588             /* Copy the cell fractions of the lower dimensions */
3589             comm->cell_f0[d1] = cell_f_row[pos++];
3590             comm->cell_f1[d1] = cell_f_row[pos++];
3591         }
3592         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3593     }
3594     /* Convert the communicated shift from float to int */
3595     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3596     if (d >= 1)
3597     {
3598         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3599     }
3600 }
3601
3602 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3603                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3604                                          gmx_bool bUniform, gmx_large_int_t step)
3605 {
3606     gmx_domdec_comm_t *comm;
3607     int                d, dim, d1;
3608     gmx_bool           bRowMember, bRowRoot;
3609     real              *cell_f_row;
3610
3611     comm = dd->comm;
3612
3613     for (d = 0; d < dd->ndim; d++)
3614     {
3615         dim        = dd->dim[d];
3616         bRowMember = TRUE;
3617         bRowRoot   = TRUE;
3618         for (d1 = d; d1 < dd->ndim; d1++)
3619         {
3620             if (dd->ci[dd->dim[d1]] > 0)
3621             {
3622                 if (d1 > d)
3623                 {
3624                     bRowMember = FALSE;
3625                 }
3626                 bRowRoot = FALSE;
3627             }
3628         }
3629         if (bRowMember)
3630         {
3631             if (bRowRoot)
3632             {
3633                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3634                                            ddbox, bDynamicBox, bUniform, step);
3635                 cell_f_row = comm->root[d]->cell_f;
3636             }
3637             else
3638             {
3639                 cell_f_row = comm->cell_f_row;
3640             }
3641             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3642         }
3643     }
3644 }
3645
3646 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3647 {
3648     int d;
3649
3650     /* This function assumes the box is static and should therefore
3651      * not be called when the box has changed since the last
3652      * call to dd_partition_system.
3653      */
3654     for (d = 0; d < dd->ndim; d++)
3655     {
3656         relative_to_absolute_cell_bounds(dd, ddbox, d);
3657     }
3658 }
3659
3660
3661
3662 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3663                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3664                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3665                                   gmx_wallcycle_t wcycle)
3666 {
3667     gmx_domdec_comm_t *comm;
3668     int                dim;
3669
3670     comm = dd->comm;
3671
3672     if (bDoDLB)
3673     {
3674         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3675         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3676         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3677     }
3678     else if (bDynamicBox)
3679     {
3680         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3681     }
3682
3683     /* Set the dimensions for which no DD is used */
3684     for (dim = 0; dim < DIM; dim++)
3685     {
3686         if (dd->nc[dim] == 1)
3687         {
3688             comm->cell_x0[dim] = 0;
3689             comm->cell_x1[dim] = ddbox->box_size[dim];
3690             if (dim >= ddbox->nboundeddim)
3691             {
3692                 comm->cell_x0[dim] += ddbox->box0[dim];
3693                 comm->cell_x1[dim] += ddbox->box0[dim];
3694             }
3695         }
3696     }
3697 }
3698
3699 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3700 {
3701     int                    d, np, i;
3702     gmx_domdec_comm_dim_t *cd;
3703
3704     for (d = 0; d < dd->ndim; d++)
3705     {
3706         cd = &dd->comm->cd[d];
3707         np = npulse[dd->dim[d]];
3708         if (np > cd->np_nalloc)
3709         {
3710             if (debug)
3711             {
3712                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3713                         dim2char(dd->dim[d]), np);
3714             }
3715             if (DDMASTER(dd) && cd->np_nalloc > 0)
3716             {
3717                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3718             }
3719             srenew(cd->ind, np);
3720             for (i = cd->np_nalloc; i < np; i++)
3721             {
3722                 cd->ind[i].index  = NULL;
3723                 cd->ind[i].nalloc = 0;
3724             }
3725             cd->np_nalloc = np;
3726         }
3727         cd->np = np;
3728     }
3729 }
3730
3731
3732 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3733                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3734                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3735                               gmx_wallcycle_t wcycle)
3736 {
3737     gmx_domdec_comm_t *comm;
3738     int                d;
3739     ivec               npulse;
3740
3741     comm = dd->comm;
3742
3743     /* Copy the old cell boundaries for the cg displacement check */
3744     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3745     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3746
3747     if (comm->bDynLoadBal)
3748     {
3749         if (DDMASTER(dd))
3750         {
3751             check_box_size(dd, ddbox);
3752         }
3753         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3754     }
3755     else
3756     {
3757         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
3758         realloc_comm_ind(dd, npulse);
3759     }
3760
3761     if (debug)
3762     {
3763         for (d = 0; d < DIM; d++)
3764         {
3765             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3766                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3767         }
3768     }
3769 }
3770
3771 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3772                                   gmx_ddbox_t *ddbox,
3773                                   rvec cell_ns_x0, rvec cell_ns_x1,
3774                                   gmx_large_int_t step)
3775 {
3776     gmx_domdec_comm_t *comm;
3777     int                dim_ind, dim;
3778
3779     comm = dd->comm;
3780
3781     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3782     {
3783         dim = dd->dim[dim_ind];
3784
3785         /* Without PBC we don't have restrictions on the outer cells */
3786         if (!(dim >= ddbox->npbcdim &&
3787               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3788             comm->bDynLoadBal &&
3789             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3790             comm->cellsize_min[dim])
3791         {
3792             char buf[22];
3793             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3794                       gmx_step_str(step, buf), dim2char(dim),
3795                       comm->cell_x1[dim] - comm->cell_x0[dim],
3796                       ddbox->skew_fac[dim],
3797                       dd->comm->cellsize_min[dim],
3798                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3799         }
3800     }
3801
3802     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3803     {
3804         /* Communicate the boundaries and update cell_ns_x0/1 */
3805         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3806         if (dd->bGridJump && dd->ndim > 1)
3807         {
3808             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3809         }
3810     }
3811 }
3812
3813 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3814 {
3815     if (YY < npbcdim)
3816     {
3817         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3818     }
3819     else
3820     {
3821         tcm[YY][XX] = 0;
3822     }
3823     if (ZZ < npbcdim)
3824     {
3825         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3826         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3827     }
3828     else
3829     {
3830         tcm[ZZ][XX] = 0;
3831         tcm[ZZ][YY] = 0;
3832     }
3833 }
3834
3835 static void check_screw_box(matrix box)
3836 {
3837     /* Mathematical limitation */
3838     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3839     {
3840         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3841     }
3842
3843     /* Limitation due to the asymmetry of the eighth shell method */
3844     if (box[ZZ][YY] != 0)
3845     {
3846         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3847     }
3848 }
3849
3850 static void distribute_cg(FILE *fplog, gmx_large_int_t step,
3851                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3852                           gmx_domdec_t *dd)
3853 {
3854     gmx_domdec_master_t *ma;
3855     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3856     int                  i, icg, j, k, k0, k1, d, npbcdim;
3857     matrix               tcm;
3858     rvec                 box_size, cg_cm;
3859     ivec                 ind;
3860     real                 nrcg, inv_ncg, pos_d;
3861     atom_id             *cgindex;
3862     gmx_bool             bUnbounded, bScrew;
3863
3864     ma = dd->ma;
3865
3866     if (tmp_ind == NULL)
3867     {
3868         snew(tmp_nalloc, dd->nnodes);
3869         snew(tmp_ind, dd->nnodes);
3870         for (i = 0; i < dd->nnodes; i++)
3871         {
3872             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3873             snew(tmp_ind[i], tmp_nalloc[i]);
3874         }
3875     }
3876
3877     /* Clear the count */
3878     for (i = 0; i < dd->nnodes; i++)
3879     {
3880         ma->ncg[i] = 0;
3881         ma->nat[i] = 0;
3882     }
3883
3884     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3885
3886     cgindex = cgs->index;
3887
3888     /* Compute the center of geometry for all charge groups */
3889     for (icg = 0; icg < cgs->nr; icg++)
3890     {
3891         k0      = cgindex[icg];
3892         k1      = cgindex[icg+1];
3893         nrcg    = k1 - k0;
3894         if (nrcg == 1)
3895         {
3896             copy_rvec(pos[k0], cg_cm);
3897         }
3898         else
3899         {
3900             inv_ncg = 1.0/nrcg;
3901
3902             clear_rvec(cg_cm);
3903             for (k = k0; (k < k1); k++)
3904             {
3905                 rvec_inc(cg_cm, pos[k]);
3906             }
3907             for (d = 0; (d < DIM); d++)
3908             {
3909                 cg_cm[d] *= inv_ncg;
3910             }
3911         }
3912         /* Put the charge group in the box and determine the cell index */
3913         for (d = DIM-1; d >= 0; d--)
3914         {
3915             pos_d = cg_cm[d];
3916             if (d < dd->npbcdim)
3917             {
3918                 bScrew = (dd->bScrewPBC && d == XX);
3919                 if (tric_dir[d] && dd->nc[d] > 1)
3920                 {
3921                     /* Use triclinic coordintates for this dimension */
3922                     for (j = d+1; j < DIM; j++)
3923                     {
3924                         pos_d += cg_cm[j]*tcm[j][d];
3925                     }
3926                 }
3927                 while (pos_d >= box[d][d])
3928                 {
3929                     pos_d -= box[d][d];
3930                     rvec_dec(cg_cm, box[d]);
3931                     if (bScrew)
3932                     {
3933                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3934                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3935                     }
3936                     for (k = k0; (k < k1); k++)
3937                     {
3938                         rvec_dec(pos[k], box[d]);
3939                         if (bScrew)
3940                         {
3941                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3942                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3943                         }
3944                     }
3945                 }
3946                 while (pos_d < 0)
3947                 {
3948                     pos_d += box[d][d];
3949                     rvec_inc(cg_cm, box[d]);
3950                     if (bScrew)
3951                     {
3952                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3953                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3954                     }
3955                     for (k = k0; (k < k1); k++)
3956                     {
3957                         rvec_inc(pos[k], box[d]);
3958                         if (bScrew)
3959                         {
3960                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3961                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3962                         }
3963                     }
3964                 }
3965             }
3966             /* This could be done more efficiently */
3967             ind[d] = 0;
3968             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3969             {
3970                 ind[d]++;
3971             }
3972         }
3973         i = dd_index(dd->nc, ind);
3974         if (ma->ncg[i] == tmp_nalloc[i])
3975         {
3976             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3977             srenew(tmp_ind[i], tmp_nalloc[i]);
3978         }
3979         tmp_ind[i][ma->ncg[i]] = icg;
3980         ma->ncg[i]++;
3981         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3982     }
3983
3984     k1 = 0;
3985     for (i = 0; i < dd->nnodes; i++)
3986     {
3987         ma->index[i] = k1;
3988         for (k = 0; k < ma->ncg[i]; k++)
3989         {
3990             ma->cg[k1++] = tmp_ind[i][k];
3991         }
3992     }
3993     ma->index[dd->nnodes] = k1;
3994
3995     for (i = 0; i < dd->nnodes; i++)
3996     {
3997         sfree(tmp_ind[i]);
3998     }
3999     sfree(tmp_ind);
4000     sfree(tmp_nalloc);
4001
4002     if (fplog)
4003     {
4004         char buf[22];
4005         fprintf(fplog, "Charge group distribution at step %s:",
4006                 gmx_step_str(step, buf));
4007         for (i = 0; i < dd->nnodes; i++)
4008         {
4009             fprintf(fplog, " %d", ma->ncg[i]);
4010         }
4011         fprintf(fplog, "\n");
4012     }
4013 }
4014
4015 static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
4016                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4017                                 rvec pos[])
4018 {
4019     gmx_domdec_master_t *ma = NULL;
4020     ivec                 npulse;
4021     int                  i, cg_gl;
4022     int                 *ibuf, buf2[2] = { 0, 0 };
4023     gmx_bool             bMaster = DDMASTER(dd);
4024     if (bMaster)
4025     {
4026         ma = dd->ma;
4027
4028         if (dd->bScrewPBC)
4029         {
4030             check_screw_box(box);
4031         }
4032
4033         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
4034
4035         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4036         for (i = 0; i < dd->nnodes; i++)
4037         {
4038             ma->ibuf[2*i]   = ma->ncg[i];
4039             ma->ibuf[2*i+1] = ma->nat[i];
4040         }
4041         ibuf = ma->ibuf;
4042     }
4043     else
4044     {
4045         ibuf = NULL;
4046     }
4047     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4048
4049     dd->ncg_home = buf2[0];
4050     dd->nat_home = buf2[1];
4051     dd->ncg_tot  = dd->ncg_home;
4052     dd->nat_tot  = dd->nat_home;
4053     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4054     {
4055         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4056         srenew(dd->index_gl, dd->cg_nalloc);
4057         srenew(dd->cgindex, dd->cg_nalloc+1);
4058     }
4059     if (bMaster)
4060     {
4061         for (i = 0; i < dd->nnodes; i++)
4062         {
4063             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4064             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4065         }
4066     }
4067
4068     dd_scatterv(dd,
4069                 DDMASTER(dd) ? ma->ibuf : NULL,
4070                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4071                 DDMASTER(dd) ? ma->cg : NULL,
4072                 dd->ncg_home*sizeof(int), dd->index_gl);
4073
4074     /* Determine the home charge group sizes */
4075     dd->cgindex[0] = 0;
4076     for (i = 0; i < dd->ncg_home; i++)
4077     {
4078         cg_gl            = dd->index_gl[i];
4079         dd->cgindex[i+1] =
4080             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4081     }
4082
4083     if (debug)
4084     {
4085         fprintf(debug, "Home charge groups:\n");
4086         for (i = 0; i < dd->ncg_home; i++)
4087         {
4088             fprintf(debug, " %d", dd->index_gl[i]);
4089             if (i % 10 == 9)
4090             {
4091                 fprintf(debug, "\n");
4092             }
4093         }
4094         fprintf(debug, "\n");
4095     }
4096 }
4097
4098 static int compact_and_copy_vec_at(int ncg, int *move,
4099                                    int *cgindex,
4100                                    int nvec, int vec,
4101                                    rvec *src, gmx_domdec_comm_t *comm,
4102                                    gmx_bool bCompact)
4103 {
4104     int m, icg, i, i0, i1, nrcg;
4105     int home_pos;
4106     int pos_vec[DIM*2];
4107
4108     home_pos = 0;
4109
4110     for (m = 0; m < DIM*2; m++)
4111     {
4112         pos_vec[m] = 0;
4113     }
4114
4115     i0 = 0;
4116     for (icg = 0; icg < ncg; icg++)
4117     {
4118         i1 = cgindex[icg+1];
4119         m  = move[icg];
4120         if (m == -1)
4121         {
4122             if (bCompact)
4123             {
4124                 /* Compact the home array in place */
4125                 for (i = i0; i < i1; i++)
4126                 {
4127                     copy_rvec(src[i], src[home_pos++]);
4128                 }
4129             }
4130         }
4131         else
4132         {
4133             /* Copy to the communication buffer */
4134             nrcg        = i1 - i0;
4135             pos_vec[m] += 1 + vec*nrcg;
4136             for (i = i0; i < i1; i++)
4137             {
4138                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4139             }
4140             pos_vec[m] += (nvec - vec - 1)*nrcg;
4141         }
4142         if (!bCompact)
4143         {
4144             home_pos += i1 - i0;
4145         }
4146         i0 = i1;
4147     }
4148
4149     return home_pos;
4150 }
4151
4152 static int compact_and_copy_vec_cg(int ncg, int *move,
4153                                    int *cgindex,
4154                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4155                                    gmx_bool bCompact)
4156 {
4157     int m, icg, i0, i1, nrcg;
4158     int home_pos;
4159     int pos_vec[DIM*2];
4160
4161     home_pos = 0;
4162
4163     for (m = 0; m < DIM*2; m++)
4164     {
4165         pos_vec[m] = 0;
4166     }
4167
4168     i0 = 0;
4169     for (icg = 0; icg < ncg; icg++)
4170     {
4171         i1 = cgindex[icg+1];
4172         m  = move[icg];
4173         if (m == -1)
4174         {
4175             if (bCompact)
4176             {
4177                 /* Compact the home array in place */
4178                 copy_rvec(src[icg], src[home_pos++]);
4179             }
4180         }
4181         else
4182         {
4183             nrcg = i1 - i0;
4184             /* Copy to the communication buffer */
4185             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4186             pos_vec[m] += 1 + nrcg*nvec;
4187         }
4188         i0 = i1;
4189     }
4190     if (!bCompact)
4191     {
4192         home_pos = ncg;
4193     }
4194
4195     return home_pos;
4196 }
4197
4198 static int compact_ind(int ncg, int *move,
4199                        int *index_gl, int *cgindex,
4200                        int *gatindex,
4201                        gmx_ga2la_t ga2la, char *bLocalCG,
4202                        int *cginfo)
4203 {
4204     int cg, nat, a0, a1, a, a_gl;
4205     int home_pos;
4206
4207     home_pos = 0;
4208     nat      = 0;
4209     for (cg = 0; cg < ncg; cg++)
4210     {
4211         a0 = cgindex[cg];
4212         a1 = cgindex[cg+1];
4213         if (move[cg] == -1)
4214         {
4215             /* Compact the home arrays in place.
4216              * Anything that can be done here avoids access to global arrays.
4217              */
4218             cgindex[home_pos] = nat;
4219             for (a = a0; a < a1; a++)
4220             {
4221                 a_gl          = gatindex[a];
4222                 gatindex[nat] = a_gl;
4223                 /* The cell number stays 0, so we don't need to set it */
4224                 ga2la_change_la(ga2la, a_gl, nat);
4225                 nat++;
4226             }
4227             index_gl[home_pos] = index_gl[cg];
4228             cginfo[home_pos]   = cginfo[cg];
4229             /* The charge group remains local, so bLocalCG does not change */
4230             home_pos++;
4231         }
4232         else
4233         {
4234             /* Clear the global indices */
4235             for (a = a0; a < a1; a++)
4236             {
4237                 ga2la_del(ga2la, gatindex[a]);
4238             }
4239             if (bLocalCG)
4240             {
4241                 bLocalCG[index_gl[cg]] = FALSE;
4242             }
4243         }
4244     }
4245     cgindex[home_pos] = nat;
4246
4247     return home_pos;
4248 }
4249
4250 static void clear_and_mark_ind(int ncg, int *move,
4251                                int *index_gl, int *cgindex, int *gatindex,
4252                                gmx_ga2la_t ga2la, char *bLocalCG,
4253                                int *cell_index)
4254 {
4255     int cg, a0, a1, a;
4256
4257     for (cg = 0; cg < ncg; cg++)
4258     {
4259         if (move[cg] >= 0)
4260         {
4261             a0 = cgindex[cg];
4262             a1 = cgindex[cg+1];
4263             /* Clear the global indices */
4264             for (a = a0; a < a1; a++)
4265             {
4266                 ga2la_del(ga2la, gatindex[a]);
4267             }
4268             if (bLocalCG)
4269             {
4270                 bLocalCG[index_gl[cg]] = FALSE;
4271             }
4272             /* Signal that this cg has moved using the ns cell index.
4273              * Here we set it to -1. fill_grid will change it
4274              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4275              */
4276             cell_index[cg] = -1;
4277         }
4278     }
4279 }
4280
4281 static void print_cg_move(FILE *fplog,
4282                           gmx_domdec_t *dd,
4283                           gmx_large_int_t step, int cg, int dim, int dir,
4284                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4285                           rvec cm_old, rvec cm_new, real pos_d)
4286 {
4287     gmx_domdec_comm_t *comm;
4288     char               buf[22];
4289
4290     comm = dd->comm;
4291
4292     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4293     if (bHaveLimitdAndCMOld)
4294     {
4295         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4296                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4297     }
4298     else
4299     {
4300         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4301                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4302     }
4303     fprintf(fplog, "distance out of cell %f\n",
4304             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4305     if (bHaveLimitdAndCMOld)
4306     {
4307         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4308                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4309     }
4310     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4311             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4312     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4313             dim2char(dim),
4314             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4315     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4316             dim2char(dim),
4317             comm->cell_x0[dim], comm->cell_x1[dim]);
4318 }
4319
4320 static void cg_move_error(FILE *fplog,
4321                           gmx_domdec_t *dd,
4322                           gmx_large_int_t step, int cg, int dim, int dir,
4323                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4324                           rvec cm_old, rvec cm_new, real pos_d)
4325 {
4326     if (fplog)
4327     {
4328         print_cg_move(fplog, dd, step, cg, dim, dir,
4329                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4330     }
4331     print_cg_move(stderr, dd, step, cg, dim, dir,
4332                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4333     gmx_fatal(FARGS,
4334               "A charge group moved too far between two domain decomposition steps\n"
4335               "This usually means that your system is not well equilibrated");
4336 }
4337
4338 static void rotate_state_atom(t_state *state, int a)
4339 {
4340     int est;
4341
4342     for (est = 0; est < estNR; est++)
4343     {
4344         if (EST_DISTR(est) && (state->flags & (1<<est)))
4345         {
4346             switch (est)
4347             {
4348                 case estX:
4349                     /* Rotate the complete state; for a rectangular box only */
4350                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4351                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4352                     break;
4353                 case estV:
4354                     state->v[a][YY] = -state->v[a][YY];
4355                     state->v[a][ZZ] = -state->v[a][ZZ];
4356                     break;
4357                 case estSDX:
4358                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4359                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4360                     break;
4361                 case estCGP:
4362                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4363                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4364                     break;
4365                 case estDISRE_INITF:
4366                 case estDISRE_RM3TAV:
4367                 case estORIRE_INITF:
4368                 case estORIRE_DTAV:
4369                     /* These are distances, so not affected by rotation */
4370                     break;
4371                 default:
4372                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4373             }
4374         }
4375     }
4376 }
4377
4378 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4379 {
4380     if (natoms > comm->moved_nalloc)
4381     {
4382         /* Contents should be preserved here */
4383         comm->moved_nalloc = over_alloc_dd(natoms);
4384         srenew(comm->moved, comm->moved_nalloc);
4385     }
4386
4387     return comm->moved;
4388 }
4389
4390 static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
4391                          gmx_domdec_t *dd,
4392                          t_state *state,
4393                          ivec tric_dir, matrix tcm,
4394                          rvec cell_x0, rvec cell_x1,
4395                          rvec limitd, rvec limit0, rvec limit1,
4396                          const int *cgindex,
4397                          int cg_start, int cg_end,
4398                          rvec *cg_cm,
4399                          int *move)
4400 {
4401     int      npbcdim;
4402     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4403     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4404     int      flag;
4405     gmx_bool bScrew;
4406     ivec     dev;
4407     real     inv_ncg, pos_d;
4408     rvec     cm_new;
4409
4410     npbcdim = dd->npbcdim;
4411
4412     for (cg = cg_start; cg < cg_end; cg++)
4413     {
4414         k0   = cgindex[cg];
4415         k1   = cgindex[cg+1];
4416         nrcg = k1 - k0;
4417         if (nrcg == 1)
4418         {
4419             copy_rvec(state->x[k0], cm_new);
4420         }
4421         else
4422         {
4423             inv_ncg = 1.0/nrcg;
4424
4425             clear_rvec(cm_new);
4426             for (k = k0; (k < k1); k++)
4427             {
4428                 rvec_inc(cm_new, state->x[k]);
4429             }
4430             for (d = 0; (d < DIM); d++)
4431             {
4432                 cm_new[d] = inv_ncg*cm_new[d];
4433             }
4434         }
4435
4436         clear_ivec(dev);
4437         /* Do pbc and check DD cell boundary crossings */
4438         for (d = DIM-1; d >= 0; d--)
4439         {
4440             if (dd->nc[d] > 1)
4441             {
4442                 bScrew = (dd->bScrewPBC && d == XX);
4443                 /* Determine the location of this cg in lattice coordinates */
4444                 pos_d = cm_new[d];
4445                 if (tric_dir[d])
4446                 {
4447                     for (d2 = d+1; d2 < DIM; d2++)
4448                     {
4449                         pos_d += cm_new[d2]*tcm[d2][d];
4450                     }
4451                 }
4452                 /* Put the charge group in the triclinic unit-cell */
4453                 if (pos_d >= cell_x1[d])
4454                 {
4455                     if (pos_d >= limit1[d])
4456                     {
4457                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4458                                       cg_cm[cg], cm_new, pos_d);
4459                     }
4460                     dev[d] = 1;
4461                     if (dd->ci[d] == dd->nc[d] - 1)
4462                     {
4463                         rvec_dec(cm_new, state->box[d]);
4464                         if (bScrew)
4465                         {
4466                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4467                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4468                         }
4469                         for (k = k0; (k < k1); k++)
4470                         {
4471                             rvec_dec(state->x[k], state->box[d]);
4472                             if (bScrew)
4473                             {
4474                                 rotate_state_atom(state, k);
4475                             }
4476                         }
4477                     }
4478                 }
4479                 else if (pos_d < cell_x0[d])
4480                 {
4481                     if (pos_d < limit0[d])
4482                     {
4483                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4484                                       cg_cm[cg], cm_new, pos_d);
4485                     }
4486                     dev[d] = -1;
4487                     if (dd->ci[d] == 0)
4488                     {
4489                         rvec_inc(cm_new, state->box[d]);
4490                         if (bScrew)
4491                         {
4492                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4493                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4494                         }
4495                         for (k = k0; (k < k1); k++)
4496                         {
4497                             rvec_inc(state->x[k], state->box[d]);
4498                             if (bScrew)
4499                             {
4500                                 rotate_state_atom(state, k);
4501                             }
4502                         }
4503                     }
4504                 }
4505             }
4506             else if (d < npbcdim)
4507             {
4508                 /* Put the charge group in the rectangular unit-cell */
4509                 while (cm_new[d] >= state->box[d][d])
4510                 {
4511                     rvec_dec(cm_new, state->box[d]);
4512                     for (k = k0; (k < k1); k++)
4513                     {
4514                         rvec_dec(state->x[k], state->box[d]);
4515                     }
4516                 }
4517                 while (cm_new[d] < 0)
4518                 {
4519                     rvec_inc(cm_new, state->box[d]);
4520                     for (k = k0; (k < k1); k++)
4521                     {
4522                         rvec_inc(state->x[k], state->box[d]);
4523                     }
4524                 }
4525             }
4526         }
4527
4528         copy_rvec(cm_new, cg_cm[cg]);
4529
4530         /* Determine where this cg should go */
4531         flag = 0;
4532         mc   = -1;
4533         for (d = 0; d < dd->ndim; d++)
4534         {
4535             dim = dd->dim[d];
4536             if (dev[dim] == 1)
4537             {
4538                 flag |= DD_FLAG_FW(d);
4539                 if (mc == -1)
4540                 {
4541                     mc = d*2;
4542                 }
4543             }
4544             else if (dev[dim] == -1)
4545             {
4546                 flag |= DD_FLAG_BW(d);
4547                 if (mc == -1)
4548                 {
4549                     if (dd->nc[dim] > 2)
4550                     {
4551                         mc = d*2 + 1;
4552                     }
4553                     else
4554                     {
4555                         mc = d*2;
4556                     }
4557                 }
4558             }
4559         }
4560         /* Temporarily store the flag in move */
4561         move[cg] = mc + flag;
4562     }
4563 }
4564
4565 static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
4566                                gmx_domdec_t *dd, ivec tric_dir,
4567                                t_state *state, rvec **f,
4568                                t_forcerec *fr, t_mdatoms *md,
4569                                gmx_bool bCompact,
4570                                t_nrnb *nrnb,
4571                                int *ncg_stay_home,
4572                                int *ncg_moved)
4573 {
4574     int               *move;
4575     int                npbcdim;
4576     int                ncg[DIM*2], nat[DIM*2];
4577     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4578     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4579     int                sbuf[2], rbuf[2];
4580     int                home_pos_cg, home_pos_at, buf_pos;
4581     int                flag;
4582     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4583     gmx_bool           bScrew;
4584     ivec               dev;
4585     real               inv_ncg, pos_d;
4586     matrix             tcm;
4587     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4588     atom_id           *cgindex;
4589     cginfo_mb_t       *cginfo_mb;
4590     gmx_domdec_comm_t *comm;
4591     int               *moved;
4592     int                nthread, thread;
4593
4594     if (dd->bScrewPBC)
4595     {
4596         check_screw_box(state->box);
4597     }
4598
4599     comm  = dd->comm;
4600     if (fr->cutoff_scheme == ecutsGROUP)
4601     {
4602         cg_cm = fr->cg_cm;
4603     }
4604
4605     for (i = 0; i < estNR; i++)
4606     {
4607         if (EST_DISTR(i))
4608         {
4609             switch (i)
4610             {
4611                 case estX: /* Always present */ break;
4612                 case estV:   bV   = (state->flags & (1<<i)); break;
4613                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4614                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4615                 case estLD_RNG:
4616                 case estLD_RNGI:
4617                 case estDISRE_INITF:
4618                 case estDISRE_RM3TAV:
4619                 case estORIRE_INITF:
4620                 case estORIRE_DTAV:
4621                     /* No processing required */
4622                     break;
4623                 default:
4624                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4625             }
4626         }
4627     }
4628
4629     if (dd->ncg_tot > comm->nalloc_int)
4630     {
4631         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4632         srenew(comm->buf_int, comm->nalloc_int);
4633     }
4634     move = comm->buf_int;
4635
4636     /* Clear the count */
4637     for (c = 0; c < dd->ndim*2; c++)
4638     {
4639         ncg[c] = 0;
4640         nat[c] = 0;
4641     }
4642
4643     npbcdim = dd->npbcdim;
4644
4645     for (d = 0; (d < DIM); d++)
4646     {
4647         limitd[d] = dd->comm->cellsize_min[d];
4648         if (d >= npbcdim && dd->ci[d] == 0)
4649         {
4650             cell_x0[d] = -GMX_FLOAT_MAX;
4651         }
4652         else
4653         {
4654             cell_x0[d] = comm->cell_x0[d];
4655         }
4656         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4657         {
4658             cell_x1[d] = GMX_FLOAT_MAX;
4659         }
4660         else
4661         {
4662             cell_x1[d] = comm->cell_x1[d];
4663         }
4664         if (d < npbcdim)
4665         {
4666             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4667             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4668         }
4669         else
4670         {
4671             /* We check after communication if a charge group moved
4672              * more than one cell. Set the pre-comm check limit to float_max.
4673              */
4674             limit0[d] = -GMX_FLOAT_MAX;
4675             limit1[d] =  GMX_FLOAT_MAX;
4676         }
4677     }
4678
4679     make_tric_corr_matrix(npbcdim, state->box, tcm);
4680
4681     cgindex = dd->cgindex;
4682
4683     nthread = gmx_omp_nthreads_get(emntDomdec);
4684
4685     /* Compute the center of geometry for all home charge groups
4686      * and put them in the box and determine where they should go.
4687      */
4688 #pragma omp parallel for num_threads(nthread) schedule(static)
4689     for (thread = 0; thread < nthread; thread++)
4690     {
4691         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4692                      cell_x0, cell_x1, limitd, limit0, limit1,
4693                      cgindex,
4694                      ( thread   *dd->ncg_home)/nthread,
4695                      ((thread+1)*dd->ncg_home)/nthread,
4696                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4697                      move);
4698     }
4699
4700     for (cg = 0; cg < dd->ncg_home; cg++)
4701     {
4702         if (move[cg] >= 0)
4703         {
4704             mc       = move[cg];
4705             flag     = mc & ~DD_FLAG_NRCG;
4706             mc       = mc & DD_FLAG_NRCG;
4707             move[cg] = mc;
4708
4709             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4710             {
4711                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4712                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4713             }
4714             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4715             /* We store the cg size in the lower 16 bits
4716              * and the place where the charge group should go
4717              * in the next 6 bits. This saves some communication volume.
4718              */
4719             nrcg = cgindex[cg+1] - cgindex[cg];
4720             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4721             ncg[mc] += 1;
4722             nat[mc] += nrcg;
4723         }
4724     }
4725
4726     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4727     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4728
4729     *ncg_moved = 0;
4730     for (i = 0; i < dd->ndim*2; i++)
4731     {
4732         *ncg_moved += ncg[i];
4733     }
4734
4735     nvec = 1;
4736     if (bV)
4737     {
4738         nvec++;
4739     }
4740     if (bSDX)
4741     {
4742         nvec++;
4743     }
4744     if (bCGP)
4745     {
4746         nvec++;
4747     }
4748
4749     /* Make sure the communication buffers are large enough */
4750     for (mc = 0; mc < dd->ndim*2; mc++)
4751     {
4752         nvr = ncg[mc] + nat[mc]*nvec;
4753         if (nvr > comm->cgcm_state_nalloc[mc])
4754         {
4755             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4756             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4757         }
4758     }
4759
4760     switch (fr->cutoff_scheme)
4761     {
4762         case ecutsGROUP:
4763             /* Recalculating cg_cm might be cheaper than communicating,
4764              * but that could give rise to rounding issues.
4765              */
4766             home_pos_cg =
4767                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4768                                         nvec, cg_cm, comm, bCompact);
4769             break;
4770         case ecutsVERLET:
4771             /* Without charge groups we send the moved atom coordinates
4772              * over twice. This is so the code below can be used without
4773              * many conditionals for both for with and without charge groups.
4774              */
4775             home_pos_cg =
4776                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4777                                         nvec, state->x, comm, FALSE);
4778             if (bCompact)
4779             {
4780                 home_pos_cg -= *ncg_moved;
4781             }
4782             break;
4783         default:
4784             gmx_incons("unimplemented");
4785             home_pos_cg = 0;
4786     }
4787
4788     vec         = 0;
4789     home_pos_at =
4790         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4791                                 nvec, vec++, state->x, comm, bCompact);
4792     if (bV)
4793     {
4794         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4795                                 nvec, vec++, state->v, comm, bCompact);
4796     }
4797     if (bSDX)
4798     {
4799         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4800                                 nvec, vec++, state->sd_X, comm, bCompact);
4801     }
4802     if (bCGP)
4803     {
4804         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4805                                 nvec, vec++, state->cg_p, comm, bCompact);
4806     }
4807
4808     if (bCompact)
4809     {
4810         compact_ind(dd->ncg_home, move,
4811                     dd->index_gl, dd->cgindex, dd->gatindex,
4812                     dd->ga2la, comm->bLocalCG,
4813                     fr->cginfo);
4814     }
4815     else
4816     {
4817         if (fr->cutoff_scheme == ecutsVERLET)
4818         {
4819             moved = get_moved(comm, dd->ncg_home);
4820
4821             for (k = 0; k < dd->ncg_home; k++)
4822             {
4823                 moved[k] = 0;
4824             }
4825         }
4826         else
4827         {
4828             moved = fr->ns.grid->cell_index;
4829         }
4830
4831         clear_and_mark_ind(dd->ncg_home, move,
4832                            dd->index_gl, dd->cgindex, dd->gatindex,
4833                            dd->ga2la, comm->bLocalCG,
4834                            moved);
4835     }
4836
4837     cginfo_mb = fr->cginfo_mb;
4838
4839     *ncg_stay_home = home_pos_cg;
4840     for (d = 0; d < dd->ndim; d++)
4841     {
4842         dim      = dd->dim[d];
4843         ncg_recv = 0;
4844         nat_recv = 0;
4845         nvr      = 0;
4846         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4847         {
4848             cdd = d*2 + dir;
4849             /* Communicate the cg and atom counts */
4850             sbuf[0] = ncg[cdd];
4851             sbuf[1] = nat[cdd];
4852             if (debug)
4853             {
4854                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4855                         d, dir, sbuf[0], sbuf[1]);
4856             }
4857             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4858
4859             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4860             {
4861                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4862                 srenew(comm->buf_int, comm->nalloc_int);
4863             }
4864
4865             /* Communicate the charge group indices, sizes and flags */
4866             dd_sendrecv_int(dd, d, dir,
4867                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4868                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4869
4870             nvs = ncg[cdd] + nat[cdd]*nvec;
4871             i   = rbuf[0]  + rbuf[1] *nvec;
4872             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4873
4874             /* Communicate cgcm and state */
4875             dd_sendrecv_rvec(dd, d, dir,
4876                              comm->cgcm_state[cdd], nvs,
4877                              comm->vbuf.v+nvr, i);
4878             ncg_recv += rbuf[0];
4879             nat_recv += rbuf[1];
4880             nvr      += i;
4881         }
4882
4883         /* Process the received charge groups */
4884         buf_pos = 0;
4885         for (cg = 0; cg < ncg_recv; cg++)
4886         {
4887             flag = comm->buf_int[cg*DD_CGIBS+1];
4888
4889             if (dim >= npbcdim && dd->nc[dim] > 2)
4890             {
4891                 /* No pbc in this dim and more than one domain boundary.
4892                  * We do a separate check if a charge group didn't move too far.
4893                  */
4894                 if (((flag & DD_FLAG_FW(d)) &&
4895                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4896                     ((flag & DD_FLAG_BW(d)) &&
4897                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4898                 {
4899                     cg_move_error(fplog, dd, step, cg, dim,
4900                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4901                                   FALSE, 0,
4902                                   comm->vbuf.v[buf_pos],
4903                                   comm->vbuf.v[buf_pos],
4904                                   comm->vbuf.v[buf_pos][dim]);
4905                 }
4906             }
4907
4908             mc = -1;
4909             if (d < dd->ndim-1)
4910             {
4911                 /* Check which direction this cg should go */
4912                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4913                 {
4914                     if (dd->bGridJump)
4915                     {
4916                         /* The cell boundaries for dimension d2 are not equal
4917                          * for each cell row of the lower dimension(s),
4918                          * therefore we might need to redetermine where
4919                          * this cg should go.
4920                          */
4921                         dim2 = dd->dim[d2];
4922                         /* If this cg crosses the box boundary in dimension d2
4923                          * we can use the communicated flag, so we do not
4924                          * have to worry about pbc.
4925                          */
4926                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4927                                (flag & DD_FLAG_FW(d2))) ||
4928                               (dd->ci[dim2] == 0 &&
4929                                (flag & DD_FLAG_BW(d2)))))
4930                         {
4931                             /* Clear the two flags for this dimension */
4932                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4933                             /* Determine the location of this cg
4934                              * in lattice coordinates
4935                              */
4936                             pos_d = comm->vbuf.v[buf_pos][dim2];
4937                             if (tric_dir[dim2])
4938                             {
4939                                 for (d3 = dim2+1; d3 < DIM; d3++)
4940                                 {
4941                                     pos_d +=
4942                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4943                                 }
4944                             }
4945                             /* Check of we are not at the box edge.
4946                              * pbc is only handled in the first step above,
4947                              * but this check could move over pbc while
4948                              * the first step did not due to different rounding.
4949                              */
4950                             if (pos_d >= cell_x1[dim2] &&
4951                                 dd->ci[dim2] != dd->nc[dim2]-1)
4952                             {
4953                                 flag |= DD_FLAG_FW(d2);
4954                             }
4955                             else if (pos_d < cell_x0[dim2] &&
4956                                      dd->ci[dim2] != 0)
4957                             {
4958                                 flag |= DD_FLAG_BW(d2);
4959                             }
4960                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4961                         }
4962                     }
4963                     /* Set to which neighboring cell this cg should go */
4964                     if (flag & DD_FLAG_FW(d2))
4965                     {
4966                         mc = d2*2;
4967                     }
4968                     else if (flag & DD_FLAG_BW(d2))
4969                     {
4970                         if (dd->nc[dd->dim[d2]] > 2)
4971                         {
4972                             mc = d2*2+1;
4973                         }
4974                         else
4975                         {
4976                             mc = d2*2;
4977                         }
4978                     }
4979                 }
4980             }
4981
4982             nrcg = flag & DD_FLAG_NRCG;
4983             if (mc == -1)
4984             {
4985                 if (home_pos_cg+1 > dd->cg_nalloc)
4986                 {
4987                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4988                     srenew(dd->index_gl, dd->cg_nalloc);
4989                     srenew(dd->cgindex, dd->cg_nalloc+1);
4990                 }
4991                 /* Set the global charge group index and size */
4992                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4993                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4994                 /* Copy the state from the buffer */
4995                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4996                 if (fr->cutoff_scheme == ecutsGROUP)
4997                 {
4998                     cg_cm = fr->cg_cm;
4999                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5000                 }
5001                 buf_pos++;
5002
5003                 /* Set the cginfo */
5004                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5005                                                    dd->index_gl[home_pos_cg]);
5006                 if (comm->bLocalCG)
5007                 {
5008                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5009                 }
5010
5011                 if (home_pos_at+nrcg > state->nalloc)
5012                 {
5013                     dd_realloc_state(state, f, home_pos_at+nrcg);
5014                 }
5015                 for (i = 0; i < nrcg; i++)
5016                 {
5017                     copy_rvec(comm->vbuf.v[buf_pos++],
5018                               state->x[home_pos_at+i]);
5019                 }
5020                 if (bV)
5021                 {
5022                     for (i = 0; i < nrcg; i++)
5023                     {
5024                         copy_rvec(comm->vbuf.v[buf_pos++],
5025                                   state->v[home_pos_at+i]);
5026                     }
5027                 }
5028                 if (bSDX)
5029                 {
5030                     for (i = 0; i < nrcg; i++)
5031                     {
5032                         copy_rvec(comm->vbuf.v[buf_pos++],
5033                                   state->sd_X[home_pos_at+i]);
5034                     }
5035                 }
5036                 if (bCGP)
5037                 {
5038                     for (i = 0; i < nrcg; i++)
5039                     {
5040                         copy_rvec(comm->vbuf.v[buf_pos++],
5041                                   state->cg_p[home_pos_at+i]);
5042                     }
5043                 }
5044                 home_pos_cg += 1;
5045                 home_pos_at += nrcg;
5046             }
5047             else
5048             {
5049                 /* Reallocate the buffers if necessary  */
5050                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5051                 {
5052                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5053                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5054                 }
5055                 nvr = ncg[mc] + nat[mc]*nvec;
5056                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5057                 {
5058                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5059                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5060                 }
5061                 /* Copy from the receive to the send buffers */
5062                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5063                        comm->buf_int + cg*DD_CGIBS,
5064                        DD_CGIBS*sizeof(int));
5065                 memcpy(comm->cgcm_state[mc][nvr],
5066                        comm->vbuf.v[buf_pos],
5067                        (1+nrcg*nvec)*sizeof(rvec));
5068                 buf_pos += 1 + nrcg*nvec;
5069                 ncg[mc] += 1;
5070                 nat[mc] += nrcg;
5071             }
5072         }
5073     }
5074
5075     /* With sorting (!bCompact) the indices are now only partially up to date
5076      * and ncg_home and nat_home are not the real count, since there are
5077      * "holes" in the arrays for the charge groups that moved to neighbors.
5078      */
5079     if (fr->cutoff_scheme == ecutsVERLET)
5080     {
5081         moved = get_moved(comm, home_pos_cg);
5082
5083         for (i = dd->ncg_home; i < home_pos_cg; i++)
5084         {
5085             moved[i] = 0;
5086         }
5087     }
5088     dd->ncg_home = home_pos_cg;
5089     dd->nat_home = home_pos_at;
5090
5091     if (debug)
5092     {
5093         fprintf(debug,
5094                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5095                 *ncg_moved, dd->ncg_home-*ncg_moved);
5096
5097     }
5098 }
5099
5100 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5101 {
5102     dd->comm->cycl[ddCycl] += cycles;
5103     dd->comm->cycl_n[ddCycl]++;
5104     if (cycles > dd->comm->cycl_max[ddCycl])
5105     {
5106         dd->comm->cycl_max[ddCycl] = cycles;
5107     }
5108 }
5109
5110 static double force_flop_count(t_nrnb *nrnb)
5111 {
5112     int         i;
5113     double      sum;
5114     const char *name;
5115
5116     sum = 0;
5117     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5118     {
5119         /* To get closer to the real timings, we half the count
5120          * for the normal loops and again half it for water loops.
5121          */
5122         name = nrnb_str(i);
5123         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5124         {
5125             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5126         }
5127         else
5128         {
5129             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5130         }
5131     }
5132     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5133     {
5134         name = nrnb_str(i);
5135         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5136         {
5137             sum += nrnb->n[i]*cost_nrnb(i);
5138         }
5139     }
5140     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5141     {
5142         sum += nrnb->n[i]*cost_nrnb(i);
5143     }
5144
5145     return sum;
5146 }
5147
5148 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5149 {
5150     if (dd->comm->eFlop)
5151     {
5152         dd->comm->flop -= force_flop_count(nrnb);
5153     }
5154 }
5155 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5156 {
5157     if (dd->comm->eFlop)
5158     {
5159         dd->comm->flop += force_flop_count(nrnb);
5160         dd->comm->flop_n++;
5161     }
5162 }
5163
5164 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5165 {
5166     int i;
5167
5168     for (i = 0; i < ddCyclNr; i++)
5169     {
5170         dd->comm->cycl[i]     = 0;
5171         dd->comm->cycl_n[i]   = 0;
5172         dd->comm->cycl_max[i] = 0;
5173     }
5174     dd->comm->flop   = 0;
5175     dd->comm->flop_n = 0;
5176 }
5177
5178 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5179 {
5180     gmx_domdec_comm_t *comm;
5181     gmx_domdec_load_t *load;
5182     gmx_domdec_root_t *root = NULL;
5183     int                d, dim, cid, i, pos;
5184     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5185     gmx_bool           bSepPME;
5186
5187     if (debug)
5188     {
5189         fprintf(debug, "get_load_distribution start\n");
5190     }
5191
5192     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5193
5194     comm = dd->comm;
5195
5196     bSepPME = (dd->pme_nodeid >= 0);
5197
5198     for (d = dd->ndim-1; d >= 0; d--)
5199     {
5200         dim = dd->dim[d];
5201         /* Check if we participate in the communication in this dimension */
5202         if (d == dd->ndim-1 ||
5203             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5204         {
5205             load = &comm->load[d];
5206             if (dd->bGridJump)
5207             {
5208                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5209             }
5210             pos = 0;
5211             if (d == dd->ndim-1)
5212             {
5213                 sbuf[pos++] = dd_force_load(comm);
5214                 sbuf[pos++] = sbuf[0];
5215                 if (dd->bGridJump)
5216                 {
5217                     sbuf[pos++] = sbuf[0];
5218                     sbuf[pos++] = cell_frac;
5219                     if (d > 0)
5220                     {
5221                         sbuf[pos++] = comm->cell_f_max0[d];
5222                         sbuf[pos++] = comm->cell_f_min1[d];
5223                     }
5224                 }
5225                 if (bSepPME)
5226                 {
5227                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5228                     sbuf[pos++] = comm->cycl[ddCyclPME];
5229                 }
5230             }
5231             else
5232             {
5233                 sbuf[pos++] = comm->load[d+1].sum;
5234                 sbuf[pos++] = comm->load[d+1].max;
5235                 if (dd->bGridJump)
5236                 {
5237                     sbuf[pos++] = comm->load[d+1].sum_m;
5238                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5239                     sbuf[pos++] = comm->load[d+1].flags;
5240                     if (d > 0)
5241                     {
5242                         sbuf[pos++] = comm->cell_f_max0[d];
5243                         sbuf[pos++] = comm->cell_f_min1[d];
5244                     }
5245                 }
5246                 if (bSepPME)
5247                 {
5248                     sbuf[pos++] = comm->load[d+1].mdf;
5249                     sbuf[pos++] = comm->load[d+1].pme;
5250                 }
5251             }
5252             load->nload = pos;
5253             /* Communicate a row in DD direction d.
5254              * The communicators are setup such that the root always has rank 0.
5255              */
5256 #ifdef GMX_MPI
5257             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5258                        load->load, load->nload*sizeof(float), MPI_BYTE,
5259                        0, comm->mpi_comm_load[d]);
5260 #endif
5261             if (dd->ci[dim] == dd->master_ci[dim])
5262             {
5263                 /* We are the root, process this row */
5264                 if (comm->bDynLoadBal)
5265                 {
5266                     root = comm->root[d];
5267                 }
5268                 load->sum      = 0;
5269                 load->max      = 0;
5270                 load->sum_m    = 0;
5271                 load->cvol_min = 1;
5272                 load->flags    = 0;
5273                 load->mdf      = 0;
5274                 load->pme      = 0;
5275                 pos            = 0;
5276                 for (i = 0; i < dd->nc[dim]; i++)
5277                 {
5278                     load->sum += load->load[pos++];
5279                     load->max  = max(load->max, load->load[pos]);
5280                     pos++;
5281                     if (dd->bGridJump)
5282                     {
5283                         if (root->bLimited)
5284                         {
5285                             /* This direction could not be load balanced properly,
5286                              * therefore we need to use the maximum iso the average load.
5287                              */
5288                             load->sum_m = max(load->sum_m, load->load[pos]);
5289                         }
5290                         else
5291                         {
5292                             load->sum_m += load->load[pos];
5293                         }
5294                         pos++;
5295                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5296                         pos++;
5297                         if (d < dd->ndim-1)
5298                         {
5299                             load->flags = (int)(load->load[pos++] + 0.5);
5300                         }
5301                         if (d > 0)
5302                         {
5303                             root->cell_f_max0[i] = load->load[pos++];
5304                             root->cell_f_min1[i] = load->load[pos++];
5305                         }
5306                     }
5307                     if (bSepPME)
5308                     {
5309                         load->mdf = max(load->mdf, load->load[pos]);
5310                         pos++;
5311                         load->pme = max(load->pme, load->load[pos]);
5312                         pos++;
5313                     }
5314                 }
5315                 if (comm->bDynLoadBal && root->bLimited)
5316                 {
5317                     load->sum_m *= dd->nc[dim];
5318                     load->flags |= (1<<d);
5319                 }
5320             }
5321         }
5322     }
5323
5324     if (DDMASTER(dd))
5325     {
5326         comm->nload      += dd_load_count(comm);
5327         comm->load_step  += comm->cycl[ddCyclStep];
5328         comm->load_sum   += comm->load[0].sum;
5329         comm->load_max   += comm->load[0].max;
5330         if (comm->bDynLoadBal)
5331         {
5332             for (d = 0; d < dd->ndim; d++)
5333             {
5334                 if (comm->load[0].flags & (1<<d))
5335                 {
5336                     comm->load_lim[d]++;
5337                 }
5338             }
5339         }
5340         if (bSepPME)
5341         {
5342             comm->load_mdf += comm->load[0].mdf;
5343             comm->load_pme += comm->load[0].pme;
5344         }
5345     }
5346
5347     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5348
5349     if (debug)
5350     {
5351         fprintf(debug, "get_load_distribution finished\n");
5352     }
5353 }
5354
5355 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5356 {
5357     /* Return the relative performance loss on the total run time
5358      * due to the force calculation load imbalance.
5359      */
5360     if (dd->comm->nload > 0)
5361     {
5362         return
5363             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5364             (dd->comm->load_step*dd->nnodes);
5365     }
5366     else
5367     {
5368         return 0;
5369     }
5370 }
5371
5372 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5373 {
5374     char               buf[STRLEN];
5375     int                npp, npme, nnodes, d, limp;
5376     float              imbal, pme_f_ratio, lossf, lossp = 0;
5377     gmx_bool           bLim;
5378     gmx_domdec_comm_t *comm;
5379
5380     comm = dd->comm;
5381     if (DDMASTER(dd) && comm->nload > 0)
5382     {
5383         npp    = dd->nnodes;
5384         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5385         nnodes = npp + npme;
5386         imbal  = comm->load_max*npp/comm->load_sum - 1;
5387         lossf  = dd_force_imb_perf_loss(dd);
5388         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5389         fprintf(fplog, "%s", buf);
5390         fprintf(stderr, "\n");
5391         fprintf(stderr, "%s", buf);
5392         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5393         fprintf(fplog, "%s", buf);
5394         fprintf(stderr, "%s", buf);
5395         bLim = FALSE;
5396         if (comm->bDynLoadBal)
5397         {
5398             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5399             for (d = 0; d < dd->ndim; d++)
5400             {
5401                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5402                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5403                 if (limp >= 50)
5404                 {
5405                     bLim = TRUE;
5406                 }
5407             }
5408             sprintf(buf+strlen(buf), "\n");
5409             fprintf(fplog, "%s", buf);
5410             fprintf(stderr, "%s", buf);
5411         }
5412         if (npme > 0)
5413         {
5414             pme_f_ratio = comm->load_pme/comm->load_mdf;
5415             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5416             if (lossp <= 0)
5417             {
5418                 lossp *= (float)npme/(float)nnodes;
5419             }
5420             else
5421             {
5422                 lossp *= (float)npp/(float)nnodes;
5423             }
5424             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5425             fprintf(fplog, "%s", buf);
5426             fprintf(stderr, "%s", buf);
5427             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5428             fprintf(fplog, "%s", buf);
5429             fprintf(stderr, "%s", buf);
5430         }
5431         fprintf(fplog, "\n");
5432         fprintf(stderr, "\n");
5433
5434         if (lossf >= DD_PERF_LOSS)
5435         {
5436             sprintf(buf,
5437                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5438                     "      in the domain decomposition.\n", lossf*100);
5439             if (!comm->bDynLoadBal)
5440             {
5441                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5442             }
5443             else if (bLim)
5444             {
5445                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5446             }
5447             fprintf(fplog, "%s\n", buf);
5448             fprintf(stderr, "%s\n", buf);
5449         }
5450         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5451         {
5452             sprintf(buf,
5453                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5454                     "      had %s work to do than the PP nodes.\n"
5455                     "      You might want to %s the number of PME nodes\n"
5456                     "      or %s the cut-off and the grid spacing.\n",
5457                     fabs(lossp*100),
5458                     (lossp < 0) ? "less"     : "more",
5459                     (lossp < 0) ? "decrease" : "increase",
5460                     (lossp < 0) ? "decrease" : "increase");
5461             fprintf(fplog, "%s\n", buf);
5462             fprintf(stderr, "%s\n", buf);
5463         }
5464     }
5465 }
5466
5467 static float dd_vol_min(gmx_domdec_t *dd)
5468 {
5469     return dd->comm->load[0].cvol_min*dd->nnodes;
5470 }
5471
5472 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5473 {
5474     return dd->comm->load[0].flags;
5475 }
5476
5477 static float dd_f_imbal(gmx_domdec_t *dd)
5478 {
5479     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5480 }
5481
5482 float dd_pme_f_ratio(gmx_domdec_t *dd)
5483 {
5484     if (dd->comm->cycl_n[ddCyclPME] > 0)
5485     {
5486         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5487     }
5488     else
5489     {
5490         return -1.0;
5491     }
5492 }
5493
5494 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
5495 {
5496     int  flags, d;
5497     char buf[22];
5498
5499     flags = dd_load_flags(dd);
5500     if (flags)
5501     {
5502         fprintf(fplog,
5503                 "DD  load balancing is limited by minimum cell size in dimension");
5504         for (d = 0; d < dd->ndim; d++)
5505         {
5506             if (flags & (1<<d))
5507             {
5508                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5509             }
5510         }
5511         fprintf(fplog, "\n");
5512     }
5513     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5514     if (dd->comm->bDynLoadBal)
5515     {
5516         fprintf(fplog, "  vol min/aver %5.3f%c",
5517                 dd_vol_min(dd), flags ? '!' : ' ');
5518     }
5519     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5520     if (dd->comm->cycl_n[ddCyclPME])
5521     {
5522         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5523     }
5524     fprintf(fplog, "\n\n");
5525 }
5526
5527 static void dd_print_load_verbose(gmx_domdec_t *dd)
5528 {
5529     if (dd->comm->bDynLoadBal)
5530     {
5531         fprintf(stderr, "vol %4.2f%c ",
5532                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5533     }
5534     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5535     if (dd->comm->cycl_n[ddCyclPME])
5536     {
5537         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5538     }
5539 }
5540
5541 #ifdef GMX_MPI
5542 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5543 {
5544     MPI_Comm           c_row;
5545     int                dim, i, rank;
5546     ivec               loc_c;
5547     gmx_domdec_root_t *root;
5548     gmx_bool           bPartOfGroup = FALSE;
5549
5550     dim = dd->dim[dim_ind];
5551     copy_ivec(loc, loc_c);
5552     for (i = 0; i < dd->nc[dim]; i++)
5553     {
5554         loc_c[dim] = i;
5555         rank       = dd_index(dd->nc, loc_c);
5556         if (rank == dd->rank)
5557         {
5558             /* This process is part of the group */
5559             bPartOfGroup = TRUE;
5560         }
5561     }
5562     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5563                    &c_row);
5564     if (bPartOfGroup)
5565     {
5566         dd->comm->mpi_comm_load[dim_ind] = c_row;
5567         if (dd->comm->eDLB != edlbNO)
5568         {
5569             if (dd->ci[dim] == dd->master_ci[dim])
5570             {
5571                 /* This is the root process of this row */
5572                 snew(dd->comm->root[dim_ind], 1);
5573                 root = dd->comm->root[dim_ind];
5574                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5575                 snew(root->old_cell_f, dd->nc[dim]+1);
5576                 snew(root->bCellMin, dd->nc[dim]);
5577                 if (dim_ind > 0)
5578                 {
5579                     snew(root->cell_f_max0, dd->nc[dim]);
5580                     snew(root->cell_f_min1, dd->nc[dim]);
5581                     snew(root->bound_min, dd->nc[dim]);
5582                     snew(root->bound_max, dd->nc[dim]);
5583                 }
5584                 snew(root->buf_ncd, dd->nc[dim]);
5585             }
5586             else
5587             {
5588                 /* This is not a root process, we only need to receive cell_f */
5589                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5590             }
5591         }
5592         if (dd->ci[dim] == dd->master_ci[dim])
5593         {
5594             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5595         }
5596     }
5597 }
5598 #endif
5599
5600 static void make_load_communicators(gmx_domdec_t *dd)
5601 {
5602 #ifdef GMX_MPI
5603     int  dim0, dim1, i, j;
5604     ivec loc;
5605
5606     if (debug)
5607     {
5608         fprintf(debug, "Making load communicators\n");
5609     }
5610
5611     snew(dd->comm->load, dd->ndim);
5612     snew(dd->comm->mpi_comm_load, dd->ndim);
5613
5614     clear_ivec(loc);
5615     make_load_communicator(dd, 0, loc);
5616     if (dd->ndim > 1)
5617     {
5618         dim0 = dd->dim[0];
5619         for (i = 0; i < dd->nc[dim0]; i++)
5620         {
5621             loc[dim0] = i;
5622             make_load_communicator(dd, 1, loc);
5623         }
5624     }
5625     if (dd->ndim > 2)
5626     {
5627         dim0 = dd->dim[0];
5628         for (i = 0; i < dd->nc[dim0]; i++)
5629         {
5630             loc[dim0] = i;
5631             dim1      = dd->dim[1];
5632             for (j = 0; j < dd->nc[dim1]; j++)
5633             {
5634                 loc[dim1] = j;
5635                 make_load_communicator(dd, 2, loc);
5636             }
5637         }
5638     }
5639
5640     if (debug)
5641     {
5642         fprintf(debug, "Finished making load communicators\n");
5643     }
5644 #endif
5645 }
5646
5647 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5648 {
5649     gmx_bool                bZYX;
5650     int                     d, dim, i, j, m;
5651     ivec                    tmp, s;
5652     int                     nzone, nzonep;
5653     ivec                    dd_zp[DD_MAXIZONE];
5654     gmx_domdec_zones_t     *zones;
5655     gmx_domdec_ns_ranges_t *izone;
5656
5657     for (d = 0; d < dd->ndim; d++)
5658     {
5659         dim = dd->dim[d];
5660         copy_ivec(dd->ci, tmp);
5661         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5662         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5663         copy_ivec(dd->ci, tmp);
5664         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5665         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5666         if (debug)
5667         {
5668             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5669                     dd->rank, dim,
5670                     dd->neighbor[d][0],
5671                     dd->neighbor[d][1]);
5672         }
5673     }
5674
5675     if (fplog)
5676     {
5677         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5678                 dd->ndim,
5679                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5680                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5681     }
5682     switch (dd->ndim)
5683     {
5684         case 3:
5685             nzone  = dd_z3n;
5686             nzonep = dd_zp3n;
5687             for (i = 0; i < nzonep; i++)
5688             {
5689                 copy_ivec(dd_zp3[i], dd_zp[i]);
5690             }
5691             break;
5692         case 2:
5693             nzone  = dd_z2n;
5694             nzonep = dd_zp2n;
5695             for (i = 0; i < nzonep; i++)
5696             {
5697                 copy_ivec(dd_zp2[i], dd_zp[i]);
5698             }
5699             break;
5700         case 1:
5701             nzone  = dd_z1n;
5702             nzonep = dd_zp1n;
5703             for (i = 0; i < nzonep; i++)
5704             {
5705                 copy_ivec(dd_zp1[i], dd_zp[i]);
5706             }
5707             break;
5708         default:
5709             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5710             nzone  = 0;
5711             nzonep = 0;
5712     }
5713
5714     zones = &dd->comm->zones;
5715
5716     for (i = 0; i < nzone; i++)
5717     {
5718         m = 0;
5719         clear_ivec(zones->shift[i]);
5720         for (d = 0; d < dd->ndim; d++)
5721         {
5722             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5723         }
5724     }
5725
5726     zones->n = nzone;
5727     for (i = 0; i < nzone; i++)
5728     {
5729         for (d = 0; d < DIM; d++)
5730         {
5731             s[d] = dd->ci[d] - zones->shift[i][d];
5732             if (s[d] < 0)
5733             {
5734                 s[d] += dd->nc[d];
5735             }
5736             else if (s[d] >= dd->nc[d])
5737             {
5738                 s[d] -= dd->nc[d];
5739             }
5740         }
5741     }
5742     zones->nizone = nzonep;
5743     for (i = 0; i < zones->nizone; i++)
5744     {
5745         if (dd_zp[i][0] != i)
5746         {
5747             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5748         }
5749         izone     = &zones->izone[i];
5750         izone->j0 = dd_zp[i][1];
5751         izone->j1 = dd_zp[i][2];
5752         for (dim = 0; dim < DIM; dim++)
5753         {
5754             if (dd->nc[dim] == 1)
5755             {
5756                 /* All shifts should be allowed */
5757                 izone->shift0[dim] = -1;
5758                 izone->shift1[dim] = 1;
5759             }
5760             else
5761             {
5762                 /*
5763                    izone->shift0[d] = 0;
5764                    izone->shift1[d] = 0;
5765                    for(j=izone->j0; j<izone->j1; j++) {
5766                    if (dd->shift[j][d] > dd->shift[i][d])
5767                    izone->shift0[d] = -1;
5768                    if (dd->shift[j][d] < dd->shift[i][d])
5769                    izone->shift1[d] = 1;
5770                    }
5771                  */
5772
5773                 int shift_diff;
5774
5775                 /* Assume the shift are not more than 1 cell */
5776                 izone->shift0[dim] = 1;
5777                 izone->shift1[dim] = -1;
5778                 for (j = izone->j0; j < izone->j1; j++)
5779                 {
5780                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5781                     if (shift_diff < izone->shift0[dim])
5782                     {
5783                         izone->shift0[dim] = shift_diff;
5784                     }
5785                     if (shift_diff > izone->shift1[dim])
5786                     {
5787                         izone->shift1[dim] = shift_diff;
5788                     }
5789                 }
5790             }
5791         }
5792     }
5793
5794     if (dd->comm->eDLB != edlbNO)
5795     {
5796         snew(dd->comm->root, dd->ndim);
5797     }
5798
5799     if (dd->comm->bRecordLoad)
5800     {
5801         make_load_communicators(dd);
5802     }
5803 }
5804
5805 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
5806 {
5807     gmx_domdec_t      *dd;
5808     gmx_domdec_comm_t *comm;
5809     int                i, rank, *buf;
5810     ivec               periods;
5811 #ifdef GMX_MPI
5812     MPI_Comm           comm_cart;
5813 #endif
5814
5815     dd   = cr->dd;
5816     comm = dd->comm;
5817
5818 #ifdef GMX_MPI
5819     if (comm->bCartesianPP)
5820     {
5821         /* Set up cartesian communication for the particle-particle part */
5822         if (fplog)
5823         {
5824             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5825                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5826         }
5827
5828         for (i = 0; i < DIM; i++)
5829         {
5830             periods[i] = TRUE;
5831         }
5832         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5833                         &comm_cart);
5834         /* We overwrite the old communicator with the new cartesian one */
5835         cr->mpi_comm_mygroup = comm_cart;
5836     }
5837
5838     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5839     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5840
5841     if (comm->bCartesianPP_PME)
5842     {
5843         /* Since we want to use the original cartesian setup for sim,
5844          * and not the one after split, we need to make an index.
5845          */
5846         snew(comm->ddindex2ddnodeid, dd->nnodes);
5847         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5848         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5849         /* Get the rank of the DD master,
5850          * above we made sure that the master node is a PP node.
5851          */
5852         if (MASTER(cr))
5853         {
5854             rank = dd->rank;
5855         }
5856         else
5857         {
5858             rank = 0;
5859         }
5860         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5861     }
5862     else if (comm->bCartesianPP)
5863     {
5864         if (cr->npmenodes == 0)
5865         {
5866             /* The PP communicator is also
5867              * the communicator for this simulation
5868              */
5869             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5870         }
5871         cr->nodeid = dd->rank;
5872
5873         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5874
5875         /* We need to make an index to go from the coordinates
5876          * to the nodeid of this simulation.
5877          */
5878         snew(comm->ddindex2simnodeid, dd->nnodes);
5879         snew(buf, dd->nnodes);
5880         if (cr->duty & DUTY_PP)
5881         {
5882             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5883         }
5884         /* Communicate the ddindex to simulation nodeid index */
5885         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5886                       cr->mpi_comm_mysim);
5887         sfree(buf);
5888
5889         /* Determine the master coordinates and rank.
5890          * The DD master should be the same node as the master of this sim.
5891          */
5892         for (i = 0; i < dd->nnodes; i++)
5893         {
5894             if (comm->ddindex2simnodeid[i] == 0)
5895             {
5896                 ddindex2xyz(dd->nc, i, dd->master_ci);
5897                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5898             }
5899         }
5900         if (debug)
5901         {
5902             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5903         }
5904     }
5905     else
5906     {
5907         /* No Cartesian communicators */
5908         /* We use the rank in dd->comm->all as DD index */
5909         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5910         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5911         dd->masterrank = 0;
5912         clear_ivec(dd->master_ci);
5913     }
5914 #endif
5915
5916     if (fplog)
5917     {
5918         fprintf(fplog,
5919                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5920                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5921     }
5922     if (debug)
5923     {
5924         fprintf(debug,
5925                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5926                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5927     }
5928 }
5929
5930 static void receive_ddindex2simnodeid(t_commrec *cr)
5931 {
5932     gmx_domdec_t      *dd;
5933
5934     gmx_domdec_comm_t *comm;
5935     int               *buf;
5936
5937     dd   = cr->dd;
5938     comm = dd->comm;
5939
5940 #ifdef GMX_MPI
5941     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5942     {
5943         snew(comm->ddindex2simnodeid, dd->nnodes);
5944         snew(buf, dd->nnodes);
5945         if (cr->duty & DUTY_PP)
5946         {
5947             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5948         }
5949 #ifdef GMX_MPI
5950         /* Communicate the ddindex to simulation nodeid index */
5951         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5952                       cr->mpi_comm_mysim);
5953 #endif
5954         sfree(buf);
5955     }
5956 #endif
5957 }
5958
5959 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5960                                                      int ncg, int natoms)
5961 {
5962     gmx_domdec_master_t *ma;
5963     int                  i;
5964
5965     snew(ma, 1);
5966
5967     snew(ma->ncg, dd->nnodes);
5968     snew(ma->index, dd->nnodes+1);
5969     snew(ma->cg, ncg);
5970     snew(ma->nat, dd->nnodes);
5971     snew(ma->ibuf, dd->nnodes*2);
5972     snew(ma->cell_x, DIM);
5973     for (i = 0; i < DIM; i++)
5974     {
5975         snew(ma->cell_x[i], dd->nc[i]+1);
5976     }
5977
5978     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5979     {
5980         ma->vbuf = NULL;
5981     }
5982     else
5983     {
5984         snew(ma->vbuf, natoms);
5985     }
5986
5987     return ma;
5988 }
5989
5990 static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
5991                                int reorder)
5992 {
5993     gmx_domdec_t      *dd;
5994     gmx_domdec_comm_t *comm;
5995     int                i, rank;
5996     gmx_bool           bDiv[DIM];
5997     ivec               periods;
5998 #ifdef GMX_MPI
5999     MPI_Comm           comm_cart;
6000 #endif
6001
6002     dd   = cr->dd;
6003     comm = dd->comm;
6004
6005     if (comm->bCartesianPP)
6006     {
6007         for (i = 1; i < DIM; i++)
6008         {
6009             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6010         }
6011         if (bDiv[YY] || bDiv[ZZ])
6012         {
6013             comm->bCartesianPP_PME = TRUE;
6014             /* If we have 2D PME decomposition, which is always in x+y,
6015              * we stack the PME only nodes in z.
6016              * Otherwise we choose the direction that provides the thinnest slab
6017              * of PME only nodes as this will have the least effect
6018              * on the PP communication.
6019              * But for the PME communication the opposite might be better.
6020              */
6021             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6022                              !bDiv[YY] ||
6023                              dd->nc[YY] > dd->nc[ZZ]))
6024             {
6025                 comm->cartpmedim = ZZ;
6026             }
6027             else
6028             {
6029                 comm->cartpmedim = YY;
6030             }
6031             comm->ntot[comm->cartpmedim]
6032                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6033         }
6034         else if (fplog)
6035         {
6036             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6037             fprintf(fplog,
6038                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6039         }
6040     }
6041
6042 #ifdef GMX_MPI
6043     if (comm->bCartesianPP_PME)
6044     {
6045         if (fplog)
6046         {
6047             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6048         }
6049
6050         for (i = 0; i < DIM; i++)
6051         {
6052             periods[i] = TRUE;
6053         }
6054         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6055                         &comm_cart);
6056
6057         MPI_Comm_rank(comm_cart, &rank);
6058         if (MASTERNODE(cr) && rank != 0)
6059         {
6060             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6061         }
6062
6063         /* With this assigment we loose the link to the original communicator
6064          * which will usually be MPI_COMM_WORLD, unless have multisim.
6065          */
6066         cr->mpi_comm_mysim = comm_cart;
6067         cr->sim_nodeid     = rank;
6068
6069         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6070
6071         if (fplog)
6072         {
6073             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6074                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6075         }
6076
6077         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6078         {
6079             cr->duty = DUTY_PP;
6080         }
6081         if (cr->npmenodes == 0 ||
6082             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6083         {
6084             cr->duty = DUTY_PME;
6085         }
6086
6087         /* Split the sim communicator into PP and PME only nodes */
6088         MPI_Comm_split(cr->mpi_comm_mysim,
6089                        cr->duty,
6090                        dd_index(comm->ntot, dd->ci),
6091                        &cr->mpi_comm_mygroup);
6092     }
6093     else
6094     {
6095         switch (dd_node_order)
6096         {
6097             case ddnoPP_PME:
6098                 if (fplog)
6099                 {
6100                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6101                 }
6102                 break;
6103             case ddnoINTERLEAVE:
6104                 /* Interleave the PP-only and PME-only nodes,
6105                  * as on clusters with dual-core machines this will double
6106                  * the communication bandwidth of the PME processes
6107                  * and thus speed up the PP <-> PME and inter PME communication.
6108                  */
6109                 if (fplog)
6110                 {
6111                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6112                 }
6113                 comm->pmenodes = dd_pmenodes(cr);
6114                 break;
6115             case ddnoCARTESIAN:
6116                 break;
6117             default:
6118                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6119         }
6120
6121         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6122         {
6123             cr->duty = DUTY_PME;
6124         }
6125         else
6126         {
6127             cr->duty = DUTY_PP;
6128         }
6129
6130         /* Split the sim communicator into PP and PME only nodes */
6131         MPI_Comm_split(cr->mpi_comm_mysim,
6132                        cr->duty,
6133                        cr->nodeid,
6134                        &cr->mpi_comm_mygroup);
6135         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6136     }
6137 #endif
6138
6139     if (fplog)
6140     {
6141         fprintf(fplog, "This is a %s only node\n\n",
6142                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6143     }
6144 }
6145
6146 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6147 {
6148     gmx_domdec_t      *dd;
6149     gmx_domdec_comm_t *comm;
6150     int                CartReorder;
6151
6152     dd   = cr->dd;
6153     comm = dd->comm;
6154
6155     copy_ivec(dd->nc, comm->ntot);
6156
6157     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6158     comm->bCartesianPP_PME = FALSE;
6159
6160     /* Reorder the nodes by default. This might change the MPI ranks.
6161      * Real reordering is only supported on very few architectures,
6162      * Blue Gene is one of them.
6163      */
6164     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6165
6166     if (cr->npmenodes > 0)
6167     {
6168         /* Split the communicator into a PP and PME part */
6169         split_communicator(fplog, cr, dd_node_order, CartReorder);
6170         if (comm->bCartesianPP_PME)
6171         {
6172             /* We (possibly) reordered the nodes in split_communicator,
6173              * so it is no longer required in make_pp_communicator.
6174              */
6175             CartReorder = FALSE;
6176         }
6177     }
6178     else
6179     {
6180         /* All nodes do PP and PME */
6181 #ifdef GMX_MPI
6182         /* We do not require separate communicators */
6183         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6184 #endif
6185     }
6186
6187     if (cr->duty & DUTY_PP)
6188     {
6189         /* Copy or make a new PP communicator */
6190         make_pp_communicator(fplog, cr, CartReorder);
6191     }
6192     else
6193     {
6194         receive_ddindex2simnodeid(cr);
6195     }
6196
6197     if (!(cr->duty & DUTY_PME))
6198     {
6199         /* Set up the commnuication to our PME node */
6200         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6201         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6202         if (debug)
6203         {
6204             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6205                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6206         }
6207     }
6208     else
6209     {
6210         dd->pme_nodeid = -1;
6211     }
6212
6213     if (DDMASTER(dd))
6214     {
6215         dd->ma = init_gmx_domdec_master_t(dd,
6216                                           comm->cgs_gl.nr,
6217                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6218     }
6219 }
6220
6221 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6222 {
6223     real  *slb_frac, tot;
6224     int    i, n;
6225     double dbl;
6226
6227     slb_frac = NULL;
6228     if (nc > 1 && size_string != NULL)
6229     {
6230         if (fplog)
6231         {
6232             fprintf(fplog, "Using static load balancing for the %s direction\n",
6233                     dir);
6234         }
6235         snew(slb_frac, nc);
6236         tot = 0;
6237         for (i = 0; i < nc; i++)
6238         {
6239             dbl = 0;
6240             sscanf(size_string, "%lf%n", &dbl, &n);
6241             if (dbl == 0)
6242             {
6243                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6244             }
6245             slb_frac[i]  = dbl;
6246             size_string += n;
6247             tot         += slb_frac[i];
6248         }
6249         /* Normalize */
6250         if (fplog)
6251         {
6252             fprintf(fplog, "Relative cell sizes:");
6253         }
6254         for (i = 0; i < nc; i++)
6255         {
6256             slb_frac[i] /= tot;
6257             if (fplog)
6258             {
6259                 fprintf(fplog, " %5.3f", slb_frac[i]);
6260             }
6261         }
6262         if (fplog)
6263         {
6264             fprintf(fplog, "\n");
6265         }
6266     }
6267
6268     return slb_frac;
6269 }
6270
6271 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6272 {
6273     int                  n, nmol, ftype;
6274     gmx_mtop_ilistloop_t iloop;
6275     t_ilist             *il;
6276
6277     n     = 0;
6278     iloop = gmx_mtop_ilistloop_init(mtop);
6279     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6280     {
6281         for (ftype = 0; ftype < F_NRE; ftype++)
6282         {
6283             if ((interaction_function[ftype].flags & IF_BOND) &&
6284                 NRAL(ftype) >  2)
6285             {
6286                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6287             }
6288         }
6289     }
6290
6291     return n;
6292 }
6293
6294 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6295 {
6296     char *val;
6297     int   nst;
6298
6299     nst = def;
6300     val = getenv(env_var);
6301     if (val)
6302     {
6303         if (sscanf(val, "%d", &nst) <= 0)
6304         {
6305             nst = 1;
6306         }
6307         if (fplog)
6308         {
6309             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6310                     env_var, val, nst);
6311         }
6312     }
6313
6314     return nst;
6315 }
6316
6317 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6318 {
6319     if (MASTER(cr))
6320     {
6321         fprintf(stderr, "\n%s\n", warn_string);
6322     }
6323     if (fplog)
6324     {
6325         fprintf(fplog, "\n%s\n", warn_string);
6326     }
6327 }
6328
6329 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6330                                   t_inputrec *ir, FILE *fplog)
6331 {
6332     if (ir->ePBC == epbcSCREW &&
6333         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6334     {
6335         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6336     }
6337
6338     if (ir->ns_type == ensSIMPLE)
6339     {
6340         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6341     }
6342
6343     if (ir->nstlist == 0)
6344     {
6345         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6346     }
6347
6348     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6349     {
6350         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6351     }
6352 }
6353
6354 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6355 {
6356     int  di, d;
6357     real r;
6358
6359     r = ddbox->box_size[XX];
6360     for (di = 0; di < dd->ndim; di++)
6361     {
6362         d = dd->dim[di];
6363         /* Check using the initial average cell size */
6364         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6365     }
6366
6367     return r;
6368 }
6369
6370 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6371                              const char *dlb_opt, gmx_bool bRecordLoad,
6372                              unsigned long Flags, t_inputrec *ir)
6373 {
6374     gmx_domdec_t *dd;
6375     int           eDLB = -1;
6376     char          buf[STRLEN];
6377
6378     switch (dlb_opt[0])
6379     {
6380         case 'a': eDLB = edlbAUTO; break;
6381         case 'n': eDLB = edlbNO;   break;
6382         case 'y': eDLB = edlbYES;  break;
6383         default: gmx_incons("Unknown dlb_opt");
6384     }
6385
6386     if (Flags & MD_RERUN)
6387     {
6388         return edlbNO;
6389     }
6390
6391     if (!EI_DYNAMICS(ir->eI))
6392     {
6393         if (eDLB == edlbYES)
6394         {
6395             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6396             dd_warning(cr, fplog, buf);
6397         }
6398
6399         return edlbNO;
6400     }
6401
6402     if (!bRecordLoad)
6403     {
6404         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6405
6406         return edlbNO;
6407     }
6408
6409     if (Flags & MD_REPRODUCIBLE)
6410     {
6411         switch (eDLB)
6412         {
6413             case edlbNO:
6414                 break;
6415             case edlbAUTO:
6416                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6417                 eDLB = edlbNO;
6418                 break;
6419             case edlbYES:
6420                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6421                 break;
6422             default:
6423                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6424                 break;
6425         }
6426     }
6427
6428     return eDLB;
6429 }
6430
6431 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6432 {
6433     int dim;
6434
6435     dd->ndim = 0;
6436     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6437     {
6438         /* Decomposition order z,y,x */
6439         if (fplog)
6440         {
6441             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6442         }
6443         for (dim = DIM-1; dim >= 0; dim--)
6444         {
6445             if (dd->nc[dim] > 1)
6446             {
6447                 dd->dim[dd->ndim++] = dim;
6448             }
6449         }
6450     }
6451     else
6452     {
6453         /* Decomposition order x,y,z */
6454         for (dim = 0; dim < DIM; dim++)
6455         {
6456             if (dd->nc[dim] > 1)
6457             {
6458                 dd->dim[dd->ndim++] = dim;
6459             }
6460         }
6461     }
6462 }
6463
6464 static gmx_domdec_comm_t *init_dd_comm()
6465 {
6466     gmx_domdec_comm_t *comm;
6467     int                i;
6468
6469     snew(comm, 1);
6470     snew(comm->cggl_flag, DIM*2);
6471     snew(comm->cgcm_state, DIM*2);
6472     for (i = 0; i < DIM*2; i++)
6473     {
6474         comm->cggl_flag_nalloc[i]  = 0;
6475         comm->cgcm_state_nalloc[i] = 0;
6476     }
6477
6478     comm->nalloc_int = 0;
6479     comm->buf_int    = NULL;
6480
6481     vec_rvec_init(&comm->vbuf);
6482
6483     comm->n_load_have    = 0;
6484     comm->n_load_collect = 0;
6485
6486     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6487     {
6488         comm->sum_nat[i] = 0;
6489     }
6490     comm->ndecomp   = 0;
6491     comm->nload     = 0;
6492     comm->load_step = 0;
6493     comm->load_sum  = 0;
6494     comm->load_max  = 0;
6495     clear_ivec(comm->load_lim);
6496     comm->load_mdf  = 0;
6497     comm->load_pme  = 0;
6498
6499     return comm;
6500 }
6501
6502 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6503                                         unsigned long Flags,
6504                                         ivec nc,
6505                                         real comm_distance_min, real rconstr,
6506                                         const char *dlb_opt, real dlb_scale,
6507                                         const char *sizex, const char *sizey, const char *sizez,
6508                                         gmx_mtop_t *mtop, t_inputrec *ir,
6509                                         matrix box, rvec *x,
6510                                         gmx_ddbox_t *ddbox,
6511                                         int *npme_x, int *npme_y)
6512 {
6513     gmx_domdec_t      *dd;
6514     gmx_domdec_comm_t *comm;
6515     int                recload;
6516     int                d, i, j;
6517     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6518     gmx_bool           bC;
6519     char               buf[STRLEN];
6520
6521     if (fplog)
6522     {
6523         fprintf(fplog,
6524                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6525     }
6526
6527     snew(dd, 1);
6528
6529     dd->comm = init_dd_comm();
6530     comm     = dd->comm;
6531     snew(comm->cggl_flag, DIM*2);
6532     snew(comm->cgcm_state, DIM*2);
6533
6534     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6535     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6536
6537     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6538     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6539     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6540     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6541     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6542     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6543     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6544     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6545
6546     dd->pme_recv_f_alloc = 0;
6547     dd->pme_recv_f_buf   = NULL;
6548
6549     if (dd->bSendRecv2 && fplog)
6550     {
6551         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6552     }
6553     if (comm->eFlop)
6554     {
6555         if (fplog)
6556         {
6557             fprintf(fplog, "Will load balance based on FLOP count\n");
6558         }
6559         if (comm->eFlop > 1)
6560         {
6561             srand(1+cr->nodeid);
6562         }
6563         comm->bRecordLoad = TRUE;
6564     }
6565     else
6566     {
6567         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6568
6569     }
6570
6571     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6572
6573     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6574     if (fplog)
6575     {
6576         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6577     }
6578     dd->bGridJump              = comm->bDynLoadBal;
6579     comm->bPMELoadBalDLBLimits = FALSE;
6580
6581     if (comm->nstSortCG)
6582     {
6583         if (fplog)
6584         {
6585             if (comm->nstSortCG == 1)
6586             {
6587                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6588             }
6589             else
6590             {
6591                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6592                         comm->nstSortCG);
6593             }
6594         }
6595         snew(comm->sort, 1);
6596     }
6597     else
6598     {
6599         if (fplog)
6600         {
6601             fprintf(fplog, "Will not sort the charge groups\n");
6602         }
6603     }
6604
6605     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6606
6607     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6608     if (comm->bInterCGBondeds)
6609     {
6610         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6611     }
6612     else
6613     {
6614         comm->bInterCGMultiBody = FALSE;
6615     }
6616
6617     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6618     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6619
6620     if (ir->rlistlong == 0)
6621     {
6622         /* Set the cut-off to some very large value,
6623          * so we don't need if statements everywhere in the code.
6624          * We use sqrt, since the cut-off is squared in some places.
6625          */
6626         comm->cutoff   = GMX_CUTOFF_INF;
6627     }
6628     else
6629     {
6630         comm->cutoff   = ir->rlistlong;
6631     }
6632     comm->cutoff_mbody = 0;
6633
6634     comm->cellsize_limit = 0;
6635     comm->bBondComm      = FALSE;
6636
6637     if (comm->bInterCGBondeds)
6638     {
6639         if (comm_distance_min > 0)
6640         {
6641             comm->cutoff_mbody = comm_distance_min;
6642             if (Flags & MD_DDBONDCOMM)
6643             {
6644                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6645             }
6646             else
6647             {
6648                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6649             }
6650             r_bonded_limit = comm->cutoff_mbody;
6651         }
6652         else if (ir->bPeriodicMols)
6653         {
6654             /* Can not easily determine the required cut-off */
6655             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6656             comm->cutoff_mbody = comm->cutoff/2;
6657             r_bonded_limit     = comm->cutoff_mbody;
6658         }
6659         else
6660         {
6661             if (MASTER(cr))
6662             {
6663                 dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
6664                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6665             }
6666             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6667             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6668
6669             /* We use an initial margin of 10% for the minimum cell size,
6670              * except when we are just below the non-bonded cut-off.
6671              */
6672             if (Flags & MD_DDBONDCOMM)
6673             {
6674                 if (max(r_2b, r_mb) > comm->cutoff)
6675                 {
6676                     r_bonded        = max(r_2b, r_mb);
6677                     r_bonded_limit  = 1.1*r_bonded;
6678                     comm->bBondComm = TRUE;
6679                 }
6680                 else
6681                 {
6682                     r_bonded       = r_mb;
6683                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6684                 }
6685                 /* We determine cutoff_mbody later */
6686             }
6687             else
6688             {
6689                 /* No special bonded communication,
6690                  * simply increase the DD cut-off.
6691                  */
6692                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6693                 comm->cutoff_mbody = r_bonded_limit;
6694                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6695             }
6696         }
6697         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6698         if (fplog)
6699         {
6700             fprintf(fplog,
6701                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6702                     comm->cellsize_limit);
6703         }
6704     }
6705
6706     if (dd->bInterCGcons && rconstr <= 0)
6707     {
6708         /* There is a cell size limit due to the constraints (P-LINCS) */
6709         rconstr = constr_r_max(fplog, mtop, ir);
6710         if (fplog)
6711         {
6712             fprintf(fplog,
6713                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6714                     rconstr);
6715             if (rconstr > comm->cellsize_limit)
6716             {
6717                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6718             }
6719         }
6720     }
6721     else if (rconstr > 0 && fplog)
6722     {
6723         /* Here we do not check for dd->bInterCGcons,
6724          * because one can also set a cell size limit for virtual sites only
6725          * and at this point we don't know yet if there are intercg v-sites.
6726          */
6727         fprintf(fplog,
6728                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6729                 rconstr);
6730     }
6731     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6732
6733     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6734
6735     if (nc[XX] > 0)
6736     {
6737         copy_ivec(nc, dd->nc);
6738         set_dd_dim(fplog, dd);
6739         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6740
6741         if (cr->npmenodes == -1)
6742         {
6743             cr->npmenodes = 0;
6744         }
6745         acs = average_cellsize_min(dd, ddbox);
6746         if (acs < comm->cellsize_limit)
6747         {
6748             if (fplog)
6749             {
6750                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6751             }
6752             gmx_fatal_collective(FARGS, cr, NULL,
6753                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6754                                  acs, comm->cellsize_limit);
6755         }
6756     }
6757     else
6758     {
6759         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6760
6761         /* We need to choose the optimal DD grid and possibly PME nodes */
6762         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6763                                comm->eDLB != edlbNO, dlb_scale,
6764                                comm->cellsize_limit, comm->cutoff,
6765                                comm->bInterCGBondeds, comm->bInterCGMultiBody);
6766
6767         if (dd->nc[XX] == 0)
6768         {
6769             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6770             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6771                     !bC ? "-rdd" : "-rcon",
6772                     comm->eDLB != edlbNO ? " or -dds" : "",
6773                     bC ? " or your LINCS settings" : "");
6774
6775             gmx_fatal_collective(FARGS, cr, NULL,
6776                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6777                                  "%s\n"
6778                                  "Look in the log file for details on the domain decomposition",
6779                                  cr->nnodes-cr->npmenodes, limit, buf);
6780         }
6781         set_dd_dim(fplog, dd);
6782     }
6783
6784     if (fplog)
6785     {
6786         fprintf(fplog,
6787                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6788                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6789     }
6790
6791     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6792     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6793     {
6794         gmx_fatal_collective(FARGS, cr, NULL,
6795                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6796                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6797     }
6798     if (cr->npmenodes > dd->nnodes)
6799     {
6800         gmx_fatal_collective(FARGS, cr, NULL,
6801                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6802     }
6803     if (cr->npmenodes > 0)
6804     {
6805         comm->npmenodes = cr->npmenodes;
6806     }
6807     else
6808     {
6809         comm->npmenodes = dd->nnodes;
6810     }
6811
6812     if (EEL_PME(ir->coulombtype))
6813     {
6814         /* The following choices should match those
6815          * in comm_cost_est in domdec_setup.c.
6816          * Note that here the checks have to take into account
6817          * that the decomposition might occur in a different order than xyz
6818          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6819          * in which case they will not match those in comm_cost_est,
6820          * but since that is mainly for testing purposes that's fine.
6821          */
6822         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6823             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6824             getenv("GMX_PMEONEDD") == NULL)
6825         {
6826             comm->npmedecompdim = 2;
6827             comm->npmenodes_x   = dd->nc[XX];
6828             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6829         }
6830         else
6831         {
6832             /* In case nc is 1 in both x and y we could still choose to
6833              * decompose pme in y instead of x, but we use x for simplicity.
6834              */
6835             comm->npmedecompdim = 1;
6836             if (dd->dim[0] == YY)
6837             {
6838                 comm->npmenodes_x = 1;
6839                 comm->npmenodes_y = comm->npmenodes;
6840             }
6841             else
6842             {
6843                 comm->npmenodes_x = comm->npmenodes;
6844                 comm->npmenodes_y = 1;
6845             }
6846         }
6847         if (fplog)
6848         {
6849             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6850                     comm->npmenodes_x, comm->npmenodes_y, 1);
6851         }
6852     }
6853     else
6854     {
6855         comm->npmedecompdim = 0;
6856         comm->npmenodes_x   = 0;
6857         comm->npmenodes_y   = 0;
6858     }
6859
6860     /* Technically we don't need both of these,
6861      * but it simplifies code not having to recalculate it.
6862      */
6863     *npme_x = comm->npmenodes_x;
6864     *npme_y = comm->npmenodes_y;
6865
6866     snew(comm->slb_frac, DIM);
6867     if (comm->eDLB == edlbNO)
6868     {
6869         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6870         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6871         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6872     }
6873
6874     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6875     {
6876         if (comm->bBondComm || comm->eDLB != edlbNO)
6877         {
6878             /* Set the bonded communication distance to halfway
6879              * the minimum and the maximum,
6880              * since the extra communication cost is nearly zero.
6881              */
6882             acs                = average_cellsize_min(dd, ddbox);
6883             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6884             if (comm->eDLB != edlbNO)
6885             {
6886                 /* Check if this does not limit the scaling */
6887                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
6888             }
6889             if (!comm->bBondComm)
6890             {
6891                 /* Without bBondComm do not go beyond the n.b. cut-off */
6892                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
6893                 if (comm->cellsize_limit >= comm->cutoff)
6894                 {
6895                     /* We don't loose a lot of efficieny
6896                      * when increasing it to the n.b. cut-off.
6897                      * It can even be slightly faster, because we need
6898                      * less checks for the communication setup.
6899                      */
6900                     comm->cutoff_mbody = comm->cutoff;
6901                 }
6902             }
6903             /* Check if we did not end up below our original limit */
6904             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
6905
6906             if (comm->cutoff_mbody > comm->cellsize_limit)
6907             {
6908                 comm->cellsize_limit = comm->cutoff_mbody;
6909             }
6910         }
6911         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6912     }
6913
6914     if (debug)
6915     {
6916         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6917                 "cellsize limit %f\n",
6918                 comm->bBondComm, comm->cellsize_limit);
6919     }
6920
6921     if (MASTER(cr))
6922     {
6923         check_dd_restrictions(cr, dd, ir, fplog);
6924     }
6925
6926     comm->partition_step = INT_MIN;
6927     dd->ddp_count        = 0;
6928
6929     clear_dd_cycle_counts(dd);
6930
6931     return dd;
6932 }
6933
6934 static void set_dlb_limits(gmx_domdec_t *dd)
6935
6936 {
6937     int d;
6938
6939     for (d = 0; d < dd->ndim; d++)
6940     {
6941         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6942         dd->comm->cellsize_min[dd->dim[d]] =
6943             dd->comm->cellsize_min_dlb[dd->dim[d]];
6944     }
6945 }
6946
6947
6948 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
6949 {
6950     gmx_domdec_t      *dd;
6951     gmx_domdec_comm_t *comm;
6952     real               cellsize_min;
6953     int                d, nc, i;
6954     char               buf[STRLEN];
6955
6956     dd   = cr->dd;
6957     comm = dd->comm;
6958
6959     if (fplog)
6960     {
6961         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6962     }
6963
6964     cellsize_min = comm->cellsize_min[dd->dim[0]];
6965     for (d = 1; d < dd->ndim; d++)
6966     {
6967         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6968     }
6969
6970     if (cellsize_min < comm->cellsize_limit*1.05)
6971     {
6972         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6973
6974         /* Change DLB from "auto" to "no". */
6975         comm->eDLB = edlbNO;
6976
6977         return;
6978     }
6979
6980     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6981     comm->bDynLoadBal = TRUE;
6982     dd->bGridJump     = TRUE;
6983
6984     set_dlb_limits(dd);
6985
6986     /* We can set the required cell size info here,
6987      * so we do not need to communicate this.
6988      * The grid is completely uniform.
6989      */
6990     for (d = 0; d < dd->ndim; d++)
6991     {
6992         if (comm->root[d])
6993         {
6994             comm->load[d].sum_m = comm->load[d].sum;
6995
6996             nc = dd->nc[dd->dim[d]];
6997             for (i = 0; i < nc; i++)
6998             {
6999                 comm->root[d]->cell_f[i]    = i/(real)nc;
7000                 if (d > 0)
7001                 {
7002                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7003                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7004                 }
7005             }
7006             comm->root[d]->cell_f[nc] = 1.0;
7007         }
7008     }
7009 }
7010
7011 static char *init_bLocalCG(gmx_mtop_t *mtop)
7012 {
7013     int   ncg, cg;
7014     char *bLocalCG;
7015
7016     ncg = ncg_mtop(mtop);
7017     snew(bLocalCG, ncg);
7018     for (cg = 0; cg < ncg; cg++)
7019     {
7020         bLocalCG[cg] = FALSE;
7021     }
7022
7023     return bLocalCG;
7024 }
7025
7026 void dd_init_bondeds(FILE *fplog,
7027                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7028                      gmx_vsite_t *vsite, gmx_constr_t constr,
7029                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7030 {
7031     gmx_domdec_comm_t *comm;
7032     gmx_bool           bBondComm;
7033     int                d;
7034
7035     dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
7036
7037     comm = dd->comm;
7038
7039     if (comm->bBondComm)
7040     {
7041         /* Communicate atoms beyond the cut-off for bonded interactions */
7042         comm = dd->comm;
7043
7044         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7045
7046         comm->bLocalCG = init_bLocalCG(mtop);
7047     }
7048     else
7049     {
7050         /* Only communicate atoms based on cut-off */
7051         comm->cglink   = NULL;
7052         comm->bLocalCG = NULL;
7053     }
7054 }
7055
7056 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7057                               t_inputrec *ir,
7058                               gmx_bool bDynLoadBal, real dlb_scale,
7059                               gmx_ddbox_t *ddbox)
7060 {
7061     gmx_domdec_comm_t *comm;
7062     int                d;
7063     ivec               np;
7064     real               limit, shrink;
7065     char               buf[64];
7066
7067     if (fplog == NULL)
7068     {
7069         return;
7070     }
7071
7072     comm = dd->comm;
7073
7074     if (bDynLoadBal)
7075     {
7076         fprintf(fplog, "The maximum number of communication pulses is:");
7077         for (d = 0; d < dd->ndim; d++)
7078         {
7079             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7080         }
7081         fprintf(fplog, "\n");
7082         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7083         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7084         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7085         for (d = 0; d < DIM; d++)
7086         {
7087             if (dd->nc[d] > 1)
7088             {
7089                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7090                 {
7091                     shrink = 0;
7092                 }
7093                 else
7094                 {
7095                     shrink =
7096                         comm->cellsize_min_dlb[d]/
7097                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7098                 }
7099                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7100             }
7101         }
7102         fprintf(fplog, "\n");
7103     }
7104     else
7105     {
7106         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
7107         fprintf(fplog, "The initial number of communication pulses is:");
7108         for (d = 0; d < dd->ndim; d++)
7109         {
7110             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7111         }
7112         fprintf(fplog, "\n");
7113         fprintf(fplog, "The initial domain decomposition cell size is:");
7114         for (d = 0; d < DIM; d++)
7115         {
7116             if (dd->nc[d] > 1)
7117             {
7118                 fprintf(fplog, " %c %.2f nm",
7119                         dim2char(d), dd->comm->cellsize_min[d]);
7120             }
7121         }
7122         fprintf(fplog, "\n\n");
7123     }
7124
7125     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7126     {
7127         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7128         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7129                 "non-bonded interactions", "", comm->cutoff);
7130
7131         if (bDynLoadBal)
7132         {
7133             limit = dd->comm->cellsize_limit;
7134         }
7135         else
7136         {
7137             if (dynamic_dd_box(ddbox, ir))
7138             {
7139                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7140             }
7141             limit = dd->comm->cellsize_min[XX];
7142             for (d = 1; d < DIM; d++)
7143             {
7144                 limit = min(limit, dd->comm->cellsize_min[d]);
7145             }
7146         }
7147
7148         if (comm->bInterCGBondeds)
7149         {
7150             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7151                     "two-body bonded interactions", "(-rdd)",
7152                     max(comm->cutoff, comm->cutoff_mbody));
7153             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7154                     "multi-body bonded interactions", "(-rdd)",
7155                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7156         }
7157         if (dd->vsite_comm)
7158         {
7159             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7160                     "virtual site constructions", "(-rcon)", limit);
7161         }
7162         if (dd->constraint_comm)
7163         {
7164             sprintf(buf, "atoms separated by up to %d constraints",
7165                     1+ir->nProjOrder);
7166             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7167                     buf, "(-rcon)", limit);
7168         }
7169         fprintf(fplog, "\n");
7170     }
7171
7172     fflush(fplog);
7173 }
7174
7175 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7176                                 real               dlb_scale,
7177                                 const t_inputrec  *ir,
7178                                 const gmx_ddbox_t *ddbox)
7179 {
7180     gmx_domdec_comm_t *comm;
7181     int                d, dim, npulse, npulse_d_max, npulse_d;
7182     gmx_bool           bNoCutOff;
7183
7184     comm = dd->comm;
7185
7186     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7187
7188     /* Determine the maximum number of comm. pulses in one dimension */
7189
7190     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7191
7192     /* Determine the maximum required number of grid pulses */
7193     if (comm->cellsize_limit >= comm->cutoff)
7194     {
7195         /* Only a single pulse is required */
7196         npulse = 1;
7197     }
7198     else if (!bNoCutOff && comm->cellsize_limit > 0)
7199     {
7200         /* We round down slightly here to avoid overhead due to the latency
7201          * of extra communication calls when the cut-off
7202          * would be only slightly longer than the cell size.
7203          * Later cellsize_limit is redetermined,
7204          * so we can not miss interactions due to this rounding.
7205          */
7206         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7207     }
7208     else
7209     {
7210         /* There is no cell size limit */
7211         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7212     }
7213
7214     if (!bNoCutOff && npulse > 1)
7215     {
7216         /* See if we can do with less pulses, based on dlb_scale */
7217         npulse_d_max = 0;
7218         for (d = 0; d < dd->ndim; d++)
7219         {
7220             dim      = dd->dim[d];
7221             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7222                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7223             npulse_d_max = max(npulse_d_max, npulse_d);
7224         }
7225         npulse = min(npulse, npulse_d_max);
7226     }
7227
7228     /* This env var can override npulse */
7229     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7230     if (d > 0)
7231     {
7232         npulse = d;
7233     }
7234
7235     comm->maxpulse       = 1;
7236     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7237     for (d = 0; d < dd->ndim; d++)
7238     {
7239         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7240         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7241         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7242         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7243         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7244         {
7245             comm->bVacDLBNoLimit = FALSE;
7246         }
7247     }
7248
7249     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7250     if (!comm->bVacDLBNoLimit)
7251     {
7252         comm->cellsize_limit = max(comm->cellsize_limit,
7253                                    comm->cutoff/comm->maxpulse);
7254     }
7255     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7256     /* Set the minimum cell size for each DD dimension */
7257     for (d = 0; d < dd->ndim; d++)
7258     {
7259         if (comm->bVacDLBNoLimit ||
7260             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7261         {
7262             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7263         }
7264         else
7265         {
7266             comm->cellsize_min_dlb[dd->dim[d]] =
7267                 comm->cutoff/comm->cd[d].np_dlb;
7268         }
7269     }
7270     if (comm->cutoff_mbody <= 0)
7271     {
7272         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7273     }
7274     if (comm->bDynLoadBal)
7275     {
7276         set_dlb_limits(dd);
7277     }
7278 }
7279
7280 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7281 {
7282     /* If each molecule is a single charge group
7283      * or we use domain decomposition for each periodic dimension,
7284      * we do not need to take pbc into account for the bonded interactions.
7285      */
7286     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7287             !(dd->nc[XX] > 1 &&
7288               dd->nc[YY] > 1 &&
7289               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7290 }
7291
7292 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7293                        t_inputrec *ir, t_forcerec *fr,
7294                        gmx_ddbox_t *ddbox)
7295 {
7296     gmx_domdec_comm_t *comm;
7297     int                natoms_tot;
7298     real               vol_frac;
7299
7300     comm = dd->comm;
7301
7302     /* Initialize the thread data.
7303      * This can not be done in init_domain_decomposition,
7304      * as the numbers of threads is determined later.
7305      */
7306     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7307     if (comm->nth > 1)
7308     {
7309         snew(comm->dth, comm->nth);
7310     }
7311
7312     if (EEL_PME(ir->coulombtype))
7313     {
7314         init_ddpme(dd, &comm->ddpme[0], 0);
7315         if (comm->npmedecompdim >= 2)
7316         {
7317             init_ddpme(dd, &comm->ddpme[1], 1);
7318         }
7319     }
7320     else
7321     {
7322         comm->npmenodes = 0;
7323         if (dd->pme_nodeid >= 0)
7324         {
7325             gmx_fatal_collective(FARGS, NULL, dd,
7326                                  "Can not have separate PME nodes without PME electrostatics");
7327         }
7328     }
7329
7330     if (debug)
7331     {
7332         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7333     }
7334     if (comm->eDLB != edlbNO)
7335     {
7336         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7337     }
7338
7339     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7340     if (comm->eDLB == edlbAUTO)
7341     {
7342         if (fplog)
7343         {
7344             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7345         }
7346         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7347     }
7348
7349     if (ir->ePBC == epbcNONE)
7350     {
7351         vol_frac = 1 - 1/(double)dd->nnodes;
7352     }
7353     else
7354     {
7355         vol_frac =
7356             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7357     }
7358     if (debug)
7359     {
7360         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7361     }
7362     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7363
7364     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7365 }
7366
7367 static gmx_bool test_dd_cutoff(t_commrec *cr,
7368                                t_state *state, t_inputrec *ir,
7369                                real cutoff_req)
7370 {
7371     gmx_domdec_t *dd;
7372     gmx_ddbox_t   ddbox;
7373     int           d, dim, np;
7374     real          inv_cell_size;
7375     int           LocallyLimited;
7376
7377     dd = cr->dd;
7378
7379     set_ddbox(dd, FALSE, cr, ir, state->box,
7380               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7381
7382     LocallyLimited = 0;
7383
7384     for (d = 0; d < dd->ndim; d++)
7385     {
7386         dim = dd->dim[d];
7387
7388         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7389         if (dynamic_dd_box(&ddbox, ir))
7390         {
7391             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7392         }
7393
7394         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7395
7396         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7397             dd->comm->cd[d].np_dlb > 0)
7398         {
7399             if (np > dd->comm->cd[d].np_dlb)
7400             {
7401                 return FALSE;
7402             }
7403
7404             /* If a current local cell size is smaller than the requested
7405              * cut-off, we could still fix it, but this gets very complicated.
7406              * Without fixing here, we might actually need more checks.
7407              */
7408             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7409             {
7410                 LocallyLimited = 1;
7411             }
7412         }
7413     }
7414
7415     if (dd->comm->eDLB != edlbNO)
7416     {
7417         /* If DLB is not active yet, we don't need to check the grid jumps.
7418          * Actually we shouldn't, because then the grid jump data is not set.
7419          */
7420         if (dd->comm->bDynLoadBal &&
7421             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7422         {
7423             LocallyLimited = 1;
7424         }
7425
7426         gmx_sumi(1, &LocallyLimited, cr);
7427
7428         if (LocallyLimited > 0)
7429         {
7430             return FALSE;
7431         }
7432     }
7433
7434     return TRUE;
7435 }
7436
7437 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7438                           real cutoff_req)
7439 {
7440     gmx_bool bCutoffAllowed;
7441
7442     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7443
7444     if (bCutoffAllowed)
7445     {
7446         cr->dd->comm->cutoff = cutoff_req;
7447     }
7448
7449     return bCutoffAllowed;
7450 }
7451
7452 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7453 {
7454     gmx_domdec_comm_t *comm;
7455
7456     comm = cr->dd->comm;
7457
7458     /* Turn on the DLB limiting (might have been on already) */
7459     comm->bPMELoadBalDLBLimits = TRUE;
7460
7461     /* Change the cut-off limit */
7462     comm->PMELoadBal_max_cutoff = comm->cutoff;
7463 }
7464
7465 static void merge_cg_buffers(int ncell,
7466                              gmx_domdec_comm_dim_t *cd, int pulse,
7467                              int  *ncg_cell,
7468                              int  *index_gl, int  *recv_i,
7469                              rvec *cg_cm,    rvec *recv_vr,
7470                              int *cgindex,
7471                              cginfo_mb_t *cginfo_mb, int *cginfo)
7472 {
7473     gmx_domdec_ind_t *ind, *ind_p;
7474     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7475     int               shift, shift_at;
7476
7477     ind = &cd->ind[pulse];
7478
7479     /* First correct the already stored data */
7480     shift = ind->nrecv[ncell];
7481     for (cell = ncell-1; cell >= 0; cell--)
7482     {
7483         shift -= ind->nrecv[cell];
7484         if (shift > 0)
7485         {
7486             /* Move the cg's present from previous grid pulses */
7487             cg0                = ncg_cell[ncell+cell];
7488             cg1                = ncg_cell[ncell+cell+1];
7489             cgindex[cg1+shift] = cgindex[cg1];
7490             for (cg = cg1-1; cg >= cg0; cg--)
7491             {
7492                 index_gl[cg+shift] = index_gl[cg];
7493                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7494                 cgindex[cg+shift] = cgindex[cg];
7495                 cginfo[cg+shift]  = cginfo[cg];
7496             }
7497             /* Correct the already stored send indices for the shift */
7498             for (p = 1; p <= pulse; p++)
7499             {
7500                 ind_p = &cd->ind[p];
7501                 cg0   = 0;
7502                 for (c = 0; c < cell; c++)
7503                 {
7504                     cg0 += ind_p->nsend[c];
7505                 }
7506                 cg1 = cg0 + ind_p->nsend[cell];
7507                 for (cg = cg0; cg < cg1; cg++)
7508                 {
7509                     ind_p->index[cg] += shift;
7510                 }
7511             }
7512         }
7513     }
7514
7515     /* Merge in the communicated buffers */
7516     shift    = 0;
7517     shift_at = 0;
7518     cg0      = 0;
7519     for (cell = 0; cell < ncell; cell++)
7520     {
7521         cg1 = ncg_cell[ncell+cell+1] + shift;
7522         if (shift_at > 0)
7523         {
7524             /* Correct the old cg indices */
7525             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7526             {
7527                 cgindex[cg+1] += shift_at;
7528             }
7529         }
7530         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7531         {
7532             /* Copy this charge group from the buffer */
7533             index_gl[cg1] = recv_i[cg0];
7534             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7535             /* Add it to the cgindex */
7536             cg_gl          = index_gl[cg1];
7537             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7538             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7539             cgindex[cg1+1] = cgindex[cg1] + nat;
7540             cg0++;
7541             cg1++;
7542             shift_at += nat;
7543         }
7544         shift                 += ind->nrecv[cell];
7545         ncg_cell[ncell+cell+1] = cg1;
7546     }
7547 }
7548
7549 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7550                                int nzone, int cg0, const int *cgindex)
7551 {
7552     int cg, zone, p;
7553
7554     /* Store the atom block boundaries for easy copying of communication buffers
7555      */
7556     cg = cg0;
7557     for (zone = 0; zone < nzone; zone++)
7558     {
7559         for (p = 0; p < cd->np; p++)
7560         {
7561             cd->ind[p].cell2at0[zone] = cgindex[cg];
7562             cg += cd->ind[p].nrecv[zone];
7563             cd->ind[p].cell2at1[zone] = cgindex[cg];
7564         }
7565     }
7566 }
7567
7568 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7569 {
7570     int      i;
7571     gmx_bool bMiss;
7572
7573     bMiss = FALSE;
7574     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7575     {
7576         if (!bLocalCG[link->a[i]])
7577         {
7578             bMiss = TRUE;
7579         }
7580     }
7581
7582     return bMiss;
7583 }
7584
7585 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7586 typedef struct {
7587     real c[DIM][4]; /* the corners for the non-bonded communication */
7588     real cr0;       /* corner for rounding */
7589     real cr1[4];    /* corners for rounding */
7590     real bc[DIM];   /* corners for bounded communication */
7591     real bcr1;      /* corner for rounding for bonded communication */
7592 } dd_corners_t;
7593
7594 /* Determine the corners of the domain(s) we are communicating with */
7595 static void
7596 set_dd_corners(const gmx_domdec_t *dd,
7597                int dim0, int dim1, int dim2,
7598                gmx_bool bDistMB,
7599                dd_corners_t *c)
7600 {
7601     const gmx_domdec_comm_t  *comm;
7602     const gmx_domdec_zones_t *zones;
7603     int i, j;
7604
7605     comm = dd->comm;
7606
7607     zones = &comm->zones;
7608
7609     /* Keep the compiler happy */
7610     c->cr0  = 0;
7611     c->bcr1 = 0;
7612
7613     /* The first dimension is equal for all cells */
7614     c->c[0][0] = comm->cell_x0[dim0];
7615     if (bDistMB)
7616     {
7617         c->bc[0] = c->c[0][0];
7618     }
7619     if (dd->ndim >= 2)
7620     {
7621         dim1 = dd->dim[1];
7622         /* This cell row is only seen from the first row */
7623         c->c[1][0] = comm->cell_x0[dim1];
7624         /* All rows can see this row */
7625         c->c[1][1] = comm->cell_x0[dim1];
7626         if (dd->bGridJump)
7627         {
7628             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7629             if (bDistMB)
7630             {
7631                 /* For the multi-body distance we need the maximum */
7632                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7633             }
7634         }
7635         /* Set the upper-right corner for rounding */
7636         c->cr0 = comm->cell_x1[dim0];
7637
7638         if (dd->ndim >= 3)
7639         {
7640             dim2 = dd->dim[2];
7641             for (j = 0; j < 4; j++)
7642             {
7643                 c->c[2][j] = comm->cell_x0[dim2];
7644             }
7645             if (dd->bGridJump)
7646             {
7647                 /* Use the maximum of the i-cells that see a j-cell */
7648                 for (i = 0; i < zones->nizone; i++)
7649                 {
7650                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7651                     {
7652                         if (j >= 4)
7653                         {
7654                             c->c[2][j-4] =
7655                                 max(c->c[2][j-4],
7656                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7657                         }
7658                     }
7659                 }
7660                 if (bDistMB)
7661                 {
7662                     /* For the multi-body distance we need the maximum */
7663                     c->bc[2] = comm->cell_x0[dim2];
7664                     for (i = 0; i < 2; i++)
7665                     {
7666                         for (j = 0; j < 2; j++)
7667                         {
7668                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7669                         }
7670                     }
7671                 }
7672             }
7673
7674             /* Set the upper-right corner for rounding */
7675             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7676              * Only cell (0,0,0) can see cell 7 (1,1,1)
7677              */
7678             c->cr1[0] = comm->cell_x1[dim1];
7679             c->cr1[3] = comm->cell_x1[dim1];
7680             if (dd->bGridJump)
7681             {
7682                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7683                 if (bDistMB)
7684                 {
7685                     /* For the multi-body distance we need the maximum */
7686                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7687                 }
7688             }
7689         }
7690     }
7691 }
7692
7693 /* Determine which cg's we need to send in this pulse from this zone */
7694 static void
7695 get_zone_pulse_cgs(gmx_domdec_t *dd,
7696                    int zonei, int zone,
7697                    int cg0, int cg1,
7698                    const int *index_gl,
7699                    const int *cgindex,
7700                    int dim, int dim_ind,
7701                    int dim0, int dim1, int dim2,
7702                    real r_comm2, real r_bcomm2,
7703                    matrix box,
7704                    ivec tric_dist,
7705                    rvec *normal,
7706                    real skew_fac2_d, real skew_fac_01,
7707                    rvec *v_d, rvec *v_0, rvec *v_1,
7708                    const dd_corners_t *c,
7709                    rvec sf2_round,
7710                    gmx_bool bDistBonded,
7711                    gmx_bool bBondComm,
7712                    gmx_bool bDist2B,
7713                    gmx_bool bDistMB,
7714                    rvec *cg_cm,
7715                    int *cginfo,
7716                    gmx_domdec_ind_t *ind,
7717                    int **ibuf, int *ibuf_nalloc,
7718                    vec_rvec_t *vbuf,
7719                    int *nsend_ptr,
7720                    int *nat_ptr,
7721                    int *nsend_z_ptr)
7722 {
7723     gmx_domdec_comm_t *comm;
7724     gmx_bool           bScrew;
7725     gmx_bool           bDistMB_pulse;
7726     int                cg, i;
7727     real               r2, rb2, r, tric_sh;
7728     rvec               rn, rb;
7729     int                dimd;
7730     int                nsend_z, nsend, nat;
7731
7732     comm = dd->comm;
7733
7734     bScrew = (dd->bScrewPBC && dim == XX);
7735
7736     bDistMB_pulse = (bDistMB && bDistBonded);
7737
7738     nsend_z = 0;
7739     nsend   = *nsend_ptr;
7740     nat     = *nat_ptr;
7741
7742     for (cg = cg0; cg < cg1; cg++)
7743     {
7744         r2  = 0;
7745         rb2 = 0;
7746         if (tric_dist[dim_ind] == 0)
7747         {
7748             /* Rectangular direction, easy */
7749             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7750             if (r > 0)
7751             {
7752                 r2 += r*r;
7753             }
7754             if (bDistMB_pulse)
7755             {
7756                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7757                 if (r > 0)
7758                 {
7759                     rb2 += r*r;
7760                 }
7761             }
7762             /* Rounding gives at most a 16% reduction
7763              * in communicated atoms
7764              */
7765             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7766             {
7767                 r = cg_cm[cg][dim0] - c->cr0;
7768                 /* This is the first dimension, so always r >= 0 */
7769                 r2 += r*r;
7770                 if (bDistMB_pulse)
7771                 {
7772                     rb2 += r*r;
7773                 }
7774             }
7775             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7776             {
7777                 r = cg_cm[cg][dim1] - c->cr1[zone];
7778                 if (r > 0)
7779                 {
7780                     r2 += r*r;
7781                 }
7782                 if (bDistMB_pulse)
7783                 {
7784                     r = cg_cm[cg][dim1] - c->bcr1;
7785                     if (r > 0)
7786                     {
7787                         rb2 += r*r;
7788                     }
7789                 }
7790             }
7791         }
7792         else
7793         {
7794             /* Triclinic direction, more complicated */
7795             clear_rvec(rn);
7796             clear_rvec(rb);
7797             /* Rounding, conservative as the skew_fac multiplication
7798              * will slightly underestimate the distance.
7799              */
7800             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7801             {
7802                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7803                 for (i = dim0+1; i < DIM; i++)
7804                 {
7805                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7806                 }
7807                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7808                 if (bDistMB_pulse)
7809                 {
7810                     rb[dim0] = rn[dim0];
7811                     rb2      = r2;
7812                 }
7813                 /* Take care that the cell planes along dim0 might not
7814                  * be orthogonal to those along dim1 and dim2.
7815                  */
7816                 for (i = 1; i <= dim_ind; i++)
7817                 {
7818                     dimd = dd->dim[i];
7819                     if (normal[dim0][dimd] > 0)
7820                     {
7821                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7822                         if (bDistMB_pulse)
7823                         {
7824                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7825                         }
7826                     }
7827                 }
7828             }
7829             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7830             {
7831                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7832                 tric_sh   = 0;
7833                 for (i = dim1+1; i < DIM; i++)
7834                 {
7835                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7836                 }
7837                 rn[dim1] += tric_sh;
7838                 if (rn[dim1] > 0)
7839                 {
7840                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7841                     /* Take care of coupling of the distances
7842                      * to the planes along dim0 and dim1 through dim2.
7843                      */
7844                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7845                     /* Take care that the cell planes along dim1
7846                      * might not be orthogonal to that along dim2.
7847                      */
7848                     if (normal[dim1][dim2] > 0)
7849                     {
7850                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7851                     }
7852                 }
7853                 if (bDistMB_pulse)
7854                 {
7855                     rb[dim1] +=
7856                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7857                     if (rb[dim1] > 0)
7858                     {
7859                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7860                         /* Take care of coupling of the distances
7861                          * to the planes along dim0 and dim1 through dim2.
7862                          */
7863                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7864                         /* Take care that the cell planes along dim1
7865                          * might not be orthogonal to that along dim2.
7866                          */
7867                         if (normal[dim1][dim2] > 0)
7868                         {
7869                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7870                         }
7871                     }
7872                 }
7873             }
7874             /* The distance along the communication direction */
7875             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7876             tric_sh  = 0;
7877             for (i = dim+1; i < DIM; i++)
7878             {
7879                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7880             }
7881             rn[dim] += tric_sh;
7882             if (rn[dim] > 0)
7883             {
7884                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7885                 /* Take care of coupling of the distances
7886                  * to the planes along dim0 and dim1 through dim2.
7887                  */
7888                 if (dim_ind == 1 && zonei == 1)
7889                 {
7890                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7891                 }
7892             }
7893             if (bDistMB_pulse)
7894             {
7895                 clear_rvec(rb);
7896                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7897                 if (rb[dim] > 0)
7898                 {
7899                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7900                     /* Take care of coupling of the distances
7901                      * to the planes along dim0 and dim1 through dim2.
7902                      */
7903                     if (dim_ind == 1 && zonei == 1)
7904                     {
7905                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7906                     }
7907                 }
7908             }
7909         }
7910
7911         if (r2 < r_comm2 ||
7912             (bDistBonded &&
7913              ((bDistMB && rb2 < r_bcomm2) ||
7914               (bDist2B && r2  < r_bcomm2)) &&
7915              (!bBondComm ||
7916               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7917                missing_link(comm->cglink, index_gl[cg],
7918                             comm->bLocalCG)))))
7919         {
7920             /* Make an index to the local charge groups */
7921             if (nsend+1 > ind->nalloc)
7922             {
7923                 ind->nalloc = over_alloc_large(nsend+1);
7924                 srenew(ind->index, ind->nalloc);
7925             }
7926             if (nsend+1 > *ibuf_nalloc)
7927             {
7928                 *ibuf_nalloc = over_alloc_large(nsend+1);
7929                 srenew(*ibuf, *ibuf_nalloc);
7930             }
7931             ind->index[nsend] = cg;
7932             (*ibuf)[nsend]    = index_gl[cg];
7933             nsend_z++;
7934             vec_rvec_check_alloc(vbuf, nsend+1);
7935
7936             if (dd->ci[dim] == 0)
7937             {
7938                 /* Correct cg_cm for pbc */
7939                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7940                 if (bScrew)
7941                 {
7942                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7943                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7944                 }
7945             }
7946             else
7947             {
7948                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7949             }
7950             nsend++;
7951             nat += cgindex[cg+1] - cgindex[cg];
7952         }
7953     }
7954
7955     *nsend_ptr   = nsend;
7956     *nat_ptr     = nat;
7957     *nsend_z_ptr = nsend_z;
7958 }
7959
7960 static void setup_dd_communication(gmx_domdec_t *dd,
7961                                    matrix box, gmx_ddbox_t *ddbox,
7962                                    t_forcerec *fr, t_state *state, rvec **f)
7963 {
7964     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7965     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7966     int                    c, i, j, cg, cg_gl, nrcg;
7967     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7968     gmx_domdec_comm_t     *comm;
7969     gmx_domdec_zones_t    *zones;
7970     gmx_domdec_comm_dim_t *cd;
7971     gmx_domdec_ind_t      *ind;
7972     cginfo_mb_t           *cginfo_mb;
7973     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7974     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
7975     dd_corners_t           corners;
7976     ivec                   tric_dist;
7977     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7978     real                   skew_fac2_d, skew_fac_01;
7979     rvec                   sf2_round;
7980     int                    nsend, nat;
7981     int                    th;
7982
7983     if (debug)
7984     {
7985         fprintf(debug, "Setting up DD communication\n");
7986     }
7987
7988     comm  = dd->comm;
7989
7990     switch (fr->cutoff_scheme)
7991     {
7992         case ecutsGROUP:
7993             cg_cm = fr->cg_cm;
7994             break;
7995         case ecutsVERLET:
7996             cg_cm = state->x;
7997             break;
7998         default:
7999             gmx_incons("unimplemented");
8000             cg_cm = NULL;
8001     }
8002
8003     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8004     {
8005         dim = dd->dim[dim_ind];
8006
8007         /* Check if we need to use triclinic distances */
8008         tric_dist[dim_ind] = 0;
8009         for (i = 0; i <= dim_ind; i++)
8010         {
8011             if (ddbox->tric_dir[dd->dim[i]])
8012             {
8013                 tric_dist[dim_ind] = 1;
8014             }
8015         }
8016     }
8017
8018     bBondComm = comm->bBondComm;
8019
8020     /* Do we need to determine extra distances for multi-body bondeds? */
8021     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8022
8023     /* Do we need to determine extra distances for only two-body bondeds? */
8024     bDist2B = (bBondComm && !bDistMB);
8025
8026     r_comm2  = sqr(comm->cutoff);
8027     r_bcomm2 = sqr(comm->cutoff_mbody);
8028
8029     if (debug)
8030     {
8031         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8032     }
8033
8034     zones = &comm->zones;
8035
8036     dim0 = dd->dim[0];
8037     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8038     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8039
8040     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8041
8042     /* Triclinic stuff */
8043     normal      = ddbox->normal;
8044     skew_fac_01 = 0;
8045     if (dd->ndim >= 2)
8046     {
8047         v_0 = ddbox->v[dim0];
8048         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8049         {
8050             /* Determine the coupling coefficient for the distances
8051              * to the cell planes along dim0 and dim1 through dim2.
8052              * This is required for correct rounding.
8053              */
8054             skew_fac_01 =
8055                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8056             if (debug)
8057             {
8058                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8059             }
8060         }
8061     }
8062     if (dd->ndim >= 3)
8063     {
8064         v_1 = ddbox->v[dim1];
8065     }
8066
8067     zone_cg_range = zones->cg_range;
8068     index_gl      = dd->index_gl;
8069     cgindex       = dd->cgindex;
8070     cginfo_mb     = fr->cginfo_mb;
8071
8072     zone_cg_range[0]   = 0;
8073     zone_cg_range[1]   = dd->ncg_home;
8074     comm->zone_ncg1[0] = dd->ncg_home;
8075     pos_cg             = dd->ncg_home;
8076
8077     nat_tot = dd->nat_home;
8078     nzone   = 1;
8079     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8080     {
8081         dim = dd->dim[dim_ind];
8082         cd  = &comm->cd[dim_ind];
8083
8084         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8085         {
8086             /* No pbc in this dimension, the first node should not comm. */
8087             nzone_send = 0;
8088         }
8089         else
8090         {
8091             nzone_send = nzone;
8092         }
8093
8094         v_d         = ddbox->v[dim];
8095         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8096
8097         cd->bInPlace = TRUE;
8098         for (p = 0; p < cd->np; p++)
8099         {
8100             /* Only atoms communicated in the first pulse are used
8101              * for multi-body bonded interactions or for bBondComm.
8102              */
8103             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8104
8105             ind   = &cd->ind[p];
8106             nsend = 0;
8107             nat   = 0;
8108             for (zone = 0; zone < nzone_send; zone++)
8109             {
8110                 if (tric_dist[dim_ind] && dim_ind > 0)
8111                 {
8112                     /* Determine slightly more optimized skew_fac's
8113                      * for rounding.
8114                      * This reduces the number of communicated atoms
8115                      * by about 10% for 3D DD of rhombic dodecahedra.
8116                      */
8117                     for (dimd = 0; dimd < dim; dimd++)
8118                     {
8119                         sf2_round[dimd] = 1;
8120                         if (ddbox->tric_dir[dimd])
8121                         {
8122                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8123                             {
8124                                 /* If we are shifted in dimension i
8125                                  * and the cell plane is tilted forward
8126                                  * in dimension i, skip this coupling.
8127                                  */
8128                                 if (!(zones->shift[nzone+zone][i] &&
8129                                       ddbox->v[dimd][i][dimd] >= 0))
8130                                 {
8131                                     sf2_round[dimd] +=
8132                                         sqr(ddbox->v[dimd][i][dimd]);
8133                                 }
8134                             }
8135                             sf2_round[dimd] = 1/sf2_round[dimd];
8136                         }
8137                     }
8138                 }
8139
8140                 zonei = zone_perm[dim_ind][zone];
8141                 if (p == 0)
8142                 {
8143                     /* Here we permutate the zones to obtain a convenient order
8144                      * for neighbor searching
8145                      */
8146                     cg0 = zone_cg_range[zonei];
8147                     cg1 = zone_cg_range[zonei+1];
8148                 }
8149                 else
8150                 {
8151                     /* Look only at the cg's received in the previous grid pulse
8152                      */
8153                     cg1 = zone_cg_range[nzone+zone+1];
8154                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8155                 }
8156
8157 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8158                 for (th = 0; th < comm->nth; th++)
8159                 {
8160                     gmx_domdec_ind_t *ind_p;
8161                     int             **ibuf_p, *ibuf_nalloc_p;
8162                     vec_rvec_t       *vbuf_p;
8163                     int              *nsend_p, *nat_p;
8164                     int              *nsend_zone_p;
8165                     int               cg0_th, cg1_th;
8166
8167                     if (th == 0)
8168                     {
8169                         /* Thread 0 writes in the comm buffers */
8170                         ind_p         = ind;
8171                         ibuf_p        = &comm->buf_int;
8172                         ibuf_nalloc_p = &comm->nalloc_int;
8173                         vbuf_p        = &comm->vbuf;
8174                         nsend_p       = &nsend;
8175                         nat_p         = &nat;
8176                         nsend_zone_p  = &ind->nsend[zone];
8177                     }
8178                     else
8179                     {
8180                         /* Other threads write into temp buffers */
8181                         ind_p         = &comm->dth[th].ind;
8182                         ibuf_p        = &comm->dth[th].ibuf;
8183                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8184                         vbuf_p        = &comm->dth[th].vbuf;
8185                         nsend_p       = &comm->dth[th].nsend;
8186                         nat_p         = &comm->dth[th].nat;
8187                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8188
8189                         comm->dth[th].nsend      = 0;
8190                         comm->dth[th].nat        = 0;
8191                         comm->dth[th].nsend_zone = 0;
8192                     }
8193
8194                     if (comm->nth == 1)
8195                     {
8196                         cg0_th = cg0;
8197                         cg1_th = cg1;
8198                     }
8199                     else
8200                     {
8201                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8202                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8203                     }
8204
8205                     /* Get the cg's for this pulse in this zone */
8206                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8207                                        index_gl, cgindex,
8208                                        dim, dim_ind, dim0, dim1, dim2,
8209                                        r_comm2, r_bcomm2,
8210                                        box, tric_dist,
8211                                        normal, skew_fac2_d, skew_fac_01,
8212                                        v_d, v_0, v_1, &corners, sf2_round,
8213                                        bDistBonded, bBondComm,
8214                                        bDist2B, bDistMB,
8215                                        cg_cm, fr->cginfo,
8216                                        ind_p,
8217                                        ibuf_p, ibuf_nalloc_p,
8218                                        vbuf_p,
8219                                        nsend_p, nat_p,
8220                                        nsend_zone_p);
8221                 }
8222
8223                 /* Append data of threads>=1 to the communication buffers */
8224                 for (th = 1; th < comm->nth; th++)
8225                 {
8226                     dd_comm_setup_work_t *dth;
8227                     int                   i, ns1;
8228
8229                     dth = &comm->dth[th];
8230
8231                     ns1 = nsend + dth->nsend_zone;
8232                     if (ns1 > ind->nalloc)
8233                     {
8234                         ind->nalloc = over_alloc_dd(ns1);
8235                         srenew(ind->index, ind->nalloc);
8236                     }
8237                     if (ns1 > comm->nalloc_int)
8238                     {
8239                         comm->nalloc_int = over_alloc_dd(ns1);
8240                         srenew(comm->buf_int, comm->nalloc_int);
8241                     }
8242                     if (ns1 > comm->vbuf.nalloc)
8243                     {
8244                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8245                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8246                     }
8247
8248                     for (i = 0; i < dth->nsend_zone; i++)
8249                     {
8250                         ind->index[nsend]    = dth->ind.index[i];
8251                         comm->buf_int[nsend] = dth->ibuf[i];
8252                         copy_rvec(dth->vbuf.v[i],
8253                                   comm->vbuf.v[nsend]);
8254                         nsend++;
8255                     }
8256                     nat              += dth->nat;
8257                     ind->nsend[zone] += dth->nsend_zone;
8258                 }
8259             }
8260             /* Clear the counts in case we do not have pbc */
8261             for (zone = nzone_send; zone < nzone; zone++)
8262             {
8263                 ind->nsend[zone] = 0;
8264             }
8265             ind->nsend[nzone]   = nsend;
8266             ind->nsend[nzone+1] = nat;
8267             /* Communicate the number of cg's and atoms to receive */
8268             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8269                             ind->nsend, nzone+2,
8270                             ind->nrecv, nzone+2);
8271
8272             /* The rvec buffer is also required for atom buffers of size nsend
8273              * in dd_move_x and dd_move_f.
8274              */
8275             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8276
8277             if (p > 0)
8278             {
8279                 /* We can receive in place if only the last zone is not empty */
8280                 for (zone = 0; zone < nzone-1; zone++)
8281                 {
8282                     if (ind->nrecv[zone] > 0)
8283                     {
8284                         cd->bInPlace = FALSE;
8285                     }
8286                 }
8287                 if (!cd->bInPlace)
8288                 {
8289                     /* The int buffer is only required here for the cg indices */
8290                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8291                     {
8292                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8293                         srenew(comm->buf_int2, comm->nalloc_int2);
8294                     }
8295                     /* The rvec buffer is also required for atom buffers
8296                      * of size nrecv in dd_move_x and dd_move_f.
8297                      */
8298                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8299                     vec_rvec_check_alloc(&comm->vbuf2, i);
8300                 }
8301             }
8302
8303             /* Make space for the global cg indices */
8304             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8305                 || dd->cg_nalloc == 0)
8306             {
8307                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8308                 srenew(index_gl, dd->cg_nalloc);
8309                 srenew(cgindex, dd->cg_nalloc+1);
8310             }
8311             /* Communicate the global cg indices */
8312             if (cd->bInPlace)
8313             {
8314                 recv_i = index_gl + pos_cg;
8315             }
8316             else
8317             {
8318                 recv_i = comm->buf_int2;
8319             }
8320             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8321                             comm->buf_int, nsend,
8322                             recv_i,        ind->nrecv[nzone]);
8323
8324             /* Make space for cg_cm */
8325             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8326             if (fr->cutoff_scheme == ecutsGROUP)
8327             {
8328                 cg_cm = fr->cg_cm;
8329             }
8330             else
8331             {
8332                 cg_cm = state->x;
8333             }
8334             /* Communicate cg_cm */
8335             if (cd->bInPlace)
8336             {
8337                 recv_vr = cg_cm + pos_cg;
8338             }
8339             else
8340             {
8341                 recv_vr = comm->vbuf2.v;
8342             }
8343             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8344                              comm->vbuf.v, nsend,
8345                              recv_vr,      ind->nrecv[nzone]);
8346
8347             /* Make the charge group index */
8348             if (cd->bInPlace)
8349             {
8350                 zone = (p == 0 ? 0 : nzone - 1);
8351                 while (zone < nzone)
8352                 {
8353                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8354                     {
8355                         cg_gl              = index_gl[pos_cg];
8356                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8357                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8358                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8359                         if (bBondComm)
8360                         {
8361                             /* Update the charge group presence,
8362                              * so we can use it in the next pass of the loop.
8363                              */
8364                             comm->bLocalCG[cg_gl] = TRUE;
8365                         }
8366                         pos_cg++;
8367                     }
8368                     if (p == 0)
8369                     {
8370                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8371                     }
8372                     zone++;
8373                     zone_cg_range[nzone+zone] = pos_cg;
8374                 }
8375             }
8376             else
8377             {
8378                 /* This part of the code is never executed with bBondComm. */
8379                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8380                                  index_gl, recv_i, cg_cm, recv_vr,
8381                                  cgindex, fr->cginfo_mb, fr->cginfo);
8382                 pos_cg += ind->nrecv[nzone];
8383             }
8384             nat_tot += ind->nrecv[nzone+1];
8385         }
8386         if (!cd->bInPlace)
8387         {
8388             /* Store the atom block for easy copying of communication buffers */
8389             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8390         }
8391         nzone += nzone;
8392     }
8393     dd->index_gl = index_gl;
8394     dd->cgindex  = cgindex;
8395
8396     dd->ncg_tot          = zone_cg_range[zones->n];
8397     dd->nat_tot          = nat_tot;
8398     comm->nat[ddnatHOME] = dd->nat_home;
8399     for (i = ddnatZONE; i < ddnatNR; i++)
8400     {
8401         comm->nat[i] = dd->nat_tot;
8402     }
8403
8404     if (!bBondComm)
8405     {
8406         /* We don't need to update cginfo, since that was alrady done above.
8407          * So we pass NULL for the forcerec.
8408          */
8409         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8410                       NULL, comm->bLocalCG);
8411     }
8412
8413     if (debug)
8414     {
8415         fprintf(debug, "Finished setting up DD communication, zones:");
8416         for (c = 0; c < zones->n; c++)
8417         {
8418             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8419         }
8420         fprintf(debug, "\n");
8421     }
8422 }
8423
8424 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8425 {
8426     int c;
8427
8428     for (c = 0; c < zones->nizone; c++)
8429     {
8430         zones->izone[c].cg1  = zones->cg_range[c+1];
8431         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8432         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8433     }
8434 }
8435
8436 static void set_zones_size(gmx_domdec_t *dd,
8437                            matrix box, const gmx_ddbox_t *ddbox,
8438                            int zone_start, int zone_end)
8439 {
8440     gmx_domdec_comm_t  *comm;
8441     gmx_domdec_zones_t *zones;
8442     gmx_bool            bDistMB;
8443     int                 z, zi, zj0, zj1, d, dim;
8444     real                rcs, rcmbs;
8445     int                 i, j;
8446     real                size_j, add_tric;
8447     real                vol;
8448
8449     comm = dd->comm;
8450
8451     zones = &comm->zones;
8452
8453     /* Do we need to determine extra distances for multi-body bondeds? */
8454     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8455
8456     for (z = zone_start; z < zone_end; z++)
8457     {
8458         /* Copy cell limits to zone limits.
8459          * Valid for non-DD dims and non-shifted dims.
8460          */
8461         copy_rvec(comm->cell_x0, zones->size[z].x0);
8462         copy_rvec(comm->cell_x1, zones->size[z].x1);
8463     }
8464
8465     for (d = 0; d < dd->ndim; d++)
8466     {
8467         dim = dd->dim[d];
8468
8469         for (z = 0; z < zones->n; z++)
8470         {
8471             /* With a staggered grid we have different sizes
8472              * for non-shifted dimensions.
8473              */
8474             if (dd->bGridJump && zones->shift[z][dim] == 0)
8475             {
8476                 if (d == 1)
8477                 {
8478                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8479                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8480                 }
8481                 else if (d == 2)
8482                 {
8483                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8484                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8485                 }
8486             }
8487         }
8488
8489         rcs   = comm->cutoff;
8490         rcmbs = comm->cutoff_mbody;
8491         if (ddbox->tric_dir[dim])
8492         {
8493             rcs   /= ddbox->skew_fac[dim];
8494             rcmbs /= ddbox->skew_fac[dim];
8495         }
8496
8497         /* Set the lower limit for the shifted zone dimensions */
8498         for (z = zone_start; z < zone_end; z++)
8499         {
8500             if (zones->shift[z][dim] > 0)
8501             {
8502                 dim = dd->dim[d];
8503                 if (!dd->bGridJump || d == 0)
8504                 {
8505                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8506                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8507                 }
8508                 else
8509                 {
8510                     /* Here we take the lower limit of the zone from
8511                      * the lowest domain of the zone below.
8512                      */
8513                     if (z < 4)
8514                     {
8515                         zones->size[z].x0[dim] =
8516                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8517                     }
8518                     else
8519                     {
8520                         if (d == 1)
8521                         {
8522                             zones->size[z].x0[dim] =
8523                                 zones->size[zone_perm[2][z-4]].x0[dim];
8524                         }
8525                         else
8526                         {
8527                             zones->size[z].x0[dim] =
8528                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8529                         }
8530                     }
8531                     /* A temporary limit, is updated below */
8532                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8533
8534                     if (bDistMB)
8535                     {
8536                         for (zi = 0; zi < zones->nizone; zi++)
8537                         {
8538                             if (zones->shift[zi][dim] == 0)
8539                             {
8540                                 /* This takes the whole zone into account.
8541                                  * With multiple pulses this will lead
8542                                  * to a larger zone then strictly necessary.
8543                                  */
8544                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8545                                                              zones->size[zi].x1[dim]+rcmbs);
8546                             }
8547                         }
8548                     }
8549                 }
8550             }
8551         }
8552
8553         /* Loop over the i-zones to set the upper limit of each
8554          * j-zone they see.
8555          */
8556         for (zi = 0; zi < zones->nizone; zi++)
8557         {
8558             if (zones->shift[zi][dim] == 0)
8559             {
8560                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8561                 {
8562                     if (zones->shift[z][dim] > 0)
8563                     {
8564                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8565                                                      zones->size[zi].x1[dim]+rcs);
8566                     }
8567                 }
8568             }
8569         }
8570     }
8571
8572     for (z = zone_start; z < zone_end; z++)
8573     {
8574         /* Initialization only required to keep the compiler happy */
8575         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8576         int  nc, c;
8577
8578         /* To determine the bounding box for a zone we need to find
8579          * the extreme corners of 4, 2 or 1 corners.
8580          */
8581         nc = 1 << (ddbox->npbcdim - 1);
8582
8583         for (c = 0; c < nc; c++)
8584         {
8585             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8586             corner[XX] = 0;
8587             if ((c & 1) == 0)
8588             {
8589                 corner[YY] = zones->size[z].x0[YY];
8590             }
8591             else
8592             {
8593                 corner[YY] = zones->size[z].x1[YY];
8594             }
8595             if ((c & 2) == 0)
8596             {
8597                 corner[ZZ] = zones->size[z].x0[ZZ];
8598             }
8599             else
8600             {
8601                 corner[ZZ] = zones->size[z].x1[ZZ];
8602             }
8603             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8604             {
8605                 /* With 1D domain decomposition the cg's are not in
8606                  * the triclinic box, but triclinic x-y and rectangular y-z.
8607                  * Shift y back, so it will later end up at 0.
8608                  */
8609                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8610             }
8611             /* Apply the triclinic couplings */
8612             for (i = YY; i < ddbox->npbcdim; i++)
8613             {
8614                 for (j = XX; j < i; j++)
8615                 {
8616                     corner[j] += corner[i]*box[i][j]/box[i][i];
8617                 }
8618             }
8619             if (c == 0)
8620             {
8621                 copy_rvec(corner, corner_min);
8622                 copy_rvec(corner, corner_max);
8623             }
8624             else
8625             {
8626                 for (i = 0; i < DIM; i++)
8627                 {
8628                     corner_min[i] = min(corner_min[i], corner[i]);
8629                     corner_max[i] = max(corner_max[i], corner[i]);
8630                 }
8631             }
8632         }
8633         /* Copy the extreme cornes without offset along x */
8634         for (i = 0; i < DIM; i++)
8635         {
8636             zones->size[z].bb_x0[i] = corner_min[i];
8637             zones->size[z].bb_x1[i] = corner_max[i];
8638         }
8639         /* Add the offset along x */
8640         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8641         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8642     }
8643
8644     if (zone_start == 0)
8645     {
8646         vol = 1;
8647         for (dim = 0; dim < DIM; dim++)
8648         {
8649             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8650         }
8651         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8652     }
8653
8654     if (debug)
8655     {
8656         for (z = zone_start; z < zone_end; z++)
8657         {
8658             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8659                     z,
8660                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8661                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8662                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8663             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8664                     z,
8665                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8666                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8667                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8668         }
8669     }
8670 }
8671
8672 static int comp_cgsort(const void *a, const void *b)
8673 {
8674     int           comp;
8675
8676     gmx_cgsort_t *cga, *cgb;
8677     cga = (gmx_cgsort_t *)a;
8678     cgb = (gmx_cgsort_t *)b;
8679
8680     comp = cga->nsc - cgb->nsc;
8681     if (comp == 0)
8682     {
8683         comp = cga->ind_gl - cgb->ind_gl;
8684     }
8685
8686     return comp;
8687 }
8688
8689 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8690                          int *a, int *buf)
8691 {
8692     int i;
8693
8694     /* Order the data */
8695     for (i = 0; i < n; i++)
8696     {
8697         buf[i] = a[sort[i].ind];
8698     }
8699
8700     /* Copy back to the original array */
8701     for (i = 0; i < n; i++)
8702     {
8703         a[i] = buf[i];
8704     }
8705 }
8706
8707 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8708                          rvec *v, rvec *buf)
8709 {
8710     int i;
8711
8712     /* Order the data */
8713     for (i = 0; i < n; i++)
8714     {
8715         copy_rvec(v[sort[i].ind], buf[i]);
8716     }
8717
8718     /* Copy back to the original array */
8719     for (i = 0; i < n; i++)
8720     {
8721         copy_rvec(buf[i], v[i]);
8722     }
8723 }
8724
8725 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8726                            rvec *v, rvec *buf)
8727 {
8728     int a, atot, cg, cg0, cg1, i;
8729
8730     if (cgindex == NULL)
8731     {
8732         /* Avoid the useless loop of the atoms within a cg */
8733         order_vec_cg(ncg, sort, v, buf);
8734
8735         return;
8736     }
8737
8738     /* Order the data */
8739     a = 0;
8740     for (cg = 0; cg < ncg; cg++)
8741     {
8742         cg0 = cgindex[sort[cg].ind];
8743         cg1 = cgindex[sort[cg].ind+1];
8744         for (i = cg0; i < cg1; i++)
8745         {
8746             copy_rvec(v[i], buf[a]);
8747             a++;
8748         }
8749     }
8750     atot = a;
8751
8752     /* Copy back to the original array */
8753     for (a = 0; a < atot; a++)
8754     {
8755         copy_rvec(buf[a], v[a]);
8756     }
8757 }
8758
8759 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8760                          int nsort_new, gmx_cgsort_t *sort_new,
8761                          gmx_cgsort_t *sort1)
8762 {
8763     int i1, i2, i_new;
8764
8765     /* The new indices are not very ordered, so we qsort them */
8766     qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8767
8768     /* sort2 is already ordered, so now we can merge the two arrays */
8769     i1    = 0;
8770     i2    = 0;
8771     i_new = 0;
8772     while (i2 < nsort2 || i_new < nsort_new)
8773     {
8774         if (i2 == nsort2)
8775         {
8776             sort1[i1++] = sort_new[i_new++];
8777         }
8778         else if (i_new == nsort_new)
8779         {
8780             sort1[i1++] = sort2[i2++];
8781         }
8782         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8783                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8784                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8785         {
8786             sort1[i1++] = sort2[i2++];
8787         }
8788         else
8789         {
8790             sort1[i1++] = sort_new[i_new++];
8791         }
8792     }
8793 }
8794
8795 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8796 {
8797     gmx_domdec_sort_t *sort;
8798     gmx_cgsort_t      *cgsort, *sort_i;
8799     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8800     int                sort_last, sort_skip;
8801
8802     sort = dd->comm->sort;
8803
8804     a = fr->ns.grid->cell_index;
8805
8806     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8807
8808     if (ncg_home_old >= 0)
8809     {
8810         /* The charge groups that remained in the same ns grid cell
8811          * are completely ordered. So we can sort efficiently by sorting
8812          * the charge groups that did move into the stationary list.
8813          */
8814         ncg_new   = 0;
8815         nsort2    = 0;
8816         nsort_new = 0;
8817         for (i = 0; i < dd->ncg_home; i++)
8818         {
8819             /* Check if this cg did not move to another node */
8820             if (a[i] < moved)
8821             {
8822                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8823                 {
8824                     /* This cg is new on this node or moved ns grid cell */
8825                     if (nsort_new >= sort->sort_new_nalloc)
8826                     {
8827                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8828                         srenew(sort->sort_new, sort->sort_new_nalloc);
8829                     }
8830                     sort_i = &(sort->sort_new[nsort_new++]);
8831                 }
8832                 else
8833                 {
8834                     /* This cg did not move */
8835                     sort_i = &(sort->sort2[nsort2++]);
8836                 }
8837                 /* Sort on the ns grid cell indices
8838                  * and the global topology index.
8839                  * index_gl is irrelevant with cell ns,
8840                  * but we set it here anyhow to avoid a conditional.
8841                  */
8842                 sort_i->nsc    = a[i];
8843                 sort_i->ind_gl = dd->index_gl[i];
8844                 sort_i->ind    = i;
8845                 ncg_new++;
8846             }
8847         }
8848         if (debug)
8849         {
8850             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8851                     nsort2, nsort_new);
8852         }
8853         /* Sort efficiently */
8854         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8855                      sort->sort);
8856     }
8857     else
8858     {
8859         cgsort  = sort->sort;
8860         ncg_new = 0;
8861         for (i = 0; i < dd->ncg_home; i++)
8862         {
8863             /* Sort on the ns grid cell indices
8864              * and the global topology index
8865              */
8866             cgsort[i].nsc    = a[i];
8867             cgsort[i].ind_gl = dd->index_gl[i];
8868             cgsort[i].ind    = i;
8869             if (cgsort[i].nsc < moved)
8870             {
8871                 ncg_new++;
8872             }
8873         }
8874         if (debug)
8875         {
8876             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8877         }
8878         /* Determine the order of the charge groups using qsort */
8879         qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8880     }
8881
8882     return ncg_new;
8883 }
8884
8885 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8886 {
8887     gmx_cgsort_t *sort;
8888     int           ncg_new, i, *a, na;
8889
8890     sort = dd->comm->sort->sort;
8891
8892     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8893
8894     ncg_new = 0;
8895     for (i = 0; i < na; i++)
8896     {
8897         if (a[i] >= 0)
8898         {
8899             sort[ncg_new].ind = a[i];
8900             ncg_new++;
8901         }
8902     }
8903
8904     return ncg_new;
8905 }
8906
8907 static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
8908                           rvec *cgcm, t_forcerec *fr, t_state *state,
8909                           int ncg_home_old)
8910 {
8911     gmx_domdec_sort_t *sort;
8912     gmx_cgsort_t      *cgsort, *sort_i;
8913     int               *cgindex;
8914     int                ncg_new, i, *ibuf, cgsize;
8915     rvec              *vbuf;
8916
8917     sort = dd->comm->sort;
8918
8919     if (dd->ncg_home > sort->sort_nalloc)
8920     {
8921         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8922         srenew(sort->sort, sort->sort_nalloc);
8923         srenew(sort->sort2, sort->sort_nalloc);
8924     }
8925     cgsort = sort->sort;
8926
8927     switch (fr->cutoff_scheme)
8928     {
8929         case ecutsGROUP:
8930             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8931             break;
8932         case ecutsVERLET:
8933             ncg_new = dd_sort_order_nbnxn(dd, fr);
8934             break;
8935         default:
8936             gmx_incons("unimplemented");
8937             ncg_new = 0;
8938     }
8939
8940     /* We alloc with the old size, since cgindex is still old */
8941     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8942     vbuf = dd->comm->vbuf.v;
8943
8944     if (dd->comm->bCGs)
8945     {
8946         cgindex = dd->cgindex;
8947     }
8948     else
8949     {
8950         cgindex = NULL;
8951     }
8952
8953     /* Remove the charge groups which are no longer at home here */
8954     dd->ncg_home = ncg_new;
8955     if (debug)
8956     {
8957         fprintf(debug, "Set the new home charge group count to %d\n",
8958                 dd->ncg_home);
8959     }
8960
8961     /* Reorder the state */
8962     for (i = 0; i < estNR; i++)
8963     {
8964         if (EST_DISTR(i) && (state->flags & (1<<i)))
8965         {
8966             switch (i)
8967             {
8968                 case estX:
8969                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8970                     break;
8971                 case estV:
8972                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8973                     break;
8974                 case estSDX:
8975                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8976                     break;
8977                 case estCGP:
8978                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8979                     break;
8980                 case estLD_RNG:
8981                 case estLD_RNGI:
8982                 case estDISRE_INITF:
8983                 case estDISRE_RM3TAV:
8984                 case estORIRE_INITF:
8985                 case estORIRE_DTAV:
8986                     /* No ordering required */
8987                     break;
8988                 default:
8989                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8990                     break;
8991             }
8992         }
8993     }
8994     if (fr->cutoff_scheme == ecutsGROUP)
8995     {
8996         /* Reorder cgcm */
8997         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8998     }
8999
9000     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9001     {
9002         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9003         srenew(sort->ibuf, sort->ibuf_nalloc);
9004     }
9005     ibuf = sort->ibuf;
9006     /* Reorder the global cg index */
9007     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9008     /* Reorder the cginfo */
9009     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9010     /* Rebuild the local cg index */
9011     if (dd->comm->bCGs)
9012     {
9013         ibuf[0] = 0;
9014         for (i = 0; i < dd->ncg_home; i++)
9015         {
9016             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9017             ibuf[i+1] = ibuf[i] + cgsize;
9018         }
9019         for (i = 0; i < dd->ncg_home+1; i++)
9020         {
9021             dd->cgindex[i] = ibuf[i];
9022         }
9023     }
9024     else
9025     {
9026         for (i = 0; i < dd->ncg_home+1; i++)
9027         {
9028             dd->cgindex[i] = i;
9029         }
9030     }
9031     /* Set the home atom number */
9032     dd->nat_home = dd->cgindex[dd->ncg_home];
9033
9034     if (fr->cutoff_scheme == ecutsVERLET)
9035     {
9036         /* The atoms are now exactly in grid order, update the grid order */
9037         nbnxn_set_atomorder(fr->nbv->nbs);
9038     }
9039     else
9040     {
9041         /* Copy the sorted ns cell indices back to the ns grid struct */
9042         for (i = 0; i < dd->ncg_home; i++)
9043         {
9044             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9045         }
9046         fr->ns.grid->nr = dd->ncg_home;
9047     }
9048 }
9049
9050 static void add_dd_statistics(gmx_domdec_t *dd)
9051 {
9052     gmx_domdec_comm_t *comm;
9053     int                ddnat;
9054
9055     comm = dd->comm;
9056
9057     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9058     {
9059         comm->sum_nat[ddnat-ddnatZONE] +=
9060             comm->nat[ddnat] - comm->nat[ddnat-1];
9061     }
9062     comm->ndecomp++;
9063 }
9064
9065 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9066 {
9067     gmx_domdec_comm_t *comm;
9068     int                ddnat;
9069
9070     comm = dd->comm;
9071
9072     /* Reset all the statistics and counters for total run counting */
9073     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9074     {
9075         comm->sum_nat[ddnat-ddnatZONE] = 0;
9076     }
9077     comm->ndecomp   = 0;
9078     comm->nload     = 0;
9079     comm->load_step = 0;
9080     comm->load_sum  = 0;
9081     comm->load_max  = 0;
9082     clear_ivec(comm->load_lim);
9083     comm->load_mdf = 0;
9084     comm->load_pme = 0;
9085 }
9086
9087 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9088 {
9089     gmx_domdec_comm_t *comm;
9090     int                ddnat;
9091     double             av;
9092
9093     comm = cr->dd->comm;
9094
9095     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9096
9097     if (fplog == NULL)
9098     {
9099         return;
9100     }
9101
9102     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9103
9104     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9105     {
9106         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9107         switch (ddnat)
9108         {
9109             case ddnatZONE:
9110                 fprintf(fplog,
9111                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9112                         2, av);
9113                 break;
9114             case ddnatVSITE:
9115                 if (cr->dd->vsite_comm)
9116                 {
9117                     fprintf(fplog,
9118                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9119                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9120                             av);
9121                 }
9122                 break;
9123             case ddnatCON:
9124                 if (cr->dd->constraint_comm)
9125                 {
9126                     fprintf(fplog,
9127                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9128                             1 + ir->nLincsIter, av);
9129                 }
9130                 break;
9131             default:
9132                 gmx_incons(" Unknown type for DD statistics");
9133         }
9134     }
9135     fprintf(fplog, "\n");
9136
9137     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9138     {
9139         print_dd_load_av(fplog, cr->dd);
9140     }
9141 }
9142
9143 void dd_partition_system(FILE                *fplog,
9144                          gmx_large_int_t      step,
9145                          t_commrec           *cr,
9146                          gmx_bool             bMasterState,
9147                          int                  nstglobalcomm,
9148                          t_state             *state_global,
9149                          gmx_mtop_t          *top_global,
9150                          t_inputrec          *ir,
9151                          t_state             *state_local,
9152                          rvec               **f,
9153                          t_mdatoms           *mdatoms,
9154                          gmx_localtop_t      *top_local,
9155                          t_forcerec          *fr,
9156                          gmx_vsite_t         *vsite,
9157                          gmx_shellfc_t        shellfc,
9158                          gmx_constr_t         constr,
9159                          t_nrnb              *nrnb,
9160                          gmx_wallcycle_t      wcycle,
9161                          gmx_bool             bVerbose)
9162 {
9163     gmx_domdec_t      *dd;
9164     gmx_domdec_comm_t *comm;
9165     gmx_ddbox_t        ddbox = {0};
9166     t_block           *cgs_gl;
9167     gmx_large_int_t    step_pcoupl;
9168     rvec               cell_ns_x0, cell_ns_x1;
9169     int                i, j, n, cg0 = 0, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9170     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9171     gmx_bool           bRedist, bSortCG, bResortAll;
9172     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9173     real               grid_density;
9174     char               sbuf[22];
9175
9176     dd   = cr->dd;
9177     comm = dd->comm;
9178
9179     bBoxChanged = (bMasterState || DEFORM(*ir));
9180     if (ir->epc != epcNO)
9181     {
9182         /* With nstpcouple > 1 pressure coupling happens.
9183          * one step after calculating the pressure.
9184          * Box scaling happens at the end of the MD step,
9185          * after the DD partitioning.
9186          * We therefore have to do DLB in the first partitioning
9187          * after an MD step where P-coupling occured.
9188          * We need to determine the last step in which p-coupling occurred.
9189          * MRS -- need to validate this for vv?
9190          */
9191         n = ir->nstpcouple;
9192         if (n == 1)
9193         {
9194             step_pcoupl = step - 1;
9195         }
9196         else
9197         {
9198             step_pcoupl = ((step - 1)/n)*n + 1;
9199         }
9200         if (step_pcoupl >= comm->partition_step)
9201         {
9202             bBoxChanged = TRUE;
9203         }
9204     }
9205
9206     bNStGlobalComm = (step % nstglobalcomm == 0);
9207
9208     if (!comm->bDynLoadBal)
9209     {
9210         bDoDLB = FALSE;
9211     }
9212     else
9213     {
9214         /* Should we do dynamic load balacing this step?
9215          * Since it requires (possibly expensive) global communication,
9216          * we might want to do DLB less frequently.
9217          */
9218         if (bBoxChanged || ir->epc != epcNO)
9219         {
9220             bDoDLB = bBoxChanged;
9221         }
9222         else
9223         {
9224             bDoDLB = bNStGlobalComm;
9225         }
9226     }
9227
9228     /* Check if we have recorded loads on the nodes */
9229     if (comm->bRecordLoad && dd_load_count(comm))
9230     {
9231         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9232         {
9233             /* Check if we should use DLB at the second partitioning
9234              * and every 100 partitionings,
9235              * so the extra communication cost is negligible.
9236              */
9237             n         = max(100, nstglobalcomm);
9238             bCheckDLB = (comm->n_load_collect == 0 ||
9239                          comm->n_load_have % n == n-1);
9240         }
9241         else
9242         {
9243             bCheckDLB = FALSE;
9244         }
9245
9246         /* Print load every nstlog, first and last step to the log file */
9247         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9248                     comm->n_load_collect == 0 ||
9249                     (ir->nsteps >= 0 &&
9250                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9251
9252         /* Avoid extra communication due to verbose screen output
9253          * when nstglobalcomm is set.
9254          */
9255         if (bDoDLB || bLogLoad || bCheckDLB ||
9256             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9257         {
9258             get_load_distribution(dd, wcycle);
9259             if (DDMASTER(dd))
9260             {
9261                 if (bLogLoad)
9262                 {
9263                     dd_print_load(fplog, dd, step-1);
9264                 }
9265                 if (bVerbose)
9266                 {
9267                     dd_print_load_verbose(dd);
9268                 }
9269             }
9270             comm->n_load_collect++;
9271
9272             if (bCheckDLB)
9273             {
9274                 /* Since the timings are node dependent, the master decides */
9275                 if (DDMASTER(dd))
9276                 {
9277                     bTurnOnDLB =
9278                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9279                     if (debug)
9280                     {
9281                         fprintf(debug, "step %s, imb loss %f\n",
9282                                 gmx_step_str(step, sbuf),
9283                                 dd_force_imb_perf_loss(dd));
9284                     }
9285                 }
9286                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9287                 if (bTurnOnDLB)
9288                 {
9289                     turn_on_dlb(fplog, cr, step);
9290                     bDoDLB = TRUE;
9291                 }
9292             }
9293         }
9294         comm->n_load_have++;
9295     }
9296
9297     cgs_gl = &comm->cgs_gl;
9298
9299     bRedist = FALSE;
9300     if (bMasterState)
9301     {
9302         /* Clear the old state */
9303         clear_dd_indices(dd, 0, 0);
9304
9305         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9306                   TRUE, cgs_gl, state_global->x, &ddbox);
9307
9308         get_cg_distribution(fplog, step, dd, cgs_gl,
9309                             state_global->box, &ddbox, state_global->x);
9310
9311         dd_distribute_state(dd, cgs_gl,
9312                             state_global, state_local, f);
9313
9314         dd_make_local_cgs(dd, &top_local->cgs);
9315
9316         /* Ensure that we have space for the new distribution */
9317         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9318
9319         if (fr->cutoff_scheme == ecutsGROUP)
9320         {
9321             calc_cgcm(fplog, 0, dd->ncg_home,
9322                       &top_local->cgs, state_local->x, fr->cg_cm);
9323         }
9324
9325         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9326
9327         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9328
9329         cg0 = 0;
9330     }
9331     else if (state_local->ddp_count != dd->ddp_count)
9332     {
9333         if (state_local->ddp_count > dd->ddp_count)
9334         {
9335             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9336         }
9337
9338         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9339         {
9340             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9341         }
9342
9343         /* Clear the old state */
9344         clear_dd_indices(dd, 0, 0);
9345
9346         /* Build the new indices */
9347         rebuild_cgindex(dd, cgs_gl->index, state_local);
9348         make_dd_indices(dd, cgs_gl->index, 0);
9349
9350         if (fr->cutoff_scheme == ecutsGROUP)
9351         {
9352             /* Redetermine the cg COMs */
9353             calc_cgcm(fplog, 0, dd->ncg_home,
9354                       &top_local->cgs, state_local->x, fr->cg_cm);
9355         }
9356
9357         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9358
9359         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9360
9361         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9362                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9363
9364         bRedist = comm->bDynLoadBal;
9365     }
9366     else
9367     {
9368         /* We have the full state, only redistribute the cgs */
9369
9370         /* Clear the non-home indices */
9371         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9372
9373         /* Avoid global communication for dim's without pbc and -gcom */
9374         if (!bNStGlobalComm)
9375         {
9376             copy_rvec(comm->box0, ddbox.box0    );
9377             copy_rvec(comm->box_size, ddbox.box_size);
9378         }
9379         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9380                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9381
9382         bBoxChanged = TRUE;
9383         bRedist     = TRUE;
9384     }
9385     /* For dim's without pbc and -gcom */
9386     copy_rvec(ddbox.box0, comm->box0    );
9387     copy_rvec(ddbox.box_size, comm->box_size);
9388
9389     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9390                       step, wcycle);
9391
9392     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9393     {
9394         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9395     }
9396
9397     /* Check if we should sort the charge groups */
9398     if (comm->nstSortCG > 0)
9399     {
9400         bSortCG = (bMasterState ||
9401                    (bRedist && (step % comm->nstSortCG == 0)));
9402     }
9403     else
9404     {
9405         bSortCG = FALSE;
9406     }
9407
9408     ncg_home_old = dd->ncg_home;
9409
9410     ncg_moved = 0;
9411     if (bRedist)
9412     {
9413         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9414
9415         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9416                            state_local, f, fr, mdatoms,
9417                            !bSortCG, nrnb, &cg0, &ncg_moved);
9418
9419         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9420     }
9421
9422     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9423                           dd, &ddbox,
9424                           &comm->cell_x0, &comm->cell_x1,
9425                           dd->ncg_home, fr->cg_cm,
9426                           cell_ns_x0, cell_ns_x1, &grid_density);
9427
9428     if (bBoxChanged)
9429     {
9430         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9431     }
9432
9433     switch (fr->cutoff_scheme)
9434     {
9435         case ecutsGROUP:
9436             copy_ivec(fr->ns.grid->n, ncells_old);
9437             grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
9438                        state_local->box, cell_ns_x0, cell_ns_x1,
9439                        fr->rlistlong, grid_density);
9440             break;
9441         case ecutsVERLET:
9442             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9443             break;
9444         default:
9445             gmx_incons("unimplemented");
9446     }
9447     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9448     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9449
9450     if (bSortCG)
9451     {
9452         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9453
9454         /* Sort the state on charge group position.
9455          * This enables exact restarts from this step.
9456          * It also improves performance by about 15% with larger numbers
9457          * of atoms per node.
9458          */
9459
9460         /* Fill the ns grid with the home cell,
9461          * so we can sort with the indices.
9462          */
9463         set_zones_ncg_home(dd);
9464
9465         switch (fr->cutoff_scheme)
9466         {
9467             case ecutsVERLET:
9468                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9469
9470                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9471                                   0,
9472                                   comm->zones.size[0].bb_x0,
9473                                   comm->zones.size[0].bb_x1,
9474                                   0, dd->ncg_home,
9475                                   comm->zones.dens_zone0,
9476                                   fr->cginfo,
9477                                   state_local->x,
9478                                   ncg_moved, bRedist ? comm->moved : NULL,
9479                                   fr->nbv->grp[eintLocal].kernel_type,
9480                                   fr->nbv->grp[eintLocal].nbat);
9481
9482                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9483                 break;
9484             case ecutsGROUP:
9485                 fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
9486                           0, dd->ncg_home, fr->cg_cm);
9487
9488                 copy_ivec(fr->ns.grid->n, ncells_new);
9489                 break;
9490             default:
9491                 gmx_incons("unimplemented");
9492         }
9493
9494         bResortAll = bMasterState;
9495
9496         /* Check if we can user the old order and ns grid cell indices
9497          * of the charge groups to sort the charge groups efficiently.
9498          */
9499         if (ncells_new[XX] != ncells_old[XX] ||
9500             ncells_new[YY] != ncells_old[YY] ||
9501             ncells_new[ZZ] != ncells_old[ZZ])
9502         {
9503             bResortAll = TRUE;
9504         }
9505
9506         if (debug)
9507         {
9508             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9509                     gmx_step_str(step, sbuf), dd->ncg_home);
9510         }
9511         dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
9512                       bResortAll ? -1 : ncg_home_old);
9513         /* Rebuild all the indices */
9514         cg0 = 0;
9515         ga2la_clear(dd->ga2la);
9516
9517         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9518     }
9519
9520     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9521
9522     /* Setup up the communication and communicate the coordinates */
9523     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9524
9525     /* Set the indices */
9526     make_dd_indices(dd, cgs_gl->index, cg0);
9527
9528     /* Set the charge group boundaries for neighbor searching */
9529     set_cg_boundaries(&comm->zones);
9530
9531     if (fr->cutoff_scheme == ecutsVERLET)
9532     {
9533         set_zones_size(dd, state_local->box, &ddbox,
9534                        bSortCG ? 1 : 0, comm->zones.n);
9535     }
9536
9537     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9538
9539     /*
9540        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9541                  -1,state_local->x,state_local->box);
9542      */
9543
9544     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9545
9546     /* Extract a local topology from the global topology */
9547     for (i = 0; i < dd->ndim; i++)
9548     {
9549         np[dd->dim[i]] = comm->cd[i].np;
9550     }
9551     dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
9552                       comm->cellsize_min, np,
9553                       fr,
9554                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9555                       vsite, top_global, top_local);
9556
9557     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9558
9559     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9560
9561     /* Set up the special atom communication */
9562     n = comm->nat[ddnatZONE];
9563     for (i = ddnatZONE+1; i < ddnatNR; i++)
9564     {
9565         switch (i)
9566         {
9567             case ddnatVSITE:
9568                 if (vsite && vsite->n_intercg_vsite)
9569                 {
9570                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9571                 }
9572                 break;
9573             case ddnatCON:
9574                 if (dd->bInterCGcons || dd->bInterCGsettles)
9575                 {
9576                     /* Only for inter-cg constraints we need special code */
9577                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9578                                                   constr, ir->nProjOrder,
9579                                                   top_local->idef.il);
9580                 }
9581                 break;
9582             default:
9583                 gmx_incons("Unknown special atom type setup");
9584         }
9585         comm->nat[i] = n;
9586     }
9587
9588     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9589
9590     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9591
9592     /* Make space for the extra coordinates for virtual site
9593      * or constraint communication.
9594      */
9595     state_local->natoms = comm->nat[ddnatNR-1];
9596     if (state_local->natoms > state_local->nalloc)
9597     {
9598         dd_realloc_state(state_local, f, state_local->natoms);
9599     }
9600
9601     if (fr->bF_NoVirSum)
9602     {
9603         if (vsite && vsite->n_intercg_vsite)
9604         {
9605             nat_f_novirsum = comm->nat[ddnatVSITE];
9606         }
9607         else
9608         {
9609             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9610             {
9611                 nat_f_novirsum = dd->nat_tot;
9612             }
9613             else
9614             {
9615                 nat_f_novirsum = dd->nat_home;
9616             }
9617         }
9618     }
9619     else
9620     {
9621         nat_f_novirsum = 0;
9622     }
9623
9624     /* Set the number of atoms required for the force calculation.
9625      * Forces need to be constrained when using a twin-range setup
9626      * or with energy minimization. For simple simulations we could
9627      * avoid some allocation, zeroing and copying, but this is
9628      * probably not worth the complications ande checking.
9629      */
9630     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9631                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9632
9633     /* We make the all mdatoms up to nat_tot_con.
9634      * We could save some work by only setting invmass
9635      * between nat_tot and nat_tot_con.
9636      */
9637     /* This call also sets the new number of home particles to dd->nat_home */
9638     atoms2md(top_global, ir,
9639              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9640
9641     /* Now we have the charges we can sort the FE interactions */
9642     dd_sort_local_top(dd, mdatoms, top_local);
9643
9644     if (vsite != NULL)
9645     {
9646         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9647         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
9648     }
9649
9650     if (shellfc)
9651     {
9652         /* Make the local shell stuff, currently no communication is done */
9653         make_local_shells(cr, mdatoms, shellfc);
9654     }
9655
9656     if (ir->implicit_solvent)
9657     {
9658         make_local_gb(cr, fr->born, ir->gb_algorithm);
9659     }
9660
9661     init_bonded_thread_force_reduction(fr, &top_local->idef);
9662
9663     if (!(cr->duty & DUTY_PME))
9664     {
9665         /* Send the charges to our PME only node */
9666         gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
9667                        mdatoms->chargeA, mdatoms->chargeB,
9668                        dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9669     }
9670
9671     if (constr)
9672     {
9673         set_constraints(constr, top_local, ir, mdatoms, cr);
9674     }
9675
9676     if (ir->ePull != epullNO)
9677     {
9678         /* Update the local pull groups */
9679         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9680     }
9681
9682     if (ir->bRot)
9683     {
9684         /* Update the local rotation groups */
9685         dd_make_local_rotation_groups(dd, ir->rot);
9686     }
9687
9688
9689     add_dd_statistics(dd);
9690
9691     /* Make sure we only count the cycles for this DD partitioning */
9692     clear_dd_cycle_counts(dd);
9693
9694     /* Because the order of the atoms might have changed since
9695      * the last vsite construction, we need to communicate the constructing
9696      * atom coordinates again (for spreading the forces this MD step).
9697      */
9698     dd_move_x_vsites(dd, state_local->box, state_local->x);
9699
9700     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9701
9702     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9703     {
9704         dd_move_x(dd, state_local->box, state_local->x);
9705         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9706                      -1, state_local->x, state_local->box);
9707     }
9708
9709     /* Store the partitioning step */
9710     comm->partition_step = step;
9711
9712     /* Increase the DD partitioning counter */
9713     dd->ddp_count++;
9714     /* The state currently matches this DD partitioning count, store it */
9715     state_local->ddp_count = dd->ddp_count;
9716     if (bMasterState)
9717     {
9718         /* The DD master node knows the complete cg distribution,
9719          * store the count so we can possibly skip the cg info communication.
9720          */
9721         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9722     }
9723
9724     if (comm->DD_debug > 0)
9725     {
9726         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9727         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9728                                 "after partitioning");
9729     }
9730 }