src/gromacs/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include <stdio.h>
  41 #include <time.h>
  42 #include <math.h>
  43 #include <string.h>
  44 #include <stdlib.h>
  45 #include "typedefs.h"
  46 #include "smalloc.h"
  47 #include "gmx_fatal.h"
  48 #include "gmx_fatal_collective.h"
  49 #include "vec.h"
  50 #include "domdec.h"
  51 #include "domdec_network.h"
  52 #include "nrnb.h"
  53 #include "pbc.h"
  54 #include "chargegroup.h"
  55 #include "constr.h"
  56 #include "mdatoms.h"
  57 #include "names.h"
  58 #include "force.h"
  59 #include "pme.h"
  60 #include "pull.h"
  61 #include "pull_rotation.h"
  62 #include "mdrun.h"
  63 #include "nsgrid.h"
  64 #include "shellfc.h"
  65 #include "mtop_util.h"
  66 #include "gmx_ga2la.h"
  67 #include "macros.h"
  68 #include "nbnxn_search.h"
  69 #include "bondf.h"
  70 #include "gmx_omp_nthreads.h"
  71 #include "gpu_utils.h"
  72
  73 #include "gromacs/fileio/futil.h"
  74 #include "gromacs/fileio/gmxfio.h"
  75 #include "gromacs/fileio/pdbio.h"
  76 #include "gromacs/timing/wallcycle.h"
  77 #include "gromacs/utility/gmxmpi.h"
  78 #include "gromacs/utility/qsort_threadsafe.h"
  79
  80 #define DDRANK(dd, rank)    (rank)
  81 #define DDMASTERRANK(dd)   (dd->masterrank)
  82
  83 typedef struct gmx_domdec_master
  84 {
  85     /* The cell boundaries */
  86     real **cell_x;
  87     /* The global charge group division */
  88     int   *ncg;    /* Number of home charge groups for each node */
  89     int   *index;  /* Index of nnodes+1 into cg */
  90     int   *cg;     /* Global charge group index */
  91     int   *nat;    /* Number of home atoms for each node. */
  92     int   *ibuf;   /* Buffer for communication */
  93     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  94 } gmx_domdec_master_t;
  95
  96 typedef struct
  97 {
  98     /* The numbers of charge groups to send and receive for each cell
  99      * that requires communication, the last entry contains the total
 100      * number of atoms that needs to be communicated.
 101      */
 102     int  nsend[DD_MAXIZONE+2];
 103     int  nrecv[DD_MAXIZONE+2];
 104     /* The charge groups to send */
 105     int *index;
 106     int  nalloc;
 107     /* The atom range for non-in-place communication */
 108     int  cell2at0[DD_MAXIZONE];
 109     int  cell2at1[DD_MAXIZONE];
 110 } gmx_domdec_ind_t;
 111
 112 typedef struct
 113 {
 114     int               np;       /* Number of grid pulses in this dimension */
 115     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 116     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 117     int               np_nalloc;
 118     gmx_bool          bInPlace; /* Can we communicate in place?            */
 119 } gmx_domdec_comm_dim_t;
 120
 121 typedef struct
 122 {
 123     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 124     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 125     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 126     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 127     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 128     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 129     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 130     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 131     real     *buf_ncd;     /* Temp. var.                                     */
 132 } gmx_domdec_root_t;
 133
 134 #define DD_NLOAD_MAX 9
 135
 136 /* Here floats are accurate enough, since these variables
 137  * only influence the load balancing, not the actual MD results.
 138  */
 139 typedef struct
 140 {
 141     int    nload;
 142     float *load;
 143     float  sum;
 144     float  max;
 145     float  sum_m;
 146     float  cvol_min;
 147     float  mdf;
 148     float  pme;
 149     int    flags;
 150 } gmx_domdec_load_t;
 151
 152 typedef struct
 153 {
 154     int  nsc;
 155     int  ind_gl;
 156     int  ind;
 157 } gmx_cgsort_t;
 158
 159 typedef struct
 160 {
 161     gmx_cgsort_t *sort;
 162     gmx_cgsort_t *sort2;
 163     int           sort_nalloc;
 164     gmx_cgsort_t *sort_new;
 165     int           sort_new_nalloc;
 166     int          *ibuf;
 167     int           ibuf_nalloc;
 168 } gmx_domdec_sort_t;
 169
 170 typedef struct
 171 {
 172     rvec *v;
 173     int   nalloc;
 174 } vec_rvec_t;
 175
 176 /* This enum determines the order of the coordinates.
 177  * ddnatHOME and ddnatZONE should be first and second,
 178  * the others can be ordered as wanted.
 179  */
 180 enum {
 181     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 182 };
 183
 184 enum {
 185     edlbAUTO, edlbNO, edlbYES, edlbNR
 186 };
 187 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 188
 189 typedef struct
 190 {
 191     int      dim;       /* The dimension                                          */
 192     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 193     int      nslab;     /* The number of PME slabs in this dimension              */
 194     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 195     int     *pp_min;    /* The minimum pp node location, size nslab               */
 196     int     *pp_max;    /* The maximum pp node location,size nslab                */
 197     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 198 } gmx_ddpme_t;
 199
 200 typedef struct
 201 {
 202     real min0;    /* The minimum bottom of this zone                        */
 203     real max1;    /* The maximum top of this zone                           */
 204     real min1;    /* The minimum top of this zone                           */
 205     real mch0;    /* The maximum bottom communicaton height for this zone   */
 206     real mch1;    /* The maximum top communicaton height for this zone      */
 207     real p1_0;    /* The bottom value of the first cell in this zone        */
 208     real p1_1;    /* The top value of the first cell in this zone           */
 209 } gmx_ddzone_t;
 210
 211 typedef struct
 212 {
 213     gmx_domdec_ind_t ind;
 214     int             *ibuf;
 215     int              ibuf_nalloc;
 216     vec_rvec_t       vbuf;
 217     int              nsend;
 218     int              nat;
 219     int              nsend_zone;
 220 } dd_comm_setup_work_t;
 221
 222 typedef struct gmx_domdec_comm
 223 {
 224     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 225      * unless stated otherwise.
 226      */
 227
 228     /* The number of decomposition dimensions for PME, 0: no PME */
 229     int         npmedecompdim;
 230     /* The number of nodes doing PME (PP/PME or only PME) */
 231     int         npmenodes;
 232     int         npmenodes_x;
 233     int         npmenodes_y;
 234     /* The communication setup including the PME only nodes */
 235     gmx_bool    bCartesianPP_PME;
 236     ivec        ntot;
 237     int         cartpmedim;
 238     int        *pmenodes;          /* size npmenodes                         */
 239     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 240                                     * but with bCartesianPP_PME              */
 241     gmx_ddpme_t ddpme[2];
 242
 243     /* The DD particle-particle nodes only */
 244     gmx_bool bCartesianPP;
 245     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 246
 247     /* The global charge groups */
 248     t_block cgs_gl;
 249
 250     /* Should we sort the cgs */
 251     int                nstSortCG;
 252     gmx_domdec_sort_t *sort;
 253
 254     /* Are there charge groups? */
 255     gmx_bool bCGs;
 256
 257     /* Are there bonded and multi-body interactions between charge groups? */
 258     gmx_bool bInterCGBondeds;
 259     gmx_bool bInterCGMultiBody;
 260
 261     /* Data for the optional bonded interaction atom communication range */
 262     gmx_bool  bBondComm;
 263     t_blocka *cglink;
 264     char     *bLocalCG;
 265
 266     /* The DLB option */
 267     int      eDLB;
 268     /* Are we actually using DLB? */
 269     gmx_bool bDynLoadBal;
 270
 271     /* Cell sizes for static load balancing, first index cartesian */
 272     real **slb_frac;
 273
 274     /* The width of the communicated boundaries */
 275     real     cutoff_mbody;
 276     real     cutoff;
 277     /* The minimum cell size (including triclinic correction) */
 278     rvec     cellsize_min;
 279     /* For dlb, for use with edlbAUTO */
 280     rvec     cellsize_min_dlb;
 281     /* The lower limit for the DD cell size with DLB */
 282     real     cellsize_limit;
 283     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 284     gmx_bool bVacDLBNoLimit;
 285
 286     /* With PME load balancing we set limits on DLB */
 287     gmx_bool bPMELoadBalDLBLimits;
 288     /* DLB needs to take into account that we want to allow this maximum
 289      * cut-off (for PME load balancing), this could limit cell boundaries.
 290      */
 291     real PMELoadBal_max_cutoff;
 292
 293     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 294     ivec tric_dir;
 295     /* box0 and box_size are required with dim's without pbc and -gcom */
 296     rvec box0;
 297     rvec box_size;
 298
 299     /* The cell boundaries */
 300     rvec cell_x0;
 301     rvec cell_x1;
 302
 303     /* The old location of the cell boundaries, to check cg displacements */
 304     rvec old_cell_x0;
 305     rvec old_cell_x1;
 306
 307     /* The communication setup and charge group boundaries for the zones */
 308     gmx_domdec_zones_t zones;
 309
 310     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 311      * cell boundaries of neighboring cells for dynamic load balancing.
 312      */
 313     gmx_ddzone_t zone_d1[2];
 314     gmx_ddzone_t zone_d2[2][2];
 315
 316     /* The coordinate/force communication setup and indices */
 317     gmx_domdec_comm_dim_t cd[DIM];
 318     /* The maximum number of cells to communicate with in one dimension */
 319     int                   maxpulse;
 320
 321     /* Which cg distribution is stored on the master node */
 322     int master_cg_ddp_count;
 323
 324     /* The number of cg's received from the direct neighbors */
 325     int  zone_ncg1[DD_MAXZONE];
 326
 327     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 328     int  nat[ddnatNR];
 329
 330     /* Array for signalling if atoms have moved to another domain */
 331     int  *moved;
 332     int   moved_nalloc;
 333
 334     /* Communication buffer for general use */
 335     int  *buf_int;
 336     int   nalloc_int;
 337
 338     /* Communication buffer for general use */
 339     vec_rvec_t vbuf;
 340
 341     /* Temporary storage for thread parallel communication setup */
 342     int                   nth;
 343     dd_comm_setup_work_t *dth;
 344
 345     /* Communication buffers only used with multiple grid pulses */
 346     int       *buf_int2;
 347     int        nalloc_int2;
 348     vec_rvec_t vbuf2;
 349
 350     /* Communication buffers for local redistribution */
 351     int  **cggl_flag;
 352     int    cggl_flag_nalloc[DIM*2];
 353     rvec **cgcm_state;
 354     int    cgcm_state_nalloc[DIM*2];
 355
 356     /* Cell sizes for dynamic load balancing */
 357     gmx_domdec_root_t **root;
 358     real               *cell_f_row;
 359     real                cell_f0[DIM];
 360     real                cell_f1[DIM];
 361     real                cell_f_max0[DIM];
 362     real                cell_f_min1[DIM];
 363
 364     /* Stuff for load communication */
 365     gmx_bool           bRecordLoad;
 366     gmx_domdec_load_t *load;
 367     int                nrank_gpu_shared;
 368 #ifdef GMX_MPI
 369     MPI_Comm          *mpi_comm_load;
 370     MPI_Comm           mpi_comm_gpu_shared;
 371 #endif
 372
 373     /* Maximum DLB scaling per load balancing step in percent */
 374     int dlb_scale_lim;
 375
 376     /* Cycle counters */
 377     float  cycl[ddCyclNr];
 378     int    cycl_n[ddCyclNr];
 379     float  cycl_max[ddCyclNr];
 380     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 381     int    eFlop;
 382     double flop;
 383     int    flop_n;
 384     /* Have often have did we have load measurements */
 385     int    n_load_have;
 386     /* Have often have we collected the load measurements */
 387     int    n_load_collect;
 388
 389     /* Statistics */
 390     double sum_nat[ddnatNR-ddnatZONE];
 391     int    ndecomp;
 392     int    nload;
 393     double load_step;
 394     double load_sum;
 395     double load_max;
 396     ivec   load_lim;
 397     double load_mdf;
 398     double load_pme;
 399
 400     /* The last partition step */
 401     gmx_int64_t partition_step;
 402
 403     /* Debugging */
 404     int  nstDDDump;
 405     int  nstDDDumpGrid;
 406     int  DD_debug;
 407 } gmx_domdec_comm_t;
 408
 409 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 410 #define DD_CGIBS 2
 411
 412 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 413 #define DD_FLAG_NRCG  65535
 414 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 415 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 416
 417 /* Zone permutation required to obtain consecutive charge groups
 418  * for neighbor searching.
 419  */
 420 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 421
 422 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 423  * components see only j zones with that component 0.
 424  */
 425
 426 /* The DD zone order */
 427 static const ivec dd_zo[DD_MAXZONE] =
 428 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 429
 430 /* The 3D setup */
 431 #define dd_z3n  8
 432 #define dd_zp3n 4
 433 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 434
 435 /* The 2D setup */
 436 #define dd_z2n  4
 437 #define dd_zp2n 2
 438 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 439
 440 /* The 1D setup */
 441 #define dd_z1n  2
 442 #define dd_zp1n 1
 443 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 444
 445 /* Factors used to avoid problems due to rounding issues */
 446 #define DD_CELL_MARGIN       1.0001
 447 #define DD_CELL_MARGIN2      1.00005
 448 /* Factor to account for pressure scaling during nstlist steps */
 449 #define DD_PRES_SCALE_MARGIN 1.02
 450
 451 /* Allowed performance loss before we DLB or warn */
 452 #define DD_PERF_LOSS 0.05
 453
 454 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 455
 456 /* Use separate MPI send and receive commands
 457  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 458  * This saves memory (and some copying for small nnodes).
 459  * For high parallelization scatter and gather calls are used.
 460  */
 461 #define GMX_DD_NNODES_SENDRECV 4
 462
 463
 464 /*
 465    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 466
 467    static void index2xyz(ivec nc,int ind,ivec xyz)
 468    {
 469    xyz[XX] = ind % nc[XX];
 470    xyz[YY] = (ind / nc[XX]) % nc[YY];
 471    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 472    }
 473  */
 474
 475 /* This order is required to minimize the coordinate communication in PME
 476  * which uses decomposition in the x direction.
 477  */
 478 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 479
 480 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 481 {
 482     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 483     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 484     xyz[ZZ] = ind % nc[ZZ];
 485 }
 486
 487 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 488 {
 489     int ddindex;
 490     int ddnodeid = -1;
 491
 492     ddindex = dd_index(dd->nc, c);
 493     if (dd->comm->bCartesianPP_PME)
 494     {
 495         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 496     }
 497     else if (dd->comm->bCartesianPP)
 498     {
 499 #ifdef GMX_MPI
 500         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 501 #endif
 502     }
 503     else
 504     {
 505         ddnodeid = ddindex;
 506     }
 507
 508     return ddnodeid;
 509 }
 510
 511 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 512 {
 513     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 514 }
 515
 516 int ddglatnr(gmx_domdec_t *dd, int i)
 517 {
 518     int atnr;
 519
 520     if (dd == NULL)
 521     {
 522         atnr = i + 1;
 523     }
 524     else
 525     {
 526         if (i >= dd->comm->nat[ddnatNR-1])
 527         {
 528             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 529         }
 530         atnr = dd->gatindex[i] + 1;
 531     }
 532
 533     return atnr;
 534 }
 535
 536 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 537 {
 538     return &dd->comm->cgs_gl;
 539 }
 540
 541 static void vec_rvec_init(vec_rvec_t *v)
 542 {
 543     v->nalloc = 0;
 544     v->v      = NULL;
 545 }
 546
 547 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 548 {
 549     if (n > v->nalloc)
 550     {
 551         v->nalloc = over_alloc_dd(n);
 552         srenew(v->v, v->nalloc);
 553     }
 554 }
 555
 556 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 557 {
 558     int i;
 559
 560     if (state->ddp_count != dd->ddp_count)
 561     {
 562         gmx_incons("The state does not the domain decomposition state");
 563     }
 564
 565     state->ncg_gl = dd->ncg_home;
 566     if (state->ncg_gl > state->cg_gl_nalloc)
 567     {
 568         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 569         srenew(state->cg_gl, state->cg_gl_nalloc);
 570     }
 571     for (i = 0; i < state->ncg_gl; i++)
 572     {
 573         state->cg_gl[i] = dd->index_gl[i];
 574     }
 575
 576     state->ddp_count_cg_gl = dd->ddp_count;
 577 }
 578
 579 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 580 {
 581     return &dd->comm->zones;
 582 }
 583
 584 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 585                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 586 {
 587     gmx_domdec_zones_t *zones;
 588     int                 izone, d, dim;
 589
 590     zones = &dd->comm->zones;
 591
 592     izone = 0;
 593     while (icg >= zones->izone[izone].cg1)
 594     {
 595         izone++;
 596     }
 597
 598     if (izone == 0)
 599     {
 600         *jcg0 = icg;
 601     }
 602     else if (izone < zones->nizone)
 603     {
 604         *jcg0 = zones->izone[izone].jcg0;
 605     }
 606     else
 607     {
 608         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 609                   icg, izone, zones->nizone);
 610     }
 611
 612     *jcg1 = zones->izone[izone].jcg1;
 613
 614     for (d = 0; d < dd->ndim; d++)
 615     {
 616         dim         = dd->dim[d];
 617         shift0[dim] = zones->izone[izone].shift0[dim];
 618         shift1[dim] = zones->izone[izone].shift1[dim];
 619         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 620         {
 621             /* A conservative approach, this can be optimized */
 622             shift0[dim] -= 1;
 623             shift1[dim] += 1;
 624         }
 625     }
 626 }
 627
 628 int dd_natoms_vsite(gmx_domdec_t *dd)
 629 {
 630     return dd->comm->nat[ddnatVSITE];
 631 }
 632
 633 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 634 {
 635     *at_start = dd->comm->nat[ddnatCON-1];
 636     *at_end   = dd->comm->nat[ddnatCON];
 637 }
 638
 639 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 640 {
 641     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 642     int                   *index, *cgindex;
 643     gmx_domdec_comm_t     *comm;
 644     gmx_domdec_comm_dim_t *cd;
 645     gmx_domdec_ind_t      *ind;
 646     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 647     gmx_bool               bPBC, bScrew;
 648
 649     comm = dd->comm;
 650
 651     cgindex = dd->cgindex;
 652
 653     buf = comm->vbuf.v;
 654
 655     nzone   = 1;
 656     nat_tot = dd->nat_home;
 657     for (d = 0; d < dd->ndim; d++)
 658     {
 659         bPBC   = (dd->ci[dd->dim[d]] == 0);
 660         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 661         if (bPBC)
 662         {
 663             copy_rvec(box[dd->dim[d]], shift);
 664         }
 665         cd = &comm->cd[d];
 666         for (p = 0; p < cd->np; p++)
 667         {
 668             ind   = &cd->ind[p];
 669             index = ind->index;
 670             n     = 0;
 671             if (!bPBC)
 672             {
 673                 for (i = 0; i < ind->nsend[nzone]; i++)
 674                 {
 675                     at0 = cgindex[index[i]];
 676                     at1 = cgindex[index[i]+1];
 677                     for (j = at0; j < at1; j++)
 678                     {
 679                         copy_rvec(x[j], buf[n]);
 680                         n++;
 681                     }
 682                 }
 683             }
 684             else if (!bScrew)
 685             {
 686                 for (i = 0; i < ind->nsend[nzone]; i++)
 687                 {
 688                     at0 = cgindex[index[i]];
 689                     at1 = cgindex[index[i]+1];
 690                     for (j = at0; j < at1; j++)
 691                     {
 692                         /* We need to shift the coordinates */
 693                         rvec_add(x[j], shift, buf[n]);
 694                         n++;
 695                     }
 696                 }
 697             }
 698             else
 699             {
 700                 for (i = 0; i < ind->nsend[nzone]; i++)
 701                 {
 702                     at0 = cgindex[index[i]];
 703                     at1 = cgindex[index[i]+1];
 704                     for (j = at0; j < at1; j++)
 705                     {
 706                         /* Shift x */
 707                         buf[n][XX] = x[j][XX] + shift[XX];
 708                         /* Rotate y and z.
 709                          * This operation requires a special shift force
 710                          * treatment, which is performed in calc_vir.
 711                          */
 712                         buf[n][YY] = box[YY][YY] - x[j][YY];
 713                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 714                         n++;
 715                     }
 716                 }
 717             }
 718
 719             if (cd->bInPlace)
 720             {
 721                 rbuf = x + nat_tot;
 722             }
 723             else
 724             {
 725                 rbuf = comm->vbuf2.v;
 726             }
 727             /* Send and receive the coordinates */
 728             dd_sendrecv_rvec(dd, d, dddirBackward,
 729                              buf,  ind->nsend[nzone+1],
 730                              rbuf, ind->nrecv[nzone+1]);
 731             if (!cd->bInPlace)
 732             {
 733                 j = 0;
 734                 for (zone = 0; zone < nzone; zone++)
 735                 {
 736                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 737                     {
 738                         copy_rvec(rbuf[j], x[i]);
 739                         j++;
 740                     }
 741                 }
 742             }
 743             nat_tot += ind->nrecv[nzone+1];
 744         }
 745         nzone += nzone;
 746     }
 747 }
 748
 749 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 750 {
 751     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 752     int                   *index, *cgindex;
 753     gmx_domdec_comm_t     *comm;
 754     gmx_domdec_comm_dim_t *cd;
 755     gmx_domdec_ind_t      *ind;
 756     rvec                  *buf, *sbuf;
 757     ivec                   vis;
 758     int                    is;
 759     gmx_bool               bPBC, bScrew;
 760
 761     comm = dd->comm;
 762
 763     cgindex = dd->cgindex;
 764
 765     buf = comm->vbuf.v;
 766
 767     n       = 0;
 768     nzone   = comm->zones.n/2;
 769     nat_tot = dd->nat_tot;
 770     for (d = dd->ndim-1; d >= 0; d--)
 771     {
 772         bPBC   = (dd->ci[dd->dim[d]] == 0);
 773         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 774         if (fshift == NULL && !bScrew)
 775         {
 776             bPBC = FALSE;
 777         }
 778         /* Determine which shift vector we need */
 779         clear_ivec(vis);
 780         vis[dd->dim[d]] = 1;
 781         is              = IVEC2IS(vis);
 782
 783         cd = &comm->cd[d];
 784         for (p = cd->np-1; p >= 0; p--)
 785         {
 786             ind      = &cd->ind[p];
 787             nat_tot -= ind->nrecv[nzone+1];
 788             if (cd->bInPlace)
 789             {
 790                 sbuf = f + nat_tot;
 791             }
 792             else
 793             {
 794                 sbuf = comm->vbuf2.v;
 795                 j    = 0;
 796                 for (zone = 0; zone < nzone; zone++)
 797                 {
 798                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 799                     {
 800                         copy_rvec(f[i], sbuf[j]);
 801                         j++;
 802                     }
 803                 }
 804             }
 805             /* Communicate the forces */
 806             dd_sendrecv_rvec(dd, d, dddirForward,
 807                              sbuf, ind->nrecv[nzone+1],
 808                              buf,  ind->nsend[nzone+1]);
 809             index = ind->index;
 810             /* Add the received forces */
 811             n = 0;
 812             if (!bPBC)
 813             {
 814                 for (i = 0; i < ind->nsend[nzone]; i++)
 815                 {
 816                     at0 = cgindex[index[i]];
 817                     at1 = cgindex[index[i]+1];
 818                     for (j = at0; j < at1; j++)
 819                     {
 820                         rvec_inc(f[j], buf[n]);
 821                         n++;
 822                     }
 823                 }
 824             }
 825             else if (!bScrew)
 826             {
 827                 for (i = 0; i < ind->nsend[nzone]; i++)
 828                 {
 829                     at0 = cgindex[index[i]];
 830                     at1 = cgindex[index[i]+1];
 831                     for (j = at0; j < at1; j++)
 832                     {
 833                         rvec_inc(f[j], buf[n]);
 834                         /* Add this force to the shift force */
 835                         rvec_inc(fshift[is], buf[n]);
 836                         n++;
 837                     }
 838                 }
 839             }
 840             else
 841             {
 842                 for (i = 0; i < ind->nsend[nzone]; i++)
 843                 {
 844                     at0 = cgindex[index[i]];
 845                     at1 = cgindex[index[i]+1];
 846                     for (j = at0; j < at1; j++)
 847                     {
 848                         /* Rotate the force */
 849                         f[j][XX] += buf[n][XX];
 850                         f[j][YY] -= buf[n][YY];
 851                         f[j][ZZ] -= buf[n][ZZ];
 852                         if (fshift)
 853                         {
 854                             /* Add this force to the shift force */
 855                             rvec_inc(fshift[is], buf[n]);
 856                         }
 857                         n++;
 858                     }
 859                 }
 860             }
 861         }
 862         nzone /= 2;
 863     }
 864 }
 865
 866 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 867 {
 868     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 869     int                   *index, *cgindex;
 870     gmx_domdec_comm_t     *comm;
 871     gmx_domdec_comm_dim_t *cd;
 872     gmx_domdec_ind_t      *ind;
 873     real                  *buf, *rbuf;
 874
 875     comm = dd->comm;
 876
 877     cgindex = dd->cgindex;
 878
 879     buf = &comm->vbuf.v[0][0];
 880
 881     nzone   = 1;
 882     nat_tot = dd->nat_home;
 883     for (d = 0; d < dd->ndim; d++)
 884     {
 885         cd = &comm->cd[d];
 886         for (p = 0; p < cd->np; p++)
 887         {
 888             ind   = &cd->ind[p];
 889             index = ind->index;
 890             n     = 0;
 891             for (i = 0; i < ind->nsend[nzone]; i++)
 892             {
 893                 at0 = cgindex[index[i]];
 894                 at1 = cgindex[index[i]+1];
 895                 for (j = at0; j < at1; j++)
 896                 {
 897                     buf[n] = v[j];
 898                     n++;
 899                 }
 900             }
 901
 902             if (cd->bInPlace)
 903             {
 904                 rbuf = v + nat_tot;
 905             }
 906             else
 907             {
 908                 rbuf = &comm->vbuf2.v[0][0];
 909             }
 910             /* Send and receive the coordinates */
 911             dd_sendrecv_real(dd, d, dddirBackward,
 912                              buf,  ind->nsend[nzone+1],
 913                              rbuf, ind->nrecv[nzone+1]);
 914             if (!cd->bInPlace)
 915             {
 916                 j = 0;
 917                 for (zone = 0; zone < nzone; zone++)
 918                 {
 919                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 920                     {
 921                         v[i] = rbuf[j];
 922                         j++;
 923                     }
 924                 }
 925             }
 926             nat_tot += ind->nrecv[nzone+1];
 927         }
 928         nzone += nzone;
 929     }
 930 }
 931
 932 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 933 {
 934     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 935     int                   *index, *cgindex;
 936     gmx_domdec_comm_t     *comm;
 937     gmx_domdec_comm_dim_t *cd;
 938     gmx_domdec_ind_t      *ind;
 939     real                  *buf, *sbuf;
 940
 941     comm = dd->comm;
 942
 943     cgindex = dd->cgindex;
 944
 945     buf = &comm->vbuf.v[0][0];
 946
 947     n       = 0;
 948     nzone   = comm->zones.n/2;
 949     nat_tot = dd->nat_tot;
 950     for (d = dd->ndim-1; d >= 0; d--)
 951     {
 952         cd = &comm->cd[d];
 953         for (p = cd->np-1; p >= 0; p--)
 954         {
 955             ind      = &cd->ind[p];
 956             nat_tot -= ind->nrecv[nzone+1];
 957             if (cd->bInPlace)
 958             {
 959                 sbuf = v + nat_tot;
 960             }
 961             else
 962             {
 963                 sbuf = &comm->vbuf2.v[0][0];
 964                 j    = 0;
 965                 for (zone = 0; zone < nzone; zone++)
 966                 {
 967                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 968                     {
 969                         sbuf[j] = v[i];
 970                         j++;
 971                     }
 972                 }
 973             }
 974             /* Communicate the forces */
 975             dd_sendrecv_real(dd, d, dddirForward,
 976                              sbuf, ind->nrecv[nzone+1],
 977                              buf,  ind->nsend[nzone+1]);
 978             index = ind->index;
 979             /* Add the received forces */
 980             n = 0;
 981             for (i = 0; i < ind->nsend[nzone]; i++)
 982             {
 983                 at0 = cgindex[index[i]];
 984                 at1 = cgindex[index[i]+1];
 985                 for (j = at0; j < at1; j++)
 986                 {
 987                     v[j] += buf[n];
 988                     n++;
 989                 }
 990             }
 991         }
 992         nzone /= 2;
 993     }
 994 }
 995
 996 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 997 {
 998     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
 999             d, i, j,
1000             zone->min0, zone->max1,
1001             zone->mch0, zone->mch0,
1002             zone->p1_0, zone->p1_1);
1003 }
1004
1005
1006 #define DDZONECOMM_MAXZONE  5
1007 #define DDZONECOMM_BUFSIZE  3
1008
1009 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1010                                int ddimind, int direction,
1011                                gmx_ddzone_t *buf_s, int n_s,
1012                                gmx_ddzone_t *buf_r, int n_r)
1013 {
1014 #define ZBS  DDZONECOMM_BUFSIZE
1015     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1016     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1017     int  i;
1018
1019     for (i = 0; i < n_s; i++)
1020     {
1021         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1022         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1023         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1024         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1025         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1026         vbuf_s[i*ZBS+1][2] = 0;
1027         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1028         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1029         vbuf_s[i*ZBS+2][2] = 0;
1030     }
1031
1032     dd_sendrecv_rvec(dd, ddimind, direction,
1033                      vbuf_s, n_s*ZBS,
1034                      vbuf_r, n_r*ZBS);
1035
1036     for (i = 0; i < n_r; i++)
1037     {
1038         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1039         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1040         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1041         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1042         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1043         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1044         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1045     }
1046
1047 #undef ZBS
1048 }
1049
1050 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1051                           rvec cell_ns_x0, rvec cell_ns_x1)
1052 {
1053     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1054     gmx_ddzone_t      *zp;
1055     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1056     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1057     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1058     rvec               extr_s[2], extr_r[2];
1059     rvec               dh;
1060     real               dist_d, c = 0, det;
1061     gmx_domdec_comm_t *comm;
1062     gmx_bool           bPBC, bUse;
1063
1064     comm = dd->comm;
1065
1066     for (d = 1; d < dd->ndim; d++)
1067     {
1068         dim      = dd->dim[d];
1069         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1070         zp->min0 = cell_ns_x0[dim];
1071         zp->max1 = cell_ns_x1[dim];
1072         zp->min1 = cell_ns_x1[dim];
1073         zp->mch0 = cell_ns_x0[dim];
1074         zp->mch1 = cell_ns_x1[dim];
1075         zp->p1_0 = cell_ns_x0[dim];
1076         zp->p1_1 = cell_ns_x1[dim];
1077     }
1078
1079     for (d = dd->ndim-2; d >= 0; d--)
1080     {
1081         dim  = dd->dim[d];
1082         bPBC = (dim < ddbox->npbcdim);
1083
1084         /* Use an rvec to store two reals */
1085         extr_s[d][0] = comm->cell_f0[d+1];
1086         extr_s[d][1] = comm->cell_f1[d+1];
1087         extr_s[d][2] = comm->cell_f1[d+1];
1088
1089         pos = 0;
1090         /* Store the extremes in the backward sending buffer,
1091          * so the get updated separately from the forward communication.
1092          */
1093         for (d1 = d; d1 < dd->ndim-1; d1++)
1094         {
1095             /* We invert the order to be able to use the same loop for buf_e */
1096             buf_s[pos].min0 = extr_s[d1][1];
1097             buf_s[pos].max1 = extr_s[d1][0];
1098             buf_s[pos].min1 = extr_s[d1][2];
1099             buf_s[pos].mch0 = 0;
1100             buf_s[pos].mch1 = 0;
1101             /* Store the cell corner of the dimension we communicate along */
1102             buf_s[pos].p1_0 = comm->cell_x0[dim];
1103             buf_s[pos].p1_1 = 0;
1104             pos++;
1105         }
1106
1107         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1108         pos++;
1109
1110         if (dd->ndim == 3 && d == 0)
1111         {
1112             buf_s[pos] = comm->zone_d2[0][1];
1113             pos++;
1114             buf_s[pos] = comm->zone_d1[0];
1115             pos++;
1116         }
1117
1118         /* We only need to communicate the extremes
1119          * in the forward direction
1120          */
1121         npulse = comm->cd[d].np;
1122         if (bPBC)
1123         {
1124             /* Take the minimum to avoid double communication */
1125             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1126         }
1127         else
1128         {
1129             /* Without PBC we should really not communicate over
1130              * the boundaries, but implementing that complicates
1131              * the communication setup and therefore we simply
1132              * do all communication, but ignore some data.
1133              */
1134             npulse_min = npulse;
1135         }
1136         for (p = 0; p < npulse_min; p++)
1137         {
1138             /* Communicate the extremes forward */
1139             bUse = (bPBC || dd->ci[dim] > 0);
1140
1141             dd_sendrecv_rvec(dd, d, dddirForward,
1142                              extr_s+d, dd->ndim-d-1,
1143                              extr_r+d, dd->ndim-d-1);
1144
1145             if (bUse)
1146             {
1147                 for (d1 = d; d1 < dd->ndim-1; d1++)
1148                 {
1149                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1150                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1151                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1152                 }
1153             }
1154         }
1155
1156         buf_size = pos;
1157         for (p = 0; p < npulse; p++)
1158         {
1159             /* Communicate all the zone information backward */
1160             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1161
1162             dd_sendrecv_ddzone(dd, d, dddirBackward,
1163                                buf_s, buf_size,
1164                                buf_r, buf_size);
1165
1166             clear_rvec(dh);
1167             if (p > 0)
1168             {
1169                 for (d1 = d+1; d1 < dd->ndim; d1++)
1170                 {
1171                     /* Determine the decrease of maximum required
1172                      * communication height along d1 due to the distance along d,
1173                      * this avoids a lot of useless atom communication.
1174                      */
1175                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1176
1177                     if (ddbox->tric_dir[dim])
1178                     {
1179                         /* c is the off-diagonal coupling between the cell planes
1180                          * along directions d and d1.
1181                          */
1182                         c = ddbox->v[dim][dd->dim[d1]][dim];
1183                     }
1184                     else
1185                     {
1186                         c = 0;
1187                     }
1188                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1189                     if (det > 0)
1190                     {
1191                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1192                     }
1193                     else
1194                     {
1195                         /* A negative value signals out of range */
1196                         dh[d1] = -1;
1197                     }
1198                 }
1199             }
1200
1201             /* Accumulate the extremes over all pulses */
1202             for (i = 0; i < buf_size; i++)
1203             {
1204                 if (p == 0)
1205                 {
1206                     buf_e[i] = buf_r[i];
1207                 }
1208                 else
1209                 {
1210                     if (bUse)
1211                     {
1212                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1213                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1214                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1215                     }
1216
1217                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1218                     {
1219                         d1 = 1;
1220                     }
1221                     else
1222                     {
1223                         d1 = d + 1;
1224                     }
1225                     if (bUse && dh[d1] >= 0)
1226                     {
1227                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1228                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1229                     }
1230                 }
1231                 /* Copy the received buffer to the send buffer,
1232                  * to pass the data through with the next pulse.
1233                  */
1234                 buf_s[i] = buf_r[i];
1235             }
1236             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1237                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1238             {
1239                 /* Store the extremes */
1240                 pos = 0;
1241
1242                 for (d1 = d; d1 < dd->ndim-1; d1++)
1243                 {
1244                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1245                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1246                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1247                     pos++;
1248                 }
1249
1250                 if (d == 1 || (d == 0 && dd->ndim == 3))
1251                 {
1252                     for (i = d; i < 2; i++)
1253                     {
1254                         comm->zone_d2[1-d][i] = buf_e[pos];
1255                         pos++;
1256                     }
1257                 }
1258                 if (d == 0)
1259                 {
1260                     comm->zone_d1[1] = buf_e[pos];
1261                     pos++;
1262                 }
1263             }
1264         }
1265     }
1266
1267     if (dd->ndim >= 2)
1268     {
1269         dim = dd->dim[1];
1270         for (i = 0; i < 2; i++)
1271         {
1272             if (debug)
1273             {
1274                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1275             }
1276             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1277             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1278         }
1279     }
1280     if (dd->ndim >= 3)
1281     {
1282         dim = dd->dim[2];
1283         for (i = 0; i < 2; i++)
1284         {
1285             for (j = 0; j < 2; j++)
1286             {
1287                 if (debug)
1288                 {
1289                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1290                 }
1291                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1292                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1293             }
1294         }
1295     }
1296     for (d = 1; d < dd->ndim; d++)
1297     {
1298         comm->cell_f_max0[d] = extr_s[d-1][0];
1299         comm->cell_f_min1[d] = extr_s[d-1][1];
1300         if (debug)
1301         {
1302             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1303                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1304         }
1305     }
1306 }
1307
1308 static void dd_collect_cg(gmx_domdec_t *dd,
1309                           t_state      *state_local)
1310 {
1311     gmx_domdec_master_t *ma = NULL;
1312     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1313     t_block             *cgs_gl;
1314
1315     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1316     {
1317         /* The master has the correct distribution */
1318         return;
1319     }
1320
1321     if (state_local->ddp_count == dd->ddp_count)
1322     {
1323         ncg_home = dd->ncg_home;
1324         cg       = dd->index_gl;
1325         nat_home = dd->nat_home;
1326     }
1327     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1328     {
1329         cgs_gl = &dd->comm->cgs_gl;
1330
1331         ncg_home = state_local->ncg_gl;
1332         cg       = state_local->cg_gl;
1333         nat_home = 0;
1334         for (i = 0; i < ncg_home; i++)
1335         {
1336             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1337         }
1338     }
1339     else
1340     {
1341         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1342     }
1343
1344     buf2[0] = dd->ncg_home;
1345     buf2[1] = dd->nat_home;
1346     if (DDMASTER(dd))
1347     {
1348         ma   = dd->ma;
1349         ibuf = ma->ibuf;
1350     }
1351     else
1352     {
1353         ibuf = NULL;
1354     }
1355     /* Collect the charge group and atom counts on the master */
1356     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1357
1358     if (DDMASTER(dd))
1359     {
1360         ma->index[0] = 0;
1361         for (i = 0; i < dd->nnodes; i++)
1362         {
1363             ma->ncg[i]     = ma->ibuf[2*i];
1364             ma->nat[i]     = ma->ibuf[2*i+1];
1365             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1366
1367         }
1368         /* Make byte counts and indices */
1369         for (i = 0; i < dd->nnodes; i++)
1370         {
1371             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1372             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1373         }
1374         if (debug)
1375         {
1376             fprintf(debug, "Initial charge group distribution: ");
1377             for (i = 0; i < dd->nnodes; i++)
1378             {
1379                 fprintf(debug, " %d", ma->ncg[i]);
1380             }
1381             fprintf(debug, "\n");
1382         }
1383     }
1384
1385     /* Collect the charge group indices on the master */
1386     dd_gatherv(dd,
1387                dd->ncg_home*sizeof(int), dd->index_gl,
1388                DDMASTER(dd) ? ma->ibuf : NULL,
1389                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1390                DDMASTER(dd) ? ma->cg : NULL);
1391
1392     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1393 }
1394
1395 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1396                                     rvec *lv, rvec *v)
1397 {
1398     gmx_domdec_master_t *ma;
1399     int                  n, i, c, a, nalloc = 0;
1400     rvec                *buf = NULL;
1401     t_block             *cgs_gl;
1402
1403     ma = dd->ma;
1404
1405     if (!DDMASTER(dd))
1406     {
1407 #ifdef GMX_MPI
1408         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1409                  dd->rank, dd->mpi_comm_all);
1410 #endif
1411     }
1412     else
1413     {
1414         /* Copy the master coordinates to the global array */
1415         cgs_gl = &dd->comm->cgs_gl;
1416
1417         n = DDMASTERRANK(dd);
1418         a = 0;
1419         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1420         {
1421             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1422             {
1423                 copy_rvec(lv[a++], v[c]);
1424             }
1425         }
1426
1427         for (n = 0; n < dd->nnodes; n++)
1428         {
1429             if (n != dd->rank)
1430             {
1431                 if (ma->nat[n] > nalloc)
1432                 {
1433                     nalloc = over_alloc_dd(ma->nat[n]);
1434                     srenew(buf, nalloc);
1435                 }
1436 #ifdef GMX_MPI
1437                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1438                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1439 #endif
1440                 a = 0;
1441                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1442                 {
1443                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1444                     {
1445                         copy_rvec(buf[a++], v[c]);
1446                     }
1447                 }
1448             }
1449         }
1450         sfree(buf);
1451     }
1452 }
1453
1454 static void get_commbuffer_counts(gmx_domdec_t *dd,
1455                                   int **counts, int **disps)
1456 {
1457     gmx_domdec_master_t *ma;
1458     int                  n;
1459
1460     ma = dd->ma;
1461
1462     /* Make the rvec count and displacment arrays */
1463     *counts  = ma->ibuf;
1464     *disps   = ma->ibuf + dd->nnodes;
1465     for (n = 0; n < dd->nnodes; n++)
1466     {
1467         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1468         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1469     }
1470 }
1471
1472 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1473                                    rvec *lv, rvec *v)
1474 {
1475     gmx_domdec_master_t *ma;
1476     int                 *rcounts = NULL, *disps = NULL;
1477     int                  n, i, c, a;
1478     rvec                *buf = NULL;
1479     t_block             *cgs_gl;
1480
1481     ma = dd->ma;
1482
1483     if (DDMASTER(dd))
1484     {
1485         get_commbuffer_counts(dd, &rcounts, &disps);
1486
1487         buf = ma->vbuf;
1488     }
1489
1490     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1491
1492     if (DDMASTER(dd))
1493     {
1494         cgs_gl = &dd->comm->cgs_gl;
1495
1496         a = 0;
1497         for (n = 0; n < dd->nnodes; n++)
1498         {
1499             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1500             {
1501                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1502                 {
1503                     copy_rvec(buf[a++], v[c]);
1504                 }
1505             }
1506         }
1507     }
1508 }
1509
1510 void dd_collect_vec(gmx_domdec_t *dd,
1511                     t_state *state_local, rvec *lv, rvec *v)
1512 {
1513     gmx_domdec_master_t *ma;
1514     int                  n, i, c, a, nalloc = 0;
1515     rvec                *buf = NULL;
1516
1517     dd_collect_cg(dd, state_local);
1518
1519     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1520     {
1521         dd_collect_vec_sendrecv(dd, lv, v);
1522     }
1523     else
1524     {
1525         dd_collect_vec_gatherv(dd, lv, v);
1526     }
1527 }
1528
1529
1530 void dd_collect_state(gmx_domdec_t *dd,
1531                       t_state *state_local, t_state *state)
1532 {
1533     int est, i, j, nh;
1534
1535     nh = state->nhchainlength;
1536
1537     if (DDMASTER(dd))
1538     {
1539         for (i = 0; i < efptNR; i++)
1540         {
1541             state->lambda[i] = state_local->lambda[i];
1542         }
1543         state->fep_state = state_local->fep_state;
1544         state->veta      = state_local->veta;
1545         state->vol0      = state_local->vol0;
1546         copy_mat(state_local->box, state->box);
1547         copy_mat(state_local->boxv, state->boxv);
1548         copy_mat(state_local->svir_prev, state->svir_prev);
1549         copy_mat(state_local->fvir_prev, state->fvir_prev);
1550         copy_mat(state_local->pres_prev, state->pres_prev);
1551
1552         for (i = 0; i < state_local->ngtc; i++)
1553         {
1554             for (j = 0; j < nh; j++)
1555             {
1556                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1557                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1558             }
1559             state->therm_integral[i] = state_local->therm_integral[i];
1560         }
1561         for (i = 0; i < state_local->nnhpres; i++)
1562         {
1563             for (j = 0; j < nh; j++)
1564             {
1565                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1566                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1567             }
1568         }
1569     }
1570     for (est = 0; est < estNR; est++)
1571     {
1572         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1573         {
1574             switch (est)
1575             {
1576                 case estX:
1577                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1578                     break;
1579                 case estV:
1580                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1581                     break;
1582                 case estSDX:
1583                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1584                     break;
1585                 case estCGP:
1586                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1587                     break;
1588                 case estLD_RNG:
1589                     if (state->nrngi == 1)
1590                     {
1591                         if (DDMASTER(dd))
1592                         {
1593                             for (i = 0; i < state_local->nrng; i++)
1594                             {
1595                                 state->ld_rng[i] = state_local->ld_rng[i];
1596                             }
1597                         }
1598                     }
1599                     else
1600                     {
1601                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1602                                   state_local->ld_rng, state->ld_rng);
1603                     }
1604                     break;
1605                 case estLD_RNGI:
1606                     if (state->nrngi == 1)
1607                     {
1608                         if (DDMASTER(dd))
1609                         {
1610                             state->ld_rngi[0] = state_local->ld_rngi[0];
1611                         }
1612                     }
1613                     else
1614                     {
1615                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1616                                   state_local->ld_rngi, state->ld_rngi);
1617                     }
1618                     break;
1619                 case estDISRE_INITF:
1620                 case estDISRE_RM3TAV:
1621                 case estORIRE_INITF:
1622                 case estORIRE_DTAV:
1623                     break;
1624                 default:
1625                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1626             }
1627         }
1628     }
1629 }
1630
1631 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1632 {
1633     int est;
1634
1635     if (debug)
1636     {
1637         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1638     }
1639
1640     state->nalloc = over_alloc_dd(nalloc);
1641
1642     for (est = 0; est < estNR; est++)
1643     {
1644         if (EST_DISTR(est) && (state->flags & (1<<est)))
1645         {
1646             switch (est)
1647             {
1648                 case estX:
1649                     srenew(state->x, state->nalloc);
1650                     break;
1651                 case estV:
1652                     srenew(state->v, state->nalloc);
1653                     break;
1654                 case estSDX:
1655                     srenew(state->sd_X, state->nalloc);
1656                     break;
1657                 case estCGP:
1658                     srenew(state->cg_p, state->nalloc);
1659                     break;
1660                 case estLD_RNG:
1661                 case estLD_RNGI:
1662                 case estDISRE_INITF:
1663                 case estDISRE_RM3TAV:
1664                 case estORIRE_INITF:
1665                 case estORIRE_DTAV:
1666                     /* No reallocation required */
1667                     break;
1668                 default:
1669                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1670             }
1671         }
1672     }
1673
1674     if (f != NULL)
1675     {
1676         srenew(*f, state->nalloc);
1677     }
1678 }
1679
1680 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1681                                int nalloc)
1682 {
1683     if (nalloc > fr->cg_nalloc)
1684     {
1685         if (debug)
1686         {
1687             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1688         }
1689         fr->cg_nalloc = over_alloc_dd(nalloc);
1690         srenew(fr->cginfo, fr->cg_nalloc);
1691         if (fr->cutoff_scheme == ecutsGROUP)
1692         {
1693             srenew(fr->cg_cm, fr->cg_nalloc);
1694         }
1695     }
1696     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1697     {
1698         /* We don't use charge groups, we use x in state to set up
1699          * the atom communication.
1700          */
1701         dd_realloc_state(state, f, nalloc);
1702     }
1703 }
1704
1705 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1706                                        rvec *v, rvec *lv)
1707 {
1708     gmx_domdec_master_t *ma;
1709     int                  n, i, c, a, nalloc = 0;
1710     rvec                *buf = NULL;
1711
1712     if (DDMASTER(dd))
1713     {
1714         ma  = dd->ma;
1715
1716         for (n = 0; n < dd->nnodes; n++)
1717         {
1718             if (n != dd->rank)
1719             {
1720                 if (ma->nat[n] > nalloc)
1721                 {
1722                     nalloc = over_alloc_dd(ma->nat[n]);
1723                     srenew(buf, nalloc);
1724                 }
1725                 /* Use lv as a temporary buffer */
1726                 a = 0;
1727                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1728                 {
1729                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1730                     {
1731                         copy_rvec(v[c], buf[a++]);
1732                     }
1733                 }
1734                 if (a != ma->nat[n])
1735                 {
1736                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1737                               a, ma->nat[n]);
1738                 }
1739
1740 #ifdef GMX_MPI
1741                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1742                          DDRANK(dd, n), n, dd->mpi_comm_all);
1743 #endif
1744             }
1745         }
1746         sfree(buf);
1747         n = DDMASTERRANK(dd);
1748         a = 0;
1749         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1750         {
1751             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1752             {
1753                 copy_rvec(v[c], lv[a++]);
1754             }
1755         }
1756     }
1757     else
1758     {
1759 #ifdef GMX_MPI
1760         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1761                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1762 #endif
1763     }
1764 }
1765
1766 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1767                                        rvec *v, rvec *lv)
1768 {
1769     gmx_domdec_master_t *ma;
1770     int                 *scounts = NULL, *disps = NULL;
1771     int                  n, i, c, a, nalloc = 0;
1772     rvec                *buf = NULL;
1773
1774     if (DDMASTER(dd))
1775     {
1776         ma  = dd->ma;
1777
1778         get_commbuffer_counts(dd, &scounts, &disps);
1779
1780         buf = ma->vbuf;
1781         a   = 0;
1782         for (n = 0; n < dd->nnodes; n++)
1783         {
1784             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1785             {
1786                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1787                 {
1788                     copy_rvec(v[c], buf[a++]);
1789                 }
1790             }
1791         }
1792     }
1793
1794     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1795 }
1796
1797 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1798 {
1799     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1800     {
1801         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1802     }
1803     else
1804     {
1805         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1806     }
1807 }
1808
1809 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1810 {
1811     int i;
1812     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1813     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1814     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1815
1816     if (dfhist->nlambda > 0)
1817     {
1818         int nlam = dfhist->nlambda;
1819         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1820         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1821         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1822         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1823         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1824         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1825
1826         for (i = 0; i < nlam; i++)
1827         {
1828             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1829             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1830             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1831             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1832             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1833             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1834         }
1835     }
1836 }
1837
1838 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1839                                 t_state *state, t_state *state_local,
1840                                 rvec **f)
1841 {
1842     int  i, j, nh;
1843
1844     nh = state->nhchainlength;
1845
1846     if (DDMASTER(dd))
1847     {
1848         for (i = 0; i < efptNR; i++)
1849         {
1850             state_local->lambda[i] = state->lambda[i];
1851         }
1852         state_local->fep_state = state->fep_state;
1853         state_local->veta      = state->veta;
1854         state_local->vol0      = state->vol0;
1855         copy_mat(state->box, state_local->box);
1856         copy_mat(state->box_rel, state_local->box_rel);
1857         copy_mat(state->boxv, state_local->boxv);
1858         copy_mat(state->svir_prev, state_local->svir_prev);
1859         copy_mat(state->fvir_prev, state_local->fvir_prev);
1860         copy_df_history(&state_local->dfhist, &state->dfhist);
1861         for (i = 0; i < state_local->ngtc; i++)
1862         {
1863             for (j = 0; j < nh; j++)
1864             {
1865                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1866                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1867             }
1868             state_local->therm_integral[i] = state->therm_integral[i];
1869         }
1870         for (i = 0; i < state_local->nnhpres; i++)
1871         {
1872             for (j = 0; j < nh; j++)
1873             {
1874                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1875                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1876             }
1877         }
1878     }
1879     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1880     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1881     dd_bcast(dd, sizeof(real), &state_local->veta);
1882     dd_bcast(dd, sizeof(real), &state_local->vol0);
1883     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1884     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1885     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1886     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1887     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1888     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1889     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1890     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1891     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1892     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1893
1894     /* communicate df_history -- required for restarting from checkpoint */
1895     dd_distribute_dfhist(dd, &state_local->dfhist);
1896
1897     if (dd->nat_home > state_local->nalloc)
1898     {
1899         dd_realloc_state(state_local, f, dd->nat_home);
1900     }
1901     for (i = 0; i < estNR; i++)
1902     {
1903         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1904         {
1905             switch (i)
1906             {
1907                 case estX:
1908                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1909                     break;
1910                 case estV:
1911                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1912                     break;
1913                 case estSDX:
1914                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1915                     break;
1916                 case estCGP:
1917                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1918                     break;
1919                 case estLD_RNG:
1920                     if (state->nrngi == 1)
1921                     {
1922                         dd_bcastc(dd,
1923                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1924                                   state->ld_rng, state_local->ld_rng);
1925                     }
1926                     else
1927                     {
1928                         dd_scatter(dd,
1929                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1930                                    state->ld_rng, state_local->ld_rng);
1931                     }
1932                     break;
1933                 case estLD_RNGI:
1934                     if (state->nrngi == 1)
1935                     {
1936                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1937                                   state->ld_rngi, state_local->ld_rngi);
1938                     }
1939                     else
1940                     {
1941                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1942                                    state->ld_rngi, state_local->ld_rngi);
1943                     }
1944                     break;
1945                 case estDISRE_INITF:
1946                 case estDISRE_RM3TAV:
1947                 case estORIRE_INITF:
1948                 case estORIRE_DTAV:
1949                     /* Not implemented yet */
1950                     break;
1951                 default:
1952                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1953             }
1954         }
1955     }
1956 }
1957
1958 static char dim2char(int dim)
1959 {
1960     char c = '?';
1961
1962     switch (dim)
1963     {
1964         case XX: c = 'X'; break;
1965         case YY: c = 'Y'; break;
1966         case ZZ: c = 'Z'; break;
1967         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1968     }
1969
1970     return c;
1971 }
1972
1973 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1974                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1975 {
1976     rvec   grid_s[2], *grid_r = NULL, cx, r;
1977     char   fname[STRLEN], format[STRLEN], buf[22];
1978     FILE  *out;
1979     int    a, i, d, z, y, x;
1980     matrix tric;
1981     real   vol;
1982
1983     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1984     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1985
1986     if (DDMASTER(dd))
1987     {
1988         snew(grid_r, 2*dd->nnodes);
1989     }
1990
1991     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1992
1993     if (DDMASTER(dd))
1994     {
1995         for (d = 0; d < DIM; d++)
1996         {
1997             for (i = 0; i < DIM; i++)
1998             {
1999                 if (d == i)
2000                 {
2001                     tric[d][i] = 1;
2002                 }
2003                 else
2004                 {
2005                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
2006                     {
2007                         tric[d][i] = box[i][d]/box[i][i];
2008                     }
2009                     else
2010                     {
2011                         tric[d][i] = 0;
2012                     }
2013                 }
2014             }
2015         }
2016         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
2017         sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
2018         out = gmx_fio_fopen(fname, "w");
2019         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2020         a = 1;
2021         for (i = 0; i < dd->nnodes; i++)
2022         {
2023             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
2024             for (d = 0; d < DIM; d++)
2025             {
2026                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
2027             }
2028             for (z = 0; z < 2; z++)
2029             {
2030                 for (y = 0; y < 2; y++)
2031                 {
2032                     for (x = 0; x < 2; x++)
2033                     {
2034                         cx[XX] = grid_r[i*2+x][XX];
2035                         cx[YY] = grid_r[i*2+y][YY];
2036                         cx[ZZ] = grid_r[i*2+z][ZZ];
2037                         mvmul(tric, cx, r);
2038                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
2039                                 ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
2040                     }
2041                 }
2042             }
2043             for (d = 0; d < DIM; d++)
2044             {
2045                 for (x = 0; x < 4; x++)
2046                 {
2047                     switch (d)
2048                     {
2049                         case 0: y = 1 + i*8 + 2*x; break;
2050                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2051                         case 2: y = 1 + i*8 + x; break;
2052                     }
2053                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2054                 }
2055             }
2056         }
2057         gmx_fio_fclose(out);
2058         sfree(grid_r);
2059     }
2060 }
2061
2062 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
2063                   gmx_mtop_t *mtop, t_commrec *cr,
2064                   int natoms, rvec x[], matrix box)
2065 {
2066     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2067     FILE         *out;
2068     int           i, ii, resnr, c;
2069     char         *atomname, *resname;
2070     real          b;
2071     gmx_domdec_t *dd;
2072
2073     dd = cr->dd;
2074     if (natoms == -1)
2075     {
2076         natoms = dd->comm->nat[ddnatVSITE];
2077     }
2078
2079     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2080
2081     sprintf(format, "%s%s\n", get_pdbformat(), "%6.2f%6.2f");
2082     sprintf(format4, "%s%s\n", get_pdbformat4(), "%6.2f%6.2f");
2083
2084     out = gmx_fio_fopen(fname, "w");
2085
2086     fprintf(out, "TITLE     %s\n", title);
2087     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2088     for (i = 0; i < natoms; i++)
2089     {
2090         ii = dd->gatindex[i];
2091         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2092         if (i < dd->comm->nat[ddnatZONE])
2093         {
2094             c = 0;
2095             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2096             {
2097                 c++;
2098             }
2099             b = c;
2100         }
2101         else if (i < dd->comm->nat[ddnatVSITE])
2102         {
2103             b = dd->comm->zones.n;
2104         }
2105         else
2106         {
2107             b = dd->comm->zones.n + 1;
2108         }
2109         fprintf(out, strlen(atomname) < 4 ? format : format4,
2110                 "ATOM", (ii+1)%100000,
2111                 atomname, resname, ' ', resnr%10000, ' ',
2112                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2113     }
2114     fprintf(out, "TER\n");
2115
2116     gmx_fio_fclose(out);
2117 }
2118
2119 real dd_cutoff_mbody(gmx_domdec_t *dd)
2120 {
2121     gmx_domdec_comm_t *comm;
2122     int                di;
2123     real               r;
2124
2125     comm = dd->comm;
2126
2127     r = -1;
2128     if (comm->bInterCGBondeds)
2129     {
2130         if (comm->cutoff_mbody > 0)
2131         {
2132             r = comm->cutoff_mbody;
2133         }
2134         else
2135         {
2136             /* cutoff_mbody=0 means we do not have DLB */
2137             r = comm->cellsize_min[dd->dim[0]];
2138             for (di = 1; di < dd->ndim; di++)
2139             {
2140                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2141             }
2142             if (comm->bBondComm)
2143             {
2144                 r = max(r, comm->cutoff_mbody);
2145             }
2146             else
2147             {
2148                 r = min(r, comm->cutoff);
2149             }
2150         }
2151     }
2152
2153     return r;
2154 }
2155
2156 real dd_cutoff_twobody(gmx_domdec_t *dd)
2157 {
2158     real r_mb;
2159
2160     r_mb = dd_cutoff_mbody(dd);
2161
2162     return max(dd->comm->cutoff, r_mb);
2163 }
2164
2165
2166 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2167 {
2168     int nc, ntot;
2169
2170     nc   = dd->nc[dd->comm->cartpmedim];
2171     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2172     copy_ivec(coord, coord_pme);
2173     coord_pme[dd->comm->cartpmedim] =
2174         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2175 }
2176
2177 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2178 {
2179     /* Here we assign a PME node to communicate with this DD node
2180      * by assuming that the major index of both is x.
2181      * We add cr->npmenodes/2 to obtain an even distribution.
2182      */
2183     return (ddindex*npme + npme/2)/ndd;
2184 }
2185
2186 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2187 {
2188     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2189 }
2190
2191 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2192 {
2193     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2194 }
2195
2196 static int *dd_pmenodes(t_commrec *cr)
2197 {
2198     int *pmenodes;
2199     int  n, i, p0, p1;
2200
2201     snew(pmenodes, cr->npmenodes);
2202     n = 0;
2203     for (i = 0; i < cr->dd->nnodes; i++)
2204     {
2205         p0 = cr_ddindex2pmeindex(cr, i);
2206         p1 = cr_ddindex2pmeindex(cr, i+1);
2207         if (i+1 == cr->dd->nnodes || p1 > p0)
2208         {
2209             if (debug)
2210             {
2211                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2212             }
2213             pmenodes[n] = i + 1 + n;
2214             n++;
2215         }
2216     }
2217
2218     return pmenodes;
2219 }
2220
2221 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2222 {
2223     gmx_domdec_t *dd;
2224     ivec          coords, coords_pme, nc;
2225     int           slab;
2226
2227     dd = cr->dd;
2228     /*
2229        if (dd->comm->bCartesian) {
2230        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2231        dd_coords2pmecoords(dd,coords,coords_pme);
2232        copy_ivec(dd->ntot,nc);
2233        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2234        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2235
2236        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2237        } else {
2238        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2239        }
2240      */
2241     coords[XX] = x;
2242     coords[YY] = y;
2243     coords[ZZ] = z;
2244     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2245
2246     return slab;
2247 }
2248
2249 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2250 {
2251     gmx_domdec_comm_t *comm;
2252     ivec               coords;
2253     int                ddindex, nodeid = -1;
2254
2255     comm = cr->dd->comm;
2256
2257     coords[XX] = x;
2258     coords[YY] = y;
2259     coords[ZZ] = z;
2260     if (comm->bCartesianPP_PME)
2261     {
2262 #ifdef GMX_MPI
2263         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2264 #endif
2265     }
2266     else
2267     {
2268         ddindex = dd_index(cr->dd->nc, coords);
2269         if (comm->bCartesianPP)
2270         {
2271             nodeid = comm->ddindex2simnodeid[ddindex];
2272         }
2273         else
2274         {
2275             if (comm->pmenodes)
2276             {
2277                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2278             }
2279             else
2280             {
2281                 nodeid = ddindex;
2282             }
2283         }
2284     }
2285
2286     return nodeid;
2287 }
2288
2289 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2290 {
2291     gmx_domdec_t      *dd;
2292     gmx_domdec_comm_t *comm;
2293     ivec               coord, coord_pme;
2294     int                i;
2295     int                pmenode = -1;
2296
2297     dd   = cr->dd;
2298     comm = dd->comm;
2299
2300     /* This assumes a uniform x domain decomposition grid cell size */
2301     if (comm->bCartesianPP_PME)
2302     {
2303 #ifdef GMX_MPI
2304         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2305         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2306         {
2307             /* This is a PP node */
2308             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2309             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2310         }
2311 #endif
2312     }
2313     else if (comm->bCartesianPP)
2314     {
2315         if (sim_nodeid < dd->nnodes)
2316         {
2317             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2318         }
2319     }
2320     else
2321     {
2322         /* This assumes DD cells with identical x coordinates
2323          * are numbered sequentially.
2324          */
2325         if (dd->comm->pmenodes == NULL)
2326         {
2327             if (sim_nodeid < dd->nnodes)
2328             {
2329                 /* The DD index equals the nodeid */
2330                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2331             }
2332         }
2333         else
2334         {
2335             i = 0;
2336             while (sim_nodeid > dd->comm->pmenodes[i])
2337             {
2338                 i++;
2339             }
2340             if (sim_nodeid < dd->comm->pmenodes[i])
2341             {
2342                 pmenode = dd->comm->pmenodes[i];
2343             }
2344         }
2345     }
2346
2347     return pmenode;
2348 }
2349
2350 void get_pme_nnodes(const gmx_domdec_t *dd,
2351                     int *npmenodes_x, int *npmenodes_y)
2352 {
2353     if (dd != NULL)
2354     {
2355         *npmenodes_x = dd->comm->npmenodes_x;
2356         *npmenodes_y = dd->comm->npmenodes_y;
2357     }
2358     else
2359     {
2360         *npmenodes_x = 1;
2361         *npmenodes_y = 1;
2362     }
2363 }
2364
2365 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2366 {
2367     gmx_bool bPMEOnlyNode;
2368
2369     if (DOMAINDECOMP(cr))
2370     {
2371         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2372     }
2373     else
2374     {
2375         bPMEOnlyNode = FALSE;
2376     }
2377
2378     return bPMEOnlyNode;
2379 }
2380
2381 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2382                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2383 {
2384     gmx_domdec_t *dd;
2385     int           x, y, z;
2386     ivec          coord, coord_pme;
2387
2388     dd = cr->dd;
2389
2390     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2391
2392     *nmy_ddnodes = 0;
2393     for (x = 0; x < dd->nc[XX]; x++)
2394     {
2395         for (y = 0; y < dd->nc[YY]; y++)
2396         {
2397             for (z = 0; z < dd->nc[ZZ]; z++)
2398             {
2399                 if (dd->comm->bCartesianPP_PME)
2400                 {
2401                     coord[XX] = x;
2402                     coord[YY] = y;
2403                     coord[ZZ] = z;
2404                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2405                     if (dd->ci[XX] == coord_pme[XX] &&
2406                         dd->ci[YY] == coord_pme[YY] &&
2407                         dd->ci[ZZ] == coord_pme[ZZ])
2408                     {
2409                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2410                     }
2411                 }
2412                 else
2413                 {
2414                     /* The slab corresponds to the nodeid in the PME group */
2415                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2416                     {
2417                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2418                     }
2419                 }
2420             }
2421         }
2422     }
2423
2424     /* The last PP-only node is the peer node */
2425     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2426
2427     if (debug)
2428     {
2429         fprintf(debug, "Receive coordinates from PP nodes:");
2430         for (x = 0; x < *nmy_ddnodes; x++)
2431         {
2432             fprintf(debug, " %d", (*my_ddnodes)[x]);
2433         }
2434         fprintf(debug, "\n");
2435     }
2436 }
2437
2438 static gmx_bool receive_vir_ener(t_commrec *cr)
2439 {
2440     gmx_domdec_comm_t *comm;
2441     int                pmenode, coords[DIM], rank;
2442     gmx_bool           bReceive;
2443
2444     bReceive = TRUE;
2445     if (cr->npmenodes < cr->dd->nnodes)
2446     {
2447         comm = cr->dd->comm;
2448         if (comm->bCartesianPP_PME)
2449         {
2450             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2451 #ifdef GMX_MPI
2452             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2453             coords[comm->cartpmedim]++;
2454             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2455             {
2456                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2457                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2458                 {
2459                     /* This is not the last PP node for pmenode */
2460                     bReceive = FALSE;
2461                 }
2462             }
2463 #endif
2464         }
2465         else
2466         {
2467             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2468             if (cr->sim_nodeid+1 < cr->nnodes &&
2469                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2470             {
2471                 /* This is not the last PP node for pmenode */
2472                 bReceive = FALSE;
2473             }
2474         }
2475     }
2476
2477     return bReceive;
2478 }
2479
2480 static void set_zones_ncg_home(gmx_domdec_t *dd)
2481 {
2482     gmx_domdec_zones_t *zones;
2483     int                 i;
2484
2485     zones = &dd->comm->zones;
2486
2487     zones->cg_range[0] = 0;
2488     for (i = 1; i < zones->n+1; i++)
2489     {
2490         zones->cg_range[i] = dd->ncg_home;
2491     }
2492     /* zone_ncg1[0] should always be equal to ncg_home */
2493     dd->comm->zone_ncg1[0] = dd->ncg_home;
2494 }
2495
2496 static void rebuild_cgindex(gmx_domdec_t *dd,
2497                             const int *gcgs_index, t_state *state)
2498 {
2499     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2500
2501     ind        = state->cg_gl;
2502     dd_cg_gl   = dd->index_gl;
2503     cgindex    = dd->cgindex;
2504     nat        = 0;
2505     cgindex[0] = nat;
2506     for (i = 0; i < state->ncg_gl; i++)
2507     {
2508         cgindex[i]  = nat;
2509         cg_gl       = ind[i];
2510         dd_cg_gl[i] = cg_gl;
2511         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2512     }
2513     cgindex[i] = nat;
2514
2515     dd->ncg_home = state->ncg_gl;
2516     dd->nat_home = nat;
2517
2518     set_zones_ncg_home(dd);
2519 }
2520
2521 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2522 {
2523     while (cg >= cginfo_mb->cg_end)
2524     {
2525         cginfo_mb++;
2526     }
2527
2528     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2529 }
2530
2531 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2532                           t_forcerec *fr, char *bLocalCG)
2533 {
2534     cginfo_mb_t *cginfo_mb;
2535     int         *cginfo;
2536     int          cg;
2537
2538     if (fr != NULL)
2539     {
2540         cginfo_mb = fr->cginfo_mb;
2541         cginfo    = fr->cginfo;
2542
2543         for (cg = cg0; cg < cg1; cg++)
2544         {
2545             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2546         }
2547     }
2548
2549     if (bLocalCG != NULL)
2550     {
2551         for (cg = cg0; cg < cg1; cg++)
2552         {
2553             bLocalCG[index_gl[cg]] = TRUE;
2554         }
2555     }
2556 }
2557
2558 static void make_dd_indices(gmx_domdec_t *dd,
2559                             const int *gcgs_index, int cg_start)
2560 {
2561     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2562     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2563     gmx_ga2la_t *ga2la;
2564     char        *bLocalCG;
2565     gmx_bool     bCGs;
2566
2567     bLocalCG = dd->comm->bLocalCG;
2568
2569     if (dd->nat_tot > dd->gatindex_nalloc)
2570     {
2571         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2572         srenew(dd->gatindex, dd->gatindex_nalloc);
2573     }
2574
2575     nzone      = dd->comm->zones.n;
2576     zone2cg    = dd->comm->zones.cg_range;
2577     zone_ncg1  = dd->comm->zone_ncg1;
2578     index_gl   = dd->index_gl;
2579     gatindex   = dd->gatindex;
2580     bCGs       = dd->comm->bCGs;
2581
2582     if (zone2cg[1] != dd->ncg_home)
2583     {
2584         gmx_incons("dd->ncg_zone is not up to date");
2585     }
2586
2587     /* Make the local to global and global to local atom index */
2588     a = dd->cgindex[cg_start];
2589     for (zone = 0; zone < nzone; zone++)
2590     {
2591         if (zone == 0)
2592         {
2593             cg0 = cg_start;
2594         }
2595         else
2596         {
2597             cg0 = zone2cg[zone];
2598         }
2599         cg1    = zone2cg[zone+1];
2600         cg1_p1 = cg0 + zone_ncg1[zone];
2601
2602         for (cg = cg0; cg < cg1; cg++)
2603         {
2604             zone1 = zone;
2605             if (cg >= cg1_p1)
2606             {
2607                 /* Signal that this cg is from more than one pulse away */
2608                 zone1 += nzone;
2609             }
2610             cg_gl = index_gl[cg];
2611             if (bCGs)
2612             {
2613                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2614                 {
2615                     gatindex[a] = a_gl;
2616                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2617                     a++;
2618                 }
2619             }
2620             else
2621             {
2622                 gatindex[a] = cg_gl;
2623                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2624                 a++;
2625             }
2626         }
2627     }
2628 }
2629
2630 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2631                           const char *where)
2632 {
2633     int ncg, i, ngl, nerr;
2634
2635     nerr = 0;
2636     if (bLocalCG == NULL)
2637     {
2638         return nerr;
2639     }
2640     for (i = 0; i < dd->ncg_tot; i++)
2641     {
2642         if (!bLocalCG[dd->index_gl[i]])
2643         {
2644             fprintf(stderr,
2645                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2646             nerr++;
2647         }
2648     }
2649     ngl = 0;
2650     for (i = 0; i < ncg_sys; i++)
2651     {
2652         if (bLocalCG[i])
2653         {
2654             ngl++;
2655         }
2656     }
2657     if (ngl != dd->ncg_tot)
2658     {
2659         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2660         nerr++;
2661     }
2662
2663     return nerr;
2664 }
2665
2666 static void check_index_consistency(gmx_domdec_t *dd,
2667                                     int natoms_sys, int ncg_sys,
2668                                     const char *where)
2669 {
2670     int   nerr, ngl, i, a, cell;
2671     int  *have;
2672
2673     nerr = 0;
2674
2675     if (dd->comm->DD_debug > 1)
2676     {
2677         snew(have, natoms_sys);
2678         for (a = 0; a < dd->nat_tot; a++)
2679         {
2680             if (have[dd->gatindex[a]] > 0)
2681             {
2682                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2683             }
2684             else
2685             {
2686                 have[dd->gatindex[a]] = a + 1;
2687             }
2688         }
2689         sfree(have);
2690     }
2691
2692     snew(have, dd->nat_tot);
2693
2694     ngl  = 0;
2695     for (i = 0; i < natoms_sys; i++)
2696     {
2697         if (ga2la_get(dd->ga2la, i, &a, &cell))
2698         {
2699             if (a >= dd->nat_tot)
2700             {
2701                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2702                 nerr++;
2703             }
2704             else
2705             {
2706                 have[a] = 1;
2707                 if (dd->gatindex[a] != i)
2708                 {
2709                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2710                     nerr++;
2711                 }
2712             }
2713             ngl++;
2714         }
2715     }
2716     if (ngl != dd->nat_tot)
2717     {
2718         fprintf(stderr,
2719                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2720                 dd->rank, where, ngl, dd->nat_tot);
2721     }
2722     for (a = 0; a < dd->nat_tot; a++)
2723     {
2724         if (have[a] == 0)
2725         {
2726             fprintf(stderr,
2727                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2728                     dd->rank, where, a+1, dd->gatindex[a]+1);
2729         }
2730     }
2731     sfree(have);
2732
2733     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2734
2735     if (nerr > 0)
2736     {
2737         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2738                   dd->rank, where, nerr);
2739     }
2740 }
2741
2742 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2743 {
2744     int   i;
2745     char *bLocalCG;
2746
2747     if (a_start == 0)
2748     {
2749         /* Clear the whole list without searching */
2750         ga2la_clear(dd->ga2la);
2751     }
2752     else
2753     {
2754         for (i = a_start; i < dd->nat_tot; i++)
2755         {
2756             ga2la_del(dd->ga2la, dd->gatindex[i]);
2757         }
2758     }
2759
2760     bLocalCG = dd->comm->bLocalCG;
2761     if (bLocalCG)
2762     {
2763         for (i = cg_start; i < dd->ncg_tot; i++)
2764         {
2765             bLocalCG[dd->index_gl[i]] = FALSE;
2766         }
2767     }
2768
2769     dd_clear_local_vsite_indices(dd);
2770
2771     if (dd->constraints)
2772     {
2773         dd_clear_local_constraint_indices(dd);
2774     }
2775 }
2776
2777 /* This function should be used for moving the domain boudaries during DLB,
2778  * for obtaining the minimum cell size. It checks the initially set limit
2779  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2780  * and, possibly, a longer cut-off limit set for PME load balancing.
2781  */
2782 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2783 {
2784     real cellsize_min;
2785
2786     cellsize_min = comm->cellsize_min[dim];
2787
2788     if (!comm->bVacDLBNoLimit)
2789     {
2790         /* The cut-off might have changed, e.g. by PME load balacning,
2791          * from the value used to set comm->cellsize_min, so check it.
2792          */
2793         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2794
2795         if (comm->bPMELoadBalDLBLimits)
2796         {
2797             /* Check for the cut-off limit set by the PME load balancing */
2798             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2799         }
2800     }
2801
2802     return cellsize_min;
2803 }
2804
2805 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2806                             int dim_ind)
2807 {
2808     real grid_jump_limit;
2809
2810     /* The distance between the boundaries of cells at distance
2811      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2812      * and by the fact that cells should not be shifted by more than
2813      * half their size, such that cg's only shift by one cell
2814      * at redecomposition.
2815      */
2816     grid_jump_limit = comm->cellsize_limit;
2817     if (!comm->bVacDLBNoLimit)
2818     {
2819         if (comm->bPMELoadBalDLBLimits)
2820         {
2821             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2822         }
2823         grid_jump_limit = max(grid_jump_limit,
2824                               cutoff/comm->cd[dim_ind].np);
2825     }
2826
2827     return grid_jump_limit;
2828 }
2829
2830 static gmx_bool check_grid_jump(gmx_int64_t     step,
2831                                 gmx_domdec_t   *dd,
2832                                 real            cutoff,
2833                                 gmx_ddbox_t    *ddbox,
2834                                 gmx_bool        bFatal)
2835 {
2836     gmx_domdec_comm_t *comm;
2837     int                d, dim;
2838     real               limit, bfac;
2839     gmx_bool           bInvalid;
2840
2841     bInvalid = FALSE;
2842
2843     comm = dd->comm;
2844
2845     for (d = 1; d < dd->ndim; d++)
2846     {
2847         dim   = dd->dim[d];
2848         limit = grid_jump_limit(comm, cutoff, d);
2849         bfac  = ddbox->box_size[dim];
2850         if (ddbox->tric_dir[dim])
2851         {
2852             bfac *= ddbox->skew_fac[dim];
2853         }
2854         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2855                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2856         {
2857             bInvalid = TRUE;
2858
2859             if (bFatal)
2860             {
2861                 char buf[22];
2862
2863                 /* This error should never be triggered under normal
2864                  * circumstances, but you never know ...
2865                  */
2866                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2867                           gmx_step_str(step, buf),
2868                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2869             }
2870         }
2871     }
2872
2873     return bInvalid;
2874 }
2875
2876 static int dd_load_count(gmx_domdec_comm_t *comm)
2877 {
2878     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2879 }
2880
2881 static float dd_force_load(gmx_domdec_comm_t *comm)
2882 {
2883     float load;
2884
2885     if (comm->eFlop)
2886     {
2887         load = comm->flop;
2888         if (comm->eFlop > 1)
2889         {
2890             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2891         }
2892     }
2893     else
2894     {
2895         load = comm->cycl[ddCyclF];
2896         if (comm->cycl_n[ddCyclF] > 1)
2897         {
2898             /* Subtract the maximum of the last n cycle counts
2899              * to get rid of possible high counts due to other sources,
2900              * for instance system activity, that would otherwise
2901              * affect the dynamic load balancing.
2902              */
2903             load -= comm->cycl_max[ddCyclF];
2904         }
2905
2906 #ifdef GMX_MPI
2907         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2908         {
2909             float gpu_wait, gpu_wait_sum;
2910
2911             gpu_wait = comm->cycl[ddCyclWaitGPU];
2912             if (comm->cycl_n[ddCyclF] > 1)
2913             {
2914                 /* We should remove the WaitGPU time of the same MD step
2915                  * as the one with the maximum F time, since the F time
2916                  * and the wait time are not independent.
2917                  * Furthermore, the step for the max F time should be chosen
2918                  * the same on all ranks that share the same GPU.
2919                  * But to keep the code simple, we remove the average instead.
2920                  * The main reason for artificially long times at some steps
2921                  * is spurious CPU activity or MPI time, so we don't expect
2922                  * that changes in the GPU wait time matter a lot here.
2923                  */
2924                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2925             }
2926             /* Sum the wait times over the ranks that share the same GPU */
2927             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2928                           comm->mpi_comm_gpu_shared);
2929             /* Replace the wait time by the average over the ranks */
2930             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2931         }
2932 #endif
2933     }
2934
2935     return load;
2936 }
2937
2938 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2939 {
2940     gmx_domdec_comm_t *comm;
2941     int                i;
2942
2943     comm = dd->comm;
2944
2945     snew(*dim_f, dd->nc[dim]+1);
2946     (*dim_f)[0] = 0;
2947     for (i = 1; i < dd->nc[dim]; i++)
2948     {
2949         if (comm->slb_frac[dim])
2950         {
2951             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2952         }
2953         else
2954         {
2955             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2956         }
2957     }
2958     (*dim_f)[dd->nc[dim]] = 1;
2959 }
2960
2961 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2962 {
2963     int  pmeindex, slab, nso, i;
2964     ivec xyz;
2965
2966     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2967     {
2968         ddpme->dim = YY;
2969     }
2970     else
2971     {
2972         ddpme->dim = dimind;
2973     }
2974     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2975
2976     ddpme->nslab = (ddpme->dim == 0 ?
2977                     dd->comm->npmenodes_x :
2978                     dd->comm->npmenodes_y);
2979
2980     if (ddpme->nslab <= 1)
2981     {
2982         return;
2983     }
2984
2985     nso = dd->comm->npmenodes/ddpme->nslab;
2986     /* Determine for each PME slab the PP location range for dimension dim */
2987     snew(ddpme->pp_min, ddpme->nslab);
2988     snew(ddpme->pp_max, ddpme->nslab);
2989     for (slab = 0; slab < ddpme->nslab; slab++)
2990     {
2991         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2992         ddpme->pp_max[slab] = 0;
2993     }
2994     for (i = 0; i < dd->nnodes; i++)
2995     {
2996         ddindex2xyz(dd->nc, i, xyz);
2997         /* For y only use our y/z slab.
2998          * This assumes that the PME x grid size matches the DD grid size.
2999          */
3000         if (dimind == 0 || xyz[XX] == dd->ci[XX])
3001         {
3002             pmeindex = ddindex2pmeindex(dd, i);
3003             if (dimind == 0)
3004             {
3005                 slab = pmeindex/nso;
3006             }
3007             else
3008             {
3009                 slab = pmeindex % ddpme->nslab;
3010             }
3011             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
3012             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
3013         }
3014     }
3015
3016     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
3017 }
3018
3019 int dd_pme_maxshift_x(gmx_domdec_t *dd)
3020 {
3021     if (dd->comm->ddpme[0].dim == XX)
3022     {
3023         return dd->comm->ddpme[0].maxshift;
3024     }
3025     else
3026     {
3027         return 0;
3028     }
3029 }
3030
3031 int dd_pme_maxshift_y(gmx_domdec_t *dd)
3032 {
3033     if (dd->comm->ddpme[0].dim == YY)
3034     {
3035         return dd->comm->ddpme[0].maxshift;
3036     }
3037     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
3038     {
3039         return dd->comm->ddpme[1].maxshift;
3040     }
3041     else
3042     {
3043         return 0;
3044     }
3045 }
3046
3047 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
3048                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
3049 {
3050     gmx_domdec_comm_t *comm;
3051     int                nc, ns, s;
3052     int               *xmin, *xmax;
3053     real               range, pme_boundary;
3054     int                sh;
3055
3056     comm = dd->comm;
3057     nc   = dd->nc[ddpme->dim];
3058     ns   = ddpme->nslab;
3059
3060     if (!ddpme->dim_match)
3061     {
3062         /* PP decomposition is not along dim: the worst situation */
3063         sh = ns/2;
3064     }
3065     else if (ns <= 3 || (bUniform && ns == nc))
3066     {
3067         /* The optimal situation */
3068         sh = 1;
3069     }
3070     else
3071     {
3072         /* We need to check for all pme nodes which nodes they
3073          * could possibly need to communicate with.
3074          */
3075         xmin = ddpme->pp_min;
3076         xmax = ddpme->pp_max;
3077         /* Allow for atoms to be maximally 2/3 times the cut-off
3078          * out of their DD cell. This is a reasonable balance between
3079          * between performance and support for most charge-group/cut-off
3080          * combinations.
3081          */
3082         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3083         /* Avoid extra communication when we are exactly at a boundary */
3084         range *= 0.999;
3085
3086         sh = 1;
3087         for (s = 0; s < ns; s++)
3088         {
3089             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3090             pme_boundary = (real)s/ns;
3091             while (sh+1 < ns &&
3092                    ((s-(sh+1) >= 0 &&
3093                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3094                     (s-(sh+1) <  0 &&
3095                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3096             {
3097                 sh++;
3098             }
3099             pme_boundary = (real)(s+1)/ns;
3100             while (sh+1 < ns &&
3101                    ((s+(sh+1) <  ns &&
3102                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3103                     (s+(sh+1) >= ns &&
3104                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3105             {
3106                 sh++;
3107             }
3108         }
3109     }
3110
3111     ddpme->maxshift = sh;
3112
3113     if (debug)
3114     {
3115         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3116                 ddpme->dim, ddpme->maxshift);
3117     }
3118 }
3119
3120 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3121 {
3122     int d, dim;
3123
3124     for (d = 0; d < dd->ndim; d++)
3125     {
3126         dim = dd->dim[d];
3127         if (dim < ddbox->nboundeddim &&
3128             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3129             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3130         {
3131             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3132                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3133                       dd->nc[dim], dd->comm->cellsize_limit);
3134         }
3135     }
3136 }
3137
3138 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3139                                   gmx_bool bMaster, ivec npulse)
3140 {
3141     gmx_domdec_comm_t *comm;
3142     int                d, j;
3143     rvec               cellsize_min;
3144     real              *cell_x, cell_dx, cellsize;
3145
3146     comm = dd->comm;
3147
3148     for (d = 0; d < DIM; d++)
3149     {
3150         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3151         npulse[d]       = 1;
3152         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3153         {
3154             /* Uniform grid */
3155             cell_dx = ddbox->box_size[d]/dd->nc[d];
3156             if (bMaster)
3157             {
3158                 for (j = 0; j < dd->nc[d]+1; j++)
3159                 {
3160                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3161                 }
3162             }
3163             else
3164             {
3165                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3166                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3167             }
3168             cellsize = cell_dx*ddbox->skew_fac[d];
3169             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3170             {
3171                 npulse[d]++;
3172             }
3173             cellsize_min[d] = cellsize;
3174         }
3175         else
3176         {
3177             /* Statically load balanced grid */
3178             /* Also when we are not doing a master distribution we determine
3179              * all cell borders in a loop to obtain identical values
3180              * to the master distribution case and to determine npulse.
3181              */
3182             if (bMaster)
3183             {
3184                 cell_x = dd->ma->cell_x[d];
3185             }
3186             else
3187             {
3188                 snew(cell_x, dd->nc[d]+1);
3189             }
3190             cell_x[0] = ddbox->box0[d];
3191             for (j = 0; j < dd->nc[d]; j++)
3192             {
3193                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3194                 cell_x[j+1] = cell_x[j] + cell_dx;
3195                 cellsize    = cell_dx*ddbox->skew_fac[d];
3196                 while (cellsize*npulse[d] < comm->cutoff &&
3197                        npulse[d] < dd->nc[d]-1)
3198                 {
3199                     npulse[d]++;
3200                 }
3201                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3202             }
3203             if (!bMaster)
3204             {
3205                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3206                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3207                 sfree(cell_x);
3208             }
3209         }
3210         /* The following limitation is to avoid that a cell would receive
3211          * some of its own home charge groups back over the periodic boundary.
3212          * Double charge groups cause trouble with the global indices.
3213          */
3214         if (d < ddbox->npbcdim &&
3215             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3216         {
3217             gmx_fatal_collective(FARGS, NULL, dd,
3218                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3219                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3220                                  comm->cutoff,
3221                                  dd->nc[d], dd->nc[d],
3222                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3223         }
3224     }
3225
3226     if (!comm->bDynLoadBal)
3227     {
3228         copy_rvec(cellsize_min, comm->cellsize_min);
3229     }
3230
3231     for (d = 0; d < comm->npmedecompdim; d++)
3232     {
3233         set_pme_maxshift(dd, &comm->ddpme[d],
3234                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3235                          comm->ddpme[d].slb_dim_f);
3236     }
3237 }
3238
3239
3240 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3241                                                   int d, int dim, gmx_domdec_root_t *root,
3242                                                   gmx_ddbox_t *ddbox,
3243                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
3244 {
3245     gmx_domdec_comm_t *comm;
3246     int                ncd, i, j, nmin, nmin_old;
3247     gmx_bool           bLimLo, bLimHi;
3248     real              *cell_size;
3249     real               fac, halfway, cellsize_limit_f_i, region_size;
3250     gmx_bool           bPBC, bLastHi = FALSE;
3251     int                nrange[] = {range[0], range[1]};
3252
3253     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3254
3255     comm = dd->comm;
3256
3257     ncd = dd->nc[dim];
3258
3259     bPBC = (dim < ddbox->npbcdim);
3260
3261     cell_size = root->buf_ncd;
3262
3263     if (debug)
3264     {
3265         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3266     }
3267
3268     /* First we need to check if the scaling does not make cells
3269      * smaller than the smallest allowed size.
3270      * We need to do this iteratively, since if a cell is too small,
3271      * it needs to be enlarged, which makes all the other cells smaller,
3272      * which could in turn make another cell smaller than allowed.
3273      */
3274     for (i = range[0]; i < range[1]; i++)
3275     {
3276         root->bCellMin[i] = FALSE;
3277     }
3278     nmin = 0;
3279     do
3280     {
3281         nmin_old = nmin;
3282         /* We need the total for normalization */
3283         fac = 0;
3284         for (i = range[0]; i < range[1]; i++)
3285         {
3286             if (root->bCellMin[i] == FALSE)
3287             {
3288                 fac += cell_size[i];
3289             }
3290         }
3291         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3292         /* Determine the cell boundaries */
3293         for (i = range[0]; i < range[1]; i++)
3294         {
3295             if (root->bCellMin[i] == FALSE)
3296             {
3297                 cell_size[i] *= fac;
3298                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3299                 {
3300                     cellsize_limit_f_i = 0;
3301                 }
3302                 else
3303                 {
3304                     cellsize_limit_f_i = cellsize_limit_f;
3305                 }
3306                 if (cell_size[i] < cellsize_limit_f_i)
3307                 {
3308                     root->bCellMin[i] = TRUE;
3309                     cell_size[i]      = cellsize_limit_f_i;
3310                     nmin++;
3311                 }
3312             }
3313             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3314         }
3315     }
3316     while (nmin > nmin_old);
3317
3318     i            = range[1]-1;
3319     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3320     /* For this check we should not use DD_CELL_MARGIN,
3321      * but a slightly smaller factor,
3322      * since rounding could get use below the limit.
3323      */
3324     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3325     {
3326         char buf[22];
3327         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3328                   gmx_step_str(step, buf),
3329                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3330                   ncd, comm->cellsize_min[dim]);
3331     }
3332
3333     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3334
3335     if (!bUniform)
3336     {
3337         /* Check if the boundary did not displace more than halfway
3338          * each of the cells it bounds, as this could cause problems,
3339          * especially when the differences between cell sizes are large.
3340          * If changes are applied, they will not make cells smaller
3341          * than the cut-off, as we check all the boundaries which
3342          * might be affected by a change and if the old state was ok,
3343          * the cells will at most be shrunk back to their old size.
3344          */
3345         for (i = range[0]+1; i < range[1]; i++)
3346         {
3347             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3348             if (root->cell_f[i] < halfway)
3349             {
3350                 root->cell_f[i] = halfway;
3351                 /* Check if the change also causes shifts of the next boundaries */
3352                 for (j = i+1; j < range[1]; j++)
3353                 {
3354                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3355                     {
3356                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3357                     }
3358                 }
3359             }
3360             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3361             if (root->cell_f[i] > halfway)
3362             {
3363                 root->cell_f[i] = halfway;
3364                 /* Check if the change also causes shifts of the next boundaries */
3365                 for (j = i-1; j >= range[0]+1; j--)
3366                 {
3367                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3368                     {
3369                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3370                     }
3371                 }
3372             }
3373         }
3374     }
3375
3376     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3377     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3378      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3379      * for a and b nrange is used */
3380     if (d > 0)
3381     {
3382         /* Take care of the staggering of the cell boundaries */
3383         if (bUniform)
3384         {
3385             for (i = range[0]; i < range[1]; i++)
3386             {
3387                 root->cell_f_max0[i] = root->cell_f[i];
3388                 root->cell_f_min1[i] = root->cell_f[i+1];
3389             }
3390         }
3391         else
3392         {
3393             for (i = range[0]+1; i < range[1]; i++)
3394             {
3395                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3396                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3397                 if (bLimLo && bLimHi)
3398                 {
3399                     /* Both limits violated, try the best we can */
3400                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3401                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3402                     nrange[0]       = range[0];
3403                     nrange[1]       = i;
3404                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3405
3406                     nrange[0] = i;
3407                     nrange[1] = range[1];
3408                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3409
3410                     return;
3411                 }
3412                 else if (bLimLo)
3413                 {
3414                     /* root->cell_f[i] = root->bound_min[i]; */
3415                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3416                     bLastHi   = FALSE;
3417                 }
3418                 else if (bLimHi && !bLastHi)
3419                 {
3420                     bLastHi = TRUE;
3421                     if (nrange[1] < range[1])   /* found a LimLo before */
3422                     {
3423                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3424                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3425                         nrange[0] = nrange[1];
3426                     }
3427                     root->cell_f[i] = root->bound_max[i];
3428                     nrange[1]       = i;
3429                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3430                     nrange[0] = i;
3431                     nrange[1] = range[1];
3432                 }
3433             }
3434             if (nrange[1] < range[1])   /* found last a LimLo */
3435             {
3436                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3437                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3438                 nrange[0] = nrange[1];
3439                 nrange[1] = range[1];
3440                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3441             }
3442             else if (nrange[0] > range[0]) /* found at least one LimHi */
3443             {
3444                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3445             }
3446         }
3447     }
3448 }
3449
3450
3451 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3452                                        int d, int dim, gmx_domdec_root_t *root,
3453                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3454                                        gmx_bool bUniform, gmx_int64_t step)
3455 {
3456     gmx_domdec_comm_t *comm;
3457     int                ncd, d1, i, j, pos;
3458     real              *cell_size;
3459     real               load_aver, load_i, imbalance, change, change_max, sc;
3460     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3461     real               change_limit;
3462     real               relax = 0.5;
3463     gmx_bool           bPBC;
3464     int                range[] = { 0, 0 };
3465
3466     comm = dd->comm;
3467
3468     /* Convert the maximum change from the input percentage to a fraction */
3469     change_limit = comm->dlb_scale_lim*0.01;
3470
3471     ncd = dd->nc[dim];
3472
3473     bPBC = (dim < ddbox->npbcdim);
3474
3475     cell_size = root->buf_ncd;
3476
3477     /* Store the original boundaries */
3478     for (i = 0; i < ncd+1; i++)
3479     {
3480         root->old_cell_f[i] = root->cell_f[i];
3481     }
3482     if (bUniform)
3483     {
3484         for (i = 0; i < ncd; i++)
3485         {
3486             cell_size[i] = 1.0/ncd;
3487         }
3488     }
3489     else if (dd_load_count(comm))
3490     {
3491         load_aver  = comm->load[d].sum_m/ncd;
3492         change_max = 0;
3493         for (i = 0; i < ncd; i++)
3494         {
3495             /* Determine the relative imbalance of cell i */
3496             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3497             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3498             /* Determine the change of the cell size using underrelaxation */
3499             change     = -relax*imbalance;
3500             change_max = max(change_max, max(change, -change));
3501         }
3502         /* Limit the amount of scaling.
3503          * We need to use the same rescaling for all cells in one row,
3504          * otherwise the load balancing might not converge.
3505          */
3506         sc = relax;
3507         if (change_max > change_limit)
3508         {
3509             sc *= change_limit/change_max;
3510         }
3511         for (i = 0; i < ncd; i++)
3512         {
3513             /* Determine the relative imbalance of cell i */
3514             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3515             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3516             /* Determine the change of the cell size using underrelaxation */
3517             change       = -sc*imbalance;
3518             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3519         }
3520     }
3521
3522     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3523     cellsize_limit_f *= DD_CELL_MARGIN;
3524     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3525     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3526     if (ddbox->tric_dir[dim])
3527     {
3528         cellsize_limit_f /= ddbox->skew_fac[dim];
3529         dist_min_f       /= ddbox->skew_fac[dim];
3530     }
3531     if (bDynamicBox && d > 0)
3532     {
3533         dist_min_f *= DD_PRES_SCALE_MARGIN;
3534     }
3535     if (d > 0 && !bUniform)
3536     {
3537         /* Make sure that the grid is not shifted too much */
3538         for (i = 1; i < ncd; i++)
3539         {
3540             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3541             {
3542                 gmx_incons("Inconsistent DD boundary staggering limits!");
3543             }
3544             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3545             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3546             if (space > 0)
3547             {
3548                 root->bound_min[i] += 0.5*space;
3549             }
3550             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3551             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3552             if (space < 0)
3553             {
3554                 root->bound_max[i] += 0.5*space;
3555             }
3556             if (debug)
3557             {
3558                 fprintf(debug,
3559                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3560                         d, i,
3561                         root->cell_f_max0[i-1] + dist_min_f,
3562                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3563                         root->cell_f_min1[i] - dist_min_f);
3564             }
3565         }
3566     }
3567     range[1]          = ncd;
3568     root->cell_f[0]   = 0;
3569     root->cell_f[ncd] = 1;
3570     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3571
3572
3573     /* After the checks above, the cells should obey the cut-off
3574      * restrictions, but it does not hurt to check.
3575      */
3576     for (i = 0; i < ncd; i++)
3577     {
3578         if (debug)
3579         {
3580             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3581                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3582         }
3583
3584         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3585             root->cell_f[i+1] - root->cell_f[i] <
3586             cellsize_limit_f/DD_CELL_MARGIN)
3587         {
3588             char buf[22];
3589             fprintf(stderr,
3590                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3591                     gmx_step_str(step, buf), dim2char(dim), i,
3592                     (root->cell_f[i+1] - root->cell_f[i])
3593                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3594         }
3595     }
3596
3597     pos = ncd + 1;
3598     /* Store the cell boundaries of the lower dimensions at the end */
3599     for (d1 = 0; d1 < d; d1++)
3600     {
3601         root->cell_f[pos++] = comm->cell_f0[d1];
3602         root->cell_f[pos++] = comm->cell_f1[d1];
3603     }
3604
3605     if (d < comm->npmedecompdim)
3606     {
3607         /* The master determines the maximum shift for
3608          * the coordinate communication between separate PME nodes.
3609          */
3610         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3611     }
3612     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3613     if (d >= 1)
3614     {
3615         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3616     }
3617 }
3618
3619 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3620                                              gmx_ddbox_t *ddbox, int dimind)
3621 {
3622     gmx_domdec_comm_t *comm;
3623     int                dim;
3624
3625     comm = dd->comm;
3626
3627     /* Set the cell dimensions */
3628     dim                = dd->dim[dimind];
3629     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3630     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3631     if (dim >= ddbox->nboundeddim)
3632     {
3633         comm->cell_x0[dim] += ddbox->box0[dim];
3634         comm->cell_x1[dim] += ddbox->box0[dim];
3635     }
3636 }
3637
3638 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3639                                          int d, int dim, real *cell_f_row,
3640                                          gmx_ddbox_t *ddbox)
3641 {
3642     gmx_domdec_comm_t *comm;
3643     int                d1, dim1, pos;
3644
3645     comm = dd->comm;
3646
3647 #ifdef GMX_MPI
3648     /* Each node would only need to know two fractions,
3649      * but it is probably cheaper to broadcast the whole array.
3650      */
3651     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3652               0, comm->mpi_comm_load[d]);
3653 #endif
3654     /* Copy the fractions for this dimension from the buffer */
3655     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3656     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3657     /* The whole array was communicated, so set the buffer position */
3658     pos = dd->nc[dim] + 1;
3659     for (d1 = 0; d1 <= d; d1++)
3660     {
3661         if (d1 < d)
3662         {
3663             /* Copy the cell fractions of the lower dimensions */
3664             comm->cell_f0[d1] = cell_f_row[pos++];
3665             comm->cell_f1[d1] = cell_f_row[pos++];
3666         }
3667         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3668     }
3669     /* Convert the communicated shift from float to int */
3670     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3671     if (d >= 1)
3672     {
3673         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3674     }
3675 }
3676
3677 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3678                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3679                                          gmx_bool bUniform, gmx_int64_t step)
3680 {
3681     gmx_domdec_comm_t *comm;
3682     int                d, dim, d1;
3683     gmx_bool           bRowMember, bRowRoot;
3684     real              *cell_f_row;
3685
3686     comm = dd->comm;
3687
3688     for (d = 0; d < dd->ndim; d++)
3689     {
3690         dim        = dd->dim[d];
3691         bRowMember = TRUE;
3692         bRowRoot   = TRUE;
3693         for (d1 = d; d1 < dd->ndim; d1++)
3694         {
3695             if (dd->ci[dd->dim[d1]] > 0)
3696             {
3697                 if (d1 != d)
3698                 {
3699                     bRowMember = FALSE;
3700                 }
3701                 bRowRoot = FALSE;
3702             }
3703         }
3704         if (bRowMember)
3705         {
3706             if (bRowRoot)
3707             {
3708                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3709                                            ddbox, bDynamicBox, bUniform, step);
3710                 cell_f_row = comm->root[d]->cell_f;
3711             }
3712             else
3713             {
3714                 cell_f_row = comm->cell_f_row;
3715             }
3716             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3717         }
3718     }
3719 }
3720
3721 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3722 {
3723     int d;
3724
3725     /* This function assumes the box is static and should therefore
3726      * not be called when the box has changed since the last
3727      * call to dd_partition_system.
3728      */
3729     for (d = 0; d < dd->ndim; d++)
3730     {
3731         relative_to_absolute_cell_bounds(dd, ddbox, d);
3732     }
3733 }
3734
3735
3736
3737 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3738                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3739                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3740                                   gmx_wallcycle_t wcycle)
3741 {
3742     gmx_domdec_comm_t *comm;
3743     int                dim;
3744
3745     comm = dd->comm;
3746
3747     if (bDoDLB)
3748     {
3749         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3750         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3751         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3752     }
3753     else if (bDynamicBox)
3754     {
3755         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3756     }
3757
3758     /* Set the dimensions for which no DD is used */
3759     for (dim = 0; dim < DIM; dim++)
3760     {
3761         if (dd->nc[dim] == 1)
3762         {
3763             comm->cell_x0[dim] = 0;
3764             comm->cell_x1[dim] = ddbox->box_size[dim];
3765             if (dim >= ddbox->nboundeddim)
3766             {
3767                 comm->cell_x0[dim] += ddbox->box0[dim];
3768                 comm->cell_x1[dim] += ddbox->box0[dim];
3769             }
3770         }
3771     }
3772 }
3773
3774 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3775 {
3776     int                    d, np, i;
3777     gmx_domdec_comm_dim_t *cd;
3778
3779     for (d = 0; d < dd->ndim; d++)
3780     {
3781         cd = &dd->comm->cd[d];
3782         np = npulse[dd->dim[d]];
3783         if (np > cd->np_nalloc)
3784         {
3785             if (debug)
3786             {
3787                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3788                         dim2char(dd->dim[d]), np);
3789             }
3790             if (DDMASTER(dd) && cd->np_nalloc > 0)
3791             {
3792                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3793             }
3794             srenew(cd->ind, np);
3795             for (i = cd->np_nalloc; i < np; i++)
3796             {
3797                 cd->ind[i].index  = NULL;
3798                 cd->ind[i].nalloc = 0;
3799             }
3800             cd->np_nalloc = np;
3801         }
3802         cd->np = np;
3803     }
3804 }
3805
3806
3807 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3808                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3809                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3810                               gmx_wallcycle_t wcycle)
3811 {
3812     gmx_domdec_comm_t *comm;
3813     int                d;
3814     ivec               npulse;
3815
3816     comm = dd->comm;
3817
3818     /* Copy the old cell boundaries for the cg displacement check */
3819     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3820     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3821
3822     if (comm->bDynLoadBal)
3823     {
3824         if (DDMASTER(dd))
3825         {
3826             check_box_size(dd, ddbox);
3827         }
3828         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3829     }
3830     else
3831     {
3832         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
3833         realloc_comm_ind(dd, npulse);
3834     }
3835
3836     if (debug)
3837     {
3838         for (d = 0; d < DIM; d++)
3839         {
3840             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3841                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3842         }
3843     }
3844 }
3845
3846 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3847                                   gmx_ddbox_t *ddbox,
3848                                   rvec cell_ns_x0, rvec cell_ns_x1,
3849                                   gmx_int64_t step)
3850 {
3851     gmx_domdec_comm_t *comm;
3852     int                dim_ind, dim;
3853
3854     comm = dd->comm;
3855
3856     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3857     {
3858         dim = dd->dim[dim_ind];
3859
3860         /* Without PBC we don't have restrictions on the outer cells */
3861         if (!(dim >= ddbox->npbcdim &&
3862               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3863             comm->bDynLoadBal &&
3864             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3865             comm->cellsize_min[dim])
3866         {
3867             char buf[22];
3868             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3869                       gmx_step_str(step, buf), dim2char(dim),
3870                       comm->cell_x1[dim] - comm->cell_x0[dim],
3871                       ddbox->skew_fac[dim],
3872                       dd->comm->cellsize_min[dim],
3873                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3874         }
3875     }
3876
3877     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3878     {
3879         /* Communicate the boundaries and update cell_ns_x0/1 */
3880         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3881         if (dd->bGridJump && dd->ndim > 1)
3882         {
3883             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3884         }
3885     }
3886 }
3887
3888 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3889 {
3890     if (YY < npbcdim)
3891     {
3892         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3893     }
3894     else
3895     {
3896         tcm[YY][XX] = 0;
3897     }
3898     if (ZZ < npbcdim)
3899     {
3900         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3901         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3902     }
3903     else
3904     {
3905         tcm[ZZ][XX] = 0;
3906         tcm[ZZ][YY] = 0;
3907     }
3908 }
3909
3910 static void check_screw_box(matrix box)
3911 {
3912     /* Mathematical limitation */
3913     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3914     {
3915         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3916     }
3917
3918     /* Limitation due to the asymmetry of the eighth shell method */
3919     if (box[ZZ][YY] != 0)
3920     {
3921         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3922     }
3923 }
3924
3925 static void distribute_cg(FILE *fplog, gmx_int64_t step,
3926                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3927                           gmx_domdec_t *dd)
3928 {
3929     gmx_domdec_master_t *ma;
3930     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3931     int                  i, icg, j, k, k0, k1, d, npbcdim;
3932     matrix               tcm;
3933     rvec                 box_size, cg_cm;
3934     ivec                 ind;
3935     real                 nrcg, inv_ncg, pos_d;
3936     atom_id             *cgindex;
3937     gmx_bool             bUnbounded, bScrew;
3938
3939     ma = dd->ma;
3940
3941     if (tmp_ind == NULL)
3942     {
3943         snew(tmp_nalloc, dd->nnodes);
3944         snew(tmp_ind, dd->nnodes);
3945         for (i = 0; i < dd->nnodes; i++)
3946         {
3947             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3948             snew(tmp_ind[i], tmp_nalloc[i]);
3949         }
3950     }
3951
3952     /* Clear the count */
3953     for (i = 0; i < dd->nnodes; i++)
3954     {
3955         ma->ncg[i] = 0;
3956         ma->nat[i] = 0;
3957     }
3958
3959     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3960
3961     cgindex = cgs->index;
3962
3963     /* Compute the center of geometry for all charge groups */
3964     for (icg = 0; icg < cgs->nr; icg++)
3965     {
3966         k0      = cgindex[icg];
3967         k1      = cgindex[icg+1];
3968         nrcg    = k1 - k0;
3969         if (nrcg == 1)
3970         {
3971             copy_rvec(pos[k0], cg_cm);
3972         }
3973         else
3974         {
3975             inv_ncg = 1.0/nrcg;
3976
3977             clear_rvec(cg_cm);
3978             for (k = k0; (k < k1); k++)
3979             {
3980                 rvec_inc(cg_cm, pos[k]);
3981             }
3982             for (d = 0; (d < DIM); d++)
3983             {
3984                 cg_cm[d] *= inv_ncg;
3985             }
3986         }
3987         /* Put the charge group in the box and determine the cell index */
3988         for (d = DIM-1; d >= 0; d--)
3989         {
3990             pos_d = cg_cm[d];
3991             if (d < dd->npbcdim)
3992             {
3993                 bScrew = (dd->bScrewPBC && d == XX);
3994                 if (tric_dir[d] && dd->nc[d] > 1)
3995                 {
3996                     /* Use triclinic coordintates for this dimension */
3997                     for (j = d+1; j < DIM; j++)
3998                     {
3999                         pos_d += cg_cm[j]*tcm[j][d];
4000                     }
4001                 }
4002                 while (pos_d >= box[d][d])
4003                 {
4004                     pos_d -= box[d][d];
4005                     rvec_dec(cg_cm, box[d]);
4006                     if (bScrew)
4007                     {
4008                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4009                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4010                     }
4011                     for (k = k0; (k < k1); k++)
4012                     {
4013                         rvec_dec(pos[k], box[d]);
4014                         if (bScrew)
4015                         {
4016                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4017                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4018                         }
4019                     }
4020                 }
4021                 while (pos_d < 0)
4022                 {
4023                     pos_d += box[d][d];
4024                     rvec_inc(cg_cm, box[d]);
4025                     if (bScrew)
4026                     {
4027                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4028                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4029                     }
4030                     for (k = k0; (k < k1); k++)
4031                     {
4032                         rvec_inc(pos[k], box[d]);
4033                         if (bScrew)
4034                         {
4035                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4036                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4037                         }
4038                     }
4039                 }
4040             }
4041             /* This could be done more efficiently */
4042             ind[d] = 0;
4043             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
4044             {
4045                 ind[d]++;
4046             }
4047         }
4048         i = dd_index(dd->nc, ind);
4049         if (ma->ncg[i] == tmp_nalloc[i])
4050         {
4051             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
4052             srenew(tmp_ind[i], tmp_nalloc[i]);
4053         }
4054         tmp_ind[i][ma->ncg[i]] = icg;
4055         ma->ncg[i]++;
4056         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
4057     }
4058
4059     k1 = 0;
4060     for (i = 0; i < dd->nnodes; i++)
4061     {
4062         ma->index[i] = k1;
4063         for (k = 0; k < ma->ncg[i]; k++)
4064         {
4065             ma->cg[k1++] = tmp_ind[i][k];
4066         }
4067     }
4068     ma->index[dd->nnodes] = k1;
4069
4070     for (i = 0; i < dd->nnodes; i++)
4071     {
4072         sfree(tmp_ind[i]);
4073     }
4074     sfree(tmp_ind);
4075     sfree(tmp_nalloc);
4076
4077     if (fplog)
4078     {
4079         char buf[22];
4080         fprintf(fplog, "Charge group distribution at step %s:",
4081                 gmx_step_str(step, buf));
4082         for (i = 0; i < dd->nnodes; i++)
4083         {
4084             fprintf(fplog, " %d", ma->ncg[i]);
4085         }
4086         fprintf(fplog, "\n");
4087     }
4088 }
4089
4090 static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
4091                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4092                                 rvec pos[])
4093 {
4094     gmx_domdec_master_t *ma = NULL;
4095     ivec                 npulse;
4096     int                  i, cg_gl;
4097     int                 *ibuf, buf2[2] = { 0, 0 };
4098     gmx_bool             bMaster = DDMASTER(dd);
4099     if (bMaster)
4100     {
4101         ma = dd->ma;
4102
4103         if (dd->bScrewPBC)
4104         {
4105             check_screw_box(box);
4106         }
4107
4108         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
4109
4110         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4111         for (i = 0; i < dd->nnodes; i++)
4112         {
4113             ma->ibuf[2*i]   = ma->ncg[i];
4114             ma->ibuf[2*i+1] = ma->nat[i];
4115         }
4116         ibuf = ma->ibuf;
4117     }
4118     else
4119     {
4120         ibuf = NULL;
4121     }
4122     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4123
4124     dd->ncg_home = buf2[0];
4125     dd->nat_home = buf2[1];
4126     dd->ncg_tot  = dd->ncg_home;
4127     dd->nat_tot  = dd->nat_home;
4128     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4129     {
4130         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4131         srenew(dd->index_gl, dd->cg_nalloc);
4132         srenew(dd->cgindex, dd->cg_nalloc+1);
4133     }
4134     if (bMaster)
4135     {
4136         for (i = 0; i < dd->nnodes; i++)
4137         {
4138             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4139             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4140         }
4141     }
4142
4143     dd_scatterv(dd,
4144                 DDMASTER(dd) ? ma->ibuf : NULL,
4145                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4146                 DDMASTER(dd) ? ma->cg : NULL,
4147                 dd->ncg_home*sizeof(int), dd->index_gl);
4148
4149     /* Determine the home charge group sizes */
4150     dd->cgindex[0] = 0;
4151     for (i = 0; i < dd->ncg_home; i++)
4152     {
4153         cg_gl            = dd->index_gl[i];
4154         dd->cgindex[i+1] =
4155             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4156     }
4157
4158     if (debug)
4159     {
4160         fprintf(debug, "Home charge groups:\n");
4161         for (i = 0; i < dd->ncg_home; i++)
4162         {
4163             fprintf(debug, " %d", dd->index_gl[i]);
4164             if (i % 10 == 9)
4165             {
4166                 fprintf(debug, "\n");
4167             }
4168         }
4169         fprintf(debug, "\n");
4170     }
4171 }
4172
4173 static int compact_and_copy_vec_at(int ncg, int *move,
4174                                    int *cgindex,
4175                                    int nvec, int vec,
4176                                    rvec *src, gmx_domdec_comm_t *comm,
4177                                    gmx_bool bCompact)
4178 {
4179     int m, icg, i, i0, i1, nrcg;
4180     int home_pos;
4181     int pos_vec[DIM*2];
4182
4183     home_pos = 0;
4184
4185     for (m = 0; m < DIM*2; m++)
4186     {
4187         pos_vec[m] = 0;
4188     }
4189
4190     i0 = 0;
4191     for (icg = 0; icg < ncg; icg++)
4192     {
4193         i1 = cgindex[icg+1];
4194         m  = move[icg];
4195         if (m == -1)
4196         {
4197             if (bCompact)
4198             {
4199                 /* Compact the home array in place */
4200                 for (i = i0; i < i1; i++)
4201                 {
4202                     copy_rvec(src[i], src[home_pos++]);
4203                 }
4204             }
4205         }
4206         else
4207         {
4208             /* Copy to the communication buffer */
4209             nrcg        = i1 - i0;
4210             pos_vec[m] += 1 + vec*nrcg;
4211             for (i = i0; i < i1; i++)
4212             {
4213                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4214             }
4215             pos_vec[m] += (nvec - vec - 1)*nrcg;
4216         }
4217         if (!bCompact)
4218         {
4219             home_pos += i1 - i0;
4220         }
4221         i0 = i1;
4222     }
4223
4224     return home_pos;
4225 }
4226
4227 static int compact_and_copy_vec_cg(int ncg, int *move,
4228                                    int *cgindex,
4229                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4230                                    gmx_bool bCompact)
4231 {
4232     int m, icg, i0, i1, nrcg;
4233     int home_pos;
4234     int pos_vec[DIM*2];
4235
4236     home_pos = 0;
4237
4238     for (m = 0; m < DIM*2; m++)
4239     {
4240         pos_vec[m] = 0;
4241     }
4242
4243     i0 = 0;
4244     for (icg = 0; icg < ncg; icg++)
4245     {
4246         i1 = cgindex[icg+1];
4247         m  = move[icg];
4248         if (m == -1)
4249         {
4250             if (bCompact)
4251             {
4252                 /* Compact the home array in place */
4253                 copy_rvec(src[icg], src[home_pos++]);
4254             }
4255         }
4256         else
4257         {
4258             nrcg = i1 - i0;
4259             /* Copy to the communication buffer */
4260             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4261             pos_vec[m] += 1 + nrcg*nvec;
4262         }
4263         i0 = i1;
4264     }
4265     if (!bCompact)
4266     {
4267         home_pos = ncg;
4268     }
4269
4270     return home_pos;
4271 }
4272
4273 static int compact_ind(int ncg, int *move,
4274                        int *index_gl, int *cgindex,
4275                        int *gatindex,
4276                        gmx_ga2la_t ga2la, char *bLocalCG,
4277                        int *cginfo)
4278 {
4279     int cg, nat, a0, a1, a, a_gl;
4280     int home_pos;
4281
4282     home_pos = 0;
4283     nat      = 0;
4284     for (cg = 0; cg < ncg; cg++)
4285     {
4286         a0 = cgindex[cg];
4287         a1 = cgindex[cg+1];
4288         if (move[cg] == -1)
4289         {
4290             /* Compact the home arrays in place.
4291              * Anything that can be done here avoids access to global arrays.
4292              */
4293             cgindex[home_pos] = nat;
4294             for (a = a0; a < a1; a++)
4295             {
4296                 a_gl          = gatindex[a];
4297                 gatindex[nat] = a_gl;
4298                 /* The cell number stays 0, so we don't need to set it */
4299                 ga2la_change_la(ga2la, a_gl, nat);
4300                 nat++;
4301             }
4302             index_gl[home_pos] = index_gl[cg];
4303             cginfo[home_pos]   = cginfo[cg];
4304             /* The charge group remains local, so bLocalCG does not change */
4305             home_pos++;
4306         }
4307         else
4308         {
4309             /* Clear the global indices */
4310             for (a = a0; a < a1; a++)
4311             {
4312                 ga2la_del(ga2la, gatindex[a]);
4313             }
4314             if (bLocalCG)
4315             {
4316                 bLocalCG[index_gl[cg]] = FALSE;
4317             }
4318         }
4319     }
4320     cgindex[home_pos] = nat;
4321
4322     return home_pos;
4323 }
4324
4325 static void clear_and_mark_ind(int ncg, int *move,
4326                                int *index_gl, int *cgindex, int *gatindex,
4327                                gmx_ga2la_t ga2la, char *bLocalCG,
4328                                int *cell_index)
4329 {
4330     int cg, a0, a1, a;
4331
4332     for (cg = 0; cg < ncg; cg++)
4333     {
4334         if (move[cg] >= 0)
4335         {
4336             a0 = cgindex[cg];
4337             a1 = cgindex[cg+1];
4338             /* Clear the global indices */
4339             for (a = a0; a < a1; a++)
4340             {
4341                 ga2la_del(ga2la, gatindex[a]);
4342             }
4343             if (bLocalCG)
4344             {
4345                 bLocalCG[index_gl[cg]] = FALSE;
4346             }
4347             /* Signal that this cg has moved using the ns cell index.
4348              * Here we set it to -1. fill_grid will change it
4349              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4350              */
4351             cell_index[cg] = -1;
4352         }
4353     }
4354 }
4355
4356 static void print_cg_move(FILE *fplog,
4357                           gmx_domdec_t *dd,
4358                           gmx_int64_t step, int cg, int dim, int dir,
4359                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4360                           rvec cm_old, rvec cm_new, real pos_d)
4361 {
4362     gmx_domdec_comm_t *comm;
4363     char               buf[22];
4364
4365     comm = dd->comm;
4366
4367     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4368     if (bHaveLimitdAndCMOld)
4369     {
4370         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4371                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4372     }
4373     else
4374     {
4375         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4376                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4377     }
4378     fprintf(fplog, "distance out of cell %f\n",
4379             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4380     if (bHaveLimitdAndCMOld)
4381     {
4382         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4383                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4384     }
4385     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4386             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4387     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4388             dim2char(dim),
4389             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4390     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4391             dim2char(dim),
4392             comm->cell_x0[dim], comm->cell_x1[dim]);
4393 }
4394
4395 static void cg_move_error(FILE *fplog,
4396                           gmx_domdec_t *dd,
4397                           gmx_int64_t step, int cg, int dim, int dir,
4398                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4399                           rvec cm_old, rvec cm_new, real pos_d)
4400 {
4401     if (fplog)
4402     {
4403         print_cg_move(fplog, dd, step, cg, dim, dir,
4404                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4405     }
4406     print_cg_move(stderr, dd, step, cg, dim, dir,
4407                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4408     gmx_fatal(FARGS,
4409               "A charge group moved too far between two domain decomposition steps\n"
4410               "This usually means that your system is not well equilibrated");
4411 }
4412
4413 static void rotate_state_atom(t_state *state, int a)
4414 {
4415     int est;
4416
4417     for (est = 0; est < estNR; est++)
4418     {
4419         if (EST_DISTR(est) && (state->flags & (1<<est)))
4420         {
4421             switch (est)
4422             {
4423                 case estX:
4424                     /* Rotate the complete state; for a rectangular box only */
4425                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4426                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4427                     break;
4428                 case estV:
4429                     state->v[a][YY] = -state->v[a][YY];
4430                     state->v[a][ZZ] = -state->v[a][ZZ];
4431                     break;
4432                 case estSDX:
4433                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4434                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4435                     break;
4436                 case estCGP:
4437                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4438                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4439                     break;
4440                 case estDISRE_INITF:
4441                 case estDISRE_RM3TAV:
4442                 case estORIRE_INITF:
4443                 case estORIRE_DTAV:
4444                     /* These are distances, so not affected by rotation */
4445                     break;
4446                 default:
4447                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4448             }
4449         }
4450     }
4451 }
4452
4453 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4454 {
4455     if (natoms > comm->moved_nalloc)
4456     {
4457         /* Contents should be preserved here */
4458         comm->moved_nalloc = over_alloc_dd(natoms);
4459         srenew(comm->moved, comm->moved_nalloc);
4460     }
4461
4462     return comm->moved;
4463 }
4464
4465 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4466                          gmx_domdec_t *dd,
4467                          t_state *state,
4468                          ivec tric_dir, matrix tcm,
4469                          rvec cell_x0, rvec cell_x1,
4470                          rvec limitd, rvec limit0, rvec limit1,
4471                          const int *cgindex,
4472                          int cg_start, int cg_end,
4473                          rvec *cg_cm,
4474                          int *move)
4475 {
4476     int      npbcdim;
4477     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4478     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4479     int      flag;
4480     gmx_bool bScrew;
4481     ivec     dev;
4482     real     inv_ncg, pos_d;
4483     rvec     cm_new;
4484
4485     npbcdim = dd->npbcdim;
4486
4487     for (cg = cg_start; cg < cg_end; cg++)
4488     {
4489         k0   = cgindex[cg];
4490         k1   = cgindex[cg+1];
4491         nrcg = k1 - k0;
4492         if (nrcg == 1)
4493         {
4494             copy_rvec(state->x[k0], cm_new);
4495         }
4496         else
4497         {
4498             inv_ncg = 1.0/nrcg;
4499
4500             clear_rvec(cm_new);
4501             for (k = k0; (k < k1); k++)
4502             {
4503                 rvec_inc(cm_new, state->x[k]);
4504             }
4505             for (d = 0; (d < DIM); d++)
4506             {
4507                 cm_new[d] = inv_ncg*cm_new[d];
4508             }
4509         }
4510
4511         clear_ivec(dev);
4512         /* Do pbc and check DD cell boundary crossings */
4513         for (d = DIM-1; d >= 0; d--)
4514         {
4515             if (dd->nc[d] > 1)
4516             {
4517                 bScrew = (dd->bScrewPBC && d == XX);
4518                 /* Determine the location of this cg in lattice coordinates */
4519                 pos_d = cm_new[d];
4520                 if (tric_dir[d])
4521                 {
4522                     for (d2 = d+1; d2 < DIM; d2++)
4523                     {
4524                         pos_d += cm_new[d2]*tcm[d2][d];
4525                     }
4526                 }
4527                 /* Put the charge group in the triclinic unit-cell */
4528                 if (pos_d >= cell_x1[d])
4529                 {
4530                     if (pos_d >= limit1[d])
4531                     {
4532                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4533                                       cg_cm[cg], cm_new, pos_d);
4534                     }
4535                     dev[d] = 1;
4536                     if (dd->ci[d] == dd->nc[d] - 1)
4537                     {
4538                         rvec_dec(cm_new, state->box[d]);
4539                         if (bScrew)
4540                         {
4541                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4542                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4543                         }
4544                         for (k = k0; (k < k1); k++)
4545                         {
4546                             rvec_dec(state->x[k], state->box[d]);
4547                             if (bScrew)
4548                             {
4549                                 rotate_state_atom(state, k);
4550                             }
4551                         }
4552                     }
4553                 }
4554                 else if (pos_d < cell_x0[d])
4555                 {
4556                     if (pos_d < limit0[d])
4557                     {
4558                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4559                                       cg_cm[cg], cm_new, pos_d);
4560                     }
4561                     dev[d] = -1;
4562                     if (dd->ci[d] == 0)
4563                     {
4564                         rvec_inc(cm_new, state->box[d]);
4565                         if (bScrew)
4566                         {
4567                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4568                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4569                         }
4570                         for (k = k0; (k < k1); k++)
4571                         {
4572                             rvec_inc(state->x[k], state->box[d]);
4573                             if (bScrew)
4574                             {
4575                                 rotate_state_atom(state, k);
4576                             }
4577                         }
4578                     }
4579                 }
4580             }
4581             else if (d < npbcdim)
4582             {
4583                 /* Put the charge group in the rectangular unit-cell */
4584                 while (cm_new[d] >= state->box[d][d])
4585                 {
4586                     rvec_dec(cm_new, state->box[d]);
4587                     for (k = k0; (k < k1); k++)
4588                     {
4589                         rvec_dec(state->x[k], state->box[d]);
4590                     }
4591                 }
4592                 while (cm_new[d] < 0)
4593                 {
4594                     rvec_inc(cm_new, state->box[d]);
4595                     for (k = k0; (k < k1); k++)
4596                     {
4597                         rvec_inc(state->x[k], state->box[d]);
4598                     }
4599                 }
4600             }
4601         }
4602
4603         copy_rvec(cm_new, cg_cm[cg]);
4604
4605         /* Determine where this cg should go */
4606         flag = 0;
4607         mc   = -1;
4608         for (d = 0; d < dd->ndim; d++)
4609         {
4610             dim = dd->dim[d];
4611             if (dev[dim] == 1)
4612             {
4613                 flag |= DD_FLAG_FW(d);
4614                 if (mc == -1)
4615                 {
4616                     mc = d*2;
4617                 }
4618             }
4619             else if (dev[dim] == -1)
4620             {
4621                 flag |= DD_FLAG_BW(d);
4622                 if (mc == -1)
4623                 {
4624                     if (dd->nc[dim] > 2)
4625                     {
4626                         mc = d*2 + 1;
4627                     }
4628                     else
4629                     {
4630                         mc = d*2;
4631                     }
4632                 }
4633             }
4634         }
4635         /* Temporarily store the flag in move */
4636         move[cg] = mc + flag;
4637     }
4638 }
4639
4640 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4641                                gmx_domdec_t *dd, ivec tric_dir,
4642                                t_state *state, rvec **f,
4643                                t_forcerec *fr,
4644                                gmx_bool bCompact,
4645                                t_nrnb *nrnb,
4646                                int *ncg_stay_home,
4647                                int *ncg_moved)
4648 {
4649     int               *move;
4650     int                npbcdim;
4651     int                ncg[DIM*2], nat[DIM*2];
4652     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4653     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4654     int                sbuf[2], rbuf[2];
4655     int                home_pos_cg, home_pos_at, buf_pos;
4656     int                flag;
4657     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4658     gmx_bool           bScrew;
4659     ivec               dev;
4660     real               inv_ncg, pos_d;
4661     matrix             tcm;
4662     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4663     atom_id           *cgindex;
4664     cginfo_mb_t       *cginfo_mb;
4665     gmx_domdec_comm_t *comm;
4666     int               *moved;
4667     int                nthread, thread;
4668
4669     if (dd->bScrewPBC)
4670     {
4671         check_screw_box(state->box);
4672     }
4673
4674     comm  = dd->comm;
4675     if (fr->cutoff_scheme == ecutsGROUP)
4676     {
4677         cg_cm = fr->cg_cm;
4678     }
4679
4680     for (i = 0; i < estNR; i++)
4681     {
4682         if (EST_DISTR(i))
4683         {
4684             switch (i)
4685             {
4686                 case estX: /* Always present */ break;
4687                 case estV:   bV   = (state->flags & (1<<i)); break;
4688                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4689                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4690                 case estLD_RNG:
4691                 case estLD_RNGI:
4692                 case estDISRE_INITF:
4693                 case estDISRE_RM3TAV:
4694                 case estORIRE_INITF:
4695                 case estORIRE_DTAV:
4696                     /* No processing required */
4697                     break;
4698                 default:
4699                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4700             }
4701         }
4702     }
4703
4704     if (dd->ncg_tot > comm->nalloc_int)
4705     {
4706         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4707         srenew(comm->buf_int, comm->nalloc_int);
4708     }
4709     move = comm->buf_int;
4710
4711     /* Clear the count */
4712     for (c = 0; c < dd->ndim*2; c++)
4713     {
4714         ncg[c] = 0;
4715         nat[c] = 0;
4716     }
4717
4718     npbcdim = dd->npbcdim;
4719
4720     for (d = 0; (d < DIM); d++)
4721     {
4722         limitd[d] = dd->comm->cellsize_min[d];
4723         if (d >= npbcdim && dd->ci[d] == 0)
4724         {
4725             cell_x0[d] = -GMX_FLOAT_MAX;
4726         }
4727         else
4728         {
4729             cell_x0[d] = comm->cell_x0[d];
4730         }
4731         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4732         {
4733             cell_x1[d] = GMX_FLOAT_MAX;
4734         }
4735         else
4736         {
4737             cell_x1[d] = comm->cell_x1[d];
4738         }
4739         if (d < npbcdim)
4740         {
4741             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4742             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4743         }
4744         else
4745         {
4746             /* We check after communication if a charge group moved
4747              * more than one cell. Set the pre-comm check limit to float_max.
4748              */
4749             limit0[d] = -GMX_FLOAT_MAX;
4750             limit1[d] =  GMX_FLOAT_MAX;
4751         }
4752     }
4753
4754     make_tric_corr_matrix(npbcdim, state->box, tcm);
4755
4756     cgindex = dd->cgindex;
4757
4758     nthread = gmx_omp_nthreads_get(emntDomdec);
4759
4760     /* Compute the center of geometry for all home charge groups
4761      * and put them in the box and determine where they should go.
4762      */
4763 #pragma omp parallel for num_threads(nthread) schedule(static)
4764     for (thread = 0; thread < nthread; thread++)
4765     {
4766         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4767                      cell_x0, cell_x1, limitd, limit0, limit1,
4768                      cgindex,
4769                      ( thread   *dd->ncg_home)/nthread,
4770                      ((thread+1)*dd->ncg_home)/nthread,
4771                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4772                      move);
4773     }
4774
4775     for (cg = 0; cg < dd->ncg_home; cg++)
4776     {
4777         if (move[cg] >= 0)
4778         {
4779             mc       = move[cg];
4780             flag     = mc & ~DD_FLAG_NRCG;
4781             mc       = mc & DD_FLAG_NRCG;
4782             move[cg] = mc;
4783
4784             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4785             {
4786                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4787                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4788             }
4789             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4790             /* We store the cg size in the lower 16 bits
4791              * and the place where the charge group should go
4792              * in the next 6 bits. This saves some communication volume.
4793              */
4794             nrcg = cgindex[cg+1] - cgindex[cg];
4795             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4796             ncg[mc] += 1;
4797             nat[mc] += nrcg;
4798         }
4799     }
4800
4801     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4802     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4803
4804     *ncg_moved = 0;
4805     for (i = 0; i < dd->ndim*2; i++)
4806     {
4807         *ncg_moved += ncg[i];
4808     }
4809
4810     nvec = 1;
4811     if (bV)
4812     {
4813         nvec++;
4814     }
4815     if (bSDX)
4816     {
4817         nvec++;
4818     }
4819     if (bCGP)
4820     {
4821         nvec++;
4822     }
4823
4824     /* Make sure the communication buffers are large enough */
4825     for (mc = 0; mc < dd->ndim*2; mc++)
4826     {
4827         nvr = ncg[mc] + nat[mc]*nvec;
4828         if (nvr > comm->cgcm_state_nalloc[mc])
4829         {
4830             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4831             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4832         }
4833     }
4834
4835     switch (fr->cutoff_scheme)
4836     {
4837         case ecutsGROUP:
4838             /* Recalculating cg_cm might be cheaper than communicating,
4839              * but that could give rise to rounding issues.
4840              */
4841             home_pos_cg =
4842                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4843                                         nvec, cg_cm, comm, bCompact);
4844             break;
4845         case ecutsVERLET:
4846             /* Without charge groups we send the moved atom coordinates
4847              * over twice. This is so the code below can be used without
4848              * many conditionals for both for with and without charge groups.
4849              */
4850             home_pos_cg =
4851                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4852                                         nvec, state->x, comm, FALSE);
4853             if (bCompact)
4854             {
4855                 home_pos_cg -= *ncg_moved;
4856             }
4857             break;
4858         default:
4859             gmx_incons("unimplemented");
4860             home_pos_cg = 0;
4861     }
4862
4863     vec         = 0;
4864     home_pos_at =
4865         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4866                                 nvec, vec++, state->x, comm, bCompact);
4867     if (bV)
4868     {
4869         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4870                                 nvec, vec++, state->v, comm, bCompact);
4871     }
4872     if (bSDX)
4873     {
4874         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4875                                 nvec, vec++, state->sd_X, comm, bCompact);
4876     }
4877     if (bCGP)
4878     {
4879         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4880                                 nvec, vec++, state->cg_p, comm, bCompact);
4881     }
4882
4883     if (bCompact)
4884     {
4885         compact_ind(dd->ncg_home, move,
4886                     dd->index_gl, dd->cgindex, dd->gatindex,
4887                     dd->ga2la, comm->bLocalCG,
4888                     fr->cginfo);
4889     }
4890     else
4891     {
4892         if (fr->cutoff_scheme == ecutsVERLET)
4893         {
4894             moved = get_moved(comm, dd->ncg_home);
4895
4896             for (k = 0; k < dd->ncg_home; k++)
4897             {
4898                 moved[k] = 0;
4899             }
4900         }
4901         else
4902         {
4903             moved = fr->ns.grid->cell_index;
4904         }
4905
4906         clear_and_mark_ind(dd->ncg_home, move,
4907                            dd->index_gl, dd->cgindex, dd->gatindex,
4908                            dd->ga2la, comm->bLocalCG,
4909                            moved);
4910     }
4911
4912     cginfo_mb = fr->cginfo_mb;
4913
4914     *ncg_stay_home = home_pos_cg;
4915     for (d = 0; d < dd->ndim; d++)
4916     {
4917         dim      = dd->dim[d];
4918         ncg_recv = 0;
4919         nat_recv = 0;
4920         nvr      = 0;
4921         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4922         {
4923             cdd = d*2 + dir;
4924             /* Communicate the cg and atom counts */
4925             sbuf[0] = ncg[cdd];
4926             sbuf[1] = nat[cdd];
4927             if (debug)
4928             {
4929                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4930                         d, dir, sbuf[0], sbuf[1]);
4931             }
4932             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4933
4934             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4935             {
4936                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4937                 srenew(comm->buf_int, comm->nalloc_int);
4938             }
4939
4940             /* Communicate the charge group indices, sizes and flags */
4941             dd_sendrecv_int(dd, d, dir,
4942                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4943                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4944
4945             nvs = ncg[cdd] + nat[cdd]*nvec;
4946             i   = rbuf[0]  + rbuf[1] *nvec;
4947             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4948
4949             /* Communicate cgcm and state */
4950             dd_sendrecv_rvec(dd, d, dir,
4951                              comm->cgcm_state[cdd], nvs,
4952                              comm->vbuf.v+nvr, i);
4953             ncg_recv += rbuf[0];
4954             nat_recv += rbuf[1];
4955             nvr      += i;
4956         }
4957
4958         /* Process the received charge groups */
4959         buf_pos = 0;
4960         for (cg = 0; cg < ncg_recv; cg++)
4961         {
4962             flag = comm->buf_int[cg*DD_CGIBS+1];
4963
4964             if (dim >= npbcdim && dd->nc[dim] > 2)
4965             {
4966                 /* No pbc in this dim and more than one domain boundary.
4967                  * We do a separate check if a charge group didn't move too far.
4968                  */
4969                 if (((flag & DD_FLAG_FW(d)) &&
4970                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4971                     ((flag & DD_FLAG_BW(d)) &&
4972                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4973                 {
4974                     cg_move_error(fplog, dd, step, cg, dim,
4975                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4976                                   FALSE, 0,
4977                                   comm->vbuf.v[buf_pos],
4978                                   comm->vbuf.v[buf_pos],
4979                                   comm->vbuf.v[buf_pos][dim]);
4980                 }
4981             }
4982
4983             mc = -1;
4984             if (d < dd->ndim-1)
4985             {
4986                 /* Check which direction this cg should go */
4987                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4988                 {
4989                     if (dd->bGridJump)
4990                     {
4991                         /* The cell boundaries for dimension d2 are not equal
4992                          * for each cell row of the lower dimension(s),
4993                          * therefore we might need to redetermine where
4994                          * this cg should go.
4995                          */
4996                         dim2 = dd->dim[d2];
4997                         /* If this cg crosses the box boundary in dimension d2
4998                          * we can use the communicated flag, so we do not
4999                          * have to worry about pbc.
5000                          */
5001                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
5002                                (flag & DD_FLAG_FW(d2))) ||
5003                               (dd->ci[dim2] == 0 &&
5004                                (flag & DD_FLAG_BW(d2)))))
5005                         {
5006                             /* Clear the two flags for this dimension */
5007                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
5008                             /* Determine the location of this cg
5009                              * in lattice coordinates
5010                              */
5011                             pos_d = comm->vbuf.v[buf_pos][dim2];
5012                             if (tric_dir[dim2])
5013                             {
5014                                 for (d3 = dim2+1; d3 < DIM; d3++)
5015                                 {
5016                                     pos_d +=
5017                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
5018                                 }
5019                             }
5020                             /* Check of we are not at the box edge.
5021                              * pbc is only handled in the first step above,
5022                              * but this check could move over pbc while
5023                              * the first step did not due to different rounding.
5024                              */
5025                             if (pos_d >= cell_x1[dim2] &&
5026                                 dd->ci[dim2] != dd->nc[dim2]-1)
5027                             {
5028                                 flag |= DD_FLAG_FW(d2);
5029                             }
5030                             else if (pos_d < cell_x0[dim2] &&
5031                                      dd->ci[dim2] != 0)
5032                             {
5033                                 flag |= DD_FLAG_BW(d2);
5034                             }
5035                             comm->buf_int[cg*DD_CGIBS+1] = flag;
5036                         }
5037                     }
5038                     /* Set to which neighboring cell this cg should go */
5039                     if (flag & DD_FLAG_FW(d2))
5040                     {
5041                         mc = d2*2;
5042                     }
5043                     else if (flag & DD_FLAG_BW(d2))
5044                     {
5045                         if (dd->nc[dd->dim[d2]] > 2)
5046                         {
5047                             mc = d2*2+1;
5048                         }
5049                         else
5050                         {
5051                             mc = d2*2;
5052                         }
5053                     }
5054                 }
5055             }
5056
5057             nrcg = flag & DD_FLAG_NRCG;
5058             if (mc == -1)
5059             {
5060                 if (home_pos_cg+1 > dd->cg_nalloc)
5061                 {
5062                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5063                     srenew(dd->index_gl, dd->cg_nalloc);
5064                     srenew(dd->cgindex, dd->cg_nalloc+1);
5065                 }
5066                 /* Set the global charge group index and size */
5067                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5068                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5069                 /* Copy the state from the buffer */
5070                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5071                 if (fr->cutoff_scheme == ecutsGROUP)
5072                 {
5073                     cg_cm = fr->cg_cm;
5074                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5075                 }
5076                 buf_pos++;
5077
5078                 /* Set the cginfo */
5079                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5080                                                    dd->index_gl[home_pos_cg]);
5081                 if (comm->bLocalCG)
5082                 {
5083                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5084                 }
5085
5086                 if (home_pos_at+nrcg > state->nalloc)
5087                 {
5088                     dd_realloc_state(state, f, home_pos_at+nrcg);
5089                 }
5090                 for (i = 0; i < nrcg; i++)
5091                 {
5092                     copy_rvec(comm->vbuf.v[buf_pos++],
5093                               state->x[home_pos_at+i]);
5094                 }
5095                 if (bV)
5096                 {
5097                     for (i = 0; i < nrcg; i++)
5098                     {
5099                         copy_rvec(comm->vbuf.v[buf_pos++],
5100                                   state->v[home_pos_at+i]);
5101                     }
5102                 }
5103                 if (bSDX)
5104                 {
5105                     for (i = 0; i < nrcg; i++)
5106                     {
5107                         copy_rvec(comm->vbuf.v[buf_pos++],
5108                                   state->sd_X[home_pos_at+i]);
5109                     }
5110                 }
5111                 if (bCGP)
5112                 {
5113                     for (i = 0; i < nrcg; i++)
5114                     {
5115                         copy_rvec(comm->vbuf.v[buf_pos++],
5116                                   state->cg_p[home_pos_at+i]);
5117                     }
5118                 }
5119                 home_pos_cg += 1;
5120                 home_pos_at += nrcg;
5121             }
5122             else
5123             {
5124                 /* Reallocate the buffers if necessary  */
5125                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5126                 {
5127                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5128                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5129                 }
5130                 nvr = ncg[mc] + nat[mc]*nvec;
5131                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5132                 {
5133                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5134                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5135                 }
5136                 /* Copy from the receive to the send buffers */
5137                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5138                        comm->buf_int + cg*DD_CGIBS,
5139                        DD_CGIBS*sizeof(int));
5140                 memcpy(comm->cgcm_state[mc][nvr],
5141                        comm->vbuf.v[buf_pos],
5142                        (1+nrcg*nvec)*sizeof(rvec));
5143                 buf_pos += 1 + nrcg*nvec;
5144                 ncg[mc] += 1;
5145                 nat[mc] += nrcg;
5146             }
5147         }
5148     }
5149
5150     /* With sorting (!bCompact) the indices are now only partially up to date
5151      * and ncg_home and nat_home are not the real count, since there are
5152      * "holes" in the arrays for the charge groups that moved to neighbors.
5153      */
5154     if (fr->cutoff_scheme == ecutsVERLET)
5155     {
5156         moved = get_moved(comm, home_pos_cg);
5157
5158         for (i = dd->ncg_home; i < home_pos_cg; i++)
5159         {
5160             moved[i] = 0;
5161         }
5162     }
5163     dd->ncg_home = home_pos_cg;
5164     dd->nat_home = home_pos_at;
5165
5166     if (debug)
5167     {
5168         fprintf(debug,
5169                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5170                 *ncg_moved, dd->ncg_home-*ncg_moved);
5171
5172     }
5173 }
5174
5175 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5176 {
5177     dd->comm->cycl[ddCycl] += cycles;
5178     dd->comm->cycl_n[ddCycl]++;
5179     if (cycles > dd->comm->cycl_max[ddCycl])
5180     {
5181         dd->comm->cycl_max[ddCycl] = cycles;
5182     }
5183 }
5184
5185 static double force_flop_count(t_nrnb *nrnb)
5186 {
5187     int         i;
5188     double      sum;
5189     const char *name;
5190
5191     sum = 0;
5192     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5193     {
5194         /* To get closer to the real timings, we half the count
5195          * for the normal loops and again half it for water loops.
5196          */
5197         name = nrnb_str(i);
5198         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5199         {
5200             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5201         }
5202         else
5203         {
5204             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5205         }
5206     }
5207     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5208     {
5209         name = nrnb_str(i);
5210         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5211         {
5212             sum += nrnb->n[i]*cost_nrnb(i);
5213         }
5214     }
5215     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5216     {
5217         sum += nrnb->n[i]*cost_nrnb(i);
5218     }
5219
5220     return sum;
5221 }
5222
5223 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5224 {
5225     if (dd->comm->eFlop)
5226     {
5227         dd->comm->flop -= force_flop_count(nrnb);
5228     }
5229 }
5230 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5231 {
5232     if (dd->comm->eFlop)
5233     {
5234         dd->comm->flop += force_flop_count(nrnb);
5235         dd->comm->flop_n++;
5236     }
5237 }
5238
5239 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5240 {
5241     int i;
5242
5243     for (i = 0; i < ddCyclNr; i++)
5244     {
5245         dd->comm->cycl[i]     = 0;
5246         dd->comm->cycl_n[i]   = 0;
5247         dd->comm->cycl_max[i] = 0;
5248     }
5249     dd->comm->flop   = 0;
5250     dd->comm->flop_n = 0;
5251 }
5252
5253 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5254 {
5255     gmx_domdec_comm_t *comm;
5256     gmx_domdec_load_t *load;
5257     gmx_domdec_root_t *root = NULL;
5258     int                d, dim, cid, i, pos;
5259     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5260     gmx_bool           bSepPME;
5261
5262     if (debug)
5263     {
5264         fprintf(debug, "get_load_distribution start\n");
5265     }
5266
5267     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5268
5269     comm = dd->comm;
5270
5271     bSepPME = (dd->pme_nodeid >= 0);
5272
5273     for (d = dd->ndim-1; d >= 0; d--)
5274     {
5275         dim = dd->dim[d];
5276         /* Check if we participate in the communication in this dimension */
5277         if (d == dd->ndim-1 ||
5278             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5279         {
5280             load = &comm->load[d];
5281             if (dd->bGridJump)
5282             {
5283                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5284             }
5285             pos = 0;
5286             if (d == dd->ndim-1)
5287             {
5288                 sbuf[pos++] = dd_force_load(comm);
5289                 sbuf[pos++] = sbuf[0];
5290                 if (dd->bGridJump)
5291                 {
5292                     sbuf[pos++] = sbuf[0];
5293                     sbuf[pos++] = cell_frac;
5294                     if (d > 0)
5295                     {
5296                         sbuf[pos++] = comm->cell_f_max0[d];
5297                         sbuf[pos++] = comm->cell_f_min1[d];
5298                     }
5299                 }
5300                 if (bSepPME)
5301                 {
5302                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5303                     sbuf[pos++] = comm->cycl[ddCyclPME];
5304                 }
5305             }
5306             else
5307             {
5308                 sbuf[pos++] = comm->load[d+1].sum;
5309                 sbuf[pos++] = comm->load[d+1].max;
5310                 if (dd->bGridJump)
5311                 {
5312                     sbuf[pos++] = comm->load[d+1].sum_m;
5313                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5314                     sbuf[pos++] = comm->load[d+1].flags;
5315                     if (d > 0)
5316                     {
5317                         sbuf[pos++] = comm->cell_f_max0[d];
5318                         sbuf[pos++] = comm->cell_f_min1[d];
5319                     }
5320                 }
5321                 if (bSepPME)
5322                 {
5323                     sbuf[pos++] = comm->load[d+1].mdf;
5324                     sbuf[pos++] = comm->load[d+1].pme;
5325                 }
5326             }
5327             load->nload = pos;
5328             /* Communicate a row in DD direction d.
5329              * The communicators are setup such that the root always has rank 0.
5330              */
5331 #ifdef GMX_MPI
5332             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5333                        load->load, load->nload*sizeof(float), MPI_BYTE,
5334                        0, comm->mpi_comm_load[d]);
5335 #endif
5336             if (dd->ci[dim] == dd->master_ci[dim])
5337             {
5338                 /* We are the root, process this row */
5339                 if (comm->bDynLoadBal)
5340                 {
5341                     root = comm->root[d];
5342                 }
5343                 load->sum      = 0;
5344                 load->max      = 0;
5345                 load->sum_m    = 0;
5346                 load->cvol_min = 1;
5347                 load->flags    = 0;
5348                 load->mdf      = 0;
5349                 load->pme      = 0;
5350                 pos            = 0;
5351                 for (i = 0; i < dd->nc[dim]; i++)
5352                 {
5353                     load->sum += load->load[pos++];
5354                     load->max  = max(load->max, load->load[pos]);
5355                     pos++;
5356                     if (dd->bGridJump)
5357                     {
5358                         if (root->bLimited)
5359                         {
5360                             /* This direction could not be load balanced properly,
5361                              * therefore we need to use the maximum iso the average load.
5362                              */
5363                             load->sum_m = max(load->sum_m, load->load[pos]);
5364                         }
5365                         else
5366                         {
5367                             load->sum_m += load->load[pos];
5368                         }
5369                         pos++;
5370                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5371                         pos++;
5372                         if (d < dd->ndim-1)
5373                         {
5374                             load->flags = (int)(load->load[pos++] + 0.5);
5375                         }
5376                         if (d > 0)
5377                         {
5378                             root->cell_f_max0[i] = load->load[pos++];
5379                             root->cell_f_min1[i] = load->load[pos++];
5380                         }
5381                     }
5382                     if (bSepPME)
5383                     {
5384                         load->mdf = max(load->mdf, load->load[pos]);
5385                         pos++;
5386                         load->pme = max(load->pme, load->load[pos]);
5387                         pos++;
5388                     }
5389                 }
5390                 if (comm->bDynLoadBal && root->bLimited)
5391                 {
5392                     load->sum_m *= dd->nc[dim];
5393                     load->flags |= (1<<d);
5394                 }
5395             }
5396         }
5397     }
5398
5399     if (DDMASTER(dd))
5400     {
5401         comm->nload      += dd_load_count(comm);
5402         comm->load_step  += comm->cycl[ddCyclStep];
5403         comm->load_sum   += comm->load[0].sum;
5404         comm->load_max   += comm->load[0].max;
5405         if (comm->bDynLoadBal)
5406         {
5407             for (d = 0; d < dd->ndim; d++)
5408             {
5409                 if (comm->load[0].flags & (1<<d))
5410                 {
5411                     comm->load_lim[d]++;
5412                 }
5413             }
5414         }
5415         if (bSepPME)
5416         {
5417             comm->load_mdf += comm->load[0].mdf;
5418             comm->load_pme += comm->load[0].pme;
5419         }
5420     }
5421
5422     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5423
5424     if (debug)
5425     {
5426         fprintf(debug, "get_load_distribution finished\n");
5427     }
5428 }
5429
5430 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5431 {
5432     /* Return the relative performance loss on the total run time
5433      * due to the force calculation load imbalance.
5434      */
5435     if (dd->comm->nload > 0)
5436     {
5437         return
5438             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5439             (dd->comm->load_step*dd->nnodes);
5440     }
5441     else
5442     {
5443         return 0;
5444     }
5445 }
5446
5447 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5448 {
5449     char               buf[STRLEN];
5450     int                npp, npme, nnodes, d, limp;
5451     float              imbal, pme_f_ratio, lossf, lossp = 0;
5452     gmx_bool           bLim;
5453     gmx_domdec_comm_t *comm;
5454
5455     comm = dd->comm;
5456     if (DDMASTER(dd) && comm->nload > 0)
5457     {
5458         npp    = dd->nnodes;
5459         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5460         nnodes = npp + npme;
5461         imbal  = comm->load_max*npp/comm->load_sum - 1;
5462         lossf  = dd_force_imb_perf_loss(dd);
5463         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5464         fprintf(fplog, "%s", buf);
5465         fprintf(stderr, "\n");
5466         fprintf(stderr, "%s", buf);
5467         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5468         fprintf(fplog, "%s", buf);
5469         fprintf(stderr, "%s", buf);
5470         bLim = FALSE;
5471         if (comm->bDynLoadBal)
5472         {
5473             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5474             for (d = 0; d < dd->ndim; d++)
5475             {
5476                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5477                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5478                 if (limp >= 50)
5479                 {
5480                     bLim = TRUE;
5481                 }
5482             }
5483             sprintf(buf+strlen(buf), "\n");
5484             fprintf(fplog, "%s", buf);
5485             fprintf(stderr, "%s", buf);
5486         }
5487         if (npme > 0)
5488         {
5489             pme_f_ratio = comm->load_pme/comm->load_mdf;
5490             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5491             if (lossp <= 0)
5492             {
5493                 lossp *= (float)npme/(float)nnodes;
5494             }
5495             else
5496             {
5497                 lossp *= (float)npp/(float)nnodes;
5498             }
5499             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5500             fprintf(fplog, "%s", buf);
5501             fprintf(stderr, "%s", buf);
5502             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5503             fprintf(fplog, "%s", buf);
5504             fprintf(stderr, "%s", buf);
5505         }
5506         fprintf(fplog, "\n");
5507         fprintf(stderr, "\n");
5508
5509         if (lossf >= DD_PERF_LOSS)
5510         {
5511             sprintf(buf,
5512                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5513                     "      in the domain decomposition.\n", lossf*100);
5514             if (!comm->bDynLoadBal)
5515             {
5516                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5517             }
5518             else if (bLim)
5519             {
5520                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5521             }
5522             fprintf(fplog, "%s\n", buf);
5523             fprintf(stderr, "%s\n", buf);
5524         }
5525         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5526         {
5527             sprintf(buf,
5528                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5529                     "      had %s work to do than the PP nodes.\n"
5530                     "      You might want to %s the number of PME nodes\n"
5531                     "      or %s the cut-off and the grid spacing.\n",
5532                     fabs(lossp*100),
5533                     (lossp < 0) ? "less"     : "more",
5534                     (lossp < 0) ? "decrease" : "increase",
5535                     (lossp < 0) ? "decrease" : "increase");
5536             fprintf(fplog, "%s\n", buf);
5537             fprintf(stderr, "%s\n", buf);
5538         }
5539     }
5540 }
5541
5542 static float dd_vol_min(gmx_domdec_t *dd)
5543 {
5544     return dd->comm->load[0].cvol_min*dd->nnodes;
5545 }
5546
5547 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5548 {
5549     return dd->comm->load[0].flags;
5550 }
5551
5552 static float dd_f_imbal(gmx_domdec_t *dd)
5553 {
5554     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5555 }
5556
5557 float dd_pme_f_ratio(gmx_domdec_t *dd)
5558 {
5559     if (dd->comm->cycl_n[ddCyclPME] > 0)
5560     {
5561         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5562     }
5563     else
5564     {
5565         return -1.0;
5566     }
5567 }
5568
5569 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5570 {
5571     int  flags, d;
5572     char buf[22];
5573
5574     flags = dd_load_flags(dd);
5575     if (flags)
5576     {
5577         fprintf(fplog,
5578                 "DD  load balancing is limited by minimum cell size in dimension");
5579         for (d = 0; d < dd->ndim; d++)
5580         {
5581             if (flags & (1<<d))
5582             {
5583                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5584             }
5585         }
5586         fprintf(fplog, "\n");
5587     }
5588     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5589     if (dd->comm->bDynLoadBal)
5590     {
5591         fprintf(fplog, "  vol min/aver %5.3f%c",
5592                 dd_vol_min(dd), flags ? '!' : ' ');
5593     }
5594     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5595     if (dd->comm->cycl_n[ddCyclPME])
5596     {
5597         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5598     }
5599     fprintf(fplog, "\n\n");
5600 }
5601
5602 static void dd_print_load_verbose(gmx_domdec_t *dd)
5603 {
5604     if (dd->comm->bDynLoadBal)
5605     {
5606         fprintf(stderr, "vol %4.2f%c ",
5607                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5608     }
5609     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5610     if (dd->comm->cycl_n[ddCyclPME])
5611     {
5612         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5613     }
5614 }
5615
5616 #ifdef GMX_MPI
5617 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5618 {
5619     MPI_Comm           c_row;
5620     int                dim, i, rank;
5621     ivec               loc_c;
5622     gmx_domdec_root_t *root;
5623     gmx_bool           bPartOfGroup = FALSE;
5624
5625     dim = dd->dim[dim_ind];
5626     copy_ivec(loc, loc_c);
5627     for (i = 0; i < dd->nc[dim]; i++)
5628     {
5629         loc_c[dim] = i;
5630         rank       = dd_index(dd->nc, loc_c);
5631         if (rank == dd->rank)
5632         {
5633             /* This process is part of the group */
5634             bPartOfGroup = TRUE;
5635         }
5636     }
5637     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5638                    &c_row);
5639     if (bPartOfGroup)
5640     {
5641         dd->comm->mpi_comm_load[dim_ind] = c_row;
5642         if (dd->comm->eDLB != edlbNO)
5643         {
5644             if (dd->ci[dim] == dd->master_ci[dim])
5645             {
5646                 /* This is the root process of this row */
5647                 snew(dd->comm->root[dim_ind], 1);
5648                 root = dd->comm->root[dim_ind];
5649                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5650                 snew(root->old_cell_f, dd->nc[dim]+1);
5651                 snew(root->bCellMin, dd->nc[dim]);
5652                 if (dim_ind > 0)
5653                 {
5654                     snew(root->cell_f_max0, dd->nc[dim]);
5655                     snew(root->cell_f_min1, dd->nc[dim]);
5656                     snew(root->bound_min, dd->nc[dim]);
5657                     snew(root->bound_max, dd->nc[dim]);
5658                 }
5659                 snew(root->buf_ncd, dd->nc[dim]);
5660             }
5661             else
5662             {
5663                 /* This is not a root process, we only need to receive cell_f */
5664                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5665             }
5666         }
5667         if (dd->ci[dim] == dd->master_ci[dim])
5668         {
5669             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5670         }
5671     }
5672 }
5673 #endif
5674
5675 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5676                                    const gmx_hw_info_t gmx_unused *hwinfo,
5677                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5678 {
5679 #ifdef GMX_MPI
5680     int           physicalnode_id_hash;
5681     int           gpu_id;
5682     gmx_domdec_t *dd;
5683     MPI_Comm      mpi_comm_pp_physicalnode;
5684
5685     if (!(cr->duty & DUTY_PP) ||
5686         hw_opt->gpu_opt.ncuda_dev_use == 0)
5687     {
5688         /* Only PP nodes (currently) use GPUs.
5689          * If we don't have GPUs, there are no resources to share.
5690          */
5691         return;
5692     }
5693
5694     physicalnode_id_hash = gmx_physicalnode_id_hash();
5695
5696     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5697
5698     dd = cr->dd;
5699
5700     if (debug)
5701     {
5702         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5703         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5704                 dd->rank, physicalnode_id_hash, gpu_id);
5705     }
5706     /* Split the PP communicator over the physical nodes */
5707     /* TODO: See if we should store this (before), as it's also used for
5708      * for the nodecomm summution.
5709      */
5710     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5711                    &mpi_comm_pp_physicalnode);
5712     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5713                    &dd->comm->mpi_comm_gpu_shared);
5714     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5715     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5716
5717     if (debug)
5718     {
5719         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5720     }
5721
5722     /* Note that some ranks could share a GPU, while others don't */
5723
5724     if (dd->comm->nrank_gpu_shared == 1)
5725     {
5726         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5727     }
5728 #endif
5729 }
5730
5731 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5732 {
5733 #ifdef GMX_MPI
5734     int  dim0, dim1, i, j;
5735     ivec loc;
5736
5737     if (debug)
5738     {
5739         fprintf(debug, "Making load communicators\n");
5740     }
5741
5742     snew(dd->comm->load, dd->ndim);
5743     snew(dd->comm->mpi_comm_load, dd->ndim);
5744
5745     clear_ivec(loc);
5746     make_load_communicator(dd, 0, loc);
5747     if (dd->ndim > 1)
5748     {
5749         dim0 = dd->dim[0];
5750         for (i = 0; i < dd->nc[dim0]; i++)
5751         {
5752             loc[dim0] = i;
5753             make_load_communicator(dd, 1, loc);
5754         }
5755     }
5756     if (dd->ndim > 2)
5757     {
5758         dim0 = dd->dim[0];
5759         for (i = 0; i < dd->nc[dim0]; i++)
5760         {
5761             loc[dim0] = i;
5762             dim1      = dd->dim[1];
5763             for (j = 0; j < dd->nc[dim1]; j++)
5764             {
5765                 loc[dim1] = j;
5766                 make_load_communicator(dd, 2, loc);
5767             }
5768         }
5769     }
5770
5771     if (debug)
5772     {
5773         fprintf(debug, "Finished making load communicators\n");
5774     }
5775 #endif
5776 }
5777
5778 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5779 {
5780     gmx_bool                bZYX;
5781     int                     d, dim, i, j, m;
5782     ivec                    tmp, s;
5783     int                     nzone, nzonep;
5784     ivec                    dd_zp[DD_MAXIZONE];
5785     gmx_domdec_zones_t     *zones;
5786     gmx_domdec_ns_ranges_t *izone;
5787
5788     for (d = 0; d < dd->ndim; d++)
5789     {
5790         dim = dd->dim[d];
5791         copy_ivec(dd->ci, tmp);
5792         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5793         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5794         copy_ivec(dd->ci, tmp);
5795         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5796         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5797         if (debug)
5798         {
5799             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5800                     dd->rank, dim,
5801                     dd->neighbor[d][0],
5802                     dd->neighbor[d][1]);
5803         }
5804     }
5805
5806     if (fplog)
5807     {
5808         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5809                 dd->ndim,
5810                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5811                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5812     }
5813     switch (dd->ndim)
5814     {
5815         case 3:
5816             nzone  = dd_z3n;
5817             nzonep = dd_zp3n;
5818             for (i = 0; i < nzonep; i++)
5819             {
5820                 copy_ivec(dd_zp3[i], dd_zp[i]);
5821             }
5822             break;
5823         case 2:
5824             nzone  = dd_z2n;
5825             nzonep = dd_zp2n;
5826             for (i = 0; i < nzonep; i++)
5827             {
5828                 copy_ivec(dd_zp2[i], dd_zp[i]);
5829             }
5830             break;
5831         case 1:
5832             nzone  = dd_z1n;
5833             nzonep = dd_zp1n;
5834             for (i = 0; i < nzonep; i++)
5835             {
5836                 copy_ivec(dd_zp1[i], dd_zp[i]);
5837             }
5838             break;
5839         default:
5840             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5841             nzone  = 0;
5842             nzonep = 0;
5843     }
5844
5845     zones = &dd->comm->zones;
5846
5847     for (i = 0; i < nzone; i++)
5848     {
5849         m = 0;
5850         clear_ivec(zones->shift[i]);
5851         for (d = 0; d < dd->ndim; d++)
5852         {
5853             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5854         }
5855     }
5856
5857     zones->n = nzone;
5858     for (i = 0; i < nzone; i++)
5859     {
5860         for (d = 0; d < DIM; d++)
5861         {
5862             s[d] = dd->ci[d] - zones->shift[i][d];
5863             if (s[d] < 0)
5864             {
5865                 s[d] += dd->nc[d];
5866             }
5867             else if (s[d] >= dd->nc[d])
5868             {
5869                 s[d] -= dd->nc[d];
5870             }
5871         }
5872     }
5873     zones->nizone = nzonep;
5874     for (i = 0; i < zones->nizone; i++)
5875     {
5876         if (dd_zp[i][0] != i)
5877         {
5878             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5879         }
5880         izone     = &zones->izone[i];
5881         izone->j0 = dd_zp[i][1];
5882         izone->j1 = dd_zp[i][2];
5883         for (dim = 0; dim < DIM; dim++)
5884         {
5885             if (dd->nc[dim] == 1)
5886             {
5887                 /* All shifts should be allowed */
5888                 izone->shift0[dim] = -1;
5889                 izone->shift1[dim] = 1;
5890             }
5891             else
5892             {
5893                 /*
5894                    izone->shift0[d] = 0;
5895                    izone->shift1[d] = 0;
5896                    for(j=izone->j0; j<izone->j1; j++) {
5897                    if (dd->shift[j][d] > dd->shift[i][d])
5898                    izone->shift0[d] = -1;
5899                    if (dd->shift[j][d] < dd->shift[i][d])
5900                    izone->shift1[d] = 1;
5901                    }
5902                  */
5903
5904                 int shift_diff;
5905
5906                 /* Assume the shift are not more than 1 cell */
5907                 izone->shift0[dim] = 1;
5908                 izone->shift1[dim] = -1;
5909                 for (j = izone->j0; j < izone->j1; j++)
5910                 {
5911                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5912                     if (shift_diff < izone->shift0[dim])
5913                     {
5914                         izone->shift0[dim] = shift_diff;
5915                     }
5916                     if (shift_diff > izone->shift1[dim])
5917                     {
5918                         izone->shift1[dim] = shift_diff;
5919                     }
5920                 }
5921             }
5922         }
5923     }
5924
5925     if (dd->comm->eDLB != edlbNO)
5926     {
5927         snew(dd->comm->root, dd->ndim);
5928     }
5929
5930     if (dd->comm->bRecordLoad)
5931     {
5932         make_load_communicators(dd);
5933     }
5934 }
5935
5936 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5937 {
5938     gmx_domdec_t      *dd;
5939     gmx_domdec_comm_t *comm;
5940     int                i, rank, *buf;
5941     ivec               periods;
5942 #ifdef GMX_MPI
5943     MPI_Comm           comm_cart;
5944 #endif
5945
5946     dd   = cr->dd;
5947     comm = dd->comm;
5948
5949 #ifdef GMX_MPI
5950     if (comm->bCartesianPP)
5951     {
5952         /* Set up cartesian communication for the particle-particle part */
5953         if (fplog)
5954         {
5955             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5956                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5957         }
5958
5959         for (i = 0; i < DIM; i++)
5960         {
5961             periods[i] = TRUE;
5962         }
5963         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5964                         &comm_cart);
5965         /* We overwrite the old communicator with the new cartesian one */
5966         cr->mpi_comm_mygroup = comm_cart;
5967     }
5968
5969     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5970     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5971
5972     if (comm->bCartesianPP_PME)
5973     {
5974         /* Since we want to use the original cartesian setup for sim,
5975          * and not the one after split, we need to make an index.
5976          */
5977         snew(comm->ddindex2ddnodeid, dd->nnodes);
5978         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5979         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5980         /* Get the rank of the DD master,
5981          * above we made sure that the master node is a PP node.
5982          */
5983         if (MASTER(cr))
5984         {
5985             rank = dd->rank;
5986         }
5987         else
5988         {
5989             rank = 0;
5990         }
5991         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5992     }
5993     else if (comm->bCartesianPP)
5994     {
5995         if (cr->npmenodes == 0)
5996         {
5997             /* The PP communicator is also
5998              * the communicator for this simulation
5999              */
6000             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
6001         }
6002         cr->nodeid = dd->rank;
6003
6004         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
6005
6006         /* We need to make an index to go from the coordinates
6007          * to the nodeid of this simulation.
6008          */
6009         snew(comm->ddindex2simnodeid, dd->nnodes);
6010         snew(buf, dd->nnodes);
6011         if (cr->duty & DUTY_PP)
6012         {
6013             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6014         }
6015         /* Communicate the ddindex to simulation nodeid index */
6016         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6017                       cr->mpi_comm_mysim);
6018         sfree(buf);
6019
6020         /* Determine the master coordinates and rank.
6021          * The DD master should be the same node as the master of this sim.
6022          */
6023         for (i = 0; i < dd->nnodes; i++)
6024         {
6025             if (comm->ddindex2simnodeid[i] == 0)
6026             {
6027                 ddindex2xyz(dd->nc, i, dd->master_ci);
6028                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
6029             }
6030         }
6031         if (debug)
6032         {
6033             fprintf(debug, "The master rank is %d\n", dd->masterrank);
6034         }
6035     }
6036     else
6037     {
6038         /* No Cartesian communicators */
6039         /* We use the rank in dd->comm->all as DD index */
6040         ddindex2xyz(dd->nc, dd->rank, dd->ci);
6041         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6042         dd->masterrank = 0;
6043         clear_ivec(dd->master_ci);
6044     }
6045 #endif
6046
6047     if (fplog)
6048     {
6049         fprintf(fplog,
6050                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
6051                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6052     }
6053     if (debug)
6054     {
6055         fprintf(debug,
6056                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
6057                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6058     }
6059 }
6060
6061 static void receive_ddindex2simnodeid(t_commrec *cr)
6062 {
6063     gmx_domdec_t      *dd;
6064
6065     gmx_domdec_comm_t *comm;
6066     int               *buf;
6067
6068     dd   = cr->dd;
6069     comm = dd->comm;
6070
6071 #ifdef GMX_MPI
6072     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
6073     {
6074         snew(comm->ddindex2simnodeid, dd->nnodes);
6075         snew(buf, dd->nnodes);
6076         if (cr->duty & DUTY_PP)
6077         {
6078             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6079         }
6080 #ifdef GMX_MPI
6081         /* Communicate the ddindex to simulation nodeid index */
6082         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6083                       cr->mpi_comm_mysim);
6084 #endif
6085         sfree(buf);
6086     }
6087 #endif
6088 }
6089
6090 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
6091                                                      int ncg, int natoms)
6092 {
6093     gmx_domdec_master_t *ma;
6094     int                  i;
6095
6096     snew(ma, 1);
6097
6098     snew(ma->ncg, dd->nnodes);
6099     snew(ma->index, dd->nnodes+1);
6100     snew(ma->cg, ncg);
6101     snew(ma->nat, dd->nnodes);
6102     snew(ma->ibuf, dd->nnodes*2);
6103     snew(ma->cell_x, DIM);
6104     for (i = 0; i < DIM; i++)
6105     {
6106         snew(ma->cell_x[i], dd->nc[i]+1);
6107     }
6108
6109     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
6110     {
6111         ma->vbuf = NULL;
6112     }
6113     else
6114     {
6115         snew(ma->vbuf, natoms);
6116     }
6117
6118     return ma;
6119 }
6120
6121 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
6122                                int gmx_unused reorder)
6123 {
6124     gmx_domdec_t      *dd;
6125     gmx_domdec_comm_t *comm;
6126     int                i, rank;
6127     gmx_bool           bDiv[DIM];
6128     ivec               periods;
6129 #ifdef GMX_MPI
6130     MPI_Comm           comm_cart;
6131 #endif
6132
6133     dd   = cr->dd;
6134     comm = dd->comm;
6135
6136     if (comm->bCartesianPP)
6137     {
6138         for (i = 1; i < DIM; i++)
6139         {
6140             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6141         }
6142         if (bDiv[YY] || bDiv[ZZ])
6143         {
6144             comm->bCartesianPP_PME = TRUE;
6145             /* If we have 2D PME decomposition, which is always in x+y,
6146              * we stack the PME only nodes in z.
6147              * Otherwise we choose the direction that provides the thinnest slab
6148              * of PME only nodes as this will have the least effect
6149              * on the PP communication.
6150              * But for the PME communication the opposite might be better.
6151              */
6152             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6153                              !bDiv[YY] ||
6154                              dd->nc[YY] > dd->nc[ZZ]))
6155             {
6156                 comm->cartpmedim = ZZ;
6157             }
6158             else
6159             {
6160                 comm->cartpmedim = YY;
6161             }
6162             comm->ntot[comm->cartpmedim]
6163                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6164         }
6165         else if (fplog)
6166         {
6167             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6168             fprintf(fplog,
6169                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6170         }
6171     }
6172
6173 #ifdef GMX_MPI
6174     if (comm->bCartesianPP_PME)
6175     {
6176         if (fplog)
6177         {
6178             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6179         }
6180
6181         for (i = 0; i < DIM; i++)
6182         {
6183             periods[i] = TRUE;
6184         }
6185         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6186                         &comm_cart);
6187
6188         MPI_Comm_rank(comm_cart, &rank);
6189         if (MASTERNODE(cr) && rank != 0)
6190         {
6191             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6192         }
6193
6194         /* With this assigment we loose the link to the original communicator
6195          * which will usually be MPI_COMM_WORLD, unless have multisim.
6196          */
6197         cr->mpi_comm_mysim = comm_cart;
6198         cr->sim_nodeid     = rank;
6199
6200         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6201
6202         if (fplog)
6203         {
6204             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6205                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6206         }
6207
6208         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6209         {
6210             cr->duty = DUTY_PP;
6211         }
6212         if (cr->npmenodes == 0 ||
6213             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6214         {
6215             cr->duty = DUTY_PME;
6216         }
6217
6218         /* Split the sim communicator into PP and PME only nodes */
6219         MPI_Comm_split(cr->mpi_comm_mysim,
6220                        cr->duty,
6221                        dd_index(comm->ntot, dd->ci),
6222                        &cr->mpi_comm_mygroup);
6223     }
6224     else
6225     {
6226         switch (dd_node_order)
6227         {
6228             case ddnoPP_PME:
6229                 if (fplog)
6230                 {
6231                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6232                 }
6233                 break;
6234             case ddnoINTERLEAVE:
6235                 /* Interleave the PP-only and PME-only nodes,
6236                  * as on clusters with dual-core machines this will double
6237                  * the communication bandwidth of the PME processes
6238                  * and thus speed up the PP <-> PME and inter PME communication.
6239                  */
6240                 if (fplog)
6241                 {
6242                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6243                 }
6244                 comm->pmenodes = dd_pmenodes(cr);
6245                 break;
6246             case ddnoCARTESIAN:
6247                 break;
6248             default:
6249                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6250         }
6251
6252         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6253         {
6254             cr->duty = DUTY_PME;
6255         }
6256         else
6257         {
6258             cr->duty = DUTY_PP;
6259         }
6260
6261         /* Split the sim communicator into PP and PME only nodes */
6262         MPI_Comm_split(cr->mpi_comm_mysim,
6263                        cr->duty,
6264                        cr->nodeid,
6265                        &cr->mpi_comm_mygroup);
6266         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6267     }
6268 #endif
6269
6270     if (fplog)
6271     {
6272         fprintf(fplog, "This is a %s only node\n\n",
6273                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6274     }
6275 }
6276
6277 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6278 {
6279     gmx_domdec_t      *dd;
6280     gmx_domdec_comm_t *comm;
6281     int                CartReorder;
6282
6283     dd   = cr->dd;
6284     comm = dd->comm;
6285
6286     copy_ivec(dd->nc, comm->ntot);
6287
6288     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6289     comm->bCartesianPP_PME = FALSE;
6290
6291     /* Reorder the nodes by default. This might change the MPI ranks.
6292      * Real reordering is only supported on very few architectures,
6293      * Blue Gene is one of them.
6294      */
6295     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6296
6297     if (cr->npmenodes > 0)
6298     {
6299         /* Split the communicator into a PP and PME part */
6300         split_communicator(fplog, cr, dd_node_order, CartReorder);
6301         if (comm->bCartesianPP_PME)
6302         {
6303             /* We (possibly) reordered the nodes in split_communicator,
6304              * so it is no longer required in make_pp_communicator.
6305              */
6306             CartReorder = FALSE;
6307         }
6308     }
6309     else
6310     {
6311         /* All nodes do PP and PME */
6312 #ifdef GMX_MPI
6313         /* We do not require separate communicators */
6314         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6315 #endif
6316     }
6317
6318     if (cr->duty & DUTY_PP)
6319     {
6320         /* Copy or make a new PP communicator */
6321         make_pp_communicator(fplog, cr, CartReorder);
6322     }
6323     else
6324     {
6325         receive_ddindex2simnodeid(cr);
6326     }
6327
6328     if (!(cr->duty & DUTY_PME))
6329     {
6330         /* Set up the commnuication to our PME node */
6331         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6332         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6333         if (debug)
6334         {
6335             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6336                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6337         }
6338     }
6339     else
6340     {
6341         dd->pme_nodeid = -1;
6342     }
6343
6344     if (DDMASTER(dd))
6345     {
6346         dd->ma = init_gmx_domdec_master_t(dd,
6347                                           comm->cgs_gl.nr,
6348                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6349     }
6350 }
6351
6352 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6353 {
6354     real  *slb_frac, tot;
6355     int    i, n;
6356     double dbl;
6357
6358     slb_frac = NULL;
6359     if (nc > 1 && size_string != NULL)
6360     {
6361         if (fplog)
6362         {
6363             fprintf(fplog, "Using static load balancing for the %s direction\n",
6364                     dir);
6365         }
6366         snew(slb_frac, nc);
6367         tot = 0;
6368         for (i = 0; i < nc; i++)
6369         {
6370             dbl = 0;
6371             sscanf(size_string, "%lf%n", &dbl, &n);
6372             if (dbl == 0)
6373             {
6374                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6375             }
6376             slb_frac[i]  = dbl;
6377             size_string += n;
6378             tot         += slb_frac[i];
6379         }
6380         /* Normalize */
6381         if (fplog)
6382         {
6383             fprintf(fplog, "Relative cell sizes:");
6384         }
6385         for (i = 0; i < nc; i++)
6386         {
6387             slb_frac[i] /= tot;
6388             if (fplog)
6389             {
6390                 fprintf(fplog, " %5.3f", slb_frac[i]);
6391             }
6392         }
6393         if (fplog)
6394         {
6395             fprintf(fplog, "\n");
6396         }
6397     }
6398
6399     return slb_frac;
6400 }
6401
6402 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6403 {
6404     int                  n, nmol, ftype;
6405     gmx_mtop_ilistloop_t iloop;
6406     t_ilist             *il;
6407
6408     n     = 0;
6409     iloop = gmx_mtop_ilistloop_init(mtop);
6410     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6411     {
6412         for (ftype = 0; ftype < F_NRE; ftype++)
6413         {
6414             if ((interaction_function[ftype].flags & IF_BOND) &&
6415                 NRAL(ftype) >  2)
6416             {
6417                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6418             }
6419         }
6420     }
6421
6422     return n;
6423 }
6424
6425 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6426 {
6427     char *val;
6428     int   nst;
6429
6430     nst = def;
6431     val = getenv(env_var);
6432     if (val)
6433     {
6434         if (sscanf(val, "%d", &nst) <= 0)
6435         {
6436             nst = 1;
6437         }
6438         if (fplog)
6439         {
6440             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6441                     env_var, val, nst);
6442         }
6443     }
6444
6445     return nst;
6446 }
6447
6448 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6449 {
6450     if (MASTER(cr))
6451     {
6452         fprintf(stderr, "\n%s\n", warn_string);
6453     }
6454     if (fplog)
6455     {
6456         fprintf(fplog, "\n%s\n", warn_string);
6457     }
6458 }
6459
6460 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6461                                   t_inputrec *ir, FILE *fplog)
6462 {
6463     if (ir->ePBC == epbcSCREW &&
6464         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6465     {
6466         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6467     }
6468
6469     if (ir->ns_type == ensSIMPLE)
6470     {
6471         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6472     }
6473
6474     if (ir->nstlist == 0)
6475     {
6476         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6477     }
6478
6479     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6480     {
6481         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6482     }
6483 }
6484
6485 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6486 {
6487     int  di, d;
6488     real r;
6489
6490     r = ddbox->box_size[XX];
6491     for (di = 0; di < dd->ndim; di++)
6492     {
6493         d = dd->dim[di];
6494         /* Check using the initial average cell size */
6495         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6496     }
6497
6498     return r;
6499 }
6500
6501 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6502                              const char *dlb_opt, gmx_bool bRecordLoad,
6503                              unsigned long Flags, t_inputrec *ir)
6504 {
6505     gmx_domdec_t *dd;
6506     int           eDLB = -1;
6507     char          buf[STRLEN];
6508
6509     switch (dlb_opt[0])
6510     {
6511         case 'a': eDLB = edlbAUTO; break;
6512         case 'n': eDLB = edlbNO;   break;
6513         case 'y': eDLB = edlbYES;  break;
6514         default: gmx_incons("Unknown dlb_opt");
6515     }
6516
6517     if (Flags & MD_RERUN)
6518     {
6519         return edlbNO;
6520     }
6521
6522     if (!EI_DYNAMICS(ir->eI))
6523     {
6524         if (eDLB == edlbYES)
6525         {
6526             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6527             dd_warning(cr, fplog, buf);
6528         }
6529
6530         return edlbNO;
6531     }
6532
6533     if (!bRecordLoad)
6534     {
6535         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6536
6537         return edlbNO;
6538     }
6539
6540     if (Flags & MD_REPRODUCIBLE)
6541     {
6542         switch (eDLB)
6543         {
6544             case edlbNO:
6545                 break;
6546             case edlbAUTO:
6547                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6548                 eDLB = edlbNO;
6549                 break;
6550             case edlbYES:
6551                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6552                 break;
6553             default:
6554                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6555                 break;
6556         }
6557     }
6558
6559     return eDLB;
6560 }
6561
6562 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6563 {
6564     int dim;
6565
6566     dd->ndim = 0;
6567     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6568     {
6569         /* Decomposition order z,y,x */
6570         if (fplog)
6571         {
6572             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6573         }
6574         for (dim = DIM-1; dim >= 0; dim--)
6575         {
6576             if (dd->nc[dim] > 1)
6577             {
6578                 dd->dim[dd->ndim++] = dim;
6579             }
6580         }
6581     }
6582     else
6583     {
6584         /* Decomposition order x,y,z */
6585         for (dim = 0; dim < DIM; dim++)
6586         {
6587             if (dd->nc[dim] > 1)
6588             {
6589                 dd->dim[dd->ndim++] = dim;
6590             }
6591         }
6592     }
6593 }
6594
6595 static gmx_domdec_comm_t *init_dd_comm()
6596 {
6597     gmx_domdec_comm_t *comm;
6598     int                i;
6599
6600     snew(comm, 1);
6601     snew(comm->cggl_flag, DIM*2);
6602     snew(comm->cgcm_state, DIM*2);
6603     for (i = 0; i < DIM*2; i++)
6604     {
6605         comm->cggl_flag_nalloc[i]  = 0;
6606         comm->cgcm_state_nalloc[i] = 0;
6607     }
6608
6609     comm->nalloc_int = 0;
6610     comm->buf_int    = NULL;
6611
6612     vec_rvec_init(&comm->vbuf);
6613
6614     comm->n_load_have    = 0;
6615     comm->n_load_collect = 0;
6616
6617     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6618     {
6619         comm->sum_nat[i] = 0;
6620     }
6621     comm->ndecomp   = 0;
6622     comm->nload     = 0;
6623     comm->load_step = 0;
6624     comm->load_sum  = 0;
6625     comm->load_max  = 0;
6626     clear_ivec(comm->load_lim);
6627     comm->load_mdf  = 0;
6628     comm->load_pme  = 0;
6629
6630     return comm;
6631 }
6632
6633 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6634                                         unsigned long Flags,
6635                                         ivec nc,
6636                                         real comm_distance_min, real rconstr,
6637                                         const char *dlb_opt, real dlb_scale,
6638                                         const char *sizex, const char *sizey, const char *sizez,
6639                                         gmx_mtop_t *mtop, t_inputrec *ir,
6640                                         matrix box, rvec *x,
6641                                         gmx_ddbox_t *ddbox,
6642                                         int *npme_x, int *npme_y)
6643 {
6644     gmx_domdec_t      *dd;
6645     gmx_domdec_comm_t *comm;
6646     int                recload;
6647     int                d, i, j;
6648     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6649     gmx_bool           bC;
6650     char               buf[STRLEN];
6651
6652     if (fplog)
6653     {
6654         fprintf(fplog,
6655                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6656     }
6657
6658     snew(dd, 1);
6659
6660     dd->comm = init_dd_comm();
6661     comm     = dd->comm;
6662     snew(comm->cggl_flag, DIM*2);
6663     snew(comm->cgcm_state, DIM*2);
6664
6665     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6666     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6667
6668     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6669     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6670     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6671     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6672     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6673     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6674     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6675     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6676
6677     dd->pme_recv_f_alloc = 0;
6678     dd->pme_recv_f_buf   = NULL;
6679
6680     if (dd->bSendRecv2 && fplog)
6681     {
6682         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6683     }
6684     if (comm->eFlop)
6685     {
6686         if (fplog)
6687         {
6688             fprintf(fplog, "Will load balance based on FLOP count\n");
6689         }
6690         if (comm->eFlop > 1)
6691         {
6692             srand(1+cr->nodeid);
6693         }
6694         comm->bRecordLoad = TRUE;
6695     }
6696     else
6697     {
6698         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6699
6700     }
6701
6702     /* Initialize to GPU share count to 0, might change later */
6703     comm->nrank_gpu_shared = 0;
6704
6705     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6706
6707     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6708     if (fplog)
6709     {
6710         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6711     }
6712     dd->bGridJump              = comm->bDynLoadBal;
6713     comm->bPMELoadBalDLBLimits = FALSE;
6714
6715     if (comm->nstSortCG)
6716     {
6717         if (fplog)
6718         {
6719             if (comm->nstSortCG == 1)
6720             {
6721                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6722             }
6723             else
6724             {
6725                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6726                         comm->nstSortCG);
6727             }
6728         }
6729         snew(comm->sort, 1);
6730     }
6731     else
6732     {
6733         if (fplog)
6734         {
6735             fprintf(fplog, "Will not sort the charge groups\n");
6736         }
6737     }
6738
6739     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6740
6741     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6742     if (comm->bInterCGBondeds)
6743     {
6744         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6745     }
6746     else
6747     {
6748         comm->bInterCGMultiBody = FALSE;
6749     }
6750
6751     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6752     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6753
6754     if (ir->rlistlong == 0)
6755     {
6756         /* Set the cut-off to some very large value,
6757          * so we don't need if statements everywhere in the code.
6758          * We use sqrt, since the cut-off is squared in some places.
6759          */
6760         comm->cutoff   = GMX_CUTOFF_INF;
6761     }
6762     else
6763     {
6764         comm->cutoff   = ir->rlistlong;
6765     }
6766     comm->cutoff_mbody = 0;
6767
6768     comm->cellsize_limit = 0;
6769     comm->bBondComm      = FALSE;
6770
6771     if (comm->bInterCGBondeds)
6772     {
6773         if (comm_distance_min > 0)
6774         {
6775             comm->cutoff_mbody = comm_distance_min;
6776             if (Flags & MD_DDBONDCOMM)
6777             {
6778                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6779             }
6780             else
6781             {
6782                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6783             }
6784             r_bonded_limit = comm->cutoff_mbody;
6785         }
6786         else if (ir->bPeriodicMols)
6787         {
6788             /* Can not easily determine the required cut-off */
6789             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6790             comm->cutoff_mbody = comm->cutoff/2;
6791             r_bonded_limit     = comm->cutoff_mbody;
6792         }
6793         else
6794         {
6795             if (MASTER(cr))
6796             {
6797                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6798                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6799             }
6800             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6801             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6802
6803             /* We use an initial margin of 10% for the minimum cell size,
6804              * except when we are just below the non-bonded cut-off.
6805              */
6806             if (Flags & MD_DDBONDCOMM)
6807             {
6808                 if (max(r_2b, r_mb) > comm->cutoff)
6809                 {
6810                     r_bonded        = max(r_2b, r_mb);
6811                     r_bonded_limit  = 1.1*r_bonded;
6812                     comm->bBondComm = TRUE;
6813                 }
6814                 else
6815                 {
6816                     r_bonded       = r_mb;
6817                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6818                 }
6819                 /* We determine cutoff_mbody later */
6820             }
6821             else
6822             {
6823                 /* No special bonded communication,
6824                  * simply increase the DD cut-off.
6825                  */
6826                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6827                 comm->cutoff_mbody = r_bonded_limit;
6828                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6829             }
6830         }
6831         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6832         if (fplog)
6833         {
6834             fprintf(fplog,
6835                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6836                     comm->cellsize_limit);
6837         }
6838     }
6839
6840     if (dd->bInterCGcons && rconstr <= 0)
6841     {
6842         /* There is a cell size limit due to the constraints (P-LINCS) */
6843         rconstr = constr_r_max(fplog, mtop, ir);
6844         if (fplog)
6845         {
6846             fprintf(fplog,
6847                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6848                     rconstr);
6849             if (rconstr > comm->cellsize_limit)
6850             {
6851                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6852             }
6853         }
6854     }
6855     else if (rconstr > 0 && fplog)
6856     {
6857         /* Here we do not check for dd->bInterCGcons,
6858          * because one can also set a cell size limit for virtual sites only
6859          * and at this point we don't know yet if there are intercg v-sites.
6860          */
6861         fprintf(fplog,
6862                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6863                 rconstr);
6864     }
6865     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6866
6867     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6868
6869     if (nc[XX] > 0)
6870     {
6871         copy_ivec(nc, dd->nc);
6872         set_dd_dim(fplog, dd);
6873         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6874
6875         if (cr->npmenodes == -1)
6876         {
6877             cr->npmenodes = 0;
6878         }
6879         acs = average_cellsize_min(dd, ddbox);
6880         if (acs < comm->cellsize_limit)
6881         {
6882             if (fplog)
6883             {
6884                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6885             }
6886             gmx_fatal_collective(FARGS, cr, NULL,
6887                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6888                                  acs, comm->cellsize_limit);
6889         }
6890     }
6891     else
6892     {
6893         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6894
6895         /* We need to choose the optimal DD grid and possibly PME nodes */
6896         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6897                                comm->eDLB != edlbNO, dlb_scale,
6898                                comm->cellsize_limit, comm->cutoff,
6899                                comm->bInterCGBondeds);
6900
6901         if (dd->nc[XX] == 0)
6902         {
6903             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6904             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6905                     !bC ? "-rdd" : "-rcon",
6906                     comm->eDLB != edlbNO ? " or -dds" : "",
6907                     bC ? " or your LINCS settings" : "");
6908
6909             gmx_fatal_collective(FARGS, cr, NULL,
6910                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6911                                  "%s\n"
6912                                  "Look in the log file for details on the domain decomposition",
6913                                  cr->nnodes-cr->npmenodes, limit, buf);
6914         }
6915         set_dd_dim(fplog, dd);
6916     }
6917
6918     if (fplog)
6919     {
6920         fprintf(fplog,
6921                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6922                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6923     }
6924
6925     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6926     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6927     {
6928         gmx_fatal_collective(FARGS, cr, NULL,
6929                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6930                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6931     }
6932     if (cr->npmenodes > dd->nnodes)
6933     {
6934         gmx_fatal_collective(FARGS, cr, NULL,
6935                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6936     }
6937     if (cr->npmenodes > 0)
6938     {
6939         comm->npmenodes = cr->npmenodes;
6940     }
6941     else
6942     {
6943         comm->npmenodes = dd->nnodes;
6944     }
6945
6946     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6947     {
6948         /* The following choices should match those
6949          * in comm_cost_est in domdec_setup.c.
6950          * Note that here the checks have to take into account
6951          * that the decomposition might occur in a different order than xyz
6952          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6953          * in which case they will not match those in comm_cost_est,
6954          * but since that is mainly for testing purposes that's fine.
6955          */
6956         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6957             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6958             getenv("GMX_PMEONEDD") == NULL)
6959         {
6960             comm->npmedecompdim = 2;
6961             comm->npmenodes_x   = dd->nc[XX];
6962             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6963         }
6964         else
6965         {
6966             /* In case nc is 1 in both x and y we could still choose to
6967              * decompose pme in y instead of x, but we use x for simplicity.
6968              */
6969             comm->npmedecompdim = 1;
6970             if (dd->dim[0] == YY)
6971             {
6972                 comm->npmenodes_x = 1;
6973                 comm->npmenodes_y = comm->npmenodes;
6974             }
6975             else
6976             {
6977                 comm->npmenodes_x = comm->npmenodes;
6978                 comm->npmenodes_y = 1;
6979             }
6980         }
6981         if (fplog)
6982         {
6983             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6984                     comm->npmenodes_x, comm->npmenodes_y, 1);
6985         }
6986     }
6987     else
6988     {
6989         comm->npmedecompdim = 0;
6990         comm->npmenodes_x   = 0;
6991         comm->npmenodes_y   = 0;
6992     }
6993
6994     /* Technically we don't need both of these,
6995      * but it simplifies code not having to recalculate it.
6996      */
6997     *npme_x = comm->npmenodes_x;
6998     *npme_y = comm->npmenodes_y;
6999
7000     snew(comm->slb_frac, DIM);
7001     if (comm->eDLB == edlbNO)
7002     {
7003         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
7004         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
7005         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
7006     }
7007
7008     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
7009     {
7010         if (comm->bBondComm || comm->eDLB != edlbNO)
7011         {
7012             /* Set the bonded communication distance to halfway
7013              * the minimum and the maximum,
7014              * since the extra communication cost is nearly zero.
7015              */
7016             acs                = average_cellsize_min(dd, ddbox);
7017             comm->cutoff_mbody = 0.5*(r_bonded + acs);
7018             if (comm->eDLB != edlbNO)
7019             {
7020                 /* Check if this does not limit the scaling */
7021                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
7022             }
7023             if (!comm->bBondComm)
7024             {
7025                 /* Without bBondComm do not go beyond the n.b. cut-off */
7026                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
7027                 if (comm->cellsize_limit >= comm->cutoff)
7028                 {
7029                     /* We don't loose a lot of efficieny
7030                      * when increasing it to the n.b. cut-off.
7031                      * It can even be slightly faster, because we need
7032                      * less checks for the communication setup.
7033                      */
7034                     comm->cutoff_mbody = comm->cutoff;
7035                 }
7036             }
7037             /* Check if we did not end up below our original limit */
7038             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
7039
7040             if (comm->cutoff_mbody > comm->cellsize_limit)
7041             {
7042                 comm->cellsize_limit = comm->cutoff_mbody;
7043             }
7044         }
7045         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7046     }
7047
7048     if (debug)
7049     {
7050         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
7051                 "cellsize limit %f\n",
7052                 comm->bBondComm, comm->cellsize_limit);
7053     }
7054
7055     if (MASTER(cr))
7056     {
7057         check_dd_restrictions(cr, dd, ir, fplog);
7058     }
7059
7060     comm->partition_step = INT_MIN;
7061     dd->ddp_count        = 0;
7062
7063     clear_dd_cycle_counts(dd);
7064
7065     return dd;
7066 }
7067
7068 static void set_dlb_limits(gmx_domdec_t *dd)
7069
7070 {
7071     int d;
7072
7073     for (d = 0; d < dd->ndim; d++)
7074     {
7075         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
7076         dd->comm->cellsize_min[dd->dim[d]] =
7077             dd->comm->cellsize_min_dlb[dd->dim[d]];
7078     }
7079 }
7080
7081
7082 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
7083 {
7084     gmx_domdec_t      *dd;
7085     gmx_domdec_comm_t *comm;
7086     real               cellsize_min;
7087     int                d, nc, i;
7088     char               buf[STRLEN];
7089
7090     dd   = cr->dd;
7091     comm = dd->comm;
7092
7093     if (fplog)
7094     {
7095         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
7096     }
7097
7098     cellsize_min = comm->cellsize_min[dd->dim[0]];
7099     for (d = 1; d < dd->ndim; d++)
7100     {
7101         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
7102     }
7103
7104     if (cellsize_min < comm->cellsize_limit*1.05)
7105     {
7106         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7107
7108         /* Change DLB from "auto" to "no". */
7109         comm->eDLB = edlbNO;
7110
7111         return;
7112     }
7113
7114     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
7115     comm->bDynLoadBal = TRUE;
7116     dd->bGridJump     = TRUE;
7117
7118     set_dlb_limits(dd);
7119
7120     /* We can set the required cell size info here,
7121      * so we do not need to communicate this.
7122      * The grid is completely uniform.
7123      */
7124     for (d = 0; d < dd->ndim; d++)
7125     {
7126         if (comm->root[d])
7127         {
7128             comm->load[d].sum_m = comm->load[d].sum;
7129
7130             nc = dd->nc[dd->dim[d]];
7131             for (i = 0; i < nc; i++)
7132             {
7133                 comm->root[d]->cell_f[i]    = i/(real)nc;
7134                 if (d > 0)
7135                 {
7136                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7137                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7138                 }
7139             }
7140             comm->root[d]->cell_f[nc] = 1.0;
7141         }
7142     }
7143 }
7144
7145 static char *init_bLocalCG(gmx_mtop_t *mtop)
7146 {
7147     int   ncg, cg;
7148     char *bLocalCG;
7149
7150     ncg = ncg_mtop(mtop);
7151     snew(bLocalCG, ncg);
7152     for (cg = 0; cg < ncg; cg++)
7153     {
7154         bLocalCG[cg] = FALSE;
7155     }
7156
7157     return bLocalCG;
7158 }
7159
7160 void dd_init_bondeds(FILE *fplog,
7161                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7162                      gmx_vsite_t *vsite,
7163                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7164 {
7165     gmx_domdec_comm_t *comm;
7166     gmx_bool           bBondComm;
7167     int                d;
7168
7169     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
7170
7171     comm = dd->comm;
7172
7173     if (comm->bBondComm)
7174     {
7175         /* Communicate atoms beyond the cut-off for bonded interactions */
7176         comm = dd->comm;
7177
7178         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7179
7180         comm->bLocalCG = init_bLocalCG(mtop);
7181     }
7182     else
7183     {
7184         /* Only communicate atoms based on cut-off */
7185         comm->cglink   = NULL;
7186         comm->bLocalCG = NULL;
7187     }
7188 }
7189
7190 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7191                               t_inputrec *ir,
7192                               gmx_bool bDynLoadBal, real dlb_scale,
7193                               gmx_ddbox_t *ddbox)
7194 {
7195     gmx_domdec_comm_t *comm;
7196     int                d;
7197     ivec               np;
7198     real               limit, shrink;
7199     char               buf[64];
7200
7201     if (fplog == NULL)
7202     {
7203         return;
7204     }
7205
7206     comm = dd->comm;
7207
7208     if (bDynLoadBal)
7209     {
7210         fprintf(fplog, "The maximum number of communication pulses is:");
7211         for (d = 0; d < dd->ndim; d++)
7212         {
7213             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7214         }
7215         fprintf(fplog, "\n");
7216         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7217         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7218         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7219         for (d = 0; d < DIM; d++)
7220         {
7221             if (dd->nc[d] > 1)
7222             {
7223                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7224                 {
7225                     shrink = 0;
7226                 }
7227                 else
7228                 {
7229                     shrink =
7230                         comm->cellsize_min_dlb[d]/
7231                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7232                 }
7233                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7234             }
7235         }
7236         fprintf(fplog, "\n");
7237     }
7238     else
7239     {
7240         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
7241         fprintf(fplog, "The initial number of communication pulses is:");
7242         for (d = 0; d < dd->ndim; d++)
7243         {
7244             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7245         }
7246         fprintf(fplog, "\n");
7247         fprintf(fplog, "The initial domain decomposition cell size is:");
7248         for (d = 0; d < DIM; d++)
7249         {
7250             if (dd->nc[d] > 1)
7251             {
7252                 fprintf(fplog, " %c %.2f nm",
7253                         dim2char(d), dd->comm->cellsize_min[d]);
7254             }
7255         }
7256         fprintf(fplog, "\n\n");
7257     }
7258
7259     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7260     {
7261         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7262         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7263                 "non-bonded interactions", "", comm->cutoff);
7264
7265         if (bDynLoadBal)
7266         {
7267             limit = dd->comm->cellsize_limit;
7268         }
7269         else
7270         {
7271             if (dynamic_dd_box(ddbox, ir))
7272             {
7273                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7274             }
7275             limit = dd->comm->cellsize_min[XX];
7276             for (d = 1; d < DIM; d++)
7277             {
7278                 limit = min(limit, dd->comm->cellsize_min[d]);
7279             }
7280         }
7281
7282         if (comm->bInterCGBondeds)
7283         {
7284             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7285                     "two-body bonded interactions", "(-rdd)",
7286                     max(comm->cutoff, comm->cutoff_mbody));
7287             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7288                     "multi-body bonded interactions", "(-rdd)",
7289                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7290         }
7291         if (dd->vsite_comm)
7292         {
7293             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7294                     "virtual site constructions", "(-rcon)", limit);
7295         }
7296         if (dd->constraint_comm)
7297         {
7298             sprintf(buf, "atoms separated by up to %d constraints",
7299                     1+ir->nProjOrder);
7300             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7301                     buf, "(-rcon)", limit);
7302         }
7303         fprintf(fplog, "\n");
7304     }
7305
7306     fflush(fplog);
7307 }
7308
7309 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7310                                 real               dlb_scale,
7311                                 const t_inputrec  *ir,
7312                                 const gmx_ddbox_t *ddbox)
7313 {
7314     gmx_domdec_comm_t *comm;
7315     int                d, dim, npulse, npulse_d_max, npulse_d;
7316     gmx_bool           bNoCutOff;
7317
7318     comm = dd->comm;
7319
7320     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7321
7322     /* Determine the maximum number of comm. pulses in one dimension */
7323
7324     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7325
7326     /* Determine the maximum required number of grid pulses */
7327     if (comm->cellsize_limit >= comm->cutoff)
7328     {
7329         /* Only a single pulse is required */
7330         npulse = 1;
7331     }
7332     else if (!bNoCutOff && comm->cellsize_limit > 0)
7333     {
7334         /* We round down slightly here to avoid overhead due to the latency
7335          * of extra communication calls when the cut-off
7336          * would be only slightly longer than the cell size.
7337          * Later cellsize_limit is redetermined,
7338          * so we can not miss interactions due to this rounding.
7339          */
7340         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7341     }
7342     else
7343     {
7344         /* There is no cell size limit */
7345         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7346     }
7347
7348     if (!bNoCutOff && npulse > 1)
7349     {
7350         /* See if we can do with less pulses, based on dlb_scale */
7351         npulse_d_max = 0;
7352         for (d = 0; d < dd->ndim; d++)
7353         {
7354             dim      = dd->dim[d];
7355             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7356                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7357             npulse_d_max = max(npulse_d_max, npulse_d);
7358         }
7359         npulse = min(npulse, npulse_d_max);
7360     }
7361
7362     /* This env var can override npulse */
7363     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7364     if (d > 0)
7365     {
7366         npulse = d;
7367     }
7368
7369     comm->maxpulse       = 1;
7370     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7371     for (d = 0; d < dd->ndim; d++)
7372     {
7373         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7374         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7375         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7376         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7377         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7378         {
7379             comm->bVacDLBNoLimit = FALSE;
7380         }
7381     }
7382
7383     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7384     if (!comm->bVacDLBNoLimit)
7385     {
7386         comm->cellsize_limit = max(comm->cellsize_limit,
7387                                    comm->cutoff/comm->maxpulse);
7388     }
7389     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7390     /* Set the minimum cell size for each DD dimension */
7391     for (d = 0; d < dd->ndim; d++)
7392     {
7393         if (comm->bVacDLBNoLimit ||
7394             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7395         {
7396             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7397         }
7398         else
7399         {
7400             comm->cellsize_min_dlb[dd->dim[d]] =
7401                 comm->cutoff/comm->cd[d].np_dlb;
7402         }
7403     }
7404     if (comm->cutoff_mbody <= 0)
7405     {
7406         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7407     }
7408     if (comm->bDynLoadBal)
7409     {
7410         set_dlb_limits(dd);
7411     }
7412 }
7413
7414 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7415 {
7416     /* If each molecule is a single charge group
7417      * or we use domain decomposition for each periodic dimension,
7418      * we do not need to take pbc into account for the bonded interactions.
7419      */
7420     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7421             !(dd->nc[XX] > 1 &&
7422               dd->nc[YY] > 1 &&
7423               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7424 }
7425
7426 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7427                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7428 {
7429     gmx_domdec_comm_t *comm;
7430     int                natoms_tot;
7431     real               vol_frac;
7432
7433     comm = dd->comm;
7434
7435     /* Initialize the thread data.
7436      * This can not be done in init_domain_decomposition,
7437      * as the numbers of threads is determined later.
7438      */
7439     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7440     if (comm->nth > 1)
7441     {
7442         snew(comm->dth, comm->nth);
7443     }
7444
7445     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7446     {
7447         init_ddpme(dd, &comm->ddpme[0], 0);
7448         if (comm->npmedecompdim >= 2)
7449         {
7450             init_ddpme(dd, &comm->ddpme[1], 1);
7451         }
7452     }
7453     else
7454     {
7455         comm->npmenodes = 0;
7456         if (dd->pme_nodeid >= 0)
7457         {
7458             gmx_fatal_collective(FARGS, NULL, dd,
7459                                  "Can not have separate PME nodes without PME electrostatics");
7460         }
7461     }
7462
7463     if (debug)
7464     {
7465         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7466     }
7467     if (comm->eDLB != edlbNO)
7468     {
7469         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7470     }
7471
7472     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7473     if (comm->eDLB == edlbAUTO)
7474     {
7475         if (fplog)
7476         {
7477             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7478         }
7479         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7480     }
7481
7482     if (ir->ePBC == epbcNONE)
7483     {
7484         vol_frac = 1 - 1/(double)dd->nnodes;
7485     }
7486     else
7487     {
7488         vol_frac =
7489             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7490     }
7491     if (debug)
7492     {
7493         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7494     }
7495     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7496
7497     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7498 }
7499
7500 static gmx_bool test_dd_cutoff(t_commrec *cr,
7501                                t_state *state, t_inputrec *ir,
7502                                real cutoff_req)
7503 {
7504     gmx_domdec_t *dd;
7505     gmx_ddbox_t   ddbox;
7506     int           d, dim, np;
7507     real          inv_cell_size;
7508     int           LocallyLimited;
7509
7510     dd = cr->dd;
7511
7512     set_ddbox(dd, FALSE, cr, ir, state->box,
7513               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7514
7515     LocallyLimited = 0;
7516
7517     for (d = 0; d < dd->ndim; d++)
7518     {
7519         dim = dd->dim[d];
7520
7521         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7522         if (dynamic_dd_box(&ddbox, ir))
7523         {
7524             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7525         }
7526
7527         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7528
7529         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7530             dd->comm->cd[d].np_dlb > 0)
7531         {
7532             if (np > dd->comm->cd[d].np_dlb)
7533             {
7534                 return FALSE;
7535             }
7536
7537             /* If a current local cell size is smaller than the requested
7538              * cut-off, we could still fix it, but this gets very complicated.
7539              * Without fixing here, we might actually need more checks.
7540              */
7541             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7542             {
7543                 LocallyLimited = 1;
7544             }
7545         }
7546     }
7547
7548     if (dd->comm->eDLB != edlbNO)
7549     {
7550         /* If DLB is not active yet, we don't need to check the grid jumps.
7551          * Actually we shouldn't, because then the grid jump data is not set.
7552          */
7553         if (dd->comm->bDynLoadBal &&
7554             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7555         {
7556             LocallyLimited = 1;
7557         }
7558
7559         gmx_sumi(1, &LocallyLimited, cr);
7560
7561         if (LocallyLimited > 0)
7562         {
7563             return FALSE;
7564         }
7565     }
7566
7567     return TRUE;
7568 }
7569
7570 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7571                           real cutoff_req)
7572 {
7573     gmx_bool bCutoffAllowed;
7574
7575     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7576
7577     if (bCutoffAllowed)
7578     {
7579         cr->dd->comm->cutoff = cutoff_req;
7580     }
7581
7582     return bCutoffAllowed;
7583 }
7584
7585 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7586 {
7587     gmx_domdec_comm_t *comm;
7588
7589     comm = cr->dd->comm;
7590
7591     /* Turn on the DLB limiting (might have been on already) */
7592     comm->bPMELoadBalDLBLimits = TRUE;
7593
7594     /* Change the cut-off limit */
7595     comm->PMELoadBal_max_cutoff = comm->cutoff;
7596 }
7597
7598 static void merge_cg_buffers(int ncell,
7599                              gmx_domdec_comm_dim_t *cd, int pulse,
7600                              int  *ncg_cell,
7601                              int  *index_gl, int  *recv_i,
7602                              rvec *cg_cm,    rvec *recv_vr,
7603                              int *cgindex,
7604                              cginfo_mb_t *cginfo_mb, int *cginfo)
7605 {
7606     gmx_domdec_ind_t *ind, *ind_p;
7607     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7608     int               shift, shift_at;
7609
7610     ind = &cd->ind[pulse];
7611
7612     /* First correct the already stored data */
7613     shift = ind->nrecv[ncell];
7614     for (cell = ncell-1; cell >= 0; cell--)
7615     {
7616         shift -= ind->nrecv[cell];
7617         if (shift > 0)
7618         {
7619             /* Move the cg's present from previous grid pulses */
7620             cg0                = ncg_cell[ncell+cell];
7621             cg1                = ncg_cell[ncell+cell+1];
7622             cgindex[cg1+shift] = cgindex[cg1];
7623             for (cg = cg1-1; cg >= cg0; cg--)
7624             {
7625                 index_gl[cg+shift] = index_gl[cg];
7626                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7627                 cgindex[cg+shift] = cgindex[cg];
7628                 cginfo[cg+shift]  = cginfo[cg];
7629             }
7630             /* Correct the already stored send indices for the shift */
7631             for (p = 1; p <= pulse; p++)
7632             {
7633                 ind_p = &cd->ind[p];
7634                 cg0   = 0;
7635                 for (c = 0; c < cell; c++)
7636                 {
7637                     cg0 += ind_p->nsend[c];
7638                 }
7639                 cg1 = cg0 + ind_p->nsend[cell];
7640                 for (cg = cg0; cg < cg1; cg++)
7641                 {
7642                     ind_p->index[cg] += shift;
7643                 }
7644             }
7645         }
7646     }
7647
7648     /* Merge in the communicated buffers */
7649     shift    = 0;
7650     shift_at = 0;
7651     cg0      = 0;
7652     for (cell = 0; cell < ncell; cell++)
7653     {
7654         cg1 = ncg_cell[ncell+cell+1] + shift;
7655         if (shift_at > 0)
7656         {
7657             /* Correct the old cg indices */
7658             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7659             {
7660                 cgindex[cg+1] += shift_at;
7661             }
7662         }
7663         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7664         {
7665             /* Copy this charge group from the buffer */
7666             index_gl[cg1] = recv_i[cg0];
7667             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7668             /* Add it to the cgindex */
7669             cg_gl          = index_gl[cg1];
7670             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7671             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7672             cgindex[cg1+1] = cgindex[cg1] + nat;
7673             cg0++;
7674             cg1++;
7675             shift_at += nat;
7676         }
7677         shift                 += ind->nrecv[cell];
7678         ncg_cell[ncell+cell+1] = cg1;
7679     }
7680 }
7681
7682 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7683                                int nzone, int cg0, const int *cgindex)
7684 {
7685     int cg, zone, p;
7686
7687     /* Store the atom block boundaries for easy copying of communication buffers
7688      */
7689     cg = cg0;
7690     for (zone = 0; zone < nzone; zone++)
7691     {
7692         for (p = 0; p < cd->np; p++)
7693         {
7694             cd->ind[p].cell2at0[zone] = cgindex[cg];
7695             cg += cd->ind[p].nrecv[zone];
7696             cd->ind[p].cell2at1[zone] = cgindex[cg];
7697         }
7698     }
7699 }
7700
7701 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7702 {
7703     int      i;
7704     gmx_bool bMiss;
7705
7706     bMiss = FALSE;
7707     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7708     {
7709         if (!bLocalCG[link->a[i]])
7710         {
7711             bMiss = TRUE;
7712         }
7713     }
7714
7715     return bMiss;
7716 }
7717
7718 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7719 typedef struct {
7720     real c[DIM][4]; /* the corners for the non-bonded communication */
7721     real cr0;       /* corner for rounding */
7722     real cr1[4];    /* corners for rounding */
7723     real bc[DIM];   /* corners for bounded communication */
7724     real bcr1;      /* corner for rounding for bonded communication */
7725 } dd_corners_t;
7726
7727 /* Determine the corners of the domain(s) we are communicating with */
7728 static void
7729 set_dd_corners(const gmx_domdec_t *dd,
7730                int dim0, int dim1, int dim2,
7731                gmx_bool bDistMB,
7732                dd_corners_t *c)
7733 {
7734     const gmx_domdec_comm_t  *comm;
7735     const gmx_domdec_zones_t *zones;
7736     int i, j;
7737
7738     comm = dd->comm;
7739
7740     zones = &comm->zones;
7741
7742     /* Keep the compiler happy */
7743     c->cr0  = 0;
7744     c->bcr1 = 0;
7745
7746     /* The first dimension is equal for all cells */
7747     c->c[0][0] = comm->cell_x0[dim0];
7748     if (bDistMB)
7749     {
7750         c->bc[0] = c->c[0][0];
7751     }
7752     if (dd->ndim >= 2)
7753     {
7754         dim1 = dd->dim[1];
7755         /* This cell row is only seen from the first row */
7756         c->c[1][0] = comm->cell_x0[dim1];
7757         /* All rows can see this row */
7758         c->c[1][1] = comm->cell_x0[dim1];
7759         if (dd->bGridJump)
7760         {
7761             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7762             if (bDistMB)
7763             {
7764                 /* For the multi-body distance we need the maximum */
7765                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7766             }
7767         }
7768         /* Set the upper-right corner for rounding */
7769         c->cr0 = comm->cell_x1[dim0];
7770
7771         if (dd->ndim >= 3)
7772         {
7773             dim2 = dd->dim[2];
7774             for (j = 0; j < 4; j++)
7775             {
7776                 c->c[2][j] = comm->cell_x0[dim2];
7777             }
7778             if (dd->bGridJump)
7779             {
7780                 /* Use the maximum of the i-cells that see a j-cell */
7781                 for (i = 0; i < zones->nizone; i++)
7782                 {
7783                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7784                     {
7785                         if (j >= 4)
7786                         {
7787                             c->c[2][j-4] =
7788                                 max(c->c[2][j-4],
7789                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7790                         }
7791                     }
7792                 }
7793                 if (bDistMB)
7794                 {
7795                     /* For the multi-body distance we need the maximum */
7796                     c->bc[2] = comm->cell_x0[dim2];
7797                     for (i = 0; i < 2; i++)
7798                     {
7799                         for (j = 0; j < 2; j++)
7800                         {
7801                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7802                         }
7803                     }
7804                 }
7805             }
7806
7807             /* Set the upper-right corner for rounding */
7808             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7809              * Only cell (0,0,0) can see cell 7 (1,1,1)
7810              */
7811             c->cr1[0] = comm->cell_x1[dim1];
7812             c->cr1[3] = comm->cell_x1[dim1];
7813             if (dd->bGridJump)
7814             {
7815                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7816                 if (bDistMB)
7817                 {
7818                     /* For the multi-body distance we need the maximum */
7819                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7820                 }
7821             }
7822         }
7823     }
7824 }
7825
7826 /* Determine which cg's we need to send in this pulse from this zone */
7827 static void
7828 get_zone_pulse_cgs(gmx_domdec_t *dd,
7829                    int zonei, int zone,
7830                    int cg0, int cg1,
7831                    const int *index_gl,
7832                    const int *cgindex,
7833                    int dim, int dim_ind,
7834                    int dim0, int dim1, int dim2,
7835                    real r_comm2, real r_bcomm2,
7836                    matrix box,
7837                    ivec tric_dist,
7838                    rvec *normal,
7839                    real skew_fac2_d, real skew_fac_01,
7840                    rvec *v_d, rvec *v_0, rvec *v_1,
7841                    const dd_corners_t *c,
7842                    rvec sf2_round,
7843                    gmx_bool bDistBonded,
7844                    gmx_bool bBondComm,
7845                    gmx_bool bDist2B,
7846                    gmx_bool bDistMB,
7847                    rvec *cg_cm,
7848                    int *cginfo,
7849                    gmx_domdec_ind_t *ind,
7850                    int **ibuf, int *ibuf_nalloc,
7851                    vec_rvec_t *vbuf,
7852                    int *nsend_ptr,
7853                    int *nat_ptr,
7854                    int *nsend_z_ptr)
7855 {
7856     gmx_domdec_comm_t *comm;
7857     gmx_bool           bScrew;
7858     gmx_bool           bDistMB_pulse;
7859     int                cg, i;
7860     real               r2, rb2, r, tric_sh;
7861     rvec               rn, rb;
7862     int                dimd;
7863     int                nsend_z, nsend, nat;
7864
7865     comm = dd->comm;
7866
7867     bScrew = (dd->bScrewPBC && dim == XX);
7868
7869     bDistMB_pulse = (bDistMB && bDistBonded);
7870
7871     nsend_z = 0;
7872     nsend   = *nsend_ptr;
7873     nat     = *nat_ptr;
7874
7875     for (cg = cg0; cg < cg1; cg++)
7876     {
7877         r2  = 0;
7878         rb2 = 0;
7879         if (tric_dist[dim_ind] == 0)
7880         {
7881             /* Rectangular direction, easy */
7882             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7883             if (r > 0)
7884             {
7885                 r2 += r*r;
7886             }
7887             if (bDistMB_pulse)
7888             {
7889                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7890                 if (r > 0)
7891                 {
7892                     rb2 += r*r;
7893                 }
7894             }
7895             /* Rounding gives at most a 16% reduction
7896              * in communicated atoms
7897              */
7898             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7899             {
7900                 r = cg_cm[cg][dim0] - c->cr0;
7901                 /* This is the first dimension, so always r >= 0 */
7902                 r2 += r*r;
7903                 if (bDistMB_pulse)
7904                 {
7905                     rb2 += r*r;
7906                 }
7907             }
7908             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7909             {
7910                 r = cg_cm[cg][dim1] - c->cr1[zone];
7911                 if (r > 0)
7912                 {
7913                     r2 += r*r;
7914                 }
7915                 if (bDistMB_pulse)
7916                 {
7917                     r = cg_cm[cg][dim1] - c->bcr1;
7918                     if (r > 0)
7919                     {
7920                         rb2 += r*r;
7921                     }
7922                 }
7923             }
7924         }
7925         else
7926         {
7927             /* Triclinic direction, more complicated */
7928             clear_rvec(rn);
7929             clear_rvec(rb);
7930             /* Rounding, conservative as the skew_fac multiplication
7931              * will slightly underestimate the distance.
7932              */
7933             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7934             {
7935                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7936                 for (i = dim0+1; i < DIM; i++)
7937                 {
7938                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7939                 }
7940                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7941                 if (bDistMB_pulse)
7942                 {
7943                     rb[dim0] = rn[dim0];
7944                     rb2      = r2;
7945                 }
7946                 /* Take care that the cell planes along dim0 might not
7947                  * be orthogonal to those along dim1 and dim2.
7948                  */
7949                 for (i = 1; i <= dim_ind; i++)
7950                 {
7951                     dimd = dd->dim[i];
7952                     if (normal[dim0][dimd] > 0)
7953                     {
7954                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7955                         if (bDistMB_pulse)
7956                         {
7957                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7958                         }
7959                     }
7960                 }
7961             }
7962             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7963             {
7964                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7965                 tric_sh   = 0;
7966                 for (i = dim1+1; i < DIM; i++)
7967                 {
7968                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7969                 }
7970                 rn[dim1] += tric_sh;
7971                 if (rn[dim1] > 0)
7972                 {
7973                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7974                     /* Take care of coupling of the distances
7975                      * to the planes along dim0 and dim1 through dim2.
7976                      */
7977                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7978                     /* Take care that the cell planes along dim1
7979                      * might not be orthogonal to that along dim2.
7980                      */
7981                     if (normal[dim1][dim2] > 0)
7982                     {
7983                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7984                     }
7985                 }
7986                 if (bDistMB_pulse)
7987                 {
7988                     rb[dim1] +=
7989                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7990                     if (rb[dim1] > 0)
7991                     {
7992                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7993                         /* Take care of coupling of the distances
7994                          * to the planes along dim0 and dim1 through dim2.
7995                          */
7996                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7997                         /* Take care that the cell planes along dim1
7998                          * might not be orthogonal to that along dim2.
7999                          */
8000                         if (normal[dim1][dim2] > 0)
8001                         {
8002                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
8003                         }
8004                     }
8005                 }
8006             }
8007             /* The distance along the communication direction */
8008             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
8009             tric_sh  = 0;
8010             for (i = dim+1; i < DIM; i++)
8011             {
8012                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
8013             }
8014             rn[dim] += tric_sh;
8015             if (rn[dim] > 0)
8016             {
8017                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
8018                 /* Take care of coupling of the distances
8019                  * to the planes along dim0 and dim1 through dim2.
8020                  */
8021                 if (dim_ind == 1 && zonei == 1)
8022                 {
8023                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
8024                 }
8025             }
8026             if (bDistMB_pulse)
8027             {
8028                 clear_rvec(rb);
8029                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
8030                 if (rb[dim] > 0)
8031                 {
8032                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
8033                     /* Take care of coupling of the distances
8034                      * to the planes along dim0 and dim1 through dim2.
8035                      */
8036                     if (dim_ind == 1 && zonei == 1)
8037                     {
8038                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
8039                     }
8040                 }
8041             }
8042         }
8043
8044         if (r2 < r_comm2 ||
8045             (bDistBonded &&
8046              ((bDistMB && rb2 < r_bcomm2) ||
8047               (bDist2B && r2  < r_bcomm2)) &&
8048              (!bBondComm ||
8049               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
8050                missing_link(comm->cglink, index_gl[cg],
8051                             comm->bLocalCG)))))
8052         {
8053             /* Make an index to the local charge groups */
8054             if (nsend+1 > ind->nalloc)
8055             {
8056                 ind->nalloc = over_alloc_large(nsend+1);
8057                 srenew(ind->index, ind->nalloc);
8058             }
8059             if (nsend+1 > *ibuf_nalloc)
8060             {
8061                 *ibuf_nalloc = over_alloc_large(nsend+1);
8062                 srenew(*ibuf, *ibuf_nalloc);
8063             }
8064             ind->index[nsend] = cg;
8065             (*ibuf)[nsend]    = index_gl[cg];
8066             nsend_z++;
8067             vec_rvec_check_alloc(vbuf, nsend+1);
8068
8069             if (dd->ci[dim] == 0)
8070             {
8071                 /* Correct cg_cm for pbc */
8072                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
8073                 if (bScrew)
8074                 {
8075                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
8076                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
8077                 }
8078             }
8079             else
8080             {
8081                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
8082             }
8083             nsend++;
8084             nat += cgindex[cg+1] - cgindex[cg];
8085         }
8086     }
8087
8088     *nsend_ptr   = nsend;
8089     *nat_ptr     = nat;
8090     *nsend_z_ptr = nsend_z;
8091 }
8092
8093 static void setup_dd_communication(gmx_domdec_t *dd,
8094                                    matrix box, gmx_ddbox_t *ddbox,
8095                                    t_forcerec *fr, t_state *state, rvec **f)
8096 {
8097     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8098     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
8099     int                    c, i, j, cg, cg_gl, nrcg;
8100     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8101     gmx_domdec_comm_t     *comm;
8102     gmx_domdec_zones_t    *zones;
8103     gmx_domdec_comm_dim_t *cd;
8104     gmx_domdec_ind_t      *ind;
8105     cginfo_mb_t           *cginfo_mb;
8106     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8107     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
8108     dd_corners_t           corners;
8109     ivec                   tric_dist;
8110     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
8111     real                   skew_fac2_d, skew_fac_01;
8112     rvec                   sf2_round;
8113     int                    nsend, nat;
8114     int                    th;
8115
8116     if (debug)
8117     {
8118         fprintf(debug, "Setting up DD communication\n");
8119     }
8120
8121     comm  = dd->comm;
8122
8123     switch (fr->cutoff_scheme)
8124     {
8125         case ecutsGROUP:
8126             cg_cm = fr->cg_cm;
8127             break;
8128         case ecutsVERLET:
8129             cg_cm = state->x;
8130             break;
8131         default:
8132             gmx_incons("unimplemented");
8133             cg_cm = NULL;
8134     }
8135
8136     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8137     {
8138         dim = dd->dim[dim_ind];
8139
8140         /* Check if we need to use triclinic distances */
8141         tric_dist[dim_ind] = 0;
8142         for (i = 0; i <= dim_ind; i++)
8143         {
8144             if (ddbox->tric_dir[dd->dim[i]])
8145             {
8146                 tric_dist[dim_ind] = 1;
8147             }
8148         }
8149     }
8150
8151     bBondComm = comm->bBondComm;
8152
8153     /* Do we need to determine extra distances for multi-body bondeds? */
8154     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8155
8156     /* Do we need to determine extra distances for only two-body bondeds? */
8157     bDist2B = (bBondComm && !bDistMB);
8158
8159     r_comm2  = sqr(comm->cutoff);
8160     r_bcomm2 = sqr(comm->cutoff_mbody);
8161
8162     if (debug)
8163     {
8164         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8165     }
8166
8167     zones = &comm->zones;
8168
8169     dim0 = dd->dim[0];
8170     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8171     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8172
8173     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8174
8175     /* Triclinic stuff */
8176     normal      = ddbox->normal;
8177     skew_fac_01 = 0;
8178     if (dd->ndim >= 2)
8179     {
8180         v_0 = ddbox->v[dim0];
8181         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8182         {
8183             /* Determine the coupling coefficient for the distances
8184              * to the cell planes along dim0 and dim1 through dim2.
8185              * This is required for correct rounding.
8186              */
8187             skew_fac_01 =
8188                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8189             if (debug)
8190             {
8191                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8192             }
8193         }
8194     }
8195     if (dd->ndim >= 3)
8196     {
8197         v_1 = ddbox->v[dim1];
8198     }
8199
8200     zone_cg_range = zones->cg_range;
8201     index_gl      = dd->index_gl;
8202     cgindex       = dd->cgindex;
8203     cginfo_mb     = fr->cginfo_mb;
8204
8205     zone_cg_range[0]   = 0;
8206     zone_cg_range[1]   = dd->ncg_home;
8207     comm->zone_ncg1[0] = dd->ncg_home;
8208     pos_cg             = dd->ncg_home;
8209
8210     nat_tot = dd->nat_home;
8211     nzone   = 1;
8212     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8213     {
8214         dim = dd->dim[dim_ind];
8215         cd  = &comm->cd[dim_ind];
8216
8217         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8218         {
8219             /* No pbc in this dimension, the first node should not comm. */
8220             nzone_send = 0;
8221         }
8222         else
8223         {
8224             nzone_send = nzone;
8225         }
8226
8227         v_d         = ddbox->v[dim];
8228         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8229
8230         cd->bInPlace = TRUE;
8231         for (p = 0; p < cd->np; p++)
8232         {
8233             /* Only atoms communicated in the first pulse are used
8234              * for multi-body bonded interactions or for bBondComm.
8235              */
8236             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8237
8238             ind   = &cd->ind[p];
8239             nsend = 0;
8240             nat   = 0;
8241             for (zone = 0; zone < nzone_send; zone++)
8242             {
8243                 if (tric_dist[dim_ind] && dim_ind > 0)
8244                 {
8245                     /* Determine slightly more optimized skew_fac's
8246                      * for rounding.
8247                      * This reduces the number of communicated atoms
8248                      * by about 10% for 3D DD of rhombic dodecahedra.
8249                      */
8250                     for (dimd = 0; dimd < dim; dimd++)
8251                     {
8252                         sf2_round[dimd] = 1;
8253                         if (ddbox->tric_dir[dimd])
8254                         {
8255                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8256                             {
8257                                 /* If we are shifted in dimension i
8258                                  * and the cell plane is tilted forward
8259                                  * in dimension i, skip this coupling.
8260                                  */
8261                                 if (!(zones->shift[nzone+zone][i] &&
8262                                       ddbox->v[dimd][i][dimd] >= 0))
8263                                 {
8264                                     sf2_round[dimd] +=
8265                                         sqr(ddbox->v[dimd][i][dimd]);
8266                                 }
8267                             }
8268                             sf2_round[dimd] = 1/sf2_round[dimd];
8269                         }
8270                     }
8271                 }
8272
8273                 zonei = zone_perm[dim_ind][zone];
8274                 if (p == 0)
8275                 {
8276                     /* Here we permutate the zones to obtain a convenient order
8277                      * for neighbor searching
8278                      */
8279                     cg0 = zone_cg_range[zonei];
8280                     cg1 = zone_cg_range[zonei+1];
8281                 }
8282                 else
8283                 {
8284                     /* Look only at the cg's received in the previous grid pulse
8285                      */
8286                     cg1 = zone_cg_range[nzone+zone+1];
8287                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8288                 }
8289
8290 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8291                 for (th = 0; th < comm->nth; th++)
8292                 {
8293                     gmx_domdec_ind_t *ind_p;
8294                     int             **ibuf_p, *ibuf_nalloc_p;
8295                     vec_rvec_t       *vbuf_p;
8296                     int              *nsend_p, *nat_p;
8297                     int              *nsend_zone_p;
8298                     int               cg0_th, cg1_th;
8299
8300                     if (th == 0)
8301                     {
8302                         /* Thread 0 writes in the comm buffers */
8303                         ind_p         = ind;
8304                         ibuf_p        = &comm->buf_int;
8305                         ibuf_nalloc_p = &comm->nalloc_int;
8306                         vbuf_p        = &comm->vbuf;
8307                         nsend_p       = &nsend;
8308                         nat_p         = &nat;
8309                         nsend_zone_p  = &ind->nsend[zone];
8310                     }
8311                     else
8312                     {
8313                         /* Other threads write into temp buffers */
8314                         ind_p         = &comm->dth[th].ind;
8315                         ibuf_p        = &comm->dth[th].ibuf;
8316                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8317                         vbuf_p        = &comm->dth[th].vbuf;
8318                         nsend_p       = &comm->dth[th].nsend;
8319                         nat_p         = &comm->dth[th].nat;
8320                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8321
8322                         comm->dth[th].nsend      = 0;
8323                         comm->dth[th].nat        = 0;
8324                         comm->dth[th].nsend_zone = 0;
8325                     }
8326
8327                     if (comm->nth == 1)
8328                     {
8329                         cg0_th = cg0;
8330                         cg1_th = cg1;
8331                     }
8332                     else
8333                     {
8334                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8335                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8336                     }
8337
8338                     /* Get the cg's for this pulse in this zone */
8339                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8340                                        index_gl, cgindex,
8341                                        dim, dim_ind, dim0, dim1, dim2,
8342                                        r_comm2, r_bcomm2,
8343                                        box, tric_dist,
8344                                        normal, skew_fac2_d, skew_fac_01,
8345                                        v_d, v_0, v_1, &corners, sf2_round,
8346                                        bDistBonded, bBondComm,
8347                                        bDist2B, bDistMB,
8348                                        cg_cm, fr->cginfo,
8349                                        ind_p,
8350                                        ibuf_p, ibuf_nalloc_p,
8351                                        vbuf_p,
8352                                        nsend_p, nat_p,
8353                                        nsend_zone_p);
8354                 }
8355
8356                 /* Append data of threads>=1 to the communication buffers */
8357                 for (th = 1; th < comm->nth; th++)
8358                 {
8359                     dd_comm_setup_work_t *dth;
8360                     int                   i, ns1;
8361
8362                     dth = &comm->dth[th];
8363
8364                     ns1 = nsend + dth->nsend_zone;
8365                     if (ns1 > ind->nalloc)
8366                     {
8367                         ind->nalloc = over_alloc_dd(ns1);
8368                         srenew(ind->index, ind->nalloc);
8369                     }
8370                     if (ns1 > comm->nalloc_int)
8371                     {
8372                         comm->nalloc_int = over_alloc_dd(ns1);
8373                         srenew(comm->buf_int, comm->nalloc_int);
8374                     }
8375                     if (ns1 > comm->vbuf.nalloc)
8376                     {
8377                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8378                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8379                     }
8380
8381                     for (i = 0; i < dth->nsend_zone; i++)
8382                     {
8383                         ind->index[nsend]    = dth->ind.index[i];
8384                         comm->buf_int[nsend] = dth->ibuf[i];
8385                         copy_rvec(dth->vbuf.v[i],
8386                                   comm->vbuf.v[nsend]);
8387                         nsend++;
8388                     }
8389                     nat              += dth->nat;
8390                     ind->nsend[zone] += dth->nsend_zone;
8391                 }
8392             }
8393             /* Clear the counts in case we do not have pbc */
8394             for (zone = nzone_send; zone < nzone; zone++)
8395             {
8396                 ind->nsend[zone] = 0;
8397             }
8398             ind->nsend[nzone]   = nsend;
8399             ind->nsend[nzone+1] = nat;
8400             /* Communicate the number of cg's and atoms to receive */
8401             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8402                             ind->nsend, nzone+2,
8403                             ind->nrecv, nzone+2);
8404
8405             /* The rvec buffer is also required for atom buffers of size nsend
8406              * in dd_move_x and dd_move_f.
8407              */
8408             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8409
8410             if (p > 0)
8411             {
8412                 /* We can receive in place if only the last zone is not empty */
8413                 for (zone = 0; zone < nzone-1; zone++)
8414                 {
8415                     if (ind->nrecv[zone] > 0)
8416                     {
8417                         cd->bInPlace = FALSE;
8418                     }
8419                 }
8420                 if (!cd->bInPlace)
8421                 {
8422                     /* The int buffer is only required here for the cg indices */
8423                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8424                     {
8425                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8426                         srenew(comm->buf_int2, comm->nalloc_int2);
8427                     }
8428                     /* The rvec buffer is also required for atom buffers
8429                      * of size nrecv in dd_move_x and dd_move_f.
8430                      */
8431                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8432                     vec_rvec_check_alloc(&comm->vbuf2, i);
8433                 }
8434             }
8435
8436             /* Make space for the global cg indices */
8437             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8438                 || dd->cg_nalloc == 0)
8439             {
8440                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8441                 srenew(index_gl, dd->cg_nalloc);
8442                 srenew(cgindex, dd->cg_nalloc+1);
8443             }
8444             /* Communicate the global cg indices */
8445             if (cd->bInPlace)
8446             {
8447                 recv_i = index_gl + pos_cg;
8448             }
8449             else
8450             {
8451                 recv_i = comm->buf_int2;
8452             }
8453             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8454                             comm->buf_int, nsend,
8455                             recv_i,        ind->nrecv[nzone]);
8456
8457             /* Make space for cg_cm */
8458             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8459             if (fr->cutoff_scheme == ecutsGROUP)
8460             {
8461                 cg_cm = fr->cg_cm;
8462             }
8463             else
8464             {
8465                 cg_cm = state->x;
8466             }
8467             /* Communicate cg_cm */
8468             if (cd->bInPlace)
8469             {
8470                 recv_vr = cg_cm + pos_cg;
8471             }
8472             else
8473             {
8474                 recv_vr = comm->vbuf2.v;
8475             }
8476             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8477                              comm->vbuf.v, nsend,
8478                              recv_vr,      ind->nrecv[nzone]);
8479
8480             /* Make the charge group index */
8481             if (cd->bInPlace)
8482             {
8483                 zone = (p == 0 ? 0 : nzone - 1);
8484                 while (zone < nzone)
8485                 {
8486                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8487                     {
8488                         cg_gl              = index_gl[pos_cg];
8489                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8490                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8491                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8492                         if (bBondComm)
8493                         {
8494                             /* Update the charge group presence,
8495                              * so we can use it in the next pass of the loop.
8496                              */
8497                             comm->bLocalCG[cg_gl] = TRUE;
8498                         }
8499                         pos_cg++;
8500                     }
8501                     if (p == 0)
8502                     {
8503                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8504                     }
8505                     zone++;
8506                     zone_cg_range[nzone+zone] = pos_cg;
8507                 }
8508             }
8509             else
8510             {
8511                 /* This part of the code is never executed with bBondComm. */
8512                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8513                                  index_gl, recv_i, cg_cm, recv_vr,
8514                                  cgindex, fr->cginfo_mb, fr->cginfo);
8515                 pos_cg += ind->nrecv[nzone];
8516             }
8517             nat_tot += ind->nrecv[nzone+1];
8518         }
8519         if (!cd->bInPlace)
8520         {
8521             /* Store the atom block for easy copying of communication buffers */
8522             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8523         }
8524         nzone += nzone;
8525     }
8526     dd->index_gl = index_gl;
8527     dd->cgindex  = cgindex;
8528
8529     dd->ncg_tot          = zone_cg_range[zones->n];
8530     dd->nat_tot          = nat_tot;
8531     comm->nat[ddnatHOME] = dd->nat_home;
8532     for (i = ddnatZONE; i < ddnatNR; i++)
8533     {
8534         comm->nat[i] = dd->nat_tot;
8535     }
8536
8537     if (!bBondComm)
8538     {
8539         /* We don't need to update cginfo, since that was alrady done above.
8540          * So we pass NULL for the forcerec.
8541          */
8542         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8543                       NULL, comm->bLocalCG);
8544     }
8545
8546     if (debug)
8547     {
8548         fprintf(debug, "Finished setting up DD communication, zones:");
8549         for (c = 0; c < zones->n; c++)
8550         {
8551             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8552         }
8553         fprintf(debug, "\n");
8554     }
8555 }
8556
8557 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8558 {
8559     int c;
8560
8561     for (c = 0; c < zones->nizone; c++)
8562     {
8563         zones->izone[c].cg1  = zones->cg_range[c+1];
8564         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8565         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8566     }
8567 }
8568
8569 static void set_zones_size(gmx_domdec_t *dd,
8570                            matrix box, const gmx_ddbox_t *ddbox,
8571                            int zone_start, int zone_end)
8572 {
8573     gmx_domdec_comm_t  *comm;
8574     gmx_domdec_zones_t *zones;
8575     gmx_bool            bDistMB;
8576     int                 z, zi, zj0, zj1, d, dim;
8577     real                rcs, rcmbs;
8578     int                 i, j;
8579     real                size_j, add_tric;
8580     real                vol;
8581
8582     comm = dd->comm;
8583
8584     zones = &comm->zones;
8585
8586     /* Do we need to determine extra distances for multi-body bondeds? */
8587     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8588
8589     for (z = zone_start; z < zone_end; z++)
8590     {
8591         /* Copy cell limits to zone limits.
8592          * Valid for non-DD dims and non-shifted dims.
8593          */
8594         copy_rvec(comm->cell_x0, zones->size[z].x0);
8595         copy_rvec(comm->cell_x1, zones->size[z].x1);
8596     }
8597
8598     for (d = 0; d < dd->ndim; d++)
8599     {
8600         dim = dd->dim[d];
8601
8602         for (z = 0; z < zones->n; z++)
8603         {
8604             /* With a staggered grid we have different sizes
8605              * for non-shifted dimensions.
8606              */
8607             if (dd->bGridJump && zones->shift[z][dim] == 0)
8608             {
8609                 if (d == 1)
8610                 {
8611                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8612                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8613                 }
8614                 else if (d == 2)
8615                 {
8616                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8617                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8618                 }
8619             }
8620         }
8621
8622         rcs   = comm->cutoff;
8623         rcmbs = comm->cutoff_mbody;
8624         if (ddbox->tric_dir[dim])
8625         {
8626             rcs   /= ddbox->skew_fac[dim];
8627             rcmbs /= ddbox->skew_fac[dim];
8628         }
8629
8630         /* Set the lower limit for the shifted zone dimensions */
8631         for (z = zone_start; z < zone_end; z++)
8632         {
8633             if (zones->shift[z][dim] > 0)
8634             {
8635                 dim = dd->dim[d];
8636                 if (!dd->bGridJump || d == 0)
8637                 {
8638                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8639                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8640                 }
8641                 else
8642                 {
8643                     /* Here we take the lower limit of the zone from
8644                      * the lowest domain of the zone below.
8645                      */
8646                     if (z < 4)
8647                     {
8648                         zones->size[z].x0[dim] =
8649                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8650                     }
8651                     else
8652                     {
8653                         if (d == 1)
8654                         {
8655                             zones->size[z].x0[dim] =
8656                                 zones->size[zone_perm[2][z-4]].x0[dim];
8657                         }
8658                         else
8659                         {
8660                             zones->size[z].x0[dim] =
8661                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8662                         }
8663                     }
8664                     /* A temporary limit, is updated below */
8665                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8666
8667                     if (bDistMB)
8668                     {
8669                         for (zi = 0; zi < zones->nizone; zi++)
8670                         {
8671                             if (zones->shift[zi][dim] == 0)
8672                             {
8673                                 /* This takes the whole zone into account.
8674                                  * With multiple pulses this will lead
8675                                  * to a larger zone then strictly necessary.
8676                                  */
8677                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8678                                                              zones->size[zi].x1[dim]+rcmbs);
8679                             }
8680                         }
8681                     }
8682                 }
8683             }
8684         }
8685
8686         /* Loop over the i-zones to set the upper limit of each
8687          * j-zone they see.
8688          */
8689         for (zi = 0; zi < zones->nizone; zi++)
8690         {
8691             if (zones->shift[zi][dim] == 0)
8692             {
8693                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8694                 {
8695                     if (zones->shift[z][dim] > 0)
8696                     {
8697                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8698                                                      zones->size[zi].x1[dim]+rcs);
8699                     }
8700                 }
8701             }
8702         }
8703     }
8704
8705     for (z = zone_start; z < zone_end; z++)
8706     {
8707         /* Initialization only required to keep the compiler happy */
8708         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8709         int  nc, c;
8710
8711         /* To determine the bounding box for a zone we need to find
8712          * the extreme corners of 4, 2 or 1 corners.
8713          */
8714         nc = 1 << (ddbox->npbcdim - 1);
8715
8716         for (c = 0; c < nc; c++)
8717         {
8718             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8719             corner[XX] = 0;
8720             if ((c & 1) == 0)
8721             {
8722                 corner[YY] = zones->size[z].x0[YY];
8723             }
8724             else
8725             {
8726                 corner[YY] = zones->size[z].x1[YY];
8727             }
8728             if ((c & 2) == 0)
8729             {
8730                 corner[ZZ] = zones->size[z].x0[ZZ];
8731             }
8732             else
8733             {
8734                 corner[ZZ] = zones->size[z].x1[ZZ];
8735             }
8736             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8737             {
8738                 /* With 1D domain decomposition the cg's are not in
8739                  * the triclinic box, but triclinic x-y and rectangular y-z.
8740                  * Shift y back, so it will later end up at 0.
8741                  */
8742                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8743             }
8744             /* Apply the triclinic couplings */
8745             for (i = YY; i < ddbox->npbcdim; i++)
8746             {
8747                 for (j = XX; j < i; j++)
8748                 {
8749                     corner[j] += corner[i]*box[i][j]/box[i][i];
8750                 }
8751             }
8752             if (c == 0)
8753             {
8754                 copy_rvec(corner, corner_min);
8755                 copy_rvec(corner, corner_max);
8756             }
8757             else
8758             {
8759                 for (i = 0; i < DIM; i++)
8760                 {
8761                     corner_min[i] = min(corner_min[i], corner[i]);
8762                     corner_max[i] = max(corner_max[i], corner[i]);
8763                 }
8764             }
8765         }
8766         /* Copy the extreme cornes without offset along x */
8767         for (i = 0; i < DIM; i++)
8768         {
8769             zones->size[z].bb_x0[i] = corner_min[i];
8770             zones->size[z].bb_x1[i] = corner_max[i];
8771         }
8772         /* Add the offset along x */
8773         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8774         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8775     }
8776
8777     if (zone_start == 0)
8778     {
8779         vol = 1;
8780         for (dim = 0; dim < DIM; dim++)
8781         {
8782             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8783         }
8784         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8785     }
8786
8787     if (debug)
8788     {
8789         for (z = zone_start; z < zone_end; z++)
8790         {
8791             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8792                     z,
8793                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8794                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8795                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8796             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8797                     z,
8798                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8799                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8800                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8801         }
8802     }
8803 }
8804
8805 static int comp_cgsort(const void *a, const void *b)
8806 {
8807     int           comp;
8808
8809     gmx_cgsort_t *cga, *cgb;
8810     cga = (gmx_cgsort_t *)a;
8811     cgb = (gmx_cgsort_t *)b;
8812
8813     comp = cga->nsc - cgb->nsc;
8814     if (comp == 0)
8815     {
8816         comp = cga->ind_gl - cgb->ind_gl;
8817     }
8818
8819     return comp;
8820 }
8821
8822 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8823                          int *a, int *buf)
8824 {
8825     int i;
8826
8827     /* Order the data */
8828     for (i = 0; i < n; i++)
8829     {
8830         buf[i] = a[sort[i].ind];
8831     }
8832
8833     /* Copy back to the original array */
8834     for (i = 0; i < n; i++)
8835     {
8836         a[i] = buf[i];
8837     }
8838 }
8839
8840 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8841                          rvec *v, rvec *buf)
8842 {
8843     int i;
8844
8845     /* Order the data */
8846     for (i = 0; i < n; i++)
8847     {
8848         copy_rvec(v[sort[i].ind], buf[i]);
8849     }
8850
8851     /* Copy back to the original array */
8852     for (i = 0; i < n; i++)
8853     {
8854         copy_rvec(buf[i], v[i]);
8855     }
8856 }
8857
8858 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8859                            rvec *v, rvec *buf)
8860 {
8861     int a, atot, cg, cg0, cg1, i;
8862
8863     if (cgindex == NULL)
8864     {
8865         /* Avoid the useless loop of the atoms within a cg */
8866         order_vec_cg(ncg, sort, v, buf);
8867
8868         return;
8869     }
8870
8871     /* Order the data */
8872     a = 0;
8873     for (cg = 0; cg < ncg; cg++)
8874     {
8875         cg0 = cgindex[sort[cg].ind];
8876         cg1 = cgindex[sort[cg].ind+1];
8877         for (i = cg0; i < cg1; i++)
8878         {
8879             copy_rvec(v[i], buf[a]);
8880             a++;
8881         }
8882     }
8883     atot = a;
8884
8885     /* Copy back to the original array */
8886     for (a = 0; a < atot; a++)
8887     {
8888         copy_rvec(buf[a], v[a]);
8889     }
8890 }
8891
8892 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8893                          int nsort_new, gmx_cgsort_t *sort_new,
8894                          gmx_cgsort_t *sort1)
8895 {
8896     int i1, i2, i_new;
8897
8898     /* The new indices are not very ordered, so we qsort them */
8899     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8900
8901     /* sort2 is already ordered, so now we can merge the two arrays */
8902     i1    = 0;
8903     i2    = 0;
8904     i_new = 0;
8905     while (i2 < nsort2 || i_new < nsort_new)
8906     {
8907         if (i2 == nsort2)
8908         {
8909             sort1[i1++] = sort_new[i_new++];
8910         }
8911         else if (i_new == nsort_new)
8912         {
8913             sort1[i1++] = sort2[i2++];
8914         }
8915         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8916                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8917                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8918         {
8919             sort1[i1++] = sort2[i2++];
8920         }
8921         else
8922         {
8923             sort1[i1++] = sort_new[i_new++];
8924         }
8925     }
8926 }
8927
8928 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8929 {
8930     gmx_domdec_sort_t *sort;
8931     gmx_cgsort_t      *cgsort, *sort_i;
8932     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8933     int                sort_last, sort_skip;
8934
8935     sort = dd->comm->sort;
8936
8937     a = fr->ns.grid->cell_index;
8938
8939     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8940
8941     if (ncg_home_old >= 0)
8942     {
8943         /* The charge groups that remained in the same ns grid cell
8944          * are completely ordered. So we can sort efficiently by sorting
8945          * the charge groups that did move into the stationary list.
8946          */
8947         ncg_new   = 0;
8948         nsort2    = 0;
8949         nsort_new = 0;
8950         for (i = 0; i < dd->ncg_home; i++)
8951         {
8952             /* Check if this cg did not move to another node */
8953             if (a[i] < moved)
8954             {
8955                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8956                 {
8957                     /* This cg is new on this node or moved ns grid cell */
8958                     if (nsort_new >= sort->sort_new_nalloc)
8959                     {
8960                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8961                         srenew(sort->sort_new, sort->sort_new_nalloc);
8962                     }
8963                     sort_i = &(sort->sort_new[nsort_new++]);
8964                 }
8965                 else
8966                 {
8967                     /* This cg did not move */
8968                     sort_i = &(sort->sort2[nsort2++]);
8969                 }
8970                 /* Sort on the ns grid cell indices
8971                  * and the global topology index.
8972                  * index_gl is irrelevant with cell ns,
8973                  * but we set it here anyhow to avoid a conditional.
8974                  */
8975                 sort_i->nsc    = a[i];
8976                 sort_i->ind_gl = dd->index_gl[i];
8977                 sort_i->ind    = i;
8978                 ncg_new++;
8979             }
8980         }
8981         if (debug)
8982         {
8983             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8984                     nsort2, nsort_new);
8985         }
8986         /* Sort efficiently */
8987         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8988                      sort->sort);
8989     }
8990     else
8991     {
8992         cgsort  = sort->sort;
8993         ncg_new = 0;
8994         for (i = 0; i < dd->ncg_home; i++)
8995         {
8996             /* Sort on the ns grid cell indices
8997              * and the global topology index
8998              */
8999             cgsort[i].nsc    = a[i];
9000             cgsort[i].ind_gl = dd->index_gl[i];
9001             cgsort[i].ind    = i;
9002             if (cgsort[i].nsc < moved)
9003             {
9004                 ncg_new++;
9005             }
9006         }
9007         if (debug)
9008         {
9009             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
9010         }
9011         /* Determine the order of the charge groups using qsort */
9012         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
9013     }
9014
9015     return ncg_new;
9016 }
9017
9018 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
9019 {
9020     gmx_cgsort_t *sort;
9021     int           ncg_new, i, *a, na;
9022
9023     sort = dd->comm->sort->sort;
9024
9025     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
9026
9027     ncg_new = 0;
9028     for (i = 0; i < na; i++)
9029     {
9030         if (a[i] >= 0)
9031         {
9032             sort[ncg_new].ind = a[i];
9033             ncg_new++;
9034         }
9035     }
9036
9037     return ncg_new;
9038 }
9039
9040 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
9041                           int ncg_home_old)
9042 {
9043     gmx_domdec_sort_t *sort;
9044     gmx_cgsort_t      *cgsort, *sort_i;
9045     int               *cgindex;
9046     int                ncg_new, i, *ibuf, cgsize;
9047     rvec              *vbuf;
9048
9049     sort = dd->comm->sort;
9050
9051     if (dd->ncg_home > sort->sort_nalloc)
9052     {
9053         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9054         srenew(sort->sort, sort->sort_nalloc);
9055         srenew(sort->sort2, sort->sort_nalloc);
9056     }
9057     cgsort = sort->sort;
9058
9059     switch (fr->cutoff_scheme)
9060     {
9061         case ecutsGROUP:
9062             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9063             break;
9064         case ecutsVERLET:
9065             ncg_new = dd_sort_order_nbnxn(dd, fr);
9066             break;
9067         default:
9068             gmx_incons("unimplemented");
9069             ncg_new = 0;
9070     }
9071
9072     /* We alloc with the old size, since cgindex is still old */
9073     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9074     vbuf = dd->comm->vbuf.v;
9075
9076     if (dd->comm->bCGs)
9077     {
9078         cgindex = dd->cgindex;
9079     }
9080     else
9081     {
9082         cgindex = NULL;
9083     }
9084
9085     /* Remove the charge groups which are no longer at home here */
9086     dd->ncg_home = ncg_new;
9087     if (debug)
9088     {
9089         fprintf(debug, "Set the new home charge group count to %d\n",
9090                 dd->ncg_home);
9091     }
9092
9093     /* Reorder the state */
9094     for (i = 0; i < estNR; i++)
9095     {
9096         if (EST_DISTR(i) && (state->flags & (1<<i)))
9097         {
9098             switch (i)
9099             {
9100                 case estX:
9101                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
9102                     break;
9103                 case estV:
9104                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
9105                     break;
9106                 case estSDX:
9107                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
9108                     break;
9109                 case estCGP:
9110                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
9111                     break;
9112                 case estLD_RNG:
9113                 case estLD_RNGI:
9114                 case estDISRE_INITF:
9115                 case estDISRE_RM3TAV:
9116                 case estORIRE_INITF:
9117                 case estORIRE_DTAV:
9118                     /* No ordering required */
9119                     break;
9120                 default:
9121                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9122                     break;
9123             }
9124         }
9125     }
9126     if (fr->cutoff_scheme == ecutsGROUP)
9127     {
9128         /* Reorder cgcm */
9129         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9130     }
9131
9132     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9133     {
9134         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9135         srenew(sort->ibuf, sort->ibuf_nalloc);
9136     }
9137     ibuf = sort->ibuf;
9138     /* Reorder the global cg index */
9139     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9140     /* Reorder the cginfo */
9141     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9142     /* Rebuild the local cg index */
9143     if (dd->comm->bCGs)
9144     {
9145         ibuf[0] = 0;
9146         for (i = 0; i < dd->ncg_home; i++)
9147         {
9148             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9149             ibuf[i+1] = ibuf[i] + cgsize;
9150         }
9151         for (i = 0; i < dd->ncg_home+1; i++)
9152         {
9153             dd->cgindex[i] = ibuf[i];
9154         }
9155     }
9156     else
9157     {
9158         for (i = 0; i < dd->ncg_home+1; i++)
9159         {
9160             dd->cgindex[i] = i;
9161         }
9162     }
9163     /* Set the home atom number */
9164     dd->nat_home = dd->cgindex[dd->ncg_home];
9165
9166     if (fr->cutoff_scheme == ecutsVERLET)
9167     {
9168         /* The atoms are now exactly in grid order, update the grid order */
9169         nbnxn_set_atomorder(fr->nbv->nbs);
9170     }
9171     else
9172     {
9173         /* Copy the sorted ns cell indices back to the ns grid struct */
9174         for (i = 0; i < dd->ncg_home; i++)
9175         {
9176             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9177         }
9178         fr->ns.grid->nr = dd->ncg_home;
9179     }
9180 }
9181
9182 static void add_dd_statistics(gmx_domdec_t *dd)
9183 {
9184     gmx_domdec_comm_t *comm;
9185     int                ddnat;
9186
9187     comm = dd->comm;
9188
9189     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9190     {
9191         comm->sum_nat[ddnat-ddnatZONE] +=
9192             comm->nat[ddnat] - comm->nat[ddnat-1];
9193     }
9194     comm->ndecomp++;
9195 }
9196
9197 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9198 {
9199     gmx_domdec_comm_t *comm;
9200     int                ddnat;
9201
9202     comm = dd->comm;
9203
9204     /* Reset all the statistics and counters for total run counting */
9205     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9206     {
9207         comm->sum_nat[ddnat-ddnatZONE] = 0;
9208     }
9209     comm->ndecomp   = 0;
9210     comm->nload     = 0;
9211     comm->load_step = 0;
9212     comm->load_sum  = 0;
9213     comm->load_max  = 0;
9214     clear_ivec(comm->load_lim);
9215     comm->load_mdf = 0;
9216     comm->load_pme = 0;
9217 }
9218
9219 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9220 {
9221     gmx_domdec_comm_t *comm;
9222     int                ddnat;
9223     double             av;
9224
9225     comm = cr->dd->comm;
9226
9227     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9228
9229     if (fplog == NULL)
9230     {
9231         return;
9232     }
9233
9234     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9235
9236     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9237     {
9238         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9239         switch (ddnat)
9240         {
9241             case ddnatZONE:
9242                 fprintf(fplog,
9243                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9244                         2, av);
9245                 break;
9246             case ddnatVSITE:
9247                 if (cr->dd->vsite_comm)
9248                 {
9249                     fprintf(fplog,
9250                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9251                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9252                             av);
9253                 }
9254                 break;
9255             case ddnatCON:
9256                 if (cr->dd->constraint_comm)
9257                 {
9258                     fprintf(fplog,
9259                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9260                             1 + ir->nLincsIter, av);
9261                 }
9262                 break;
9263             default:
9264                 gmx_incons(" Unknown type for DD statistics");
9265         }
9266     }
9267     fprintf(fplog, "\n");
9268
9269     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9270     {
9271         print_dd_load_av(fplog, cr->dd);
9272     }
9273 }
9274
9275 void dd_partition_system(FILE                *fplog,
9276                          gmx_int64_t          step,
9277                          t_commrec           *cr,
9278                          gmx_bool             bMasterState,
9279                          int                  nstglobalcomm,
9280                          t_state             *state_global,
9281                          gmx_mtop_t          *top_global,
9282                          t_inputrec          *ir,
9283                          t_state             *state_local,
9284                          rvec               **f,
9285                          t_mdatoms           *mdatoms,
9286                          gmx_localtop_t      *top_local,
9287                          t_forcerec          *fr,
9288                          gmx_vsite_t         *vsite,
9289                          gmx_shellfc_t        shellfc,
9290                          gmx_constr_t         constr,
9291                          t_nrnb              *nrnb,
9292                          gmx_wallcycle_t      wcycle,
9293                          gmx_bool             bVerbose)
9294 {
9295     gmx_domdec_t      *dd;
9296     gmx_domdec_comm_t *comm;
9297     gmx_ddbox_t        ddbox = {0};
9298     t_block           *cgs_gl;
9299     gmx_int64_t        step_pcoupl;
9300     rvec               cell_ns_x0, cell_ns_x1;
9301     int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9302     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9303     gmx_bool           bRedist, bSortCG, bResortAll;
9304     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9305     real               grid_density;
9306     char               sbuf[22];
9307
9308     dd   = cr->dd;
9309     comm = dd->comm;
9310
9311     bBoxChanged = (bMasterState || DEFORM(*ir));
9312     if (ir->epc != epcNO)
9313     {
9314         /* With nstpcouple > 1 pressure coupling happens.
9315          * one step after calculating the pressure.
9316          * Box scaling happens at the end of the MD step,
9317          * after the DD partitioning.
9318          * We therefore have to do DLB in the first partitioning
9319          * after an MD step where P-coupling occured.
9320          * We need to determine the last step in which p-coupling occurred.
9321          * MRS -- need to validate this for vv?
9322          */
9323         n = ir->nstpcouple;
9324         if (n == 1)
9325         {
9326             step_pcoupl = step - 1;
9327         }
9328         else
9329         {
9330             step_pcoupl = ((step - 1)/n)*n + 1;
9331         }
9332         if (step_pcoupl >= comm->partition_step)
9333         {
9334             bBoxChanged = TRUE;
9335         }
9336     }
9337
9338     bNStGlobalComm = (step % nstglobalcomm == 0);
9339
9340     if (!comm->bDynLoadBal)
9341     {
9342         bDoDLB = FALSE;
9343     }
9344     else
9345     {
9346         /* Should we do dynamic load balacing this step?
9347          * Since it requires (possibly expensive) global communication,
9348          * we might want to do DLB less frequently.
9349          */
9350         if (bBoxChanged || ir->epc != epcNO)
9351         {
9352             bDoDLB = bBoxChanged;
9353         }
9354         else
9355         {
9356             bDoDLB = bNStGlobalComm;
9357         }
9358     }
9359
9360     /* Check if we have recorded loads on the nodes */
9361     if (comm->bRecordLoad && dd_load_count(comm))
9362     {
9363         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9364         {
9365             /* Check if we should use DLB at the second partitioning
9366              * and every 100 partitionings,
9367              * so the extra communication cost is negligible.
9368              */
9369             n         = max(100, nstglobalcomm);
9370             bCheckDLB = (comm->n_load_collect == 0 ||
9371                          comm->n_load_have % n == n-1);
9372         }
9373         else
9374         {
9375             bCheckDLB = FALSE;
9376         }
9377
9378         /* Print load every nstlog, first and last step to the log file */
9379         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9380                     comm->n_load_collect == 0 ||
9381                     (ir->nsteps >= 0 &&
9382                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9383
9384         /* Avoid extra communication due to verbose screen output
9385          * when nstglobalcomm is set.
9386          */
9387         if (bDoDLB || bLogLoad || bCheckDLB ||
9388             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9389         {
9390             get_load_distribution(dd, wcycle);
9391             if (DDMASTER(dd))
9392             {
9393                 if (bLogLoad)
9394                 {
9395                     dd_print_load(fplog, dd, step-1);
9396                 }
9397                 if (bVerbose)
9398                 {
9399                     dd_print_load_verbose(dd);
9400                 }
9401             }
9402             comm->n_load_collect++;
9403
9404             if (bCheckDLB)
9405             {
9406                 /* Since the timings are node dependent, the master decides */
9407                 if (DDMASTER(dd))
9408                 {
9409                     bTurnOnDLB =
9410                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9411                     if (debug)
9412                     {
9413                         fprintf(debug, "step %s, imb loss %f\n",
9414                                 gmx_step_str(step, sbuf),
9415                                 dd_force_imb_perf_loss(dd));
9416                     }
9417                 }
9418                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9419                 if (bTurnOnDLB)
9420                 {
9421                     turn_on_dlb(fplog, cr, step);
9422                     bDoDLB = TRUE;
9423                 }
9424             }
9425         }
9426         comm->n_load_have++;
9427     }
9428
9429     cgs_gl = &comm->cgs_gl;
9430
9431     bRedist = FALSE;
9432     if (bMasterState)
9433     {
9434         /* Clear the old state */
9435         clear_dd_indices(dd, 0, 0);
9436         ncgindex_set = 0;
9437
9438         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9439                   TRUE, cgs_gl, state_global->x, &ddbox);
9440
9441         get_cg_distribution(fplog, step, dd, cgs_gl,
9442                             state_global->box, &ddbox, state_global->x);
9443
9444         dd_distribute_state(dd, cgs_gl,
9445                             state_global, state_local, f);
9446
9447         dd_make_local_cgs(dd, &top_local->cgs);
9448
9449         /* Ensure that we have space for the new distribution */
9450         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9451
9452         if (fr->cutoff_scheme == ecutsGROUP)
9453         {
9454             calc_cgcm(fplog, 0, dd->ncg_home,
9455                       &top_local->cgs, state_local->x, fr->cg_cm);
9456         }
9457
9458         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9459
9460         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9461     }
9462     else if (state_local->ddp_count != dd->ddp_count)
9463     {
9464         if (state_local->ddp_count > dd->ddp_count)
9465         {
9466             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9467         }
9468
9469         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9470         {
9471             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9472         }
9473
9474         /* Clear the old state */
9475         clear_dd_indices(dd, 0, 0);
9476
9477         /* Build the new indices */
9478         rebuild_cgindex(dd, cgs_gl->index, state_local);
9479         make_dd_indices(dd, cgs_gl->index, 0);
9480         ncgindex_set = dd->ncg_home;
9481
9482         if (fr->cutoff_scheme == ecutsGROUP)
9483         {
9484             /* Redetermine the cg COMs */
9485             calc_cgcm(fplog, 0, dd->ncg_home,
9486                       &top_local->cgs, state_local->x, fr->cg_cm);
9487         }
9488
9489         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9490
9491         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9492
9493         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9494                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9495
9496         bRedist = comm->bDynLoadBal;
9497     }
9498     else
9499     {
9500         /* We have the full state, only redistribute the cgs */
9501
9502         /* Clear the non-home indices */
9503         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9504         ncgindex_set = 0;
9505
9506         /* Avoid global communication for dim's without pbc and -gcom */
9507         if (!bNStGlobalComm)
9508         {
9509             copy_rvec(comm->box0, ddbox.box0    );
9510             copy_rvec(comm->box_size, ddbox.box_size);
9511         }
9512         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9513                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9514
9515         bBoxChanged = TRUE;
9516         bRedist     = TRUE;
9517     }
9518     /* For dim's without pbc and -gcom */
9519     copy_rvec(ddbox.box0, comm->box0    );
9520     copy_rvec(ddbox.box_size, comm->box_size);
9521
9522     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9523                       step, wcycle);
9524
9525     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9526     {
9527         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9528     }
9529
9530     /* Check if we should sort the charge groups */
9531     if (comm->nstSortCG > 0)
9532     {
9533         bSortCG = (bMasterState ||
9534                    (bRedist && (step % comm->nstSortCG == 0)));
9535     }
9536     else
9537     {
9538         bSortCG = FALSE;
9539     }
9540
9541     ncg_home_old = dd->ncg_home;
9542
9543     ncg_moved = 0;
9544     if (bRedist)
9545     {
9546         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9547
9548         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9549                            state_local, f, fr,
9550                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9551
9552         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9553     }
9554
9555     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9556                           dd, &ddbox,
9557                           &comm->cell_x0, &comm->cell_x1,
9558                           dd->ncg_home, fr->cg_cm,
9559                           cell_ns_x0, cell_ns_x1, &grid_density);
9560
9561     if (bBoxChanged)
9562     {
9563         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9564     }
9565
9566     switch (fr->cutoff_scheme)
9567     {
9568         case ecutsGROUP:
9569             copy_ivec(fr->ns.grid->n, ncells_old);
9570             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9571                        state_local->box, cell_ns_x0, cell_ns_x1,
9572                        fr->rlistlong, grid_density);
9573             break;
9574         case ecutsVERLET:
9575             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9576             break;
9577         default:
9578             gmx_incons("unimplemented");
9579     }
9580     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9581     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9582
9583     if (bSortCG)
9584     {
9585         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9586
9587         /* Sort the state on charge group position.
9588          * This enables exact restarts from this step.
9589          * It also improves performance by about 15% with larger numbers
9590          * of atoms per node.
9591          */
9592
9593         /* Fill the ns grid with the home cell,
9594          * so we can sort with the indices.
9595          */
9596         set_zones_ncg_home(dd);
9597
9598         switch (fr->cutoff_scheme)
9599         {
9600             case ecutsVERLET:
9601                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9602
9603                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9604                                   0,
9605                                   comm->zones.size[0].bb_x0,
9606                                   comm->zones.size[0].bb_x1,
9607                                   0, dd->ncg_home,
9608                                   comm->zones.dens_zone0,
9609                                   fr->cginfo,
9610                                   state_local->x,
9611                                   ncg_moved, bRedist ? comm->moved : NULL,
9612                                   fr->nbv->grp[eintLocal].kernel_type,
9613                                   fr->nbv->grp[eintLocal].nbat);
9614
9615                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9616                 break;
9617             case ecutsGROUP:
9618                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9619                           0, dd->ncg_home, fr->cg_cm);
9620
9621                 copy_ivec(fr->ns.grid->n, ncells_new);
9622                 break;
9623             default:
9624                 gmx_incons("unimplemented");
9625         }
9626
9627         bResortAll = bMasterState;
9628
9629         /* Check if we can user the old order and ns grid cell indices
9630          * of the charge groups to sort the charge groups efficiently.
9631          */
9632         if (ncells_new[XX] != ncells_old[XX] ||
9633             ncells_new[YY] != ncells_old[YY] ||
9634             ncells_new[ZZ] != ncells_old[ZZ])
9635         {
9636             bResortAll = TRUE;
9637         }
9638
9639         if (debug)
9640         {
9641             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9642                     gmx_step_str(step, sbuf), dd->ncg_home);
9643         }
9644         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9645                       bResortAll ? -1 : ncg_home_old);
9646         /* Rebuild all the indices */
9647         ga2la_clear(dd->ga2la);
9648         ncgindex_set = 0;
9649
9650         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9651     }
9652
9653     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9654
9655     /* Setup up the communication and communicate the coordinates */
9656     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9657
9658     /* Set the indices */
9659     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9660
9661     /* Set the charge group boundaries for neighbor searching */
9662     set_cg_boundaries(&comm->zones);
9663
9664     if (fr->cutoff_scheme == ecutsVERLET)
9665     {
9666         set_zones_size(dd, state_local->box, &ddbox,
9667                        bSortCG ? 1 : 0, comm->zones.n);
9668     }
9669
9670     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9671
9672     /*
9673        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9674                  -1,state_local->x,state_local->box);
9675      */
9676
9677     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9678
9679     /* Extract a local topology from the global topology */
9680     for (i = 0; i < dd->ndim; i++)
9681     {
9682         np[dd->dim[i]] = comm->cd[i].np;
9683     }
9684     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9685                       comm->cellsize_min, np,
9686                       fr,
9687                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9688                       vsite, top_global, top_local);
9689
9690     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9691
9692     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9693
9694     /* Set up the special atom communication */
9695     n = comm->nat[ddnatZONE];
9696     for (i = ddnatZONE+1; i < ddnatNR; i++)
9697     {
9698         switch (i)
9699         {
9700             case ddnatVSITE:
9701                 if (vsite && vsite->n_intercg_vsite)
9702                 {
9703                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9704                 }
9705                 break;
9706             case ddnatCON:
9707                 if (dd->bInterCGcons || dd->bInterCGsettles)
9708                 {
9709                     /* Only for inter-cg constraints we need special code */
9710                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9711                                                   constr, ir->nProjOrder,
9712                                                   top_local->idef.il);
9713                 }
9714                 break;
9715             default:
9716                 gmx_incons("Unknown special atom type setup");
9717         }
9718         comm->nat[i] = n;
9719     }
9720
9721     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9722
9723     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9724
9725     /* Make space for the extra coordinates for virtual site
9726      * or constraint communication.
9727      */
9728     state_local->natoms = comm->nat[ddnatNR-1];
9729     if (state_local->natoms > state_local->nalloc)
9730     {
9731         dd_realloc_state(state_local, f, state_local->natoms);
9732     }
9733
9734     if (fr->bF_NoVirSum)
9735     {
9736         if (vsite && vsite->n_intercg_vsite)
9737         {
9738             nat_f_novirsum = comm->nat[ddnatVSITE];
9739         }
9740         else
9741         {
9742             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9743             {
9744                 nat_f_novirsum = dd->nat_tot;
9745             }
9746             else
9747             {
9748                 nat_f_novirsum = dd->nat_home;
9749             }
9750         }
9751     }
9752     else
9753     {
9754         nat_f_novirsum = 0;
9755     }
9756
9757     /* Set the number of atoms required for the force calculation.
9758      * Forces need to be constrained when using a twin-range setup
9759      * or with energy minimization. For simple simulations we could
9760      * avoid some allocation, zeroing and copying, but this is
9761      * probably not worth the complications ande checking.
9762      */
9763     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9764                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9765
9766     /* We make the all mdatoms up to nat_tot_con.
9767      * We could save some work by only setting invmass
9768      * between nat_tot and nat_tot_con.
9769      */
9770     /* This call also sets the new number of home particles to dd->nat_home */
9771     atoms2md(top_global, ir,
9772              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9773
9774     /* Now we have the charges we can sort the FE interactions */
9775     dd_sort_local_top(dd, mdatoms, top_local);
9776
9777     if (vsite != NULL)
9778     {
9779         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9780         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
9781     }
9782
9783     if (shellfc)
9784     {
9785         /* Make the local shell stuff, currently no communication is done */
9786         make_local_shells(cr, mdatoms, shellfc);
9787     }
9788
9789     if (ir->implicit_solvent)
9790     {
9791         make_local_gb(cr, fr->born, ir->gb_algorithm);
9792     }
9793
9794     setup_bonded_threading(fr, &top_local->idef);
9795
9796     if (!(cr->duty & DUTY_PME))
9797     {
9798         /* Send the charges and/or c6/sigmas to our PME only node */
9799         gmx_pme_send_parameters(cr, mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9800                                 mdatoms->chargeA, mdatoms->chargeB,
9801                                 mdatoms->c6A, mdatoms->c6B,
9802                                 mdatoms->sigmaA, mdatoms->sigmaB,
9803                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9804     }
9805
9806     if (constr)
9807     {
9808         set_constraints(constr, top_local, ir, mdatoms, cr);
9809     }
9810
9811     if (ir->ePull != epullNO)
9812     {
9813         /* Update the local pull groups */
9814         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9815     }
9816
9817     if (ir->bRot)
9818     {
9819         /* Update the local rotation groups */
9820         dd_make_local_rotation_groups(dd, ir->rot);
9821     }
9822
9823
9824     add_dd_statistics(dd);
9825
9826     /* Make sure we only count the cycles for this DD partitioning */
9827     clear_dd_cycle_counts(dd);
9828
9829     /* Because the order of the atoms might have changed since
9830      * the last vsite construction, we need to communicate the constructing
9831      * atom coordinates again (for spreading the forces this MD step).
9832      */
9833     dd_move_x_vsites(dd, state_local->box, state_local->x);
9834
9835     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9836
9837     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9838     {
9839         dd_move_x(dd, state_local->box, state_local->x);
9840         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9841                      -1, state_local->x, state_local->box);
9842     }
9843
9844     /* Store the partitioning step */
9845     comm->partition_step = step;
9846
9847     /* Increase the DD partitioning counter */
9848     dd->ddp_count++;
9849     /* The state currently matches this DD partitioning count, store it */
9850     state_local->ddp_count = dd->ddp_count;
9851     if (bMasterState)
9852     {
9853         /* The DD master node knows the complete cg distribution,
9854          * store the count so we can possibly skip the cg info communication.
9855          */
9856         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9857     }
9858
9859     if (comm->DD_debug > 0)
9860     {
9861         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9862         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9863                                 "after partitioning");
9864     }
9865 }