src/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2008
   5  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   7  * others, as listed in the AUTHORS file in the top-level source
   8  * directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <time.h>
  43 #include <math.h>
  44 #include <string.h>
  45 #include <stdlib.h>
  46 #include "typedefs.h"
  47 #include "smalloc.h"
  48 #include "gmx_fatal.h"
  49 #include "gmx_fatal_collective.h"
  50 #include "vec.h"
  51 #include "domdec.h"
  52 #include "domdec_network.h"
  53 #include "nrnb.h"
  54 #include "pbc.h"
  55 #include "chargegroup.h"
  56 #include "constr.h"
  57 #include "mdatoms.h"
  58 #include "names.h"
  59 #include "pdbio.h"
  60 #include "futil.h"
  61 #include "force.h"
  62 #include "pme.h"
  63 #include "pull.h"
  64 #include "pull_rotation.h"
  65 #include "gmx_wallcycle.h"
  66 #include "mdrun.h"
  67 #include "nsgrid.h"
  68 #include "shellfc.h"
  69 #include "mtop_util.h"
  70 #include "gmxfio.h"
  71 #include "gmx_ga2la.h"
  72 #include "gmx_sort.h"
  73 #include "nbnxn_search.h"
  74 #include "bondf.h"
  75 #include "gmx_omp_nthreads.h"
  76 #include "gpu_utils.h"
  77
  78 #ifdef GMX_LIB_MPI
  79 #include <mpi.h>
  80 #endif
  81 #ifdef GMX_THREAD_MPI
  82 #include "tmpi.h"
  83 #endif
  84
  85 #define DDRANK(dd, rank)    (rank)
  86 #define DDMASTERRANK(dd)   (dd->masterrank)
  87
  88 typedef struct gmx_domdec_master
  89 {
  90     /* The cell boundaries */
  91     real **cell_x;
  92     /* The global charge group division */
  93     int   *ncg;    /* Number of home charge groups for each node */
  94     int   *index;  /* Index of nnodes+1 into cg */
  95     int   *cg;     /* Global charge group index */
  96     int   *nat;    /* Number of home atoms for each node. */
  97     int   *ibuf;   /* Buffer for communication */
  98     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  99 } gmx_domdec_master_t;
 100
 101 typedef struct
 102 {
 103     /* The numbers of charge groups to send and receive for each cell
 104      * that requires communication, the last entry contains the total
 105      * number of atoms that needs to be communicated.
 106      */
 107     int  nsend[DD_MAXIZONE+2];
 108     int  nrecv[DD_MAXIZONE+2];
 109     /* The charge groups to send */
 110     int *index;
 111     int  nalloc;
 112     /* The atom range for non-in-place communication */
 113     int  cell2at0[DD_MAXIZONE];
 114     int  cell2at1[DD_MAXIZONE];
 115 } gmx_domdec_ind_t;
 116
 117 typedef struct
 118 {
 119     int               np;       /* Number of grid pulses in this dimension */
 120     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 121     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 122     int               np_nalloc;
 123     gmx_bool          bInPlace; /* Can we communicate in place?            */
 124 } gmx_domdec_comm_dim_t;
 125
 126 typedef struct
 127 {
 128     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 129     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 130     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 131     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 132     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 133     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 134     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 135     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 136     real     *buf_ncd;     /* Temp. var.                                     */
 137 } gmx_domdec_root_t;
 138
 139 #define DD_NLOAD_MAX 9
 140
 141 /* Here floats are accurate enough, since these variables
 142  * only influence the load balancing, not the actual MD results.
 143  */
 144 typedef struct
 145 {
 146     int    nload;
 147     float *load;
 148     float  sum;
 149     float  max;
 150     float  sum_m;
 151     float  cvol_min;
 152     float  mdf;
 153     float  pme;
 154     int    flags;
 155 } gmx_domdec_load_t;
 156
 157 typedef struct
 158 {
 159     int  nsc;
 160     int  ind_gl;
 161     int  ind;
 162 } gmx_cgsort_t;
 163
 164 typedef struct
 165 {
 166     gmx_cgsort_t *sort;
 167     gmx_cgsort_t *sort2;
 168     int           sort_nalloc;
 169     gmx_cgsort_t *sort_new;
 170     int           sort_new_nalloc;
 171     int          *ibuf;
 172     int           ibuf_nalloc;
 173 } gmx_domdec_sort_t;
 174
 175 typedef struct
 176 {
 177     rvec *v;
 178     int   nalloc;
 179 } vec_rvec_t;
 180
 181 /* This enum determines the order of the coordinates.
 182  * ddnatHOME and ddnatZONE should be first and second,
 183  * the others can be ordered as wanted.
 184  */
 185 enum {
 186     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 187 };
 188
 189 enum {
 190     edlbAUTO, edlbNO, edlbYES, edlbNR
 191 };
 192 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 193
 194 typedef struct
 195 {
 196     int      dim;       /* The dimension                                          */
 197     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 198     int      nslab;     /* The number of PME slabs in this dimension              */
 199     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 200     int     *pp_min;    /* The minimum pp node location, size nslab               */
 201     int     *pp_max;    /* The maximum pp node location,size nslab                */
 202     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 203 } gmx_ddpme_t;
 204
 205 typedef struct
 206 {
 207     real min0;    /* The minimum bottom of this zone                        */
 208     real max1;    /* The maximum top of this zone                           */
 209     real min1;    /* The minimum top of this zone                           */
 210     real mch0;    /* The maximum bottom communicaton height for this zone   */
 211     real mch1;    /* The maximum top communicaton height for this zone      */
 212     real p1_0;    /* The bottom value of the first cell in this zone        */
 213     real p1_1;    /* The top value of the first cell in this zone           */
 214 } gmx_ddzone_t;
 215
 216 typedef struct
 217 {
 218     gmx_domdec_ind_t ind;
 219     int             *ibuf;
 220     int              ibuf_nalloc;
 221     vec_rvec_t       vbuf;
 222     int              nsend;
 223     int              nat;
 224     int              nsend_zone;
 225 } dd_comm_setup_work_t;
 226
 227 typedef struct gmx_domdec_comm
 228 {
 229     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 230      * unless stated otherwise.
 231      */
 232
 233     /* The number of decomposition dimensions for PME, 0: no PME */
 234     int         npmedecompdim;
 235     /* The number of nodes doing PME (PP/PME or only PME) */
 236     int         npmenodes;
 237     int         npmenodes_x;
 238     int         npmenodes_y;
 239     /* The communication setup including the PME only nodes */
 240     gmx_bool    bCartesianPP_PME;
 241     ivec        ntot;
 242     int         cartpmedim;
 243     int        *pmenodes;          /* size npmenodes                         */
 244     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 245                                     * but with bCartesianPP_PME              */
 246     gmx_ddpme_t ddpme[2];
 247
 248     /* The DD particle-particle nodes only */
 249     gmx_bool bCartesianPP;
 250     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 251
 252     /* The global charge groups */
 253     t_block cgs_gl;
 254
 255     /* Should we sort the cgs */
 256     int                nstSortCG;
 257     gmx_domdec_sort_t *sort;
 258
 259     /* Are there charge groups? */
 260     gmx_bool bCGs;
 261
 262     /* Are there bonded and multi-body interactions between charge groups? */
 263     gmx_bool bInterCGBondeds;
 264     gmx_bool bInterCGMultiBody;
 265
 266     /* Data for the optional bonded interaction atom communication range */
 267     gmx_bool  bBondComm;
 268     t_blocka *cglink;
 269     char     *bLocalCG;
 270
 271     /* The DLB option */
 272     int      eDLB;
 273     /* Are we actually using DLB? */
 274     gmx_bool bDynLoadBal;
 275
 276     /* Cell sizes for static load balancing, first index cartesian */
 277     real **slb_frac;
 278
 279     /* The width of the communicated boundaries */
 280     real     cutoff_mbody;
 281     real     cutoff;
 282     /* The minimum cell size (including triclinic correction) */
 283     rvec     cellsize_min;
 284     /* For dlb, for use with edlbAUTO */
 285     rvec     cellsize_min_dlb;
 286     /* The lower limit for the DD cell size with DLB */
 287     real     cellsize_limit;
 288     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 289     gmx_bool bVacDLBNoLimit;
 290
 291     /* With PME load balancing we set limits on DLB */
 292     gmx_bool bPMELoadBalDLBLimits;
 293     /* DLB needs to take into account that we want to allow this maximum
 294      * cut-off (for PME load balancing), this could limit cell boundaries.
 295      */
 296     real PMELoadBal_max_cutoff;
 297
 298     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 299     ivec tric_dir;
 300     /* box0 and box_size are required with dim's without pbc and -gcom */
 301     rvec box0;
 302     rvec box_size;
 303
 304     /* The cell boundaries */
 305     rvec cell_x0;
 306     rvec cell_x1;
 307
 308     /* The old location of the cell boundaries, to check cg displacements */
 309     rvec old_cell_x0;
 310     rvec old_cell_x1;
 311
 312     /* The communication setup and charge group boundaries for the zones */
 313     gmx_domdec_zones_t zones;
 314
 315     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 316      * cell boundaries of neighboring cells for dynamic load balancing.
 317      */
 318     gmx_ddzone_t zone_d1[2];
 319     gmx_ddzone_t zone_d2[2][2];
 320
 321     /* The coordinate/force communication setup and indices */
 322     gmx_domdec_comm_dim_t cd[DIM];
 323     /* The maximum number of cells to communicate with in one dimension */
 324     int                   maxpulse;
 325
 326     /* Which cg distribution is stored on the master node */
 327     int master_cg_ddp_count;
 328
 329     /* The number of cg's received from the direct neighbors */
 330     int  zone_ncg1[DD_MAXZONE];
 331
 332     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 333     int  nat[ddnatNR];
 334
 335     /* Array for signalling if atoms have moved to another domain */
 336     int  *moved;
 337     int   moved_nalloc;
 338
 339     /* Communication buffer for general use */
 340     int  *buf_int;
 341     int   nalloc_int;
 342
 343     /* Communication buffer for general use */
 344     vec_rvec_t vbuf;
 345
 346     /* Temporary storage for thread parallel communication setup */
 347     int                   nth;
 348     dd_comm_setup_work_t *dth;
 349
 350     /* Communication buffers only used with multiple grid pulses */
 351     int       *buf_int2;
 352     int        nalloc_int2;
 353     vec_rvec_t vbuf2;
 354
 355     /* Communication buffers for local redistribution */
 356     int  **cggl_flag;
 357     int    cggl_flag_nalloc[DIM*2];
 358     rvec **cgcm_state;
 359     int    cgcm_state_nalloc[DIM*2];
 360
 361     /* Cell sizes for dynamic load balancing */
 362     gmx_domdec_root_t **root;
 363     real               *cell_f_row;
 364     real                cell_f0[DIM];
 365     real                cell_f1[DIM];
 366     real                cell_f_max0[DIM];
 367     real                cell_f_min1[DIM];
 368
 369     /* Stuff for load communication */
 370     gmx_bool           bRecordLoad;
 371     gmx_domdec_load_t *load;
 372     int                nrank_gpu_shared;
 373 #ifdef GMX_MPI
 374     MPI_Comm          *mpi_comm_load;
 375     MPI_Comm           mpi_comm_gpu_shared;
 376 #endif
 377
 378     /* Maximum DLB scaling per load balancing step in percent */
 379     int dlb_scale_lim;
 380
 381     /* Cycle counters */
 382     float  cycl[ddCyclNr];
 383     int    cycl_n[ddCyclNr];
 384     float  cycl_max[ddCyclNr];
 385     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 386     int    eFlop;
 387     double flop;
 388     int    flop_n;
 389     /* Have often have did we have load measurements */
 390     int    n_load_have;
 391     /* Have often have we collected the load measurements */
 392     int    n_load_collect;
 393
 394     /* Statistics */
 395     double sum_nat[ddnatNR-ddnatZONE];
 396     int    ndecomp;
 397     int    nload;
 398     double load_step;
 399     double load_sum;
 400     double load_max;
 401     ivec   load_lim;
 402     double load_mdf;
 403     double load_pme;
 404
 405     /* The last partition step */
 406     gmx_large_int_t partition_step;
 407
 408     /* Debugging */
 409     int  nstDDDump;
 410     int  nstDDDumpGrid;
 411     int  DD_debug;
 412 } gmx_domdec_comm_t;
 413
 414 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 415 #define DD_CGIBS 2
 416
 417 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 418 #define DD_FLAG_NRCG  65535
 419 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 420 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 421
 422 /* Zone permutation required to obtain consecutive charge groups
 423  * for neighbor searching.
 424  */
 425 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 426
 427 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 428  * components see only j zones with that component 0.
 429  */
 430
 431 /* The DD zone order */
 432 static const ivec dd_zo[DD_MAXZONE] =
 433 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 434
 435 /* The 3D setup */
 436 #define dd_z3n  8
 437 #define dd_zp3n 4
 438 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 439
 440 /* The 2D setup */
 441 #define dd_z2n  4
 442 #define dd_zp2n 2
 443 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 444
 445 /* The 1D setup */
 446 #define dd_z1n  2
 447 #define dd_zp1n 1
 448 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 449
 450 /* Factors used to avoid problems due to rounding issues */
 451 #define DD_CELL_MARGIN       1.0001
 452 #define DD_CELL_MARGIN2      1.00005
 453 /* Factor to account for pressure scaling during nstlist steps */
 454 #define DD_PRES_SCALE_MARGIN 1.02
 455
 456 /* Allowed performance loss before we DLB or warn */
 457 #define DD_PERF_LOSS 0.05
 458
 459 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 460
 461 /* Use separate MPI send and receive commands
 462  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 463  * This saves memory (and some copying for small nnodes).
 464  * For high parallelization scatter and gather calls are used.
 465  */
 466 #define GMX_DD_NNODES_SENDRECV 4
 467
 468
 469 /*
 470    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 471
 472    static void index2xyz(ivec nc,int ind,ivec xyz)
 473    {
 474    xyz[XX] = ind % nc[XX];
 475    xyz[YY] = (ind / nc[XX]) % nc[YY];
 476    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 477    }
 478  */
 479
 480 /* This order is required to minimize the coordinate communication in PME
 481  * which uses decomposition in the x direction.
 482  */
 483 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 484
 485 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 486 {
 487     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 488     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 489     xyz[ZZ] = ind % nc[ZZ];
 490 }
 491
 492 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 493 {
 494     int ddindex;
 495     int ddnodeid = -1;
 496
 497     ddindex = dd_index(dd->nc, c);
 498     if (dd->comm->bCartesianPP_PME)
 499     {
 500         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 501     }
 502     else if (dd->comm->bCartesianPP)
 503     {
 504 #ifdef GMX_MPI
 505         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 506 #endif
 507     }
 508     else
 509     {
 510         ddnodeid = ddindex;
 511     }
 512
 513     return ddnodeid;
 514 }
 515
 516 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 517 {
 518     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 519 }
 520
 521 int ddglatnr(gmx_domdec_t *dd, int i)
 522 {
 523     int atnr;
 524
 525     if (dd == NULL)
 526     {
 527         atnr = i + 1;
 528     }
 529     else
 530     {
 531         if (i >= dd->comm->nat[ddnatNR-1])
 532         {
 533             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 534         }
 535         atnr = dd->gatindex[i] + 1;
 536     }
 537
 538     return atnr;
 539 }
 540
 541 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 542 {
 543     return &dd->comm->cgs_gl;
 544 }
 545
 546 static void vec_rvec_init(vec_rvec_t *v)
 547 {
 548     v->nalloc = 0;
 549     v->v      = NULL;
 550 }
 551
 552 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 553 {
 554     if (n > v->nalloc)
 555     {
 556         v->nalloc = over_alloc_dd(n);
 557         srenew(v->v, v->nalloc);
 558     }
 559 }
 560
 561 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 562 {
 563     int i;
 564
 565     if (state->ddp_count != dd->ddp_count)
 566     {
 567         gmx_incons("The state does not the domain decomposition state");
 568     }
 569
 570     state->ncg_gl = dd->ncg_home;
 571     if (state->ncg_gl > state->cg_gl_nalloc)
 572     {
 573         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 574         srenew(state->cg_gl, state->cg_gl_nalloc);
 575     }
 576     for (i = 0; i < state->ncg_gl; i++)
 577     {
 578         state->cg_gl[i] = dd->index_gl[i];
 579     }
 580
 581     state->ddp_count_cg_gl = dd->ddp_count;
 582 }
 583
 584 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 585 {
 586     return &dd->comm->zones;
 587 }
 588
 589 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 590                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 591 {
 592     gmx_domdec_zones_t *zones;
 593     int                 izone, d, dim;
 594
 595     zones = &dd->comm->zones;
 596
 597     izone = 0;
 598     while (icg >= zones->izone[izone].cg1)
 599     {
 600         izone++;
 601     }
 602
 603     if (izone == 0)
 604     {
 605         *jcg0 = icg;
 606     }
 607     else if (izone < zones->nizone)
 608     {
 609         *jcg0 = zones->izone[izone].jcg0;
 610     }
 611     else
 612     {
 613         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 614                   icg, izone, zones->nizone);
 615     }
 616
 617     *jcg1 = zones->izone[izone].jcg1;
 618
 619     for (d = 0; d < dd->ndim; d++)
 620     {
 621         dim         = dd->dim[d];
 622         shift0[dim] = zones->izone[izone].shift0[dim];
 623         shift1[dim] = zones->izone[izone].shift1[dim];
 624         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 625         {
 626             /* A conservative approach, this can be optimized */
 627             shift0[dim] -= 1;
 628             shift1[dim] += 1;
 629         }
 630     }
 631 }
 632
 633 int dd_natoms_vsite(gmx_domdec_t *dd)
 634 {
 635     return dd->comm->nat[ddnatVSITE];
 636 }
 637
 638 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 639 {
 640     *at_start = dd->comm->nat[ddnatCON-1];
 641     *at_end   = dd->comm->nat[ddnatCON];
 642 }
 643
 644 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 645 {
 646     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 647     int                   *index, *cgindex;
 648     gmx_domdec_comm_t     *comm;
 649     gmx_domdec_comm_dim_t *cd;
 650     gmx_domdec_ind_t      *ind;
 651     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 652     gmx_bool               bPBC, bScrew;
 653
 654     comm = dd->comm;
 655
 656     cgindex = dd->cgindex;
 657
 658     buf = comm->vbuf.v;
 659
 660     nzone   = 1;
 661     nat_tot = dd->nat_home;
 662     for (d = 0; d < dd->ndim; d++)
 663     {
 664         bPBC   = (dd->ci[dd->dim[d]] == 0);
 665         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 666         if (bPBC)
 667         {
 668             copy_rvec(box[dd->dim[d]], shift);
 669         }
 670         cd = &comm->cd[d];
 671         for (p = 0; p < cd->np; p++)
 672         {
 673             ind   = &cd->ind[p];
 674             index = ind->index;
 675             n     = 0;
 676             if (!bPBC)
 677             {
 678                 for (i = 0; i < ind->nsend[nzone]; i++)
 679                 {
 680                     at0 = cgindex[index[i]];
 681                     at1 = cgindex[index[i]+1];
 682                     for (j = at0; j < at1; j++)
 683                     {
 684                         copy_rvec(x[j], buf[n]);
 685                         n++;
 686                     }
 687                 }
 688             }
 689             else if (!bScrew)
 690             {
 691                 for (i = 0; i < ind->nsend[nzone]; i++)
 692                 {
 693                     at0 = cgindex[index[i]];
 694                     at1 = cgindex[index[i]+1];
 695                     for (j = at0; j < at1; j++)
 696                     {
 697                         /* We need to shift the coordinates */
 698                         rvec_add(x[j], shift, buf[n]);
 699                         n++;
 700                     }
 701                 }
 702             }
 703             else
 704             {
 705                 for (i = 0; i < ind->nsend[nzone]; i++)
 706                 {
 707                     at0 = cgindex[index[i]];
 708                     at1 = cgindex[index[i]+1];
 709                     for (j = at0; j < at1; j++)
 710                     {
 711                         /* Shift x */
 712                         buf[n][XX] = x[j][XX] + shift[XX];
 713                         /* Rotate y and z.
 714                          * This operation requires a special shift force
 715                          * treatment, which is performed in calc_vir.
 716                          */
 717                         buf[n][YY] = box[YY][YY] - x[j][YY];
 718                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 719                         n++;
 720                     }
 721                 }
 722             }
 723
 724             if (cd->bInPlace)
 725             {
 726                 rbuf = x + nat_tot;
 727             }
 728             else
 729             {
 730                 rbuf = comm->vbuf2.v;
 731             }
 732             /* Send and receive the coordinates */
 733             dd_sendrecv_rvec(dd, d, dddirBackward,
 734                              buf,  ind->nsend[nzone+1],
 735                              rbuf, ind->nrecv[nzone+1]);
 736             if (!cd->bInPlace)
 737             {
 738                 j = 0;
 739                 for (zone = 0; zone < nzone; zone++)
 740                 {
 741                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 742                     {
 743                         copy_rvec(rbuf[j], x[i]);
 744                         j++;
 745                     }
 746                 }
 747             }
 748             nat_tot += ind->nrecv[nzone+1];
 749         }
 750         nzone += nzone;
 751     }
 752 }
 753
 754 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 755 {
 756     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 757     int                   *index, *cgindex;
 758     gmx_domdec_comm_t     *comm;
 759     gmx_domdec_comm_dim_t *cd;
 760     gmx_domdec_ind_t      *ind;
 761     rvec                  *buf, *sbuf;
 762     ivec                   vis;
 763     int                    is;
 764     gmx_bool               bPBC, bScrew;
 765
 766     comm = dd->comm;
 767
 768     cgindex = dd->cgindex;
 769
 770     buf = comm->vbuf.v;
 771
 772     n       = 0;
 773     nzone   = comm->zones.n/2;
 774     nat_tot = dd->nat_tot;
 775     for (d = dd->ndim-1; d >= 0; d--)
 776     {
 777         bPBC   = (dd->ci[dd->dim[d]] == 0);
 778         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 779         if (fshift == NULL && !bScrew)
 780         {
 781             bPBC = FALSE;
 782         }
 783         /* Determine which shift vector we need */
 784         clear_ivec(vis);
 785         vis[dd->dim[d]] = 1;
 786         is              = IVEC2IS(vis);
 787
 788         cd = &comm->cd[d];
 789         for (p = cd->np-1; p >= 0; p--)
 790         {
 791             ind      = &cd->ind[p];
 792             nat_tot -= ind->nrecv[nzone+1];
 793             if (cd->bInPlace)
 794             {
 795                 sbuf = f + nat_tot;
 796             }
 797             else
 798             {
 799                 sbuf = comm->vbuf2.v;
 800                 j    = 0;
 801                 for (zone = 0; zone < nzone; zone++)
 802                 {
 803                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 804                     {
 805                         copy_rvec(f[i], sbuf[j]);
 806                         j++;
 807                     }
 808                 }
 809             }
 810             /* Communicate the forces */
 811             dd_sendrecv_rvec(dd, d, dddirForward,
 812                              sbuf, ind->nrecv[nzone+1],
 813                              buf,  ind->nsend[nzone+1]);
 814             index = ind->index;
 815             /* Add the received forces */
 816             n = 0;
 817             if (!bPBC)
 818             {
 819                 for (i = 0; i < ind->nsend[nzone]; i++)
 820                 {
 821                     at0 = cgindex[index[i]];
 822                     at1 = cgindex[index[i]+1];
 823                     for (j = at0; j < at1; j++)
 824                     {
 825                         rvec_inc(f[j], buf[n]);
 826                         n++;
 827                     }
 828                 }
 829             }
 830             else if (!bScrew)
 831             {
 832                 for (i = 0; i < ind->nsend[nzone]; i++)
 833                 {
 834                     at0 = cgindex[index[i]];
 835                     at1 = cgindex[index[i]+1];
 836                     for (j = at0; j < at1; j++)
 837                     {
 838                         rvec_inc(f[j], buf[n]);
 839                         /* Add this force to the shift force */
 840                         rvec_inc(fshift[is], buf[n]);
 841                         n++;
 842                     }
 843                 }
 844             }
 845             else
 846             {
 847                 for (i = 0; i < ind->nsend[nzone]; i++)
 848                 {
 849                     at0 = cgindex[index[i]];
 850                     at1 = cgindex[index[i]+1];
 851                     for (j = at0; j < at1; j++)
 852                     {
 853                         /* Rotate the force */
 854                         f[j][XX] += buf[n][XX];
 855                         f[j][YY] -= buf[n][YY];
 856                         f[j][ZZ] -= buf[n][ZZ];
 857                         if (fshift)
 858                         {
 859                             /* Add this force to the shift force */
 860                             rvec_inc(fshift[is], buf[n]);
 861                         }
 862                         n++;
 863                     }
 864                 }
 865             }
 866         }
 867         nzone /= 2;
 868     }
 869 }
 870
 871 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 872 {
 873     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 874     int                   *index, *cgindex;
 875     gmx_domdec_comm_t     *comm;
 876     gmx_domdec_comm_dim_t *cd;
 877     gmx_domdec_ind_t      *ind;
 878     real                  *buf, *rbuf;
 879
 880     comm = dd->comm;
 881
 882     cgindex = dd->cgindex;
 883
 884     buf = &comm->vbuf.v[0][0];
 885
 886     nzone   = 1;
 887     nat_tot = dd->nat_home;
 888     for (d = 0; d < dd->ndim; d++)
 889     {
 890         cd = &comm->cd[d];
 891         for (p = 0; p < cd->np; p++)
 892         {
 893             ind   = &cd->ind[p];
 894             index = ind->index;
 895             n     = 0;
 896             for (i = 0; i < ind->nsend[nzone]; i++)
 897             {
 898                 at0 = cgindex[index[i]];
 899                 at1 = cgindex[index[i]+1];
 900                 for (j = at0; j < at1; j++)
 901                 {
 902                     buf[n] = v[j];
 903                     n++;
 904                 }
 905             }
 906
 907             if (cd->bInPlace)
 908             {
 909                 rbuf = v + nat_tot;
 910             }
 911             else
 912             {
 913                 rbuf = &comm->vbuf2.v[0][0];
 914             }
 915             /* Send and receive the coordinates */
 916             dd_sendrecv_real(dd, d, dddirBackward,
 917                              buf,  ind->nsend[nzone+1],
 918                              rbuf, ind->nrecv[nzone+1]);
 919             if (!cd->bInPlace)
 920             {
 921                 j = 0;
 922                 for (zone = 0; zone < nzone; zone++)
 923                 {
 924                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 925                     {
 926                         v[i] = rbuf[j];
 927                         j++;
 928                     }
 929                 }
 930             }
 931             nat_tot += ind->nrecv[nzone+1];
 932         }
 933         nzone += nzone;
 934     }
 935 }
 936
 937 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 938 {
 939     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 940     int                   *index, *cgindex;
 941     gmx_domdec_comm_t     *comm;
 942     gmx_domdec_comm_dim_t *cd;
 943     gmx_domdec_ind_t      *ind;
 944     real                  *buf, *sbuf;
 945
 946     comm = dd->comm;
 947
 948     cgindex = dd->cgindex;
 949
 950     buf = &comm->vbuf.v[0][0];
 951
 952     n       = 0;
 953     nzone   = comm->zones.n/2;
 954     nat_tot = dd->nat_tot;
 955     for (d = dd->ndim-1; d >= 0; d--)
 956     {
 957         cd = &comm->cd[d];
 958         for (p = cd->np-1; p >= 0; p--)
 959         {
 960             ind      = &cd->ind[p];
 961             nat_tot -= ind->nrecv[nzone+1];
 962             if (cd->bInPlace)
 963             {
 964                 sbuf = v + nat_tot;
 965             }
 966             else
 967             {
 968                 sbuf = &comm->vbuf2.v[0][0];
 969                 j    = 0;
 970                 for (zone = 0; zone < nzone; zone++)
 971                 {
 972                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 973                     {
 974                         sbuf[j] = v[i];
 975                         j++;
 976                     }
 977                 }
 978             }
 979             /* Communicate the forces */
 980             dd_sendrecv_real(dd, d, dddirForward,
 981                              sbuf, ind->nrecv[nzone+1],
 982                              buf,  ind->nsend[nzone+1]);
 983             index = ind->index;
 984             /* Add the received forces */
 985             n = 0;
 986             for (i = 0; i < ind->nsend[nzone]; i++)
 987             {
 988                 at0 = cgindex[index[i]];
 989                 at1 = cgindex[index[i]+1];
 990                 for (j = at0; j < at1; j++)
 991                 {
 992                     v[j] += buf[n];
 993                     n++;
 994                 }
 995             }
 996         }
 997         nzone /= 2;
 998     }
 999 }
1000
1001 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
1002 {
1003     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1004             d, i, j,
1005             zone->min0, zone->max1,
1006             zone->mch0, zone->mch0,
1007             zone->p1_0, zone->p1_1);
1008 }
1009
1010
1011 #define DDZONECOMM_MAXZONE  5
1012 #define DDZONECOMM_BUFSIZE  3
1013
1014 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1015                                int ddimind, int direction,
1016                                gmx_ddzone_t *buf_s, int n_s,
1017                                gmx_ddzone_t *buf_r, int n_r)
1018 {
1019 #define ZBS  DDZONECOMM_BUFSIZE
1020     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1021     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1022     int  i;
1023
1024     for (i = 0; i < n_s; i++)
1025     {
1026         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1027         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1028         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1029         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1030         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1031         vbuf_s[i*ZBS+1][2] = 0;
1032         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1033         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1034         vbuf_s[i*ZBS+2][2] = 0;
1035     }
1036
1037     dd_sendrecv_rvec(dd, ddimind, direction,
1038                      vbuf_s, n_s*ZBS,
1039                      vbuf_r, n_r*ZBS);
1040
1041     for (i = 0; i < n_r; i++)
1042     {
1043         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1044         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1045         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1046         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1047         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1048         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1049         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1050     }
1051
1052 #undef ZBS
1053 }
1054
1055 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1056                           rvec cell_ns_x0, rvec cell_ns_x1)
1057 {
1058     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1059     gmx_ddzone_t      *zp;
1060     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1061     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1062     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1063     rvec               extr_s[2], extr_r[2];
1064     rvec               dh;
1065     real               dist_d, c = 0, det;
1066     gmx_domdec_comm_t *comm;
1067     gmx_bool           bPBC, bUse;
1068
1069     comm = dd->comm;
1070
1071     for (d = 1; d < dd->ndim; d++)
1072     {
1073         dim      = dd->dim[d];
1074         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1075         zp->min0 = cell_ns_x0[dim];
1076         zp->max1 = cell_ns_x1[dim];
1077         zp->min1 = cell_ns_x1[dim];
1078         zp->mch0 = cell_ns_x0[dim];
1079         zp->mch1 = cell_ns_x1[dim];
1080         zp->p1_0 = cell_ns_x0[dim];
1081         zp->p1_1 = cell_ns_x1[dim];
1082     }
1083
1084     for (d = dd->ndim-2; d >= 0; d--)
1085     {
1086         dim  = dd->dim[d];
1087         bPBC = (dim < ddbox->npbcdim);
1088
1089         /* Use an rvec to store two reals */
1090         extr_s[d][0] = comm->cell_f0[d+1];
1091         extr_s[d][1] = comm->cell_f1[d+1];
1092         extr_s[d][2] = comm->cell_f1[d+1];
1093
1094         pos = 0;
1095         /* Store the extremes in the backward sending buffer,
1096          * so the get updated separately from the forward communication.
1097          */
1098         for (d1 = d; d1 < dd->ndim-1; d1++)
1099         {
1100             /* We invert the order to be able to use the same loop for buf_e */
1101             buf_s[pos].min0 = extr_s[d1][1];
1102             buf_s[pos].max1 = extr_s[d1][0];
1103             buf_s[pos].min1 = extr_s[d1][2];
1104             buf_s[pos].mch0 = 0;
1105             buf_s[pos].mch1 = 0;
1106             /* Store the cell corner of the dimension we communicate along */
1107             buf_s[pos].p1_0 = comm->cell_x0[dim];
1108             buf_s[pos].p1_1 = 0;
1109             pos++;
1110         }
1111
1112         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1113         pos++;
1114
1115         if (dd->ndim == 3 && d == 0)
1116         {
1117             buf_s[pos] = comm->zone_d2[0][1];
1118             pos++;
1119             buf_s[pos] = comm->zone_d1[0];
1120             pos++;
1121         }
1122
1123         /* We only need to communicate the extremes
1124          * in the forward direction
1125          */
1126         npulse = comm->cd[d].np;
1127         if (bPBC)
1128         {
1129             /* Take the minimum to avoid double communication */
1130             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1131         }
1132         else
1133         {
1134             /* Without PBC we should really not communicate over
1135              * the boundaries, but implementing that complicates
1136              * the communication setup and therefore we simply
1137              * do all communication, but ignore some data.
1138              */
1139             npulse_min = npulse;
1140         }
1141         for (p = 0; p < npulse_min; p++)
1142         {
1143             /* Communicate the extremes forward */
1144             bUse = (bPBC || dd->ci[dim] > 0);
1145
1146             dd_sendrecv_rvec(dd, d, dddirForward,
1147                              extr_s+d, dd->ndim-d-1,
1148                              extr_r+d, dd->ndim-d-1);
1149
1150             if (bUse)
1151             {
1152                 for (d1 = d; d1 < dd->ndim-1; d1++)
1153                 {
1154                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1155                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1156                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1157                 }
1158             }
1159         }
1160
1161         buf_size = pos;
1162         for (p = 0; p < npulse; p++)
1163         {
1164             /* Communicate all the zone information backward */
1165             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1166
1167             dd_sendrecv_ddzone(dd, d, dddirBackward,
1168                                buf_s, buf_size,
1169                                buf_r, buf_size);
1170
1171             clear_rvec(dh);
1172             if (p > 0)
1173             {
1174                 for (d1 = d+1; d1 < dd->ndim; d1++)
1175                 {
1176                     /* Determine the decrease of maximum required
1177                      * communication height along d1 due to the distance along d,
1178                      * this avoids a lot of useless atom communication.
1179                      */
1180                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1181
1182                     if (ddbox->tric_dir[dim])
1183                     {
1184                         /* c is the off-diagonal coupling between the cell planes
1185                          * along directions d and d1.
1186                          */
1187                         c = ddbox->v[dim][dd->dim[d1]][dim];
1188                     }
1189                     else
1190                     {
1191                         c = 0;
1192                     }
1193                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1194                     if (det > 0)
1195                     {
1196                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1197                     }
1198                     else
1199                     {
1200                         /* A negative value signals out of range */
1201                         dh[d1] = -1;
1202                     }
1203                 }
1204             }
1205
1206             /* Accumulate the extremes over all pulses */
1207             for (i = 0; i < buf_size; i++)
1208             {
1209                 if (p == 0)
1210                 {
1211                     buf_e[i] = buf_r[i];
1212                 }
1213                 else
1214                 {
1215                     if (bUse)
1216                     {
1217                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1218                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1219                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1220                     }
1221
1222                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1223                     {
1224                         d1 = 1;
1225                     }
1226                     else
1227                     {
1228                         d1 = d + 1;
1229                     }
1230                     if (bUse && dh[d1] >= 0)
1231                     {
1232                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1233                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1234                     }
1235                 }
1236                 /* Copy the received buffer to the send buffer,
1237                  * to pass the data through with the next pulse.
1238                  */
1239                 buf_s[i] = buf_r[i];
1240             }
1241             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1242                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1243             {
1244                 /* Store the extremes */
1245                 pos = 0;
1246
1247                 for (d1 = d; d1 < dd->ndim-1; d1++)
1248                 {
1249                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1250                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1251                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1252                     pos++;
1253                 }
1254
1255                 if (d == 1 || (d == 0 && dd->ndim == 3))
1256                 {
1257                     for (i = d; i < 2; i++)
1258                     {
1259                         comm->zone_d2[1-d][i] = buf_e[pos];
1260                         pos++;
1261                     }
1262                 }
1263                 if (d == 0)
1264                 {
1265                     comm->zone_d1[1] = buf_e[pos];
1266                     pos++;
1267                 }
1268             }
1269         }
1270     }
1271
1272     if (dd->ndim >= 2)
1273     {
1274         dim = dd->dim[1];
1275         for (i = 0; i < 2; i++)
1276         {
1277             if (debug)
1278             {
1279                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1280             }
1281             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1282             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1283         }
1284     }
1285     if (dd->ndim >= 3)
1286     {
1287         dim = dd->dim[2];
1288         for (i = 0; i < 2; i++)
1289         {
1290             for (j = 0; j < 2; j++)
1291             {
1292                 if (debug)
1293                 {
1294                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1295                 }
1296                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1297                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1298             }
1299         }
1300     }
1301     for (d = 1; d < dd->ndim; d++)
1302     {
1303         comm->cell_f_max0[d] = extr_s[d-1][0];
1304         comm->cell_f_min1[d] = extr_s[d-1][1];
1305         if (debug)
1306         {
1307             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1308                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1309         }
1310     }
1311 }
1312
1313 static void dd_collect_cg(gmx_domdec_t *dd,
1314                           t_state      *state_local)
1315 {
1316     gmx_domdec_master_t *ma = NULL;
1317     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1318     t_block             *cgs_gl;
1319
1320     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1321     {
1322         /* The master has the correct distribution */
1323         return;
1324     }
1325
1326     if (state_local->ddp_count == dd->ddp_count)
1327     {
1328         ncg_home = dd->ncg_home;
1329         cg       = dd->index_gl;
1330         nat_home = dd->nat_home;
1331     }
1332     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1333     {
1334         cgs_gl = &dd->comm->cgs_gl;
1335
1336         ncg_home = state_local->ncg_gl;
1337         cg       = state_local->cg_gl;
1338         nat_home = 0;
1339         for (i = 0; i < ncg_home; i++)
1340         {
1341             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1342         }
1343     }
1344     else
1345     {
1346         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1347     }
1348
1349     buf2[0] = dd->ncg_home;
1350     buf2[1] = dd->nat_home;
1351     if (DDMASTER(dd))
1352     {
1353         ma   = dd->ma;
1354         ibuf = ma->ibuf;
1355     }
1356     else
1357     {
1358         ibuf = NULL;
1359     }
1360     /* Collect the charge group and atom counts on the master */
1361     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1362
1363     if (DDMASTER(dd))
1364     {
1365         ma->index[0] = 0;
1366         for (i = 0; i < dd->nnodes; i++)
1367         {
1368             ma->ncg[i]     = ma->ibuf[2*i];
1369             ma->nat[i]     = ma->ibuf[2*i+1];
1370             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1371
1372         }
1373         /* Make byte counts and indices */
1374         for (i = 0; i < dd->nnodes; i++)
1375         {
1376             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1377             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1378         }
1379         if (debug)
1380         {
1381             fprintf(debug, "Initial charge group distribution: ");
1382             for (i = 0; i < dd->nnodes; i++)
1383             {
1384                 fprintf(debug, " %d", ma->ncg[i]);
1385             }
1386             fprintf(debug, "\n");
1387         }
1388     }
1389
1390     /* Collect the charge group indices on the master */
1391     dd_gatherv(dd,
1392                dd->ncg_home*sizeof(int), dd->index_gl,
1393                DDMASTER(dd) ? ma->ibuf : NULL,
1394                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1395                DDMASTER(dd) ? ma->cg : NULL);
1396
1397     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1398 }
1399
1400 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1401                                     rvec *lv, rvec *v)
1402 {
1403     gmx_domdec_master_t *ma;
1404     int                  n, i, c, a, nalloc = 0;
1405     rvec                *buf = NULL;
1406     t_block             *cgs_gl;
1407
1408     ma = dd->ma;
1409
1410     if (!DDMASTER(dd))
1411     {
1412 #ifdef GMX_MPI
1413         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1414                  dd->rank, dd->mpi_comm_all);
1415 #endif
1416     }
1417     else
1418     {
1419         /* Copy the master coordinates to the global array */
1420         cgs_gl = &dd->comm->cgs_gl;
1421
1422         n = DDMASTERRANK(dd);
1423         a = 0;
1424         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1425         {
1426             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1427             {
1428                 copy_rvec(lv[a++], v[c]);
1429             }
1430         }
1431
1432         for (n = 0; n < dd->nnodes; n++)
1433         {
1434             if (n != dd->rank)
1435             {
1436                 if (ma->nat[n] > nalloc)
1437                 {
1438                     nalloc = over_alloc_dd(ma->nat[n]);
1439                     srenew(buf, nalloc);
1440                 }
1441 #ifdef GMX_MPI
1442                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1443                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1444 #endif
1445                 a = 0;
1446                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1447                 {
1448                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1449                     {
1450                         copy_rvec(buf[a++], v[c]);
1451                     }
1452                 }
1453             }
1454         }
1455         sfree(buf);
1456     }
1457 }
1458
1459 static void get_commbuffer_counts(gmx_domdec_t *dd,
1460                                   int **counts, int **disps)
1461 {
1462     gmx_domdec_master_t *ma;
1463     int                  n;
1464
1465     ma = dd->ma;
1466
1467     /* Make the rvec count and displacment arrays */
1468     *counts  = ma->ibuf;
1469     *disps   = ma->ibuf + dd->nnodes;
1470     for (n = 0; n < dd->nnodes; n++)
1471     {
1472         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1473         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1474     }
1475 }
1476
1477 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1478                                    rvec *lv, rvec *v)
1479 {
1480     gmx_domdec_master_t *ma;
1481     int                 *rcounts = NULL, *disps = NULL;
1482     int                  n, i, c, a;
1483     rvec                *buf = NULL;
1484     t_block             *cgs_gl;
1485
1486     ma = dd->ma;
1487
1488     if (DDMASTER(dd))
1489     {
1490         get_commbuffer_counts(dd, &rcounts, &disps);
1491
1492         buf = ma->vbuf;
1493     }
1494
1495     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1496
1497     if (DDMASTER(dd))
1498     {
1499         cgs_gl = &dd->comm->cgs_gl;
1500
1501         a = 0;
1502         for (n = 0; n < dd->nnodes; n++)
1503         {
1504             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1505             {
1506                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1507                 {
1508                     copy_rvec(buf[a++], v[c]);
1509                 }
1510             }
1511         }
1512     }
1513 }
1514
1515 void dd_collect_vec(gmx_domdec_t *dd,
1516                     t_state *state_local, rvec *lv, rvec *v)
1517 {
1518     gmx_domdec_master_t *ma;
1519     int                  n, i, c, a, nalloc = 0;
1520     rvec                *buf = NULL;
1521
1522     dd_collect_cg(dd, state_local);
1523
1524     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1525     {
1526         dd_collect_vec_sendrecv(dd, lv, v);
1527     }
1528     else
1529     {
1530         dd_collect_vec_gatherv(dd, lv, v);
1531     }
1532 }
1533
1534
1535 void dd_collect_state(gmx_domdec_t *dd,
1536                       t_state *state_local, t_state *state)
1537 {
1538     int est, i, j, nh;
1539
1540     nh = state->nhchainlength;
1541
1542     if (DDMASTER(dd))
1543     {
1544         for (i = 0; i < efptNR; i++)
1545         {
1546             state->lambda[i] = state_local->lambda[i];
1547         }
1548         state->fep_state = state_local->fep_state;
1549         state->veta      = state_local->veta;
1550         state->vol0      = state_local->vol0;
1551         copy_mat(state_local->box, state->box);
1552         copy_mat(state_local->boxv, state->boxv);
1553         copy_mat(state_local->svir_prev, state->svir_prev);
1554         copy_mat(state_local->fvir_prev, state->fvir_prev);
1555         copy_mat(state_local->pres_prev, state->pres_prev);
1556
1557         for (i = 0; i < state_local->ngtc; i++)
1558         {
1559             for (j = 0; j < nh; j++)
1560             {
1561                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1562                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1563             }
1564             state->therm_integral[i] = state_local->therm_integral[i];
1565         }
1566         for (i = 0; i < state_local->nnhpres; i++)
1567         {
1568             for (j = 0; j < nh; j++)
1569             {
1570                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1571                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1572             }
1573         }
1574     }
1575     for (est = 0; est < estNR; est++)
1576     {
1577         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1578         {
1579             switch (est)
1580             {
1581                 case estX:
1582                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1583                     break;
1584                 case estV:
1585                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1586                     break;
1587                 case estSDX:
1588                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1589                     break;
1590                 case estCGP:
1591                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1592                     break;
1593                 case estLD_RNG:
1594                     if (state->nrngi == 1)
1595                     {
1596                         if (DDMASTER(dd))
1597                         {
1598                             for (i = 0; i < state_local->nrng; i++)
1599                             {
1600                                 state->ld_rng[i] = state_local->ld_rng[i];
1601                             }
1602                         }
1603                     }
1604                     else
1605                     {
1606                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1607                                   state_local->ld_rng, state->ld_rng);
1608                     }
1609                     break;
1610                 case estLD_RNGI:
1611                     if (state->nrngi == 1)
1612                     {
1613                         if (DDMASTER(dd))
1614                         {
1615                             state->ld_rngi[0] = state_local->ld_rngi[0];
1616                         }
1617                     }
1618                     else
1619                     {
1620                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1621                                   state_local->ld_rngi, state->ld_rngi);
1622                     }
1623                     break;
1624                 case estDISRE_INITF:
1625                 case estDISRE_RM3TAV:
1626                 case estORIRE_INITF:
1627                 case estORIRE_DTAV:
1628                     break;
1629                 default:
1630                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1631             }
1632         }
1633     }
1634 }
1635
1636 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1637 {
1638     int est;
1639
1640     if (debug)
1641     {
1642         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1643     }
1644
1645     state->nalloc = over_alloc_dd(nalloc);
1646
1647     for (est = 0; est < estNR; est++)
1648     {
1649         if (EST_DISTR(est) && (state->flags & (1<<est)))
1650         {
1651             switch (est)
1652             {
1653                 case estX:
1654                     srenew(state->x, state->nalloc);
1655                     break;
1656                 case estV:
1657                     srenew(state->v, state->nalloc);
1658                     break;
1659                 case estSDX:
1660                     srenew(state->sd_X, state->nalloc);
1661                     break;
1662                 case estCGP:
1663                     srenew(state->cg_p, state->nalloc);
1664                     break;
1665                 case estLD_RNG:
1666                 case estLD_RNGI:
1667                 case estDISRE_INITF:
1668                 case estDISRE_RM3TAV:
1669                 case estORIRE_INITF:
1670                 case estORIRE_DTAV:
1671                     /* No reallocation required */
1672                     break;
1673                 default:
1674                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1675             }
1676         }
1677     }
1678
1679     if (f != NULL)
1680     {
1681         srenew(*f, state->nalloc);
1682     }
1683 }
1684
1685 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1686                                int nalloc)
1687 {
1688     if (nalloc > fr->cg_nalloc)
1689     {
1690         if (debug)
1691         {
1692             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1693         }
1694         fr->cg_nalloc = over_alloc_dd(nalloc);
1695         srenew(fr->cginfo, fr->cg_nalloc);
1696         if (fr->cutoff_scheme == ecutsGROUP)
1697         {
1698             srenew(fr->cg_cm, fr->cg_nalloc);
1699         }
1700     }
1701     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1702     {
1703         /* We don't use charge groups, we use x in state to set up
1704          * the atom communication.
1705          */
1706         dd_realloc_state(state, f, nalloc);
1707     }
1708 }
1709
1710 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1711                                        rvec *v, rvec *lv)
1712 {
1713     gmx_domdec_master_t *ma;
1714     int                  n, i, c, a, nalloc = 0;
1715     rvec                *buf = NULL;
1716
1717     if (DDMASTER(dd))
1718     {
1719         ma  = dd->ma;
1720
1721         for (n = 0; n < dd->nnodes; n++)
1722         {
1723             if (n != dd->rank)
1724             {
1725                 if (ma->nat[n] > nalloc)
1726                 {
1727                     nalloc = over_alloc_dd(ma->nat[n]);
1728                     srenew(buf, nalloc);
1729                 }
1730                 /* Use lv as a temporary buffer */
1731                 a = 0;
1732                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1733                 {
1734                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1735                     {
1736                         copy_rvec(v[c], buf[a++]);
1737                     }
1738                 }
1739                 if (a != ma->nat[n])
1740                 {
1741                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1742                               a, ma->nat[n]);
1743                 }
1744
1745 #ifdef GMX_MPI
1746                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1747                          DDRANK(dd, n), n, dd->mpi_comm_all);
1748 #endif
1749             }
1750         }
1751         sfree(buf);
1752         n = DDMASTERRANK(dd);
1753         a = 0;
1754         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1755         {
1756             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1757             {
1758                 copy_rvec(v[c], lv[a++]);
1759             }
1760         }
1761     }
1762     else
1763     {
1764 #ifdef GMX_MPI
1765         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1766                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1767 #endif
1768     }
1769 }
1770
1771 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1772                                        rvec *v, rvec *lv)
1773 {
1774     gmx_domdec_master_t *ma;
1775     int                 *scounts = NULL, *disps = NULL;
1776     int                  n, i, c, a, nalloc = 0;
1777     rvec                *buf = NULL;
1778
1779     if (DDMASTER(dd))
1780     {
1781         ma  = dd->ma;
1782
1783         get_commbuffer_counts(dd, &scounts, &disps);
1784
1785         buf = ma->vbuf;
1786         a   = 0;
1787         for (n = 0; n < dd->nnodes; n++)
1788         {
1789             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1790             {
1791                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1792                 {
1793                     copy_rvec(v[c], buf[a++]);
1794                 }
1795             }
1796         }
1797     }
1798
1799     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1800 }
1801
1802 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1803 {
1804     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1805     {
1806         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1807     }
1808     else
1809     {
1810         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1811     }
1812 }
1813
1814 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1815 {
1816     int i;
1817     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1818     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1819     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1820
1821     if (dfhist->nlambda > 0)
1822     {
1823         int nlam = dfhist->nlambda;
1824         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1825         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1826         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1827         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1828         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1829         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1830
1831         for (i = 0; i<nlam; i++) {
1832             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1833             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1834             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1835             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1836             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1837             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1838         }
1839     }
1840 }
1841
1842 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1843                                 t_state *state, t_state *state_local,
1844                                 rvec **f)
1845 {
1846     int  i, j, nh;
1847
1848     nh = state->nhchainlength;
1849
1850     if (DDMASTER(dd))
1851     {
1852         for (i = 0; i < efptNR; i++)
1853         {
1854             state_local->lambda[i] = state->lambda[i];
1855         }
1856         state_local->fep_state = state->fep_state;
1857         state_local->veta      = state->veta;
1858         state_local->vol0      = state->vol0;
1859         copy_mat(state->box, state_local->box);
1860         copy_mat(state->box_rel, state_local->box_rel);
1861         copy_mat(state->boxv, state_local->boxv);
1862         copy_mat(state->svir_prev, state_local->svir_prev);
1863         copy_mat(state->fvir_prev, state_local->fvir_prev);
1864         copy_df_history(&state_local->dfhist,&state->dfhist);
1865         for (i = 0; i < state_local->ngtc; i++)
1866         {
1867             for (j = 0; j < nh; j++)
1868             {
1869                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1870                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1871             }
1872             state_local->therm_integral[i] = state->therm_integral[i];
1873         }
1874         for (i = 0; i < state_local->nnhpres; i++)
1875         {
1876             for (j = 0; j < nh; j++)
1877             {
1878                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1879                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1880             }
1881         }
1882     }
1883     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1884     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1885     dd_bcast(dd, sizeof(real), &state_local->veta);
1886     dd_bcast(dd, sizeof(real), &state_local->vol0);
1887     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1888     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1889     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1890     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1891     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1892     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1893     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1894     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1895     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1896     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1897
1898     /* communicate df_history -- required for restarting from checkpoint */
1899     dd_distribute_dfhist(dd,&state_local->dfhist);
1900
1901     if (dd->nat_home > state_local->nalloc)
1902     {
1903         dd_realloc_state(state_local, f, dd->nat_home);
1904     }
1905     for (i = 0; i < estNR; i++)
1906     {
1907         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1908         {
1909             switch (i)
1910             {
1911                 case estX:
1912                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1913                     break;
1914                 case estV:
1915                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1916                     break;
1917                 case estSDX:
1918                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1919                     break;
1920                 case estCGP:
1921                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1922                     break;
1923                 case estLD_RNG:
1924                     if (state->nrngi == 1)
1925                     {
1926                         dd_bcastc(dd,
1927                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1928                                   state->ld_rng, state_local->ld_rng);
1929                     }
1930                     else
1931                     {
1932                         dd_scatter(dd,
1933                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1934                                    state->ld_rng, state_local->ld_rng);
1935                     }
1936                     break;
1937                 case estLD_RNGI:
1938                     if (state->nrngi == 1)
1939                     {
1940                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1941                                   state->ld_rngi, state_local->ld_rngi);
1942                     }
1943                     else
1944                     {
1945                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1946                                    state->ld_rngi, state_local->ld_rngi);
1947                     }
1948                     break;
1949                 case estDISRE_INITF:
1950                 case estDISRE_RM3TAV:
1951                 case estORIRE_INITF:
1952                 case estORIRE_DTAV:
1953                     /* Not implemented yet */
1954                     break;
1955                 default:
1956                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1957             }
1958         }
1959     }
1960 }
1961
1962 static char dim2char(int dim)
1963 {
1964     char c = '?';
1965
1966     switch (dim)
1967     {
1968         case XX: c = 'X'; break;
1969         case YY: c = 'Y'; break;
1970         case ZZ: c = 'Z'; break;
1971         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1972     }
1973
1974     return c;
1975 }
1976
1977 static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
1978                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1979 {
1980     rvec   grid_s[2], *grid_r = NULL, cx, r;
1981     char   fname[STRLEN], format[STRLEN], buf[22];
1982     FILE  *out;
1983     int    a, i, d, z, y, x;
1984     matrix tric;
1985     real   vol;
1986
1987     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1988     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1989
1990     if (DDMASTER(dd))
1991     {
1992         snew(grid_r, 2*dd->nnodes);
1993     }
1994
1995     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1996
1997     if (DDMASTER(dd))
1998     {
1999         for (d = 0; d < DIM; d++)
2000         {
2001             for (i = 0; i < DIM; i++)
2002             {
2003                 if (d == i)
2004                 {
2005                     tric[d][i] = 1;
2006                 }
2007                 else
2008                 {
2009                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
2010                     {
2011                         tric[d][i] = box[i][d]/box[i][i];
2012                     }
2013                     else
2014                     {
2015                         tric[d][i] = 0;
2016                     }
2017                 }
2018             }
2019         }
2020         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
2021         sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
2022         out = gmx_fio_fopen(fname, "w");
2023         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2024         a = 1;
2025         for (i = 0; i < dd->nnodes; i++)
2026         {
2027             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
2028             for (d = 0; d < DIM; d++)
2029             {
2030                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
2031             }
2032             for (z = 0; z < 2; z++)
2033             {
2034                 for (y = 0; y < 2; y++)
2035                 {
2036                     for (x = 0; x < 2; x++)
2037                     {
2038                         cx[XX] = grid_r[i*2+x][XX];
2039                         cx[YY] = grid_r[i*2+y][YY];
2040                         cx[ZZ] = grid_r[i*2+z][ZZ];
2041                         mvmul(tric, cx, r);
2042                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
2043                                 ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
2044                     }
2045                 }
2046             }
2047             for (d = 0; d < DIM; d++)
2048             {
2049                 for (x = 0; x < 4; x++)
2050                 {
2051                     switch (d)
2052                     {
2053                         case 0: y = 1 + i*8 + 2*x; break;
2054                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2055                         case 2: y = 1 + i*8 + x; break;
2056                     }
2057                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2058                 }
2059             }
2060         }
2061         gmx_fio_fclose(out);
2062         sfree(grid_r);
2063     }
2064 }
2065
2066 void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
2067                   gmx_mtop_t *mtop, t_commrec *cr,
2068                   int natoms, rvec x[], matrix box)
2069 {
2070     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2071     FILE         *out;
2072     int           i, ii, resnr, c;
2073     char         *atomname, *resname;
2074     real          b;
2075     gmx_domdec_t *dd;
2076
2077     dd = cr->dd;
2078     if (natoms == -1)
2079     {
2080         natoms = dd->comm->nat[ddnatVSITE];
2081     }
2082
2083     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2084
2085     sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
2086     sprintf(format4, "%s%s\n", pdbformat4, "%6.2f%6.2f");
2087
2088     out = gmx_fio_fopen(fname, "w");
2089
2090     fprintf(out, "TITLE     %s\n", title);
2091     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2092     for (i = 0; i < natoms; i++)
2093     {
2094         ii = dd->gatindex[i];
2095         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2096         if (i < dd->comm->nat[ddnatZONE])
2097         {
2098             c = 0;
2099             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2100             {
2101                 c++;
2102             }
2103             b = c;
2104         }
2105         else if (i < dd->comm->nat[ddnatVSITE])
2106         {
2107             b = dd->comm->zones.n;
2108         }
2109         else
2110         {
2111             b = dd->comm->zones.n + 1;
2112         }
2113         fprintf(out, strlen(atomname) < 4 ? format : format4,
2114                 "ATOM", (ii+1)%100000,
2115                 atomname, resname, ' ', resnr%10000, ' ',
2116                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2117     }
2118     fprintf(out, "TER\n");
2119
2120     gmx_fio_fclose(out);
2121 }
2122
2123 real dd_cutoff_mbody(gmx_domdec_t *dd)
2124 {
2125     gmx_domdec_comm_t *comm;
2126     int                di;
2127     real               r;
2128
2129     comm = dd->comm;
2130
2131     r = -1;
2132     if (comm->bInterCGBondeds)
2133     {
2134         if (comm->cutoff_mbody > 0)
2135         {
2136             r = comm->cutoff_mbody;
2137         }
2138         else
2139         {
2140             /* cutoff_mbody=0 means we do not have DLB */
2141             r = comm->cellsize_min[dd->dim[0]];
2142             for (di = 1; di < dd->ndim; di++)
2143             {
2144                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2145             }
2146             if (comm->bBondComm)
2147             {
2148                 r = max(r, comm->cutoff_mbody);
2149             }
2150             else
2151             {
2152                 r = min(r, comm->cutoff);
2153             }
2154         }
2155     }
2156
2157     return r;
2158 }
2159
2160 real dd_cutoff_twobody(gmx_domdec_t *dd)
2161 {
2162     real r_mb;
2163
2164     r_mb = dd_cutoff_mbody(dd);
2165
2166     return max(dd->comm->cutoff, r_mb);
2167 }
2168
2169
2170 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2171 {
2172     int nc, ntot;
2173
2174     nc   = dd->nc[dd->comm->cartpmedim];
2175     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2176     copy_ivec(coord, coord_pme);
2177     coord_pme[dd->comm->cartpmedim] =
2178         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2179 }
2180
2181 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2182 {
2183     /* Here we assign a PME node to communicate with this DD node
2184      * by assuming that the major index of both is x.
2185      * We add cr->npmenodes/2 to obtain an even distribution.
2186      */
2187     return (ddindex*npme + npme/2)/ndd;
2188 }
2189
2190 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2191 {
2192     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2193 }
2194
2195 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2196 {
2197     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2198 }
2199
2200 static int *dd_pmenodes(t_commrec *cr)
2201 {
2202     int *pmenodes;
2203     int  n, i, p0, p1;
2204
2205     snew(pmenodes, cr->npmenodes);
2206     n = 0;
2207     for (i = 0; i < cr->dd->nnodes; i++)
2208     {
2209         p0 = cr_ddindex2pmeindex(cr, i);
2210         p1 = cr_ddindex2pmeindex(cr, i+1);
2211         if (i+1 == cr->dd->nnodes || p1 > p0)
2212         {
2213             if (debug)
2214             {
2215                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2216             }
2217             pmenodes[n] = i + 1 + n;
2218             n++;
2219         }
2220     }
2221
2222     return pmenodes;
2223 }
2224
2225 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2226 {
2227     gmx_domdec_t *dd;
2228     ivec          coords, coords_pme, nc;
2229     int           slab;
2230
2231     dd = cr->dd;
2232     /*
2233        if (dd->comm->bCartesian) {
2234        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2235        dd_coords2pmecoords(dd,coords,coords_pme);
2236        copy_ivec(dd->ntot,nc);
2237        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2238        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2239
2240        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2241        } else {
2242        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2243        }
2244      */
2245     coords[XX] = x;
2246     coords[YY] = y;
2247     coords[ZZ] = z;
2248     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2249
2250     return slab;
2251 }
2252
2253 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2254 {
2255     gmx_domdec_comm_t *comm;
2256     ivec               coords;
2257     int                ddindex, nodeid = -1;
2258
2259     comm = cr->dd->comm;
2260
2261     coords[XX] = x;
2262     coords[YY] = y;
2263     coords[ZZ] = z;
2264     if (comm->bCartesianPP_PME)
2265     {
2266 #ifdef GMX_MPI
2267         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2268 #endif
2269     }
2270     else
2271     {
2272         ddindex = dd_index(cr->dd->nc, coords);
2273         if (comm->bCartesianPP)
2274         {
2275             nodeid = comm->ddindex2simnodeid[ddindex];
2276         }
2277         else
2278         {
2279             if (comm->pmenodes)
2280             {
2281                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2282             }
2283             else
2284             {
2285                 nodeid = ddindex;
2286             }
2287         }
2288     }
2289
2290     return nodeid;
2291 }
2292
2293 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2294 {
2295     gmx_domdec_t      *dd;
2296     gmx_domdec_comm_t *comm;
2297     ivec               coord, coord_pme;
2298     int                i;
2299     int                pmenode = -1;
2300
2301     dd   = cr->dd;
2302     comm = dd->comm;
2303
2304     /* This assumes a uniform x domain decomposition grid cell size */
2305     if (comm->bCartesianPP_PME)
2306     {
2307 #ifdef GMX_MPI
2308         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2309         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2310         {
2311             /* This is a PP node */
2312             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2313             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2314         }
2315 #endif
2316     }
2317     else if (comm->bCartesianPP)
2318     {
2319         if (sim_nodeid < dd->nnodes)
2320         {
2321             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2322         }
2323     }
2324     else
2325     {
2326         /* This assumes DD cells with identical x coordinates
2327          * are numbered sequentially.
2328          */
2329         if (dd->comm->pmenodes == NULL)
2330         {
2331             if (sim_nodeid < dd->nnodes)
2332             {
2333                 /* The DD index equals the nodeid */
2334                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2335             }
2336         }
2337         else
2338         {
2339             i = 0;
2340             while (sim_nodeid > dd->comm->pmenodes[i])
2341             {
2342                 i++;
2343             }
2344             if (sim_nodeid < dd->comm->pmenodes[i])
2345             {
2346                 pmenode = dd->comm->pmenodes[i];
2347             }
2348         }
2349     }
2350
2351     return pmenode;
2352 }
2353
2354 void get_pme_nnodes(const gmx_domdec_t *dd,
2355                     int *npmenodes_x, int *npmenodes_y)
2356 {
2357     if (dd != NULL)
2358     {
2359         *npmenodes_x = dd->comm->npmenodes_x;
2360         *npmenodes_y = dd->comm->npmenodes_y;
2361     }
2362     else
2363     {
2364         *npmenodes_x = 1;
2365         *npmenodes_y = 1;
2366     }
2367 }
2368
2369 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2370 {
2371     gmx_bool bPMEOnlyNode;
2372
2373     if (DOMAINDECOMP(cr))
2374     {
2375         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2376     }
2377     else
2378     {
2379         bPMEOnlyNode = FALSE;
2380     }
2381
2382     return bPMEOnlyNode;
2383 }
2384
2385 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2386                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2387 {
2388     gmx_domdec_t *dd;
2389     int           x, y, z;
2390     ivec          coord, coord_pme;
2391
2392     dd = cr->dd;
2393
2394     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2395
2396     *nmy_ddnodes = 0;
2397     for (x = 0; x < dd->nc[XX]; x++)
2398     {
2399         for (y = 0; y < dd->nc[YY]; y++)
2400         {
2401             for (z = 0; z < dd->nc[ZZ]; z++)
2402             {
2403                 if (dd->comm->bCartesianPP_PME)
2404                 {
2405                     coord[XX] = x;
2406                     coord[YY] = y;
2407                     coord[ZZ] = z;
2408                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2409                     if (dd->ci[XX] == coord_pme[XX] &&
2410                         dd->ci[YY] == coord_pme[YY] &&
2411                         dd->ci[ZZ] == coord_pme[ZZ])
2412                     {
2413                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2414                     }
2415                 }
2416                 else
2417                 {
2418                     /* The slab corresponds to the nodeid in the PME group */
2419                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2420                     {
2421                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2422                     }
2423                 }
2424             }
2425         }
2426     }
2427
2428     /* The last PP-only node is the peer node */
2429     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2430
2431     if (debug)
2432     {
2433         fprintf(debug, "Receive coordinates from PP nodes:");
2434         for (x = 0; x < *nmy_ddnodes; x++)
2435         {
2436             fprintf(debug, " %d", (*my_ddnodes)[x]);
2437         }
2438         fprintf(debug, "\n");
2439     }
2440 }
2441
2442 static gmx_bool receive_vir_ener(t_commrec *cr)
2443 {
2444     gmx_domdec_comm_t *comm;
2445     int                pmenode, coords[DIM], rank;
2446     gmx_bool           bReceive;
2447
2448     bReceive = TRUE;
2449     if (cr->npmenodes < cr->dd->nnodes)
2450     {
2451         comm = cr->dd->comm;
2452         if (comm->bCartesianPP_PME)
2453         {
2454             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2455 #ifdef GMX_MPI
2456             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2457             coords[comm->cartpmedim]++;
2458             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2459             {
2460                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2461                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2462                 {
2463                     /* This is not the last PP node for pmenode */
2464                     bReceive = FALSE;
2465                 }
2466             }
2467 #endif
2468         }
2469         else
2470         {
2471             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2472             if (cr->sim_nodeid+1 < cr->nnodes &&
2473                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2474             {
2475                 /* This is not the last PP node for pmenode */
2476                 bReceive = FALSE;
2477             }
2478         }
2479     }
2480
2481     return bReceive;
2482 }
2483
2484 static void set_zones_ncg_home(gmx_domdec_t *dd)
2485 {
2486     gmx_domdec_zones_t *zones;
2487     int                 i;
2488
2489     zones = &dd->comm->zones;
2490
2491     zones->cg_range[0] = 0;
2492     for (i = 1; i < zones->n+1; i++)
2493     {
2494         zones->cg_range[i] = dd->ncg_home;
2495     }
2496     /* zone_ncg1[0] should always be equal to ncg_home */
2497     dd->comm->zone_ncg1[0] = dd->ncg_home;
2498 }
2499
2500 static void rebuild_cgindex(gmx_domdec_t *dd,
2501                             const int *gcgs_index, t_state *state)
2502 {
2503     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2504
2505     ind        = state->cg_gl;
2506     dd_cg_gl   = dd->index_gl;
2507     cgindex    = dd->cgindex;
2508     nat        = 0;
2509     cgindex[0] = nat;
2510     for (i = 0; i < state->ncg_gl; i++)
2511     {
2512         cgindex[i]  = nat;
2513         cg_gl       = ind[i];
2514         dd_cg_gl[i] = cg_gl;
2515         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2516     }
2517     cgindex[i] = nat;
2518
2519     dd->ncg_home = state->ncg_gl;
2520     dd->nat_home = nat;
2521
2522     set_zones_ncg_home(dd);
2523 }
2524
2525 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2526 {
2527     while (cg >= cginfo_mb->cg_end)
2528     {
2529         cginfo_mb++;
2530     }
2531
2532     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2533 }
2534
2535 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2536                           t_forcerec *fr, char *bLocalCG)
2537 {
2538     cginfo_mb_t *cginfo_mb;
2539     int         *cginfo;
2540     int          cg;
2541
2542     if (fr != NULL)
2543     {
2544         cginfo_mb = fr->cginfo_mb;
2545         cginfo    = fr->cginfo;
2546
2547         for (cg = cg0; cg < cg1; cg++)
2548         {
2549             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2550         }
2551     }
2552
2553     if (bLocalCG != NULL)
2554     {
2555         for (cg = cg0; cg < cg1; cg++)
2556         {
2557             bLocalCG[index_gl[cg]] = TRUE;
2558         }
2559     }
2560 }
2561
2562 static void make_dd_indices(gmx_domdec_t *dd,
2563                             const int *gcgs_index, int cg_start)
2564 {
2565     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2566     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2567     gmx_ga2la_t *ga2la;
2568     char        *bLocalCG;
2569     gmx_bool     bCGs;
2570
2571     bLocalCG = dd->comm->bLocalCG;
2572
2573     if (dd->nat_tot > dd->gatindex_nalloc)
2574     {
2575         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2576         srenew(dd->gatindex, dd->gatindex_nalloc);
2577     }
2578
2579     nzone      = dd->comm->zones.n;
2580     zone2cg    = dd->comm->zones.cg_range;
2581     zone_ncg1  = dd->comm->zone_ncg1;
2582     index_gl   = dd->index_gl;
2583     gatindex   = dd->gatindex;
2584     bCGs       = dd->comm->bCGs;
2585
2586     if (zone2cg[1] != dd->ncg_home)
2587     {
2588         gmx_incons("dd->ncg_zone is not up to date");
2589     }
2590
2591     /* Make the local to global and global to local atom index */
2592     a = dd->cgindex[cg_start];
2593     for (zone = 0; zone < nzone; zone++)
2594     {
2595         if (zone == 0)
2596         {
2597             cg0 = cg_start;
2598         }
2599         else
2600         {
2601             cg0 = zone2cg[zone];
2602         }
2603         cg1    = zone2cg[zone+1];
2604         cg1_p1 = cg0 + zone_ncg1[zone];
2605
2606         for (cg = cg0; cg < cg1; cg++)
2607         {
2608             zone1 = zone;
2609             if (cg >= cg1_p1)
2610             {
2611                 /* Signal that this cg is from more than one pulse away */
2612                 zone1 += nzone;
2613             }
2614             cg_gl = index_gl[cg];
2615             if (bCGs)
2616             {
2617                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2618                 {
2619                     gatindex[a] = a_gl;
2620                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2621                     a++;
2622                 }
2623             }
2624             else
2625             {
2626                 gatindex[a] = cg_gl;
2627                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2628                 a++;
2629             }
2630         }
2631     }
2632 }
2633
2634 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2635                           const char *where)
2636 {
2637     int ncg, i, ngl, nerr;
2638
2639     nerr = 0;
2640     if (bLocalCG == NULL)
2641     {
2642         return nerr;
2643     }
2644     for (i = 0; i < dd->ncg_tot; i++)
2645     {
2646         if (!bLocalCG[dd->index_gl[i]])
2647         {
2648             fprintf(stderr,
2649                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2650             nerr++;
2651         }
2652     }
2653     ngl = 0;
2654     for (i = 0; i < ncg_sys; i++)
2655     {
2656         if (bLocalCG[i])
2657         {
2658             ngl++;
2659         }
2660     }
2661     if (ngl != dd->ncg_tot)
2662     {
2663         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2664         nerr++;
2665     }
2666
2667     return nerr;
2668 }
2669
2670 static void check_index_consistency(gmx_domdec_t *dd,
2671                                     int natoms_sys, int ncg_sys,
2672                                     const char *where)
2673 {
2674     int   nerr, ngl, i, a, cell;
2675     int  *have;
2676
2677     nerr = 0;
2678
2679     if (dd->comm->DD_debug > 1)
2680     {
2681         snew(have, natoms_sys);
2682         for (a = 0; a < dd->nat_tot; a++)
2683         {
2684             if (have[dd->gatindex[a]] > 0)
2685             {
2686                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2687             }
2688             else
2689             {
2690                 have[dd->gatindex[a]] = a + 1;
2691             }
2692         }
2693         sfree(have);
2694     }
2695
2696     snew(have, dd->nat_tot);
2697
2698     ngl  = 0;
2699     for (i = 0; i < natoms_sys; i++)
2700     {
2701         if (ga2la_get(dd->ga2la, i, &a, &cell))
2702         {
2703             if (a >= dd->nat_tot)
2704             {
2705                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2706                 nerr++;
2707             }
2708             else
2709             {
2710                 have[a] = 1;
2711                 if (dd->gatindex[a] != i)
2712                 {
2713                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2714                     nerr++;
2715                 }
2716             }
2717             ngl++;
2718         }
2719     }
2720     if (ngl != dd->nat_tot)
2721     {
2722         fprintf(stderr,
2723                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2724                 dd->rank, where, ngl, dd->nat_tot);
2725     }
2726     for (a = 0; a < dd->nat_tot; a++)
2727     {
2728         if (have[a] == 0)
2729         {
2730             fprintf(stderr,
2731                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2732                     dd->rank, where, a+1, dd->gatindex[a]+1);
2733         }
2734     }
2735     sfree(have);
2736
2737     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2738
2739     if (nerr > 0)
2740     {
2741         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2742                   dd->rank, where, nerr);
2743     }
2744 }
2745
2746 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2747 {
2748     int   i;
2749     char *bLocalCG;
2750
2751     if (a_start == 0)
2752     {
2753         /* Clear the whole list without searching */
2754         ga2la_clear(dd->ga2la);
2755     }
2756     else
2757     {
2758         for (i = a_start; i < dd->nat_tot; i++)
2759         {
2760             ga2la_del(dd->ga2la, dd->gatindex[i]);
2761         }
2762     }
2763
2764     bLocalCG = dd->comm->bLocalCG;
2765     if (bLocalCG)
2766     {
2767         for (i = cg_start; i < dd->ncg_tot; i++)
2768         {
2769             bLocalCG[dd->index_gl[i]] = FALSE;
2770         }
2771     }
2772
2773     dd_clear_local_vsite_indices(dd);
2774
2775     if (dd->constraints)
2776     {
2777         dd_clear_local_constraint_indices(dd);
2778     }
2779 }
2780
2781 /* This function should be used for moving the domain boudaries during DLB,
2782  * for obtaining the minimum cell size. It checks the initially set limit
2783  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2784  * and, possibly, a longer cut-off limit set for PME load balancing.
2785  */
2786 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2787 {
2788     real cellsize_min;
2789
2790     cellsize_min = comm->cellsize_min[dim];
2791
2792     if (!comm->bVacDLBNoLimit)
2793     {
2794         /* The cut-off might have changed, e.g. by PME load balacning,
2795          * from the value used to set comm->cellsize_min, so check it.
2796          */
2797         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2798
2799         if (comm->bPMELoadBalDLBLimits)
2800         {
2801             /* Check for the cut-off limit set by the PME load balancing */
2802             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2803         }
2804     }
2805
2806     return cellsize_min;
2807 }
2808
2809 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2810                             int dim_ind)
2811 {
2812     real grid_jump_limit;
2813
2814     /* The distance between the boundaries of cells at distance
2815      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2816      * and by the fact that cells should not be shifted by more than
2817      * half their size, such that cg's only shift by one cell
2818      * at redecomposition.
2819      */
2820     grid_jump_limit = comm->cellsize_limit;
2821     if (!comm->bVacDLBNoLimit)
2822     {
2823         if (comm->bPMELoadBalDLBLimits)
2824         {
2825             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2826         }
2827         grid_jump_limit = max(grid_jump_limit,
2828                               cutoff/comm->cd[dim_ind].np);
2829     }
2830
2831     return grid_jump_limit;
2832 }
2833
2834 static gmx_bool check_grid_jump(gmx_large_int_t step,
2835                                 gmx_domdec_t   *dd,
2836                                 real            cutoff,
2837                                 gmx_ddbox_t    *ddbox,
2838                                 gmx_bool        bFatal)
2839 {
2840     gmx_domdec_comm_t *comm;
2841     int                d, dim;
2842     real               limit, bfac;
2843     gmx_bool           bInvalid;
2844
2845     bInvalid = FALSE;
2846
2847     comm = dd->comm;
2848
2849     for (d = 1; d < dd->ndim; d++)
2850     {
2851         dim   = dd->dim[d];
2852         limit = grid_jump_limit(comm, cutoff, d);
2853         bfac  = ddbox->box_size[dim];
2854         if (ddbox->tric_dir[dim])
2855         {
2856             bfac *= ddbox->skew_fac[dim];
2857         }
2858         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2859                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2860         {
2861             bInvalid = TRUE;
2862
2863             if (bFatal)
2864             {
2865                 char buf[22];
2866
2867                 /* This error should never be triggered under normal
2868                  * circumstances, but you never know ...
2869                  */
2870                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2871                           gmx_step_str(step, buf),
2872                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2873             }
2874         }
2875     }
2876
2877     return bInvalid;
2878 }
2879
2880 static int dd_load_count(gmx_domdec_comm_t *comm)
2881 {
2882     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2883 }
2884
2885 static float dd_force_load(gmx_domdec_comm_t *comm)
2886 {
2887     float load;
2888
2889     if (comm->eFlop)
2890     {
2891         load = comm->flop;
2892         if (comm->eFlop > 1)
2893         {
2894             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2895         }
2896     }
2897     else
2898     {
2899         load = comm->cycl[ddCyclF];
2900         if (comm->cycl_n[ddCyclF] > 1)
2901         {
2902             /* Subtract the maximum of the last n cycle counts
2903              * to get rid of possible high counts due to other sources,
2904              * for instance system activity, that would otherwise
2905              * affect the dynamic load balancing.
2906              */
2907             load -= comm->cycl_max[ddCyclF];
2908         }
2909
2910 #ifdef GMX_MPI
2911         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2912         {
2913             float gpu_wait, gpu_wait_sum;
2914
2915             gpu_wait = comm->cycl[ddCyclWaitGPU];
2916             if (comm->cycl_n[ddCyclF] > 1)
2917             {
2918                 /* We should remove the WaitGPU time of the same MD step
2919                  * as the one with the maximum F time, since the F time
2920                  * and the wait time are not independent.
2921                  * Furthermore, the step for the max F time should be chosen
2922                  * the same on all ranks that share the same GPU.
2923                  * But to keep the code simple, we remove the average instead.
2924                  * The main reason for artificially long times at some steps
2925                  * is spurious CPU activity or MPI time, so we don't expect
2926                  * that changes in the GPU wait time matter a lot here.
2927                  */
2928                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2929             }
2930             /* Sum the wait times over the ranks that share the same GPU */
2931             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2932                           comm->mpi_comm_gpu_shared);
2933             /* Replace the wait time by the average over the ranks */
2934             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2935         }
2936 #endif
2937     }
2938
2939     return load;
2940 }
2941
2942 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2943 {
2944     gmx_domdec_comm_t *comm;
2945     int                i;
2946
2947     comm = dd->comm;
2948
2949     snew(*dim_f, dd->nc[dim]+1);
2950     (*dim_f)[0] = 0;
2951     for (i = 1; i < dd->nc[dim]; i++)
2952     {
2953         if (comm->slb_frac[dim])
2954         {
2955             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2956         }
2957         else
2958         {
2959             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2960         }
2961     }
2962     (*dim_f)[dd->nc[dim]] = 1;
2963 }
2964
2965 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2966 {
2967     int  pmeindex, slab, nso, i;
2968     ivec xyz;
2969
2970     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2971     {
2972         ddpme->dim = YY;
2973     }
2974     else
2975     {
2976         ddpme->dim = dimind;
2977     }
2978     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2979
2980     ddpme->nslab = (ddpme->dim == 0 ?
2981                     dd->comm->npmenodes_x :
2982                     dd->comm->npmenodes_y);
2983
2984     if (ddpme->nslab <= 1)
2985     {
2986         return;
2987     }
2988
2989     nso = dd->comm->npmenodes/ddpme->nslab;
2990     /* Determine for each PME slab the PP location range for dimension dim */
2991     snew(ddpme->pp_min, ddpme->nslab);
2992     snew(ddpme->pp_max, ddpme->nslab);
2993     for (slab = 0; slab < ddpme->nslab; slab++)
2994     {
2995         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2996         ddpme->pp_max[slab] = 0;
2997     }
2998     for (i = 0; i < dd->nnodes; i++)
2999     {
3000         ddindex2xyz(dd->nc, i, xyz);
3001         /* For y only use our y/z slab.
3002          * This assumes that the PME x grid size matches the DD grid size.
3003          */
3004         if (dimind == 0 || xyz[XX] == dd->ci[XX])
3005         {
3006             pmeindex = ddindex2pmeindex(dd, i);
3007             if (dimind == 0)
3008             {
3009                 slab = pmeindex/nso;
3010             }
3011             else
3012             {
3013                 slab = pmeindex % ddpme->nslab;
3014             }
3015             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
3016             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
3017         }
3018     }
3019
3020     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
3021 }
3022
3023 int dd_pme_maxshift_x(gmx_domdec_t *dd)
3024 {
3025     if (dd->comm->ddpme[0].dim == XX)
3026     {
3027         return dd->comm->ddpme[0].maxshift;
3028     }
3029     else
3030     {
3031         return 0;
3032     }
3033 }
3034
3035 int dd_pme_maxshift_y(gmx_domdec_t *dd)
3036 {
3037     if (dd->comm->ddpme[0].dim == YY)
3038     {
3039         return dd->comm->ddpme[0].maxshift;
3040     }
3041     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
3042     {
3043         return dd->comm->ddpme[1].maxshift;
3044     }
3045     else
3046     {
3047         return 0;
3048     }
3049 }
3050
3051 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
3052                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
3053 {
3054     gmx_domdec_comm_t *comm;
3055     int                nc, ns, s;
3056     int               *xmin, *xmax;
3057     real               range, pme_boundary;
3058     int                sh;
3059
3060     comm = dd->comm;
3061     nc   = dd->nc[ddpme->dim];
3062     ns   = ddpme->nslab;
3063
3064     if (!ddpme->dim_match)
3065     {
3066         /* PP decomposition is not along dim: the worst situation */
3067         sh = ns/2;
3068     }
3069     else if (ns <= 3 || (bUniform && ns == nc))
3070     {
3071         /* The optimal situation */
3072         sh = 1;
3073     }
3074     else
3075     {
3076         /* We need to check for all pme nodes which nodes they
3077          * could possibly need to communicate with.
3078          */
3079         xmin = ddpme->pp_min;
3080         xmax = ddpme->pp_max;
3081         /* Allow for atoms to be maximally 2/3 times the cut-off
3082          * out of their DD cell. This is a reasonable balance between
3083          * between performance and support for most charge-group/cut-off
3084          * combinations.
3085          */
3086         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3087         /* Avoid extra communication when we are exactly at a boundary */
3088         range *= 0.999;
3089
3090         sh = 1;
3091         for (s = 0; s < ns; s++)
3092         {
3093             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3094             pme_boundary = (real)s/ns;
3095             while (sh+1 < ns &&
3096                    ((s-(sh+1) >= 0 &&
3097                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3098                     (s-(sh+1) <  0 &&
3099                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3100             {
3101                 sh++;
3102             }
3103             pme_boundary = (real)(s+1)/ns;
3104             while (sh+1 < ns &&
3105                    ((s+(sh+1) <  ns &&
3106                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3107                     (s+(sh+1) >= ns &&
3108                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3109             {
3110                 sh++;
3111             }
3112         }
3113     }
3114
3115     ddpme->maxshift = sh;
3116
3117     if (debug)
3118     {
3119         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3120                 ddpme->dim, ddpme->maxshift);
3121     }
3122 }
3123
3124 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3125 {
3126     int d, dim;
3127
3128     for (d = 0; d < dd->ndim; d++)
3129     {
3130         dim = dd->dim[d];
3131         if (dim < ddbox->nboundeddim &&
3132             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3133             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3134         {
3135             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3136                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3137                       dd->nc[dim], dd->comm->cellsize_limit);
3138         }
3139     }
3140 }
3141
3142 enum { setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY };
3143
3144 /* Set the domain boundaries. Use for static (or no) load balancing,
3145  * and also for the starting state for dynamic load balancing.
3146  * setmode determine if and where the boundaries are stored, use enum above.
3147  * Returns the number communication pulses in npulse.
3148  */
3149 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3150                                   int setmode, ivec npulse)
3151 {
3152     gmx_domdec_comm_t *comm;
3153     int                d, j;
3154     rvec               cellsize_min;
3155     real              *cell_x, cell_dx, cellsize;
3156
3157     comm = dd->comm;
3158
3159     for (d = 0; d < DIM; d++)
3160     {
3161         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3162         npulse[d]       = 1;
3163         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3164         {
3165             /* Uniform grid */
3166             cell_dx = ddbox->box_size[d]/dd->nc[d];
3167             switch (setmode)
3168             {
3169                 case setcellsizeslbMASTER:
3170                     for (j = 0; j < dd->nc[d]+1; j++)
3171                     {
3172                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3173                     }
3174                     break;
3175                 case setcellsizeslbLOCAL:
3176                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3177                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3178                     break;
3179                 default:
3180                     break;
3181             }
3182             cellsize = cell_dx*ddbox->skew_fac[d];
3183             while (cellsize*npulse[d] < comm->cutoff)
3184             {
3185                 npulse[d]++;
3186             }
3187             cellsize_min[d] = cellsize;
3188         }
3189         else
3190         {
3191             /* Statically load balanced grid */
3192             /* Also when we are not doing a master distribution we determine
3193              * all cell borders in a loop to obtain identical values
3194              * to the master distribution case and to determine npulse.
3195              */
3196             if (setmode == setcellsizeslbMASTER)
3197             {
3198                 cell_x = dd->ma->cell_x[d];
3199             }
3200             else
3201             {
3202                 snew(cell_x, dd->nc[d]+1);
3203             }
3204             cell_x[0] = ddbox->box0[d];
3205             for (j = 0; j < dd->nc[d]; j++)
3206             {
3207                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3208                 cell_x[j+1] = cell_x[j] + cell_dx;
3209                 cellsize    = cell_dx*ddbox->skew_fac[d];
3210                 while (cellsize*npulse[d] < comm->cutoff &&
3211                        npulse[d] < dd->nc[d]-1)
3212                 {
3213                     npulse[d]++;
3214                 }
3215                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3216             }
3217             if (setmode == setcellsizeslbLOCAL)
3218             {
3219                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3220                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3221             }
3222             if (setmode != setcellsizeslbMASTER)
3223             {
3224                 sfree(cell_x);
3225             }
3226         }
3227         /* The following limitation is to avoid that a cell would receive
3228          * some of its own home charge groups back over the periodic boundary.
3229          * Double charge groups cause trouble with the global indices.
3230          */
3231         if (d < ddbox->npbcdim &&
3232             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3233         {
3234             char error_string[STRLEN];
3235
3236             sprintf(error_string,
3237                      "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3238                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3239                     comm->cutoff,
3240                     dd->nc[d], dd->nc[d],
3241                     dd->nnodes > dd->nc[d] ? "cells" : "processors");
3242
3243             if (setmode == setcellsizeslbLOCAL)
3244             {
3245                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
3246             }
3247             else
3248             {
3249                 gmx_fatal(FARGS, error_string);
3250             }
3251         }
3252     }
3253
3254     if (!comm->bDynLoadBal)
3255     {
3256         copy_rvec(cellsize_min, comm->cellsize_min);
3257     }
3258
3259     for (d = 0; d < comm->npmedecompdim; d++)
3260     {
3261         set_pme_maxshift(dd, &comm->ddpme[d],
3262                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3263                          comm->ddpme[d].slb_dim_f);
3264     }
3265 }
3266
3267
3268 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3269                                                   int d, int dim, gmx_domdec_root_t *root,
3270                                                   gmx_ddbox_t *ddbox,
3271                                                   gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
3272 {
3273     gmx_domdec_comm_t *comm;
3274     int                ncd, i, j, nmin, nmin_old;
3275     gmx_bool           bLimLo, bLimHi;
3276     real              *cell_size;
3277     real               fac, halfway, cellsize_limit_f_i, region_size;
3278     gmx_bool           bPBC, bLastHi = FALSE;
3279     int                nrange[] = {range[0], range[1]};
3280
3281     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3282
3283     comm = dd->comm;
3284
3285     ncd = dd->nc[dim];
3286
3287     bPBC = (dim < ddbox->npbcdim);
3288
3289     cell_size = root->buf_ncd;
3290
3291     if (debug)
3292     {
3293         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3294     }
3295
3296     /* First we need to check if the scaling does not make cells
3297      * smaller than the smallest allowed size.
3298      * We need to do this iteratively, since if a cell is too small,
3299      * it needs to be enlarged, which makes all the other cells smaller,
3300      * which could in turn make another cell smaller than allowed.
3301      */
3302     for (i = range[0]; i < range[1]; i++)
3303     {
3304         root->bCellMin[i] = FALSE;
3305     }
3306     nmin = 0;
3307     do
3308     {
3309         nmin_old = nmin;
3310         /* We need the total for normalization */
3311         fac = 0;
3312         for (i = range[0]; i < range[1]; i++)
3313         {
3314             if (root->bCellMin[i] == FALSE)
3315             {
3316                 fac += cell_size[i];
3317             }
3318         }
3319         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3320         /* Determine the cell boundaries */
3321         for (i = range[0]; i < range[1]; i++)
3322         {
3323             if (root->bCellMin[i] == FALSE)
3324             {
3325                 cell_size[i] *= fac;
3326                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3327                 {
3328                     cellsize_limit_f_i = 0;
3329                 }
3330                 else
3331                 {
3332                     cellsize_limit_f_i = cellsize_limit_f;
3333                 }
3334                 if (cell_size[i] < cellsize_limit_f_i)
3335                 {
3336                     root->bCellMin[i] = TRUE;
3337                     cell_size[i]      = cellsize_limit_f_i;
3338                     nmin++;
3339                 }
3340             }
3341             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3342         }
3343     }
3344     while (nmin > nmin_old);
3345
3346     i            = range[1]-1;
3347     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3348     /* For this check we should not use DD_CELL_MARGIN,
3349      * but a slightly smaller factor,
3350      * since rounding could get use below the limit.
3351      */
3352     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3353     {
3354         char buf[22];
3355         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3356                   gmx_step_str(step, buf),
3357                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3358                   ncd, comm->cellsize_min[dim]);
3359     }
3360
3361     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3362
3363     if (!bUniform)
3364     {
3365         /* Check if the boundary did not displace more than halfway
3366          * each of the cells it bounds, as this could cause problems,
3367          * especially when the differences between cell sizes are large.
3368          * If changes are applied, they will not make cells smaller
3369          * than the cut-off, as we check all the boundaries which
3370          * might be affected by a change and if the old state was ok,
3371          * the cells will at most be shrunk back to their old size.
3372          */
3373         for (i = range[0]+1; i < range[1]; i++)
3374         {
3375             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3376             if (root->cell_f[i] < halfway)
3377             {
3378                 root->cell_f[i] = halfway;
3379                 /* Check if the change also causes shifts of the next boundaries */
3380                 for (j = i+1; j < range[1]; j++)
3381                 {
3382                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3383                     {
3384                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3385                     }
3386                 }
3387             }
3388             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3389             if (root->cell_f[i] > halfway)
3390             {
3391                 root->cell_f[i] = halfway;
3392                 /* Check if the change also causes shifts of the next boundaries */
3393                 for (j = i-1; j >= range[0]+1; j--)
3394                 {
3395                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3396                     {
3397                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3398                     }
3399                 }
3400             }
3401         }
3402     }
3403
3404     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3405     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3406      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3407      * for a and b nrange is used */
3408     if (d > 0)
3409     {
3410         /* Take care of the staggering of the cell boundaries */
3411         if (bUniform)
3412         {
3413             for (i = range[0]; i < range[1]; i++)
3414             {
3415                 root->cell_f_max0[i] = root->cell_f[i];
3416                 root->cell_f_min1[i] = root->cell_f[i+1];
3417             }
3418         }
3419         else
3420         {
3421             for (i = range[0]+1; i < range[1]; i++)
3422             {
3423                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3424                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3425                 if (bLimLo && bLimHi)
3426                 {
3427                     /* Both limits violated, try the best we can */
3428                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3429                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3430                     nrange[0]       = range[0];
3431                     nrange[1]       = i;
3432                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3433
3434                     nrange[0] = i;
3435                     nrange[1] = range[1];
3436                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3437
3438                     return;
3439                 }
3440                 else if (bLimLo)
3441                 {
3442                     /* root->cell_f[i] = root->bound_min[i]; */
3443                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3444                     bLastHi   = FALSE;
3445                 }
3446                 else if (bLimHi && !bLastHi)
3447                 {
3448                     bLastHi = TRUE;
3449                     if (nrange[1] < range[1])   /* found a LimLo before */
3450                     {
3451                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3452                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3453                         nrange[0] = nrange[1];
3454                     }
3455                     root->cell_f[i] = root->bound_max[i];
3456                     nrange[1]       = i;
3457                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3458                     nrange[0] = i;
3459                     nrange[1] = range[1];
3460                 }
3461             }
3462             if (nrange[1] < range[1])   /* found last a LimLo */
3463             {
3464                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3465                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3466                 nrange[0] = nrange[1];
3467                 nrange[1] = range[1];
3468                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3469             }
3470             else if (nrange[0] > range[0]) /* found at least one LimHi */
3471             {
3472                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3473             }
3474         }
3475     }
3476 }
3477
3478
3479 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3480                                        int d, int dim, gmx_domdec_root_t *root,
3481                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3482                                        gmx_bool bUniform, gmx_large_int_t step)
3483 {
3484     gmx_domdec_comm_t *comm;
3485     int                ncd, d1, i, j, pos;
3486     real              *cell_size;
3487     real               load_aver, load_i, imbalance, change, change_max, sc;
3488     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3489     real               change_limit;
3490     real               relax = 0.5;
3491     gmx_bool           bPBC;
3492     int                range[] = { 0, 0 };
3493
3494     comm = dd->comm;
3495
3496     /* Convert the maximum change from the input percentage to a fraction */
3497     change_limit = comm->dlb_scale_lim*0.01;
3498
3499     ncd = dd->nc[dim];
3500
3501     bPBC = (dim < ddbox->npbcdim);
3502
3503     cell_size = root->buf_ncd;
3504
3505     /* Store the original boundaries */
3506     for (i = 0; i < ncd+1; i++)
3507     {
3508         root->old_cell_f[i] = root->cell_f[i];
3509     }
3510     if (bUniform)
3511     {
3512         for (i = 0; i < ncd; i++)
3513         {
3514             cell_size[i] = 1.0/ncd;
3515         }
3516     }
3517     else if (dd_load_count(comm))
3518     {
3519         load_aver  = comm->load[d].sum_m/ncd;
3520         change_max = 0;
3521         for (i = 0; i < ncd; i++)
3522         {
3523             /* Determine the relative imbalance of cell i */
3524             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3525             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3526             /* Determine the change of the cell size using underrelaxation */
3527             change     = -relax*imbalance;
3528             change_max = max(change_max, max(change, -change));
3529         }
3530         /* Limit the amount of scaling.
3531          * We need to use the same rescaling for all cells in one row,
3532          * otherwise the load balancing might not converge.
3533          */
3534         sc = relax;
3535         if (change_max > change_limit)
3536         {
3537             sc *= change_limit/change_max;
3538         }
3539         for (i = 0; i < ncd; i++)
3540         {
3541             /* Determine the relative imbalance of cell i */
3542             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3543             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3544             /* Determine the change of the cell size using underrelaxation */
3545             change       = -sc*imbalance;
3546             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3547         }
3548     }
3549
3550     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3551     cellsize_limit_f *= DD_CELL_MARGIN;
3552     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3553     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3554     if (ddbox->tric_dir[dim])
3555     {
3556         cellsize_limit_f /= ddbox->skew_fac[dim];
3557         dist_min_f       /= ddbox->skew_fac[dim];
3558     }
3559     if (bDynamicBox && d > 0)
3560     {
3561         dist_min_f *= DD_PRES_SCALE_MARGIN;
3562     }
3563     if (d > 0 && !bUniform)
3564     {
3565         /* Make sure that the grid is not shifted too much */
3566         for (i = 1; i < ncd; i++)
3567         {
3568             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3569             {
3570                 gmx_incons("Inconsistent DD boundary staggering limits!");
3571             }
3572             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3573             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3574             if (space > 0)
3575             {
3576                 root->bound_min[i] += 0.5*space;
3577             }
3578             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3579             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3580             if (space < 0)
3581             {
3582                 root->bound_max[i] += 0.5*space;
3583             }
3584             if (debug)
3585             {
3586                 fprintf(debug,
3587                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3588                         d, i,
3589                         root->cell_f_max0[i-1] + dist_min_f,
3590                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3591                         root->cell_f_min1[i] - dist_min_f);
3592             }
3593         }
3594     }
3595     range[1]          = ncd;
3596     root->cell_f[0]   = 0;
3597     root->cell_f[ncd] = 1;
3598     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3599
3600
3601     /* After the checks above, the cells should obey the cut-off
3602      * restrictions, but it does not hurt to check.
3603      */
3604     for (i = 0; i < ncd; i++)
3605     {
3606         if (debug)
3607         {
3608             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3609                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3610         }
3611
3612         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3613             root->cell_f[i+1] - root->cell_f[i] <
3614             cellsize_limit_f/DD_CELL_MARGIN)
3615         {
3616             char buf[22];
3617             fprintf(stderr,
3618                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3619                     gmx_step_str(step, buf), dim2char(dim), i,
3620                     (root->cell_f[i+1] - root->cell_f[i])
3621                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3622         }
3623     }
3624
3625     pos = ncd + 1;
3626     /* Store the cell boundaries of the lower dimensions at the end */
3627     for (d1 = 0; d1 < d; d1++)
3628     {
3629         root->cell_f[pos++] = comm->cell_f0[d1];
3630         root->cell_f[pos++] = comm->cell_f1[d1];
3631     }
3632
3633     if (d < comm->npmedecompdim)
3634     {
3635         /* The master determines the maximum shift for
3636          * the coordinate communication between separate PME nodes.
3637          */
3638         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3639     }
3640     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3641     if (d >= 1)
3642     {
3643         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3644     }
3645 }
3646
3647 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3648                                              gmx_ddbox_t *ddbox, int dimind)
3649 {
3650     gmx_domdec_comm_t *comm;
3651     int                dim;
3652
3653     comm = dd->comm;
3654
3655     /* Set the cell dimensions */
3656     dim                = dd->dim[dimind];
3657     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3658     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3659     if (dim >= ddbox->nboundeddim)
3660     {
3661         comm->cell_x0[dim] += ddbox->box0[dim];
3662         comm->cell_x1[dim] += ddbox->box0[dim];
3663     }
3664 }
3665
3666 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3667                                          int d, int dim, real *cell_f_row,
3668                                          gmx_ddbox_t *ddbox)
3669 {
3670     gmx_domdec_comm_t *comm;
3671     int                d1, dim1, pos;
3672
3673     comm = dd->comm;
3674
3675 #ifdef GMX_MPI
3676     /* Each node would only need to know two fractions,
3677      * but it is probably cheaper to broadcast the whole array.
3678      */
3679     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3680               0, comm->mpi_comm_load[d]);
3681 #endif
3682     /* Copy the fractions for this dimension from the buffer */
3683     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3684     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3685     /* The whole array was communicated, so set the buffer position */
3686     pos = dd->nc[dim] + 1;
3687     for (d1 = 0; d1 <= d; d1++)
3688     {
3689         if (d1 < d)
3690         {
3691             /* Copy the cell fractions of the lower dimensions */
3692             comm->cell_f0[d1] = cell_f_row[pos++];
3693             comm->cell_f1[d1] = cell_f_row[pos++];
3694         }
3695         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3696     }
3697     /* Convert the communicated shift from float to int */
3698     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3699     if (d >= 1)
3700     {
3701         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3702     }
3703 }
3704
3705 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3706                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3707                                          gmx_bool bUniform, gmx_large_int_t step)
3708 {
3709     gmx_domdec_comm_t *comm;
3710     int                d, dim, d1;
3711     gmx_bool           bRowMember, bRowRoot;
3712     real              *cell_f_row;
3713
3714     comm = dd->comm;
3715
3716     for (d = 0; d < dd->ndim; d++)
3717     {
3718         dim        = dd->dim[d];
3719         bRowMember = TRUE;
3720         bRowRoot   = TRUE;
3721         for (d1 = d; d1 < dd->ndim; d1++)
3722         {
3723             if (dd->ci[dd->dim[d1]] > 0)
3724             {
3725                 if (d1 > d)
3726                 {
3727                     bRowMember = FALSE;
3728                 }
3729                 bRowRoot = FALSE;
3730             }
3731         }
3732         if (bRowMember)
3733         {
3734             if (bRowRoot)
3735             {
3736                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3737                                            ddbox, bDynamicBox, bUniform, step);
3738                 cell_f_row = comm->root[d]->cell_f;
3739             }
3740             else
3741             {
3742                 cell_f_row = comm->cell_f_row;
3743             }
3744             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3745         }
3746     }
3747 }
3748
3749 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3750 {
3751     int d;
3752
3753     /* This function assumes the box is static and should therefore
3754      * not be called when the box has changed since the last
3755      * call to dd_partition_system.
3756      */
3757     for (d = 0; d < dd->ndim; d++)
3758     {
3759         relative_to_absolute_cell_bounds(dd, ddbox, d);
3760     }
3761 }
3762
3763
3764
3765 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3766                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3767                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3768                                   gmx_wallcycle_t wcycle)
3769 {
3770     gmx_domdec_comm_t *comm;
3771     int                dim;
3772
3773     comm = dd->comm;
3774
3775     if (bDoDLB)
3776     {
3777         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3778         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3779         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3780     }
3781     else if (bDynamicBox)
3782     {
3783         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3784     }
3785
3786     /* Set the dimensions for which no DD is used */
3787     for (dim = 0; dim < DIM; dim++)
3788     {
3789         if (dd->nc[dim] == 1)
3790         {
3791             comm->cell_x0[dim] = 0;
3792             comm->cell_x1[dim] = ddbox->box_size[dim];
3793             if (dim >= ddbox->nboundeddim)
3794             {
3795                 comm->cell_x0[dim] += ddbox->box0[dim];
3796                 comm->cell_x1[dim] += ddbox->box0[dim];
3797             }
3798         }
3799     }
3800 }
3801
3802 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3803 {
3804     int                    d, np, i;
3805     gmx_domdec_comm_dim_t *cd;
3806
3807     for (d = 0; d < dd->ndim; d++)
3808     {
3809         cd = &dd->comm->cd[d];
3810         np = npulse[dd->dim[d]];
3811         if (np > cd->np_nalloc)
3812         {
3813             if (debug)
3814             {
3815                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3816                         dim2char(dd->dim[d]), np);
3817             }
3818             if (DDMASTER(dd) && cd->np_nalloc > 0)
3819             {
3820                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3821             }
3822             srenew(cd->ind, np);
3823             for (i = cd->np_nalloc; i < np; i++)
3824             {
3825                 cd->ind[i].index  = NULL;
3826                 cd->ind[i].nalloc = 0;
3827             }
3828             cd->np_nalloc = np;
3829         }
3830         cd->np = np;
3831     }
3832 }
3833
3834
3835 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3836                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3837                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3838                               gmx_wallcycle_t wcycle)
3839 {
3840     gmx_domdec_comm_t *comm;
3841     int                d;
3842     ivec               npulse;
3843
3844     comm = dd->comm;
3845
3846     /* Copy the old cell boundaries for the cg displacement check */
3847     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3848     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3849
3850     if (comm->bDynLoadBal)
3851     {
3852         if (DDMASTER(dd))
3853         {
3854             check_box_size(dd, ddbox);
3855         }
3856         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3857     }
3858     else
3859     {
3860         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3861         realloc_comm_ind(dd, npulse);
3862     }
3863
3864     if (debug)
3865     {
3866         for (d = 0; d < DIM; d++)
3867         {
3868             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3869                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3870         }
3871     }
3872 }
3873
3874 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3875                                   gmx_ddbox_t *ddbox,
3876                                   rvec cell_ns_x0, rvec cell_ns_x1,
3877                                   gmx_large_int_t step)
3878 {
3879     gmx_domdec_comm_t *comm;
3880     int                dim_ind, dim;
3881
3882     comm = dd->comm;
3883
3884     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3885     {
3886         dim = dd->dim[dim_ind];
3887
3888         /* Without PBC we don't have restrictions on the outer cells */
3889         if (!(dim >= ddbox->npbcdim &&
3890               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3891             comm->bDynLoadBal &&
3892             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3893             comm->cellsize_min[dim])
3894         {
3895             char buf[22];
3896             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3897                       gmx_step_str(step, buf), dim2char(dim),
3898                       comm->cell_x1[dim] - comm->cell_x0[dim],
3899                       ddbox->skew_fac[dim],
3900                       dd->comm->cellsize_min[dim],
3901                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3902         }
3903     }
3904
3905     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3906     {
3907         /* Communicate the boundaries and update cell_ns_x0/1 */
3908         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3909         if (dd->bGridJump && dd->ndim > 1)
3910         {
3911             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3912         }
3913     }
3914 }
3915
3916 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3917 {
3918     if (YY < npbcdim)
3919     {
3920         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3921     }
3922     else
3923     {
3924         tcm[YY][XX] = 0;
3925     }
3926     if (ZZ < npbcdim)
3927     {
3928         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3929         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3930     }
3931     else
3932     {
3933         tcm[ZZ][XX] = 0;
3934         tcm[ZZ][YY] = 0;
3935     }
3936 }
3937
3938 static void check_screw_box(matrix box)
3939 {
3940     /* Mathematical limitation */
3941     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3942     {
3943         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3944     }
3945
3946     /* Limitation due to the asymmetry of the eighth shell method */
3947     if (box[ZZ][YY] != 0)
3948     {
3949         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3950     }
3951 }
3952
3953 static void distribute_cg(FILE *fplog, gmx_large_int_t step,
3954                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3955                           gmx_domdec_t *dd)
3956 {
3957     gmx_domdec_master_t *ma;
3958     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3959     int                  i, icg, j, k, k0, k1, d, npbcdim;
3960     matrix               tcm;
3961     rvec                 box_size, cg_cm;
3962     ivec                 ind;
3963     real                 nrcg, inv_ncg, pos_d;
3964     atom_id             *cgindex;
3965     gmx_bool             bUnbounded, bScrew;
3966
3967     ma = dd->ma;
3968
3969     if (tmp_ind == NULL)
3970     {
3971         snew(tmp_nalloc, dd->nnodes);
3972         snew(tmp_ind, dd->nnodes);
3973         for (i = 0; i < dd->nnodes; i++)
3974         {
3975             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3976             snew(tmp_ind[i], tmp_nalloc[i]);
3977         }
3978     }
3979
3980     /* Clear the count */
3981     for (i = 0; i < dd->nnodes; i++)
3982     {
3983         ma->ncg[i] = 0;
3984         ma->nat[i] = 0;
3985     }
3986
3987     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3988
3989     cgindex = cgs->index;
3990
3991     /* Compute the center of geometry for all charge groups */
3992     for (icg = 0; icg < cgs->nr; icg++)
3993     {
3994         k0      = cgindex[icg];
3995         k1      = cgindex[icg+1];
3996         nrcg    = k1 - k0;
3997         if (nrcg == 1)
3998         {
3999             copy_rvec(pos[k0], cg_cm);
4000         }
4001         else
4002         {
4003             inv_ncg = 1.0/nrcg;
4004
4005             clear_rvec(cg_cm);
4006             for (k = k0; (k < k1); k++)
4007             {
4008                 rvec_inc(cg_cm, pos[k]);
4009             }
4010             for (d = 0; (d < DIM); d++)
4011             {
4012                 cg_cm[d] *= inv_ncg;
4013             }
4014         }
4015         /* Put the charge group in the box and determine the cell index */
4016         for (d = DIM-1; d >= 0; d--)
4017         {
4018             pos_d = cg_cm[d];
4019             if (d < dd->npbcdim)
4020             {
4021                 bScrew = (dd->bScrewPBC && d == XX);
4022                 if (tric_dir[d] && dd->nc[d] > 1)
4023                 {
4024                     /* Use triclinic coordintates for this dimension */
4025                     for (j = d+1; j < DIM; j++)
4026                     {
4027                         pos_d += cg_cm[j]*tcm[j][d];
4028                     }
4029                 }
4030                 while (pos_d >= box[d][d])
4031                 {
4032                     pos_d -= box[d][d];
4033                     rvec_dec(cg_cm, box[d]);
4034                     if (bScrew)
4035                     {
4036                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4037                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4038                     }
4039                     for (k = k0; (k < k1); k++)
4040                     {
4041                         rvec_dec(pos[k], box[d]);
4042                         if (bScrew)
4043                         {
4044                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4045                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4046                         }
4047                     }
4048                 }
4049                 while (pos_d < 0)
4050                 {
4051                     pos_d += box[d][d];
4052                     rvec_inc(cg_cm, box[d]);
4053                     if (bScrew)
4054                     {
4055                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4056                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4057                     }
4058                     for (k = k0; (k < k1); k++)
4059                     {
4060                         rvec_inc(pos[k], box[d]);
4061                         if (bScrew)
4062                         {
4063                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4064                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4065                         }
4066                     }
4067                 }
4068             }
4069             /* This could be done more efficiently */
4070             ind[d] = 0;
4071             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
4072             {
4073                 ind[d]++;
4074             }
4075         }
4076         i = dd_index(dd->nc, ind);
4077         if (ma->ncg[i] == tmp_nalloc[i])
4078         {
4079             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
4080             srenew(tmp_ind[i], tmp_nalloc[i]);
4081         }
4082         tmp_ind[i][ma->ncg[i]] = icg;
4083         ma->ncg[i]++;
4084         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
4085     }
4086
4087     k1 = 0;
4088     for (i = 0; i < dd->nnodes; i++)
4089     {
4090         ma->index[i] = k1;
4091         for (k = 0; k < ma->ncg[i]; k++)
4092         {
4093             ma->cg[k1++] = tmp_ind[i][k];
4094         }
4095     }
4096     ma->index[dd->nnodes] = k1;
4097
4098     for (i = 0; i < dd->nnodes; i++)
4099     {
4100         sfree(tmp_ind[i]);
4101     }
4102     sfree(tmp_ind);
4103     sfree(tmp_nalloc);
4104
4105     if (fplog)
4106     {
4107         char buf[22];
4108         fprintf(fplog, "Charge group distribution at step %s:",
4109                 gmx_step_str(step, buf));
4110         for (i = 0; i < dd->nnodes; i++)
4111         {
4112             fprintf(fplog, " %d", ma->ncg[i]);
4113         }
4114         fprintf(fplog, "\n");
4115     }
4116 }
4117
4118 static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
4119                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4120                                 rvec pos[])
4121 {
4122     gmx_domdec_master_t *ma = NULL;
4123     ivec                 npulse;
4124     int                  i, cg_gl;
4125     int                 *ibuf, buf2[2] = { 0, 0 };
4126     gmx_bool             bMaster = DDMASTER(dd);
4127
4128     if (bMaster)
4129     {
4130         ma = dd->ma;
4131
4132         if (dd->bScrewPBC)
4133         {
4134             check_screw_box(box);
4135         }
4136
4137         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
4138
4139         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4140         for (i = 0; i < dd->nnodes; i++)
4141         {
4142             ma->ibuf[2*i]   = ma->ncg[i];
4143             ma->ibuf[2*i+1] = ma->nat[i];
4144         }
4145         ibuf = ma->ibuf;
4146     }
4147     else
4148     {
4149         ibuf = NULL;
4150     }
4151     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4152
4153     dd->ncg_home = buf2[0];
4154     dd->nat_home = buf2[1];
4155     dd->ncg_tot  = dd->ncg_home;
4156     dd->nat_tot  = dd->nat_home;
4157     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4158     {
4159         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4160         srenew(dd->index_gl, dd->cg_nalloc);
4161         srenew(dd->cgindex, dd->cg_nalloc+1);
4162     }
4163     if (bMaster)
4164     {
4165         for (i = 0; i < dd->nnodes; i++)
4166         {
4167             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4168             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4169         }
4170     }
4171
4172     dd_scatterv(dd,
4173                 DDMASTER(dd) ? ma->ibuf : NULL,
4174                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4175                 DDMASTER(dd) ? ma->cg : NULL,
4176                 dd->ncg_home*sizeof(int), dd->index_gl);
4177
4178     /* Determine the home charge group sizes */
4179     dd->cgindex[0] = 0;
4180     for (i = 0; i < dd->ncg_home; i++)
4181     {
4182         cg_gl            = dd->index_gl[i];
4183         dd->cgindex[i+1] =
4184             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4185     }
4186
4187     if (debug)
4188     {
4189         fprintf(debug, "Home charge groups:\n");
4190         for (i = 0; i < dd->ncg_home; i++)
4191         {
4192             fprintf(debug, " %d", dd->index_gl[i]);
4193             if (i % 10 == 9)
4194             {
4195                 fprintf(debug, "\n");
4196             }
4197         }
4198         fprintf(debug, "\n");
4199     }
4200 }
4201
4202 static int compact_and_copy_vec_at(int ncg, int *move,
4203                                    int *cgindex,
4204                                    int nvec, int vec,
4205                                    rvec *src, gmx_domdec_comm_t *comm,
4206                                    gmx_bool bCompact)
4207 {
4208     int m, icg, i, i0, i1, nrcg;
4209     int home_pos;
4210     int pos_vec[DIM*2];
4211
4212     home_pos = 0;
4213
4214     for (m = 0; m < DIM*2; m++)
4215     {
4216         pos_vec[m] = 0;
4217     }
4218
4219     i0 = 0;
4220     for (icg = 0; icg < ncg; icg++)
4221     {
4222         i1 = cgindex[icg+1];
4223         m  = move[icg];
4224         if (m == -1)
4225         {
4226             if (bCompact)
4227             {
4228                 /* Compact the home array in place */
4229                 for (i = i0; i < i1; i++)
4230                 {
4231                     copy_rvec(src[i], src[home_pos++]);
4232                 }
4233             }
4234         }
4235         else
4236         {
4237             /* Copy to the communication buffer */
4238             nrcg        = i1 - i0;
4239             pos_vec[m] += 1 + vec*nrcg;
4240             for (i = i0; i < i1; i++)
4241             {
4242                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4243             }
4244             pos_vec[m] += (nvec - vec - 1)*nrcg;
4245         }
4246         if (!bCompact)
4247         {
4248             home_pos += i1 - i0;
4249         }
4250         i0 = i1;
4251     }
4252
4253     return home_pos;
4254 }
4255
4256 static int compact_and_copy_vec_cg(int ncg, int *move,
4257                                    int *cgindex,
4258                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4259                                    gmx_bool bCompact)
4260 {
4261     int m, icg, i0, i1, nrcg;
4262     int home_pos;
4263     int pos_vec[DIM*2];
4264
4265     home_pos = 0;
4266
4267     for (m = 0; m < DIM*2; m++)
4268     {
4269         pos_vec[m] = 0;
4270     }
4271
4272     i0 = 0;
4273     for (icg = 0; icg < ncg; icg++)
4274     {
4275         i1 = cgindex[icg+1];
4276         m  = move[icg];
4277         if (m == -1)
4278         {
4279             if (bCompact)
4280             {
4281                 /* Compact the home array in place */
4282                 copy_rvec(src[icg], src[home_pos++]);
4283             }
4284         }
4285         else
4286         {
4287             nrcg = i1 - i0;
4288             /* Copy to the communication buffer */
4289             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4290             pos_vec[m] += 1 + nrcg*nvec;
4291         }
4292         i0 = i1;
4293     }
4294     if (!bCompact)
4295     {
4296         home_pos = ncg;
4297     }
4298
4299     return home_pos;
4300 }
4301
4302 static int compact_ind(int ncg, int *move,
4303                        int *index_gl, int *cgindex,
4304                        int *gatindex,
4305                        gmx_ga2la_t ga2la, char *bLocalCG,
4306                        int *cginfo)
4307 {
4308     int cg, nat, a0, a1, a, a_gl;
4309     int home_pos;
4310
4311     home_pos = 0;
4312     nat      = 0;
4313     for (cg = 0; cg < ncg; cg++)
4314     {
4315         a0 = cgindex[cg];
4316         a1 = cgindex[cg+1];
4317         if (move[cg] == -1)
4318         {
4319             /* Compact the home arrays in place.
4320              * Anything that can be done here avoids access to global arrays.
4321              */
4322             cgindex[home_pos] = nat;
4323             for (a = a0; a < a1; a++)
4324             {
4325                 a_gl          = gatindex[a];
4326                 gatindex[nat] = a_gl;
4327                 /* The cell number stays 0, so we don't need to set it */
4328                 ga2la_change_la(ga2la, a_gl, nat);
4329                 nat++;
4330             }
4331             index_gl[home_pos] = index_gl[cg];
4332             cginfo[home_pos]   = cginfo[cg];
4333             /* The charge group remains local, so bLocalCG does not change */
4334             home_pos++;
4335         }
4336         else
4337         {
4338             /* Clear the global indices */
4339             for (a = a0; a < a1; a++)
4340             {
4341                 ga2la_del(ga2la, gatindex[a]);
4342             }
4343             if (bLocalCG)
4344             {
4345                 bLocalCG[index_gl[cg]] = FALSE;
4346             }
4347         }
4348     }
4349     cgindex[home_pos] = nat;
4350
4351     return home_pos;
4352 }
4353
4354 static void clear_and_mark_ind(int ncg, int *move,
4355                                int *index_gl, int *cgindex, int *gatindex,
4356                                gmx_ga2la_t ga2la, char *bLocalCG,
4357                                int *cell_index)
4358 {
4359     int cg, a0, a1, a;
4360
4361     for (cg = 0; cg < ncg; cg++)
4362     {
4363         if (move[cg] >= 0)
4364         {
4365             a0 = cgindex[cg];
4366             a1 = cgindex[cg+1];
4367             /* Clear the global indices */
4368             for (a = a0; a < a1; a++)
4369             {
4370                 ga2la_del(ga2la, gatindex[a]);
4371             }
4372             if (bLocalCG)
4373             {
4374                 bLocalCG[index_gl[cg]] = FALSE;
4375             }
4376             /* Signal that this cg has moved using the ns cell index.
4377              * Here we set it to -1. fill_grid will change it
4378              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4379              */
4380             cell_index[cg] = -1;
4381         }
4382     }
4383 }
4384
4385 static void print_cg_move(FILE *fplog,
4386                           gmx_domdec_t *dd,
4387                           gmx_large_int_t step, int cg, int dim, int dir,
4388                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4389                           rvec cm_old, rvec cm_new, real pos_d)
4390 {
4391     gmx_domdec_comm_t *comm;
4392     char               buf[22];
4393
4394     comm = dd->comm;
4395
4396     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4397     if (bHaveLimitdAndCMOld)
4398     {
4399         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4400                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4401     }
4402     else
4403     {
4404         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4405                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4406     }
4407     fprintf(fplog, "distance out of cell %f\n",
4408             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4409     if (bHaveLimitdAndCMOld)
4410     {
4411         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4412                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4413     }
4414     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4415             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4416     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4417             dim2char(dim),
4418             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4419     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4420             dim2char(dim),
4421             comm->cell_x0[dim], comm->cell_x1[dim]);
4422 }
4423
4424 static void cg_move_error(FILE *fplog,
4425                           gmx_domdec_t *dd,
4426                           gmx_large_int_t step, int cg, int dim, int dir,
4427                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4428                           rvec cm_old, rvec cm_new, real pos_d)
4429 {
4430     if (fplog)
4431     {
4432         print_cg_move(fplog, dd, step, cg, dim, dir,
4433                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4434     }
4435     print_cg_move(stderr, dd, step, cg, dim, dir,
4436                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4437     gmx_fatal(FARGS,
4438               "A charge group moved too far between two domain decomposition steps\n"
4439               "This usually means that your system is not well equilibrated");
4440 }
4441
4442 static void rotate_state_atom(t_state *state, int a)
4443 {
4444     int est;
4445
4446     for (est = 0; est < estNR; est++)
4447     {
4448         if (EST_DISTR(est) && (state->flags & (1<<est)))
4449         {
4450             switch (est)
4451             {
4452                 case estX:
4453                     /* Rotate the complete state; for a rectangular box only */
4454                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4455                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4456                     break;
4457                 case estV:
4458                     state->v[a][YY] = -state->v[a][YY];
4459                     state->v[a][ZZ] = -state->v[a][ZZ];
4460                     break;
4461                 case estSDX:
4462                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4463                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4464                     break;
4465                 case estCGP:
4466                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4467                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4468                     break;
4469                 case estDISRE_INITF:
4470                 case estDISRE_RM3TAV:
4471                 case estORIRE_INITF:
4472                 case estORIRE_DTAV:
4473                     /* These are distances, so not affected by rotation */
4474                     break;
4475                 default:
4476                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4477             }
4478         }
4479     }
4480 }
4481
4482 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4483 {
4484     if (natoms > comm->moved_nalloc)
4485     {
4486         /* Contents should be preserved here */
4487         comm->moved_nalloc = over_alloc_dd(natoms);
4488         srenew(comm->moved, comm->moved_nalloc);
4489     }
4490
4491     return comm->moved;
4492 }
4493
4494 static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
4495                          gmx_domdec_t *dd,
4496                          t_state *state,
4497                          ivec tric_dir, matrix tcm,
4498                          rvec cell_x0, rvec cell_x1,
4499                          rvec limitd, rvec limit0, rvec limit1,
4500                          const int *cgindex,
4501                          int cg_start, int cg_end,
4502                          rvec *cg_cm,
4503                          int *move)
4504 {
4505     int      npbcdim;
4506     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4507     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4508     int      flag;
4509     gmx_bool bScrew;
4510     ivec     dev;
4511     real     inv_ncg, pos_d;
4512     rvec     cm_new;
4513
4514     npbcdim = dd->npbcdim;
4515
4516     for (cg = cg_start; cg < cg_end; cg++)
4517     {
4518         k0   = cgindex[cg];
4519         k1   = cgindex[cg+1];
4520         nrcg = k1 - k0;
4521         if (nrcg == 1)
4522         {
4523             copy_rvec(state->x[k0], cm_new);
4524         }
4525         else
4526         {
4527             inv_ncg = 1.0/nrcg;
4528
4529             clear_rvec(cm_new);
4530             for (k = k0; (k < k1); k++)
4531             {
4532                 rvec_inc(cm_new, state->x[k]);
4533             }
4534             for (d = 0; (d < DIM); d++)
4535             {
4536                 cm_new[d] = inv_ncg*cm_new[d];
4537             }
4538         }
4539
4540         clear_ivec(dev);
4541         /* Do pbc and check DD cell boundary crossings */
4542         for (d = DIM-1; d >= 0; d--)
4543         {
4544             if (dd->nc[d] > 1)
4545             {
4546                 bScrew = (dd->bScrewPBC && d == XX);
4547                 /* Determine the location of this cg in lattice coordinates */
4548                 pos_d = cm_new[d];
4549                 if (tric_dir[d])
4550                 {
4551                     for (d2 = d+1; d2 < DIM; d2++)
4552                     {
4553                         pos_d += cm_new[d2]*tcm[d2][d];
4554                     }
4555                 }
4556                 /* Put the charge group in the triclinic unit-cell */
4557                 if (pos_d >= cell_x1[d])
4558                 {
4559                     if (pos_d >= limit1[d])
4560                     {
4561                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4562                                       cg_cm[cg], cm_new, pos_d);
4563                     }
4564                     dev[d] = 1;
4565                     if (dd->ci[d] == dd->nc[d] - 1)
4566                     {
4567                         rvec_dec(cm_new, state->box[d]);
4568                         if (bScrew)
4569                         {
4570                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4571                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4572                         }
4573                         for (k = k0; (k < k1); k++)
4574                         {
4575                             rvec_dec(state->x[k], state->box[d]);
4576                             if (bScrew)
4577                             {
4578                                 rotate_state_atom(state, k);
4579                             }
4580                         }
4581                     }
4582                 }
4583                 else if (pos_d < cell_x0[d])
4584                 {
4585                     if (pos_d < limit0[d])
4586                     {
4587                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4588                                       cg_cm[cg], cm_new, pos_d);
4589                     }
4590                     dev[d] = -1;
4591                     if (dd->ci[d] == 0)
4592                     {
4593                         rvec_inc(cm_new, state->box[d]);
4594                         if (bScrew)
4595                         {
4596                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4597                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4598                         }
4599                         for (k = k0; (k < k1); k++)
4600                         {
4601                             rvec_inc(state->x[k], state->box[d]);
4602                             if (bScrew)
4603                             {
4604                                 rotate_state_atom(state, k);
4605                             }
4606                         }
4607                     }
4608                 }
4609             }
4610             else if (d < npbcdim)
4611             {
4612                 /* Put the charge group in the rectangular unit-cell */
4613                 while (cm_new[d] >= state->box[d][d])
4614                 {
4615                     rvec_dec(cm_new, state->box[d]);
4616                     for (k = k0; (k < k1); k++)
4617                     {
4618                         rvec_dec(state->x[k], state->box[d]);
4619                     }
4620                 }
4621                 while (cm_new[d] < 0)
4622                 {
4623                     rvec_inc(cm_new, state->box[d]);
4624                     for (k = k0; (k < k1); k++)
4625                     {
4626                         rvec_inc(state->x[k], state->box[d]);
4627                     }
4628                 }
4629             }
4630         }
4631
4632         copy_rvec(cm_new, cg_cm[cg]);
4633
4634         /* Determine where this cg should go */
4635         flag = 0;
4636         mc   = -1;
4637         for (d = 0; d < dd->ndim; d++)
4638         {
4639             dim = dd->dim[d];
4640             if (dev[dim] == 1)
4641             {
4642                 flag |= DD_FLAG_FW(d);
4643                 if (mc == -1)
4644                 {
4645                     mc = d*2;
4646                 }
4647             }
4648             else if (dev[dim] == -1)
4649             {
4650                 flag |= DD_FLAG_BW(d);
4651                 if (mc == -1)
4652                 {
4653                     if (dd->nc[dim] > 2)
4654                     {
4655                         mc = d*2 + 1;
4656                     }
4657                     else
4658                     {
4659                         mc = d*2;
4660                     }
4661                 }
4662             }
4663         }
4664         /* Temporarily store the flag in move */
4665         move[cg] = mc + flag;
4666     }
4667 }
4668
4669 static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
4670                                gmx_domdec_t *dd, ivec tric_dir,
4671                                t_state *state, rvec **f,
4672                                t_forcerec *fr, t_mdatoms *md,
4673                                gmx_bool bCompact,
4674                                t_nrnb *nrnb,
4675                                int *ncg_stay_home,
4676                                int *ncg_moved)
4677 {
4678     int               *move;
4679     int                npbcdim;
4680     int                ncg[DIM*2], nat[DIM*2];
4681     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4682     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4683     int                sbuf[2], rbuf[2];
4684     int                home_pos_cg, home_pos_at, buf_pos;
4685     int                flag;
4686     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4687     gmx_bool           bScrew;
4688     ivec               dev;
4689     real               inv_ncg, pos_d;
4690     matrix             tcm;
4691     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4692     atom_id           *cgindex;
4693     cginfo_mb_t       *cginfo_mb;
4694     gmx_domdec_comm_t *comm;
4695     int               *moved;
4696     int                nthread, thread;
4697
4698     if (dd->bScrewPBC)
4699     {
4700         check_screw_box(state->box);
4701     }
4702
4703     comm  = dd->comm;
4704     if (fr->cutoff_scheme == ecutsGROUP)
4705     {
4706         cg_cm = fr->cg_cm;
4707     }
4708
4709     for (i = 0; i < estNR; i++)
4710     {
4711         if (EST_DISTR(i))
4712         {
4713             switch (i)
4714             {
4715                 case estX: /* Always present */ break;
4716                 case estV:   bV   = (state->flags & (1<<i)); break;
4717                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4718                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4719                 case estLD_RNG:
4720                 case estLD_RNGI:
4721                 case estDISRE_INITF:
4722                 case estDISRE_RM3TAV:
4723                 case estORIRE_INITF:
4724                 case estORIRE_DTAV:
4725                     /* No processing required */
4726                     break;
4727                 default:
4728                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4729             }
4730         }
4731     }
4732
4733     if (dd->ncg_tot > comm->nalloc_int)
4734     {
4735         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4736         srenew(comm->buf_int, comm->nalloc_int);
4737     }
4738     move = comm->buf_int;
4739
4740     /* Clear the count */
4741     for (c = 0; c < dd->ndim*2; c++)
4742     {
4743         ncg[c] = 0;
4744         nat[c] = 0;
4745     }
4746
4747     npbcdim = dd->npbcdim;
4748
4749     for (d = 0; (d < DIM); d++)
4750     {
4751         limitd[d] = dd->comm->cellsize_min[d];
4752         if (d >= npbcdim && dd->ci[d] == 0)
4753         {
4754             cell_x0[d] = -GMX_FLOAT_MAX;
4755         }
4756         else
4757         {
4758             cell_x0[d] = comm->cell_x0[d];
4759         }
4760         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4761         {
4762             cell_x1[d] = GMX_FLOAT_MAX;
4763         }
4764         else
4765         {
4766             cell_x1[d] = comm->cell_x1[d];
4767         }
4768         if (d < npbcdim)
4769         {
4770             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4771             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4772         }
4773         else
4774         {
4775             /* We check after communication if a charge group moved
4776              * more than one cell. Set the pre-comm check limit to float_max.
4777              */
4778             limit0[d] = -GMX_FLOAT_MAX;
4779             limit1[d] =  GMX_FLOAT_MAX;
4780         }
4781     }
4782
4783     make_tric_corr_matrix(npbcdim, state->box, tcm);
4784
4785     cgindex = dd->cgindex;
4786
4787     nthread = gmx_omp_nthreads_get(emntDomdec);
4788
4789     /* Compute the center of geometry for all home charge groups
4790      * and put them in the box and determine where they should go.
4791      */
4792 #pragma omp parallel for num_threads(nthread) schedule(static)
4793     for (thread = 0; thread < nthread; thread++)
4794     {
4795         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4796                      cell_x0, cell_x1, limitd, limit0, limit1,
4797                      cgindex,
4798                      ( thread   *dd->ncg_home)/nthread,
4799                      ((thread+1)*dd->ncg_home)/nthread,
4800                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4801                      move);
4802     }
4803
4804     for (cg = 0; cg < dd->ncg_home; cg++)
4805     {
4806         if (move[cg] >= 0)
4807         {
4808             mc       = move[cg];
4809             flag     = mc & ~DD_FLAG_NRCG;
4810             mc       = mc & DD_FLAG_NRCG;
4811             move[cg] = mc;
4812
4813             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4814             {
4815                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4816                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4817             }
4818             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4819             /* We store the cg size in the lower 16 bits
4820              * and the place where the charge group should go
4821              * in the next 6 bits. This saves some communication volume.
4822              */
4823             nrcg = cgindex[cg+1] - cgindex[cg];
4824             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4825             ncg[mc] += 1;
4826             nat[mc] += nrcg;
4827         }
4828     }
4829
4830     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4831     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4832
4833     *ncg_moved = 0;
4834     for (i = 0; i < dd->ndim*2; i++)
4835     {
4836         *ncg_moved += ncg[i];
4837     }
4838
4839     nvec = 1;
4840     if (bV)
4841     {
4842         nvec++;
4843     }
4844     if (bSDX)
4845     {
4846         nvec++;
4847     }
4848     if (bCGP)
4849     {
4850         nvec++;
4851     }
4852
4853     /* Make sure the communication buffers are large enough */
4854     for (mc = 0; mc < dd->ndim*2; mc++)
4855     {
4856         nvr = ncg[mc] + nat[mc]*nvec;
4857         if (nvr > comm->cgcm_state_nalloc[mc])
4858         {
4859             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4860             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4861         }
4862     }
4863
4864     switch (fr->cutoff_scheme)
4865     {
4866         case ecutsGROUP:
4867             /* Recalculating cg_cm might be cheaper than communicating,
4868              * but that could give rise to rounding issues.
4869              */
4870             home_pos_cg =
4871                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4872                                         nvec, cg_cm, comm, bCompact);
4873             break;
4874         case ecutsVERLET:
4875             /* Without charge groups we send the moved atom coordinates
4876              * over twice. This is so the code below can be used without
4877              * many conditionals for both for with and without charge groups.
4878              */
4879             home_pos_cg =
4880                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4881                                         nvec, state->x, comm, FALSE);
4882             if (bCompact)
4883             {
4884                 home_pos_cg -= *ncg_moved;
4885             }
4886             break;
4887         default:
4888             gmx_incons("unimplemented");
4889             home_pos_cg = 0;
4890     }
4891
4892     vec         = 0;
4893     home_pos_at =
4894         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4895                                 nvec, vec++, state->x, comm, bCompact);
4896     if (bV)
4897     {
4898         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4899                                 nvec, vec++, state->v, comm, bCompact);
4900     }
4901     if (bSDX)
4902     {
4903         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4904                                 nvec, vec++, state->sd_X, comm, bCompact);
4905     }
4906     if (bCGP)
4907     {
4908         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4909                                 nvec, vec++, state->cg_p, comm, bCompact);
4910     }
4911
4912     if (bCompact)
4913     {
4914         compact_ind(dd->ncg_home, move,
4915                     dd->index_gl, dd->cgindex, dd->gatindex,
4916                     dd->ga2la, comm->bLocalCG,
4917                     fr->cginfo);
4918     }
4919     else
4920     {
4921         if (fr->cutoff_scheme == ecutsVERLET)
4922         {
4923             moved = get_moved(comm, dd->ncg_home);
4924
4925             for (k = 0; k < dd->ncg_home; k++)
4926             {
4927                 moved[k] = 0;
4928             }
4929         }
4930         else
4931         {
4932             moved = fr->ns.grid->cell_index;
4933         }
4934
4935         clear_and_mark_ind(dd->ncg_home, move,
4936                            dd->index_gl, dd->cgindex, dd->gatindex,
4937                            dd->ga2la, comm->bLocalCG,
4938                            moved);
4939     }
4940
4941     cginfo_mb = fr->cginfo_mb;
4942
4943     *ncg_stay_home = home_pos_cg;
4944     for (d = 0; d < dd->ndim; d++)
4945     {
4946         dim      = dd->dim[d];
4947         ncg_recv = 0;
4948         nat_recv = 0;
4949         nvr      = 0;
4950         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4951         {
4952             cdd = d*2 + dir;
4953             /* Communicate the cg and atom counts */
4954             sbuf[0] = ncg[cdd];
4955             sbuf[1] = nat[cdd];
4956             if (debug)
4957             {
4958                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4959                         d, dir, sbuf[0], sbuf[1]);
4960             }
4961             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4962
4963             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4964             {
4965                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4966                 srenew(comm->buf_int, comm->nalloc_int);
4967             }
4968
4969             /* Communicate the charge group indices, sizes and flags */
4970             dd_sendrecv_int(dd, d, dir,
4971                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4972                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4973
4974             nvs = ncg[cdd] + nat[cdd]*nvec;
4975             i   = rbuf[0]  + rbuf[1] *nvec;
4976             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4977
4978             /* Communicate cgcm and state */
4979             dd_sendrecv_rvec(dd, d, dir,
4980                              comm->cgcm_state[cdd], nvs,
4981                              comm->vbuf.v+nvr, i);
4982             ncg_recv += rbuf[0];
4983             nat_recv += rbuf[1];
4984             nvr      += i;
4985         }
4986
4987         /* Process the received charge groups */
4988         buf_pos = 0;
4989         for (cg = 0; cg < ncg_recv; cg++)
4990         {
4991             flag = comm->buf_int[cg*DD_CGIBS+1];
4992
4993             if (dim >= npbcdim && dd->nc[dim] > 2)
4994             {
4995                 /* No pbc in this dim and more than one domain boundary.
4996                  * We do a separate check if a charge group didn't move too far.
4997                  */
4998                 if (((flag & DD_FLAG_FW(d)) &&
4999                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
5000                     ((flag & DD_FLAG_BW(d)) &&
5001                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
5002                 {
5003                     cg_move_error(fplog, dd, step, cg, dim,
5004                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
5005                                   FALSE, 0,
5006                                   comm->vbuf.v[buf_pos],
5007                                   comm->vbuf.v[buf_pos],
5008                                   comm->vbuf.v[buf_pos][dim]);
5009                 }
5010             }
5011
5012             mc = -1;
5013             if (d < dd->ndim-1)
5014             {
5015                 /* Check which direction this cg should go */
5016                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
5017                 {
5018                     if (dd->bGridJump)
5019                     {
5020                         /* The cell boundaries for dimension d2 are not equal
5021                          * for each cell row of the lower dimension(s),
5022                          * therefore we might need to redetermine where
5023                          * this cg should go.
5024                          */
5025                         dim2 = dd->dim[d2];
5026                         /* If this cg crosses the box boundary in dimension d2
5027                          * we can use the communicated flag, so we do not
5028                          * have to worry about pbc.
5029                          */
5030                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
5031                                (flag & DD_FLAG_FW(d2))) ||
5032                               (dd->ci[dim2] == 0 &&
5033                                (flag & DD_FLAG_BW(d2)))))
5034                         {
5035                             /* Clear the two flags for this dimension */
5036                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
5037                             /* Determine the location of this cg
5038                              * in lattice coordinates
5039                              */
5040                             pos_d = comm->vbuf.v[buf_pos][dim2];
5041                             if (tric_dir[dim2])
5042                             {
5043                                 for (d3 = dim2+1; d3 < DIM; d3++)
5044                                 {
5045                                     pos_d +=
5046                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
5047                                 }
5048                             }
5049                             /* Check of we are not at the box edge.
5050                              * pbc is only handled in the first step above,
5051                              * but this check could move over pbc while
5052                              * the first step did not due to different rounding.
5053                              */
5054                             if (pos_d >= cell_x1[dim2] &&
5055                                 dd->ci[dim2] != dd->nc[dim2]-1)
5056                             {
5057                                 flag |= DD_FLAG_FW(d2);
5058                             }
5059                             else if (pos_d < cell_x0[dim2] &&
5060                                      dd->ci[dim2] != 0)
5061                             {
5062                                 flag |= DD_FLAG_BW(d2);
5063                             }
5064                             comm->buf_int[cg*DD_CGIBS+1] = flag;
5065                         }
5066                     }
5067                     /* Set to which neighboring cell this cg should go */
5068                     if (flag & DD_FLAG_FW(d2))
5069                     {
5070                         mc = d2*2;
5071                     }
5072                     else if (flag & DD_FLAG_BW(d2))
5073                     {
5074                         if (dd->nc[dd->dim[d2]] > 2)
5075                         {
5076                             mc = d2*2+1;
5077                         }
5078                         else
5079                         {
5080                             mc = d2*2;
5081                         }
5082                     }
5083                 }
5084             }
5085
5086             nrcg = flag & DD_FLAG_NRCG;
5087             if (mc == -1)
5088             {
5089                 if (home_pos_cg+1 > dd->cg_nalloc)
5090                 {
5091                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5092                     srenew(dd->index_gl, dd->cg_nalloc);
5093                     srenew(dd->cgindex, dd->cg_nalloc+1);
5094                 }
5095                 /* Set the global charge group index and size */
5096                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5097                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5098                 /* Copy the state from the buffer */
5099                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5100                 if (fr->cutoff_scheme == ecutsGROUP)
5101                 {
5102                     cg_cm = fr->cg_cm;
5103                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5104                 }
5105                 buf_pos++;
5106
5107                 /* Set the cginfo */
5108                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5109                                                    dd->index_gl[home_pos_cg]);
5110                 if (comm->bLocalCG)
5111                 {
5112                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5113                 }
5114
5115                 if (home_pos_at+nrcg > state->nalloc)
5116                 {
5117                     dd_realloc_state(state, f, home_pos_at+nrcg);
5118                 }
5119                 for (i = 0; i < nrcg; i++)
5120                 {
5121                     copy_rvec(comm->vbuf.v[buf_pos++],
5122                               state->x[home_pos_at+i]);
5123                 }
5124                 if (bV)
5125                 {
5126                     for (i = 0; i < nrcg; i++)
5127                     {
5128                         copy_rvec(comm->vbuf.v[buf_pos++],
5129                                   state->v[home_pos_at+i]);
5130                     }
5131                 }
5132                 if (bSDX)
5133                 {
5134                     for (i = 0; i < nrcg; i++)
5135                     {
5136                         copy_rvec(comm->vbuf.v[buf_pos++],
5137                                   state->sd_X[home_pos_at+i]);
5138                     }
5139                 }
5140                 if (bCGP)
5141                 {
5142                     for (i = 0; i < nrcg; i++)
5143                     {
5144                         copy_rvec(comm->vbuf.v[buf_pos++],
5145                                   state->cg_p[home_pos_at+i]);
5146                     }
5147                 }
5148                 home_pos_cg += 1;
5149                 home_pos_at += nrcg;
5150             }
5151             else
5152             {
5153                 /* Reallocate the buffers if necessary  */
5154                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5155                 {
5156                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5157                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5158                 }
5159                 nvr = ncg[mc] + nat[mc]*nvec;
5160                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5161                 {
5162                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5163                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5164                 }
5165                 /* Copy from the receive to the send buffers */
5166                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5167                        comm->buf_int + cg*DD_CGIBS,
5168                        DD_CGIBS*sizeof(int));
5169                 memcpy(comm->cgcm_state[mc][nvr],
5170                        comm->vbuf.v[buf_pos],
5171                        (1+nrcg*nvec)*sizeof(rvec));
5172                 buf_pos += 1 + nrcg*nvec;
5173                 ncg[mc] += 1;
5174                 nat[mc] += nrcg;
5175             }
5176         }
5177     }
5178
5179     /* With sorting (!bCompact) the indices are now only partially up to date
5180      * and ncg_home and nat_home are not the real count, since there are
5181      * "holes" in the arrays for the charge groups that moved to neighbors.
5182      */
5183     if (fr->cutoff_scheme == ecutsVERLET)
5184     {
5185         moved = get_moved(comm, home_pos_cg);
5186
5187         for (i = dd->ncg_home; i < home_pos_cg; i++)
5188         {
5189             moved[i] = 0;
5190         }
5191     }
5192     dd->ncg_home = home_pos_cg;
5193     dd->nat_home = home_pos_at;
5194
5195     if (debug)
5196     {
5197         fprintf(debug,
5198                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5199                 *ncg_moved, dd->ncg_home-*ncg_moved);
5200
5201     }
5202 }
5203
5204 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5205 {
5206     dd->comm->cycl[ddCycl] += cycles;
5207     dd->comm->cycl_n[ddCycl]++;
5208     if (cycles > dd->comm->cycl_max[ddCycl])
5209     {
5210         dd->comm->cycl_max[ddCycl] = cycles;
5211     }
5212 }
5213
5214 static double force_flop_count(t_nrnb *nrnb)
5215 {
5216     int         i;
5217     double      sum;
5218     const char *name;
5219
5220     sum = 0;
5221     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5222     {
5223         /* To get closer to the real timings, we half the count
5224          * for the normal loops and again half it for water loops.
5225          */
5226         name = nrnb_str(i);
5227         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5228         {
5229             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5230         }
5231         else
5232         {
5233             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5234         }
5235     }
5236     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5237     {
5238         name = nrnb_str(i);
5239         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5240         {
5241             sum += nrnb->n[i]*cost_nrnb(i);
5242         }
5243     }
5244     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5245     {
5246         sum += nrnb->n[i]*cost_nrnb(i);
5247     }
5248
5249     return sum;
5250 }
5251
5252 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5253 {
5254     if (dd->comm->eFlop)
5255     {
5256         dd->comm->flop -= force_flop_count(nrnb);
5257     }
5258 }
5259 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5260 {
5261     if (dd->comm->eFlop)
5262     {
5263         dd->comm->flop += force_flop_count(nrnb);
5264         dd->comm->flop_n++;
5265     }
5266 }
5267
5268 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5269 {
5270     int i;
5271
5272     for (i = 0; i < ddCyclNr; i++)
5273     {
5274         dd->comm->cycl[i]     = 0;
5275         dd->comm->cycl_n[i]   = 0;
5276         dd->comm->cycl_max[i] = 0;
5277     }
5278     dd->comm->flop   = 0;
5279     dd->comm->flop_n = 0;
5280 }
5281
5282 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5283 {
5284     gmx_domdec_comm_t *comm;
5285     gmx_domdec_load_t *load;
5286     gmx_domdec_root_t *root = NULL;
5287     int                d, dim, cid, i, pos;
5288     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5289     gmx_bool           bSepPME;
5290
5291     if (debug)
5292     {
5293         fprintf(debug, "get_load_distribution start\n");
5294     }
5295
5296     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5297
5298     comm = dd->comm;
5299
5300     bSepPME = (dd->pme_nodeid >= 0);
5301
5302     for (d = dd->ndim-1; d >= 0; d--)
5303     {
5304         dim = dd->dim[d];
5305         /* Check if we participate in the communication in this dimension */
5306         if (d == dd->ndim-1 ||
5307             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5308         {
5309             load = &comm->load[d];
5310             if (dd->bGridJump)
5311             {
5312                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5313             }
5314             pos = 0;
5315             if (d == dd->ndim-1)
5316             {
5317                 sbuf[pos++] = dd_force_load(comm);
5318                 sbuf[pos++] = sbuf[0];
5319                 if (dd->bGridJump)
5320                 {
5321                     sbuf[pos++] = sbuf[0];
5322                     sbuf[pos++] = cell_frac;
5323                     if (d > 0)
5324                     {
5325                         sbuf[pos++] = comm->cell_f_max0[d];
5326                         sbuf[pos++] = comm->cell_f_min1[d];
5327                     }
5328                 }
5329                 if (bSepPME)
5330                 {
5331                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5332                     sbuf[pos++] = comm->cycl[ddCyclPME];
5333                 }
5334             }
5335             else
5336             {
5337                 sbuf[pos++] = comm->load[d+1].sum;
5338                 sbuf[pos++] = comm->load[d+1].max;
5339                 if (dd->bGridJump)
5340                 {
5341                     sbuf[pos++] = comm->load[d+1].sum_m;
5342                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5343                     sbuf[pos++] = comm->load[d+1].flags;
5344                     if (d > 0)
5345                     {
5346                         sbuf[pos++] = comm->cell_f_max0[d];
5347                         sbuf[pos++] = comm->cell_f_min1[d];
5348                     }
5349                 }
5350                 if (bSepPME)
5351                 {
5352                     sbuf[pos++] = comm->load[d+1].mdf;
5353                     sbuf[pos++] = comm->load[d+1].pme;
5354                 }
5355             }
5356             load->nload = pos;
5357             /* Communicate a row in DD direction d.
5358              * The communicators are setup such that the root always has rank 0.
5359              */
5360 #ifdef GMX_MPI
5361             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5362                        load->load, load->nload*sizeof(float), MPI_BYTE,
5363                        0, comm->mpi_comm_load[d]);
5364 #endif
5365             if (dd->ci[dim] == dd->master_ci[dim])
5366             {
5367                 /* We are the root, process this row */
5368                 if (comm->bDynLoadBal)
5369                 {
5370                     root = comm->root[d];
5371                 }
5372                 load->sum      = 0;
5373                 load->max      = 0;
5374                 load->sum_m    = 0;
5375                 load->cvol_min = 1;
5376                 load->flags    = 0;
5377                 load->mdf      = 0;
5378                 load->pme      = 0;
5379                 pos            = 0;
5380                 for (i = 0; i < dd->nc[dim]; i++)
5381                 {
5382                     load->sum += load->load[pos++];
5383                     load->max  = max(load->max, load->load[pos]);
5384                     pos++;
5385                     if (dd->bGridJump)
5386                     {
5387                         if (root->bLimited)
5388                         {
5389                             /* This direction could not be load balanced properly,
5390                              * therefore we need to use the maximum iso the average load.
5391                              */
5392                             load->sum_m = max(load->sum_m, load->load[pos]);
5393                         }
5394                         else
5395                         {
5396                             load->sum_m += load->load[pos];
5397                         }
5398                         pos++;
5399                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5400                         pos++;
5401                         if (d < dd->ndim-1)
5402                         {
5403                             load->flags = (int)(load->load[pos++] + 0.5);
5404                         }
5405                         if (d > 0)
5406                         {
5407                             root->cell_f_max0[i] = load->load[pos++];
5408                             root->cell_f_min1[i] = load->load[pos++];
5409                         }
5410                     }
5411                     if (bSepPME)
5412                     {
5413                         load->mdf = max(load->mdf, load->load[pos]);
5414                         pos++;
5415                         load->pme = max(load->pme, load->load[pos]);
5416                         pos++;
5417                     }
5418                 }
5419                 if (comm->bDynLoadBal && root->bLimited)
5420                 {
5421                     load->sum_m *= dd->nc[dim];
5422                     load->flags |= (1<<d);
5423                 }
5424             }
5425         }
5426     }
5427
5428     if (DDMASTER(dd))
5429     {
5430         comm->nload      += dd_load_count(comm);
5431         comm->load_step  += comm->cycl[ddCyclStep];
5432         comm->load_sum   += comm->load[0].sum;
5433         comm->load_max   += comm->load[0].max;
5434         if (comm->bDynLoadBal)
5435         {
5436             for (d = 0; d < dd->ndim; d++)
5437             {
5438                 if (comm->load[0].flags & (1<<d))
5439                 {
5440                     comm->load_lim[d]++;
5441                 }
5442             }
5443         }
5444         if (bSepPME)
5445         {
5446             comm->load_mdf += comm->load[0].mdf;
5447             comm->load_pme += comm->load[0].pme;
5448         }
5449     }
5450
5451     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5452
5453     if (debug)
5454     {
5455         fprintf(debug, "get_load_distribution finished\n");
5456     }
5457 }
5458
5459 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5460 {
5461     /* Return the relative performance loss on the total run time
5462      * due to the force calculation load imbalance.
5463      */
5464     if (dd->comm->nload > 0)
5465     {
5466         return
5467             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5468             (dd->comm->load_step*dd->nnodes);
5469     }
5470     else
5471     {
5472         return 0;
5473     }
5474 }
5475
5476 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5477 {
5478     char               buf[STRLEN];
5479     int                npp, npme, nnodes, d, limp;
5480     float              imbal, pme_f_ratio, lossf, lossp = 0;
5481     gmx_bool           bLim;
5482     gmx_domdec_comm_t *comm;
5483
5484     comm = dd->comm;
5485     if (DDMASTER(dd) && comm->nload > 0)
5486     {
5487         npp    = dd->nnodes;
5488         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5489         nnodes = npp + npme;
5490         imbal  = comm->load_max*npp/comm->load_sum - 1;
5491         lossf  = dd_force_imb_perf_loss(dd);
5492         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5493         fprintf(fplog, "%s", buf);
5494         fprintf(stderr, "\n");
5495         fprintf(stderr, "%s", buf);
5496         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5497         fprintf(fplog, "%s", buf);
5498         fprintf(stderr, "%s", buf);
5499         bLim = FALSE;
5500         if (comm->bDynLoadBal)
5501         {
5502             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5503             for (d = 0; d < dd->ndim; d++)
5504             {
5505                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5506                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5507                 if (limp >= 50)
5508                 {
5509                     bLim = TRUE;
5510                 }
5511             }
5512             sprintf(buf+strlen(buf), "\n");
5513             fprintf(fplog, "%s", buf);
5514             fprintf(stderr, "%s", buf);
5515         }
5516         if (npme > 0)
5517         {
5518             pme_f_ratio = comm->load_pme/comm->load_mdf;
5519             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5520             if (lossp <= 0)
5521             {
5522                 lossp *= (float)npme/(float)nnodes;
5523             }
5524             else
5525             {
5526                 lossp *= (float)npp/(float)nnodes;
5527             }
5528             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5529             fprintf(fplog, "%s", buf);
5530             fprintf(stderr, "%s", buf);
5531             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5532             fprintf(fplog, "%s", buf);
5533             fprintf(stderr, "%s", buf);
5534         }
5535         fprintf(fplog, "\n");
5536         fprintf(stderr, "\n");
5537
5538         if (lossf >= DD_PERF_LOSS)
5539         {
5540             sprintf(buf,
5541                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5542                     "      in the domain decomposition.\n", lossf*100);
5543             if (!comm->bDynLoadBal)
5544             {
5545                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5546             }
5547             else if (bLim)
5548             {
5549                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5550             }
5551             fprintf(fplog, "%s\n", buf);
5552             fprintf(stderr, "%s\n", buf);
5553         }
5554         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5555         {
5556             sprintf(buf,
5557                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5558                     "      had %s work to do than the PP nodes.\n"
5559                     "      You might want to %s the number of PME nodes\n"
5560                     "      or %s the cut-off and the grid spacing.\n",
5561                     fabs(lossp*100),
5562                     (lossp < 0) ? "less"     : "more",
5563                     (lossp < 0) ? "decrease" : "increase",
5564                     (lossp < 0) ? "decrease" : "increase");
5565             fprintf(fplog, "%s\n", buf);
5566             fprintf(stderr, "%s\n", buf);
5567         }
5568     }
5569 }
5570
5571 static float dd_vol_min(gmx_domdec_t *dd)
5572 {
5573     return dd->comm->load[0].cvol_min*dd->nnodes;
5574 }
5575
5576 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5577 {
5578     return dd->comm->load[0].flags;
5579 }
5580
5581 static float dd_f_imbal(gmx_domdec_t *dd)
5582 {
5583     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5584 }
5585
5586 float dd_pme_f_ratio(gmx_domdec_t *dd)
5587 {
5588     if (dd->comm->cycl_n[ddCyclPME] > 0)
5589     {
5590         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5591     }
5592     else
5593     {
5594         return -1.0;
5595     }
5596 }
5597
5598 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
5599 {
5600     int  flags, d;
5601     char buf[22];
5602
5603     flags = dd_load_flags(dd);
5604     if (flags)
5605     {
5606         fprintf(fplog,
5607                 "DD  load balancing is limited by minimum cell size in dimension");
5608         for (d = 0; d < dd->ndim; d++)
5609         {
5610             if (flags & (1<<d))
5611             {
5612                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5613             }
5614         }
5615         fprintf(fplog, "\n");
5616     }
5617     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5618     if (dd->comm->bDynLoadBal)
5619     {
5620         fprintf(fplog, "  vol min/aver %5.3f%c",
5621                 dd_vol_min(dd), flags ? '!' : ' ');
5622     }
5623     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5624     if (dd->comm->cycl_n[ddCyclPME])
5625     {
5626         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5627     }
5628     fprintf(fplog, "\n\n");
5629 }
5630
5631 static void dd_print_load_verbose(gmx_domdec_t *dd)
5632 {
5633     if (dd->comm->bDynLoadBal)
5634     {
5635         fprintf(stderr, "vol %4.2f%c ",
5636                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5637     }
5638     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5639     if (dd->comm->cycl_n[ddCyclPME])
5640     {
5641         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5642     }
5643 }
5644
5645 #ifdef GMX_MPI
5646 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5647 {
5648     MPI_Comm           c_row;
5649     int                dim, i, rank;
5650     ivec               loc_c;
5651     gmx_domdec_root_t *root;
5652     gmx_bool           bPartOfGroup = FALSE;
5653
5654     dim = dd->dim[dim_ind];
5655     copy_ivec(loc, loc_c);
5656     for (i = 0; i < dd->nc[dim]; i++)
5657     {
5658         loc_c[dim] = i;
5659         rank       = dd_index(dd->nc, loc_c);
5660         if (rank == dd->rank)
5661         {
5662             /* This process is part of the group */
5663             bPartOfGroup = TRUE;
5664         }
5665     }
5666     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5667                    &c_row);
5668     if (bPartOfGroup)
5669     {
5670         dd->comm->mpi_comm_load[dim_ind] = c_row;
5671         if (dd->comm->eDLB != edlbNO)
5672         {
5673             if (dd->ci[dim] == dd->master_ci[dim])
5674             {
5675                 /* This is the root process of this row */
5676                 snew(dd->comm->root[dim_ind], 1);
5677                 root = dd->comm->root[dim_ind];
5678                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5679                 snew(root->old_cell_f, dd->nc[dim]+1);
5680                 snew(root->bCellMin, dd->nc[dim]);
5681                 if (dim_ind > 0)
5682                 {
5683                     snew(root->cell_f_max0, dd->nc[dim]);
5684                     snew(root->cell_f_min1, dd->nc[dim]);
5685                     snew(root->bound_min, dd->nc[dim]);
5686                     snew(root->bound_max, dd->nc[dim]);
5687                 }
5688                 snew(root->buf_ncd, dd->nc[dim]);
5689             }
5690             else
5691             {
5692                 /* This is not a root process, we only need to receive cell_f */
5693                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5694             }
5695         }
5696         if (dd->ci[dim] == dd->master_ci[dim])
5697         {
5698             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5699         }
5700     }
5701 }
5702 #endif
5703
5704 void dd_setup_dlb_resource_sharing(t_commrec *cr,
5705                                    const gmx_hw_info_t *hwinfo,
5706                                    const gmx_hw_opt_t *hw_opt)
5707 {
5708 #ifdef GMX_MPI
5709     int           physicalnode_id_hash;
5710     int           gpu_id;
5711     gmx_domdec_t *dd;
5712     MPI_Comm      mpi_comm_pp_physicalnode;
5713
5714     if (!(cr->duty & DUTY_PP) ||
5715         hw_opt->gpu_opt.ncuda_dev_use == 0)
5716     {
5717         /* Only PP nodes (currently) use GPUs.
5718          * If we don't have GPUs, there are no resources to share.
5719          */
5720         return;
5721     }
5722
5723     physicalnode_id_hash = gmx_physicalnode_id_hash();
5724
5725     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5726
5727     dd = cr->dd;
5728
5729     if (debug)
5730     {
5731         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5732         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5733                 dd->rank, physicalnode_id_hash, gpu_id);
5734     }
5735     /* Split the PP communicator over the physical nodes */
5736     /* TODO: See if we should store this (before), as it's also used for
5737      * for the nodecomm summution.
5738      */
5739     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5740                    &mpi_comm_pp_physicalnode);
5741     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5742                    &dd->comm->mpi_comm_gpu_shared);
5743     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5744     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5745
5746     if (debug)
5747     {
5748         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5749     }
5750
5751     /* Note that some ranks could share a GPU, while others don't */
5752
5753     if (dd->comm->nrank_gpu_shared == 1)
5754     {
5755         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5756     }
5757 #endif
5758 }
5759
5760 static void make_load_communicators(gmx_domdec_t *dd)
5761 {
5762 #ifdef GMX_MPI
5763     int  dim0, dim1, i, j;
5764     ivec loc;
5765
5766     if (debug)
5767     {
5768         fprintf(debug, "Making load communicators\n");
5769     }
5770
5771     snew(dd->comm->load, dd->ndim);
5772     snew(dd->comm->mpi_comm_load, dd->ndim);
5773
5774     clear_ivec(loc);
5775     make_load_communicator(dd, 0, loc);
5776     if (dd->ndim > 1)
5777     {
5778         dim0 = dd->dim[0];
5779         for (i = 0; i < dd->nc[dim0]; i++)
5780         {
5781             loc[dim0] = i;
5782             make_load_communicator(dd, 1, loc);
5783         }
5784     }
5785     if (dd->ndim > 2)
5786     {
5787         dim0 = dd->dim[0];
5788         for (i = 0; i < dd->nc[dim0]; i++)
5789         {
5790             loc[dim0] = i;
5791             dim1      = dd->dim[1];
5792             for (j = 0; j < dd->nc[dim1]; j++)
5793             {
5794                 loc[dim1] = j;
5795                 make_load_communicator(dd, 2, loc);
5796             }
5797         }
5798     }
5799
5800     if (debug)
5801     {
5802         fprintf(debug, "Finished making load communicators\n");
5803     }
5804 #endif
5805 }
5806
5807 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5808 {
5809     gmx_bool                bZYX;
5810     int                     d, dim, i, j, m;
5811     ivec                    tmp, s;
5812     int                     nzone, nzonep;
5813     ivec                    dd_zp[DD_MAXIZONE];
5814     gmx_domdec_zones_t     *zones;
5815     gmx_domdec_ns_ranges_t *izone;
5816
5817     for (d = 0; d < dd->ndim; d++)
5818     {
5819         dim = dd->dim[d];
5820         copy_ivec(dd->ci, tmp);
5821         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5822         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5823         copy_ivec(dd->ci, tmp);
5824         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5825         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5826         if (debug)
5827         {
5828             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5829                     dd->rank, dim,
5830                     dd->neighbor[d][0],
5831                     dd->neighbor[d][1]);
5832         }
5833     }
5834
5835     if (fplog)
5836     {
5837         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5838                 dd->ndim,
5839                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5840                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5841     }
5842     switch (dd->ndim)
5843     {
5844         case 3:
5845             nzone  = dd_z3n;
5846             nzonep = dd_zp3n;
5847             for (i = 0; i < nzonep; i++)
5848             {
5849                 copy_ivec(dd_zp3[i], dd_zp[i]);
5850             }
5851             break;
5852         case 2:
5853             nzone  = dd_z2n;
5854             nzonep = dd_zp2n;
5855             for (i = 0; i < nzonep; i++)
5856             {
5857                 copy_ivec(dd_zp2[i], dd_zp[i]);
5858             }
5859             break;
5860         case 1:
5861             nzone  = dd_z1n;
5862             nzonep = dd_zp1n;
5863             for (i = 0; i < nzonep; i++)
5864             {
5865                 copy_ivec(dd_zp1[i], dd_zp[i]);
5866             }
5867             break;
5868         default:
5869             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5870             nzone  = 0;
5871             nzonep = 0;
5872     }
5873
5874     zones = &dd->comm->zones;
5875
5876     for (i = 0; i < nzone; i++)
5877     {
5878         m = 0;
5879         clear_ivec(zones->shift[i]);
5880         for (d = 0; d < dd->ndim; d++)
5881         {
5882             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5883         }
5884     }
5885
5886     zones->n = nzone;
5887     for (i = 0; i < nzone; i++)
5888     {
5889         for (d = 0; d < DIM; d++)
5890         {
5891             s[d] = dd->ci[d] - zones->shift[i][d];
5892             if (s[d] < 0)
5893             {
5894                 s[d] += dd->nc[d];
5895             }
5896             else if (s[d] >= dd->nc[d])
5897             {
5898                 s[d] -= dd->nc[d];
5899             }
5900         }
5901     }
5902     zones->nizone = nzonep;
5903     for (i = 0; i < zones->nizone; i++)
5904     {
5905         if (dd_zp[i][0] != i)
5906         {
5907             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5908         }
5909         izone     = &zones->izone[i];
5910         izone->j0 = dd_zp[i][1];
5911         izone->j1 = dd_zp[i][2];
5912         for (dim = 0; dim < DIM; dim++)
5913         {
5914             if (dd->nc[dim] == 1)
5915             {
5916                 /* All shifts should be allowed */
5917                 izone->shift0[dim] = -1;
5918                 izone->shift1[dim] = 1;
5919             }
5920             else
5921             {
5922                 /*
5923                    izone->shift0[d] = 0;
5924                    izone->shift1[d] = 0;
5925                    for(j=izone->j0; j<izone->j1; j++) {
5926                    if (dd->shift[j][d] > dd->shift[i][d])
5927                    izone->shift0[d] = -1;
5928                    if (dd->shift[j][d] < dd->shift[i][d])
5929                    izone->shift1[d] = 1;
5930                    }
5931                  */
5932
5933                 int shift_diff;
5934
5935                 /* Assume the shift are not more than 1 cell */
5936                 izone->shift0[dim] = 1;
5937                 izone->shift1[dim] = -1;
5938                 for (j = izone->j0; j < izone->j1; j++)
5939                 {
5940                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5941                     if (shift_diff < izone->shift0[dim])
5942                     {
5943                         izone->shift0[dim] = shift_diff;
5944                     }
5945                     if (shift_diff > izone->shift1[dim])
5946                     {
5947                         izone->shift1[dim] = shift_diff;
5948                     }
5949                 }
5950             }
5951         }
5952     }
5953
5954     if (dd->comm->eDLB != edlbNO)
5955     {
5956         snew(dd->comm->root, dd->ndim);
5957     }
5958
5959     if (dd->comm->bRecordLoad)
5960     {
5961         make_load_communicators(dd);
5962     }
5963 }
5964
5965 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
5966 {
5967     gmx_domdec_t      *dd;
5968     gmx_domdec_comm_t *comm;
5969     int                i, rank, *buf;
5970     ivec               periods;
5971 #ifdef GMX_MPI
5972     MPI_Comm           comm_cart;
5973 #endif
5974
5975     dd   = cr->dd;
5976     comm = dd->comm;
5977
5978 #ifdef GMX_MPI
5979     if (comm->bCartesianPP)
5980     {
5981         /* Set up cartesian communication for the particle-particle part */
5982         if (fplog)
5983         {
5984             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5985                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5986         }
5987
5988         for (i = 0; i < DIM; i++)
5989         {
5990             periods[i] = TRUE;
5991         }
5992         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5993                         &comm_cart);
5994         /* We overwrite the old communicator with the new cartesian one */
5995         cr->mpi_comm_mygroup = comm_cart;
5996     }
5997
5998     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5999     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
6000
6001     if (comm->bCartesianPP_PME)
6002     {
6003         /* Since we want to use the original cartesian setup for sim,
6004          * and not the one after split, we need to make an index.
6005          */
6006         snew(comm->ddindex2ddnodeid, dd->nnodes);
6007         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
6008         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
6009         /* Get the rank of the DD master,
6010          * above we made sure that the master node is a PP node.
6011          */
6012         if (MASTER(cr))
6013         {
6014             rank = dd->rank;
6015         }
6016         else
6017         {
6018             rank = 0;
6019         }
6020         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
6021     }
6022     else if (comm->bCartesianPP)
6023     {
6024         if (cr->npmenodes == 0)
6025         {
6026             /* The PP communicator is also
6027              * the communicator for this simulation
6028              */
6029             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
6030         }
6031         cr->nodeid = dd->rank;
6032
6033         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
6034
6035         /* We need to make an index to go from the coordinates
6036          * to the nodeid of this simulation.
6037          */
6038         snew(comm->ddindex2simnodeid, dd->nnodes);
6039         snew(buf, dd->nnodes);
6040         if (cr->duty & DUTY_PP)
6041         {
6042             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6043         }
6044         /* Communicate the ddindex to simulation nodeid index */
6045         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6046                       cr->mpi_comm_mysim);
6047         sfree(buf);
6048
6049         /* Determine the master coordinates and rank.
6050          * The DD master should be the same node as the master of this sim.
6051          */
6052         for (i = 0; i < dd->nnodes; i++)
6053         {
6054             if (comm->ddindex2simnodeid[i] == 0)
6055             {
6056                 ddindex2xyz(dd->nc, i, dd->master_ci);
6057                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
6058             }
6059         }
6060         if (debug)
6061         {
6062             fprintf(debug, "The master rank is %d\n", dd->masterrank);
6063         }
6064     }
6065     else
6066     {
6067         /* No Cartesian communicators */
6068         /* We use the rank in dd->comm->all as DD index */
6069         ddindex2xyz(dd->nc, dd->rank, dd->ci);
6070         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6071         dd->masterrank = 0;
6072         clear_ivec(dd->master_ci);
6073     }
6074 #endif
6075
6076     if (fplog)
6077     {
6078         fprintf(fplog,
6079                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
6080                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6081     }
6082     if (debug)
6083     {
6084         fprintf(debug,
6085                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
6086                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6087     }
6088 }
6089
6090 static void receive_ddindex2simnodeid(t_commrec *cr)
6091 {
6092     gmx_domdec_t      *dd;
6093
6094     gmx_domdec_comm_t *comm;
6095     int               *buf;
6096
6097     dd   = cr->dd;
6098     comm = dd->comm;
6099
6100 #ifdef GMX_MPI
6101     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
6102     {
6103         snew(comm->ddindex2simnodeid, dd->nnodes);
6104         snew(buf, dd->nnodes);
6105         if (cr->duty & DUTY_PP)
6106         {
6107             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6108         }
6109 #ifdef GMX_MPI
6110         /* Communicate the ddindex to simulation nodeid index */
6111         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6112                       cr->mpi_comm_mysim);
6113 #endif
6114         sfree(buf);
6115     }
6116 #endif
6117 }
6118
6119 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
6120                                                      int ncg, int natoms)
6121 {
6122     gmx_domdec_master_t *ma;
6123     int                  i;
6124
6125     snew(ma, 1);
6126
6127     snew(ma->ncg, dd->nnodes);
6128     snew(ma->index, dd->nnodes+1);
6129     snew(ma->cg, ncg);
6130     snew(ma->nat, dd->nnodes);
6131     snew(ma->ibuf, dd->nnodes*2);
6132     snew(ma->cell_x, DIM);
6133     for (i = 0; i < DIM; i++)
6134     {
6135         snew(ma->cell_x[i], dd->nc[i]+1);
6136     }
6137
6138     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
6139     {
6140         ma->vbuf = NULL;
6141     }
6142     else
6143     {
6144         snew(ma->vbuf, natoms);
6145     }
6146
6147     return ma;
6148 }
6149
6150 static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
6151                                int reorder)
6152 {
6153     gmx_domdec_t      *dd;
6154     gmx_domdec_comm_t *comm;
6155     int                i, rank;
6156     gmx_bool           bDiv[DIM];
6157     ivec               periods;
6158 #ifdef GMX_MPI
6159     MPI_Comm           comm_cart;
6160 #endif
6161
6162     dd   = cr->dd;
6163     comm = dd->comm;
6164
6165     if (comm->bCartesianPP)
6166     {
6167         for (i = 1; i < DIM; i++)
6168         {
6169             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6170         }
6171         if (bDiv[YY] || bDiv[ZZ])
6172         {
6173             comm->bCartesianPP_PME = TRUE;
6174             /* If we have 2D PME decomposition, which is always in x+y,
6175              * we stack the PME only nodes in z.
6176              * Otherwise we choose the direction that provides the thinnest slab
6177              * of PME only nodes as this will have the least effect
6178              * on the PP communication.
6179              * But for the PME communication the opposite might be better.
6180              */
6181             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6182                              !bDiv[YY] ||
6183                              dd->nc[YY] > dd->nc[ZZ]))
6184             {
6185                 comm->cartpmedim = ZZ;
6186             }
6187             else
6188             {
6189                 comm->cartpmedim = YY;
6190             }
6191             comm->ntot[comm->cartpmedim]
6192                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6193         }
6194         else if (fplog)
6195         {
6196             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6197             fprintf(fplog,
6198                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6199         }
6200     }
6201
6202 #ifdef GMX_MPI
6203     if (comm->bCartesianPP_PME)
6204     {
6205         if (fplog)
6206         {
6207             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6208         }
6209
6210         for (i = 0; i < DIM; i++)
6211         {
6212             periods[i] = TRUE;
6213         }
6214         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6215                         &comm_cart);
6216
6217         MPI_Comm_rank(comm_cart, &rank);
6218         if (MASTERNODE(cr) && rank != 0)
6219         {
6220             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6221         }
6222
6223         /* With this assigment we loose the link to the original communicator
6224          * which will usually be MPI_COMM_WORLD, unless have multisim.
6225          */
6226         cr->mpi_comm_mysim = comm_cart;
6227         cr->sim_nodeid     = rank;
6228
6229         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6230
6231         if (fplog)
6232         {
6233             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6234                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6235         }
6236
6237         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6238         {
6239             cr->duty = DUTY_PP;
6240         }
6241         if (cr->npmenodes == 0 ||
6242             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6243         {
6244             cr->duty = DUTY_PME;
6245         }
6246
6247         /* Split the sim communicator into PP and PME only nodes */
6248         MPI_Comm_split(cr->mpi_comm_mysim,
6249                        cr->duty,
6250                        dd_index(comm->ntot, dd->ci),
6251                        &cr->mpi_comm_mygroup);
6252     }
6253     else
6254     {
6255         switch (dd_node_order)
6256         {
6257             case ddnoPP_PME:
6258                 if (fplog)
6259                 {
6260                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6261                 }
6262                 break;
6263             case ddnoINTERLEAVE:
6264                 /* Interleave the PP-only and PME-only nodes,
6265                  * as on clusters with dual-core machines this will double
6266                  * the communication bandwidth of the PME processes
6267                  * and thus speed up the PP <-> PME and inter PME communication.
6268                  */
6269                 if (fplog)
6270                 {
6271                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6272                 }
6273                 comm->pmenodes = dd_pmenodes(cr);
6274                 break;
6275             case ddnoCARTESIAN:
6276                 break;
6277             default:
6278                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6279         }
6280
6281         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6282         {
6283             cr->duty = DUTY_PME;
6284         }
6285         else
6286         {
6287             cr->duty = DUTY_PP;
6288         }
6289
6290         /* Split the sim communicator into PP and PME only nodes */
6291         MPI_Comm_split(cr->mpi_comm_mysim,
6292                        cr->duty,
6293                        cr->nodeid,
6294                        &cr->mpi_comm_mygroup);
6295         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6296     }
6297 #endif
6298
6299     if (fplog)
6300     {
6301         fprintf(fplog, "This is a %s only node\n\n",
6302                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6303     }
6304 }
6305
6306 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6307 {
6308     gmx_domdec_t      *dd;
6309     gmx_domdec_comm_t *comm;
6310     int                CartReorder;
6311
6312     dd   = cr->dd;
6313     comm = dd->comm;
6314
6315     copy_ivec(dd->nc, comm->ntot);
6316
6317     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6318     comm->bCartesianPP_PME = FALSE;
6319
6320     /* Reorder the nodes by default. This might change the MPI ranks.
6321      * Real reordering is only supported on very few architectures,
6322      * Blue Gene is one of them.
6323      */
6324     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6325
6326     if (cr->npmenodes > 0)
6327     {
6328         /* Split the communicator into a PP and PME part */
6329         split_communicator(fplog, cr, dd_node_order, CartReorder);
6330         if (comm->bCartesianPP_PME)
6331         {
6332             /* We (possibly) reordered the nodes in split_communicator,
6333              * so it is no longer required in make_pp_communicator.
6334              */
6335             CartReorder = FALSE;
6336         }
6337     }
6338     else
6339     {
6340         /* All nodes do PP and PME */
6341 #ifdef GMX_MPI
6342         /* We do not require separate communicators */
6343         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6344 #endif
6345     }
6346
6347     if (cr->duty & DUTY_PP)
6348     {
6349         /* Copy or make a new PP communicator */
6350         make_pp_communicator(fplog, cr, CartReorder);
6351     }
6352     else
6353     {
6354         receive_ddindex2simnodeid(cr);
6355     }
6356
6357     if (!(cr->duty & DUTY_PME))
6358     {
6359         /* Set up the commnuication to our PME node */
6360         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6361         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6362         if (debug)
6363         {
6364             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6365                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6366         }
6367     }
6368     else
6369     {
6370         dd->pme_nodeid = -1;
6371     }
6372
6373     if (DDMASTER(dd))
6374     {
6375         dd->ma = init_gmx_domdec_master_t(dd,
6376                                           comm->cgs_gl.nr,
6377                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6378     }
6379 }
6380
6381 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6382 {
6383     real  *slb_frac, tot;
6384     int    i, n;
6385     double dbl;
6386
6387     slb_frac = NULL;
6388     if (nc > 1 && size_string != NULL)
6389     {
6390         if (fplog)
6391         {
6392             fprintf(fplog, "Using static load balancing for the %s direction\n",
6393                     dir);
6394         }
6395         snew(slb_frac, nc);
6396         tot = 0;
6397         for (i = 0; i < nc; i++)
6398         {
6399             dbl = 0;
6400             sscanf(size_string, "%lf%n", &dbl, &n);
6401             if (dbl == 0)
6402             {
6403                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6404             }
6405             slb_frac[i]  = dbl;
6406             size_string += n;
6407             tot         += slb_frac[i];
6408         }
6409         /* Normalize */
6410         if (fplog)
6411         {
6412             fprintf(fplog, "Relative cell sizes:");
6413         }
6414         for (i = 0; i < nc; i++)
6415         {
6416             slb_frac[i] /= tot;
6417             if (fplog)
6418             {
6419                 fprintf(fplog, " %5.3f", slb_frac[i]);
6420             }
6421         }
6422         if (fplog)
6423         {
6424             fprintf(fplog, "\n");
6425         }
6426     }
6427
6428     return slb_frac;
6429 }
6430
6431 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6432 {
6433     int                  n, nmol, ftype;
6434     gmx_mtop_ilistloop_t iloop;
6435     t_ilist             *il;
6436
6437     n     = 0;
6438     iloop = gmx_mtop_ilistloop_init(mtop);
6439     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6440     {
6441         for (ftype = 0; ftype < F_NRE; ftype++)
6442         {
6443             if ((interaction_function[ftype].flags & IF_BOND) &&
6444                 NRAL(ftype) >  2)
6445             {
6446                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6447             }
6448         }
6449     }
6450
6451     return n;
6452 }
6453
6454 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6455 {
6456     char *val;
6457     int   nst;
6458
6459     nst = def;
6460     val = getenv(env_var);
6461     if (val)
6462     {
6463         if (sscanf(val, "%d", &nst) <= 0)
6464         {
6465             nst = 1;
6466         }
6467         if (fplog)
6468         {
6469             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6470                     env_var, val, nst);
6471         }
6472     }
6473
6474     return nst;
6475 }
6476
6477 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6478 {
6479     if (MASTER(cr))
6480     {
6481         fprintf(stderr, "\n%s\n", warn_string);
6482     }
6483     if (fplog)
6484     {
6485         fprintf(fplog, "\n%s\n", warn_string);
6486     }
6487 }
6488
6489 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6490                                   t_inputrec *ir, FILE *fplog)
6491 {
6492     if (ir->ePBC == epbcSCREW &&
6493         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6494     {
6495         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6496     }
6497
6498     if (ir->ns_type == ensSIMPLE)
6499     {
6500         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6501     }
6502
6503     if (ir->nstlist == 0)
6504     {
6505         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6506     }
6507
6508     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6509     {
6510         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6511     }
6512 }
6513
6514 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6515 {
6516     int  di, d;
6517     real r;
6518
6519     r = ddbox->box_size[XX];
6520     for (di = 0; di < dd->ndim; di++)
6521     {
6522         d = dd->dim[di];
6523         /* Check using the initial average cell size */
6524         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6525     }
6526
6527     return r;
6528 }
6529
6530 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6531                              const char *dlb_opt, gmx_bool bRecordLoad,
6532                              unsigned long Flags, t_inputrec *ir)
6533 {
6534     gmx_domdec_t *dd;
6535     int           eDLB = -1;
6536     char          buf[STRLEN];
6537
6538     switch (dlb_opt[0])
6539     {
6540         case 'a': eDLB = edlbAUTO; break;
6541         case 'n': eDLB = edlbNO;   break;
6542         case 'y': eDLB = edlbYES;  break;
6543         default: gmx_incons("Unknown dlb_opt");
6544     }
6545
6546     if (Flags & MD_RERUN)
6547     {
6548         return edlbNO;
6549     }
6550
6551     if (!EI_DYNAMICS(ir->eI))
6552     {
6553         if (eDLB == edlbYES)
6554         {
6555             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6556             dd_warning(cr, fplog, buf);
6557         }
6558
6559         return edlbNO;
6560     }
6561
6562     if (!bRecordLoad)
6563     {
6564         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6565
6566         return edlbNO;
6567     }
6568
6569     if (Flags & MD_REPRODUCIBLE)
6570     {
6571         switch (eDLB)
6572         {
6573             case edlbNO:
6574                 break;
6575             case edlbAUTO:
6576                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6577                 eDLB = edlbNO;
6578                 break;
6579             case edlbYES:
6580                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6581                 break;
6582             default:
6583                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6584                 break;
6585         }
6586     }
6587
6588     return eDLB;
6589 }
6590
6591 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6592 {
6593     int dim;
6594
6595     dd->ndim = 0;
6596     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6597     {
6598         /* Decomposition order z,y,x */
6599         if (fplog)
6600         {
6601             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6602         }
6603         for (dim = DIM-1; dim >= 0; dim--)
6604         {
6605             if (dd->nc[dim] > 1)
6606             {
6607                 dd->dim[dd->ndim++] = dim;
6608             }
6609         }
6610     }
6611     else
6612     {
6613         /* Decomposition order x,y,z */
6614         for (dim = 0; dim < DIM; dim++)
6615         {
6616             if (dd->nc[dim] > 1)
6617             {
6618                 dd->dim[dd->ndim++] = dim;
6619             }
6620         }
6621     }
6622 }
6623
6624 static gmx_domdec_comm_t *init_dd_comm()
6625 {
6626     gmx_domdec_comm_t *comm;
6627     int                i;
6628
6629     snew(comm, 1);
6630     snew(comm->cggl_flag, DIM*2);
6631     snew(comm->cgcm_state, DIM*2);
6632     for (i = 0; i < DIM*2; i++)
6633     {
6634         comm->cggl_flag_nalloc[i]  = 0;
6635         comm->cgcm_state_nalloc[i] = 0;
6636     }
6637
6638     comm->nalloc_int = 0;
6639     comm->buf_int    = NULL;
6640
6641     vec_rvec_init(&comm->vbuf);
6642
6643     comm->n_load_have    = 0;
6644     comm->n_load_collect = 0;
6645
6646     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6647     {
6648         comm->sum_nat[i] = 0;
6649     }
6650     comm->ndecomp   = 0;
6651     comm->nload     = 0;
6652     comm->load_step = 0;
6653     comm->load_sum  = 0;
6654     comm->load_max  = 0;
6655     clear_ivec(comm->load_lim);
6656     comm->load_mdf  = 0;
6657     comm->load_pme  = 0;
6658
6659     return comm;
6660 }
6661
6662 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6663                                         unsigned long Flags,
6664                                         ivec nc,
6665                                         real comm_distance_min, real rconstr,
6666                                         const char *dlb_opt, real dlb_scale,
6667                                         const char *sizex, const char *sizey, const char *sizez,
6668                                         gmx_mtop_t *mtop, t_inputrec *ir,
6669                                         matrix box, rvec *x,
6670                                         gmx_ddbox_t *ddbox,
6671                                         int *npme_x, int *npme_y)
6672 {
6673     gmx_domdec_t      *dd;
6674     gmx_domdec_comm_t *comm;
6675     int                recload;
6676     int                d, i, j;
6677     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6678     gmx_bool           bC;
6679     char               buf[STRLEN];
6680
6681     if (fplog)
6682     {
6683         fprintf(fplog,
6684                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6685     }
6686
6687     snew(dd, 1);
6688
6689     dd->comm = init_dd_comm();
6690     comm     = dd->comm;
6691     snew(comm->cggl_flag, DIM*2);
6692     snew(comm->cgcm_state, DIM*2);
6693
6694     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6695     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6696
6697     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6698     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6699     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6700     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6701     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6702     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6703     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6704     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6705
6706     dd->pme_recv_f_alloc = 0;
6707     dd->pme_recv_f_buf   = NULL;
6708
6709     if (dd->bSendRecv2 && fplog)
6710     {
6711         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6712     }
6713     if (comm->eFlop)
6714     {
6715         if (fplog)
6716         {
6717             fprintf(fplog, "Will load balance based on FLOP count\n");
6718         }
6719         if (comm->eFlop > 1)
6720         {
6721             srand(1+cr->nodeid);
6722         }
6723         comm->bRecordLoad = TRUE;
6724     }
6725     else
6726     {
6727         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6728
6729     }
6730
6731     /* Initialize to GPU share count to 0, might change later */
6732     comm->nrank_gpu_shared = 0;
6733
6734     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6735
6736     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6737     if (fplog)
6738     {
6739         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6740     }
6741     dd->bGridJump              = comm->bDynLoadBal;
6742     comm->bPMELoadBalDLBLimits = FALSE;
6743
6744     if (comm->nstSortCG)
6745     {
6746         if (fplog)
6747         {
6748             if (comm->nstSortCG == 1)
6749             {
6750                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6751             }
6752             else
6753             {
6754                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6755                         comm->nstSortCG);
6756             }
6757         }
6758         snew(comm->sort, 1);
6759     }
6760     else
6761     {
6762         if (fplog)
6763         {
6764             fprintf(fplog, "Will not sort the charge groups\n");
6765         }
6766     }
6767
6768     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6769
6770     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6771     if (comm->bInterCGBondeds)
6772     {
6773         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6774     }
6775     else
6776     {
6777         comm->bInterCGMultiBody = FALSE;
6778     }
6779
6780     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6781     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6782
6783     if (ir->rlistlong == 0)
6784     {
6785         /* Set the cut-off to some very large value,
6786          * so we don't need if statements everywhere in the code.
6787          * We use sqrt, since the cut-off is squared in some places.
6788          */
6789         comm->cutoff   = GMX_CUTOFF_INF;
6790     }
6791     else
6792     {
6793         comm->cutoff   = ir->rlistlong;
6794     }
6795     comm->cutoff_mbody = 0;
6796
6797     comm->cellsize_limit = 0;
6798     comm->bBondComm      = FALSE;
6799
6800     if (comm->bInterCGBondeds)
6801     {
6802         if (comm_distance_min > 0)
6803         {
6804             comm->cutoff_mbody = comm_distance_min;
6805             if (Flags & MD_DDBONDCOMM)
6806             {
6807                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6808             }
6809             else
6810             {
6811                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6812             }
6813             r_bonded_limit = comm->cutoff_mbody;
6814         }
6815         else if (ir->bPeriodicMols)
6816         {
6817             /* Can not easily determine the required cut-off */
6818             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6819             comm->cutoff_mbody = comm->cutoff/2;
6820             r_bonded_limit     = comm->cutoff_mbody;
6821         }
6822         else
6823         {
6824             if (MASTER(cr))
6825             {
6826                 dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
6827                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6828             }
6829             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6830             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6831
6832             /* We use an initial margin of 10% for the minimum cell size,
6833              * except when we are just below the non-bonded cut-off.
6834              */
6835             if (Flags & MD_DDBONDCOMM)
6836             {
6837                 if (max(r_2b, r_mb) > comm->cutoff)
6838                 {
6839                     r_bonded        = max(r_2b, r_mb);
6840                     r_bonded_limit  = 1.1*r_bonded;
6841                     comm->bBondComm = TRUE;
6842                 }
6843                 else
6844                 {
6845                     r_bonded       = r_mb;
6846                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6847                 }
6848                 /* We determine cutoff_mbody later */
6849             }
6850             else
6851             {
6852                 /* No special bonded communication,
6853                  * simply increase the DD cut-off.
6854                  */
6855                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6856                 comm->cutoff_mbody = r_bonded_limit;
6857                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6858             }
6859         }
6860         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6861         if (fplog)
6862         {
6863             fprintf(fplog,
6864                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6865                     comm->cellsize_limit);
6866         }
6867     }
6868
6869     if (dd->bInterCGcons && rconstr <= 0)
6870     {
6871         /* There is a cell size limit due to the constraints (P-LINCS) */
6872         rconstr = constr_r_max(fplog, mtop, ir);
6873         if (fplog)
6874         {
6875             fprintf(fplog,
6876                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6877                     rconstr);
6878             if (rconstr > comm->cellsize_limit)
6879             {
6880                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6881             }
6882         }
6883     }
6884     else if (rconstr > 0 && fplog)
6885     {
6886         /* Here we do not check for dd->bInterCGcons,
6887          * because one can also set a cell size limit for virtual sites only
6888          * and at this point we don't know yet if there are intercg v-sites.
6889          */
6890         fprintf(fplog,
6891                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6892                 rconstr);
6893     }
6894     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6895
6896     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6897
6898     if (nc[XX] > 0)
6899     {
6900         copy_ivec(nc, dd->nc);
6901         set_dd_dim(fplog, dd);
6902         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6903
6904         if (cr->npmenodes == -1)
6905         {
6906             cr->npmenodes = 0;
6907         }
6908         acs = average_cellsize_min(dd, ddbox);
6909         if (acs < comm->cellsize_limit)
6910         {
6911             if (fplog)
6912             {
6913                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6914             }
6915             gmx_fatal_collective(FARGS, cr, NULL,
6916                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6917                                  acs, comm->cellsize_limit);
6918         }
6919     }
6920     else
6921     {
6922         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6923
6924         /* We need to choose the optimal DD grid and possibly PME nodes */
6925         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6926                                comm->eDLB != edlbNO, dlb_scale,
6927                                comm->cellsize_limit, comm->cutoff,
6928                                comm->bInterCGBondeds, comm->bInterCGMultiBody);
6929
6930         if (dd->nc[XX] == 0)
6931         {
6932             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6933             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6934                     !bC ? "-rdd" : "-rcon",
6935                     comm->eDLB != edlbNO ? " or -dds" : "",
6936                     bC ? " or your LINCS settings" : "");
6937
6938             gmx_fatal_collective(FARGS, cr, NULL,
6939                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6940                                  "%s\n"
6941                                  "Look in the log file for details on the domain decomposition",
6942                                  cr->nnodes-cr->npmenodes, limit, buf);
6943         }
6944         set_dd_dim(fplog, dd);
6945     }
6946
6947     if (fplog)
6948     {
6949         fprintf(fplog,
6950                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6951                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6952     }
6953
6954     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6955     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6956     {
6957         gmx_fatal_collective(FARGS, cr, NULL,
6958                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6959                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6960     }
6961     if (cr->npmenodes > dd->nnodes)
6962     {
6963         gmx_fatal_collective(FARGS, cr, NULL,
6964                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6965     }
6966     if (cr->npmenodes > 0)
6967     {
6968         comm->npmenodes = cr->npmenodes;
6969     }
6970     else
6971     {
6972         comm->npmenodes = dd->nnodes;
6973     }
6974
6975     if (EEL_PME(ir->coulombtype))
6976     {
6977         /* The following choices should match those
6978          * in comm_cost_est in domdec_setup.c.
6979          * Note that here the checks have to take into account
6980          * that the decomposition might occur in a different order than xyz
6981          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6982          * in which case they will not match those in comm_cost_est,
6983          * but since that is mainly for testing purposes that's fine.
6984          */
6985         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6986             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6987             getenv("GMX_PMEONEDD") == NULL)
6988         {
6989             comm->npmedecompdim = 2;
6990             comm->npmenodes_x   = dd->nc[XX];
6991             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6992         }
6993         else
6994         {
6995             /* In case nc is 1 in both x and y we could still choose to
6996              * decompose pme in y instead of x, but we use x for simplicity.
6997              */
6998             comm->npmedecompdim = 1;
6999             if (dd->dim[0] == YY)
7000             {
7001                 comm->npmenodes_x = 1;
7002                 comm->npmenodes_y = comm->npmenodes;
7003             }
7004             else
7005             {
7006                 comm->npmenodes_x = comm->npmenodes;
7007                 comm->npmenodes_y = 1;
7008             }
7009         }
7010         if (fplog)
7011         {
7012             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
7013                     comm->npmenodes_x, comm->npmenodes_y, 1);
7014         }
7015     }
7016     else
7017     {
7018         comm->npmedecompdim = 0;
7019         comm->npmenodes_x   = 0;
7020         comm->npmenodes_y   = 0;
7021     }
7022
7023     /* Technically we don't need both of these,
7024      * but it simplifies code not having to recalculate it.
7025      */
7026     *npme_x = comm->npmenodes_x;
7027     *npme_y = comm->npmenodes_y;
7028
7029     snew(comm->slb_frac, DIM);
7030     if (comm->eDLB == edlbNO)
7031     {
7032         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
7033         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
7034         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
7035     }
7036
7037     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
7038     {
7039         if (comm->bBondComm || comm->eDLB != edlbNO)
7040         {
7041             /* Set the bonded communication distance to halfway
7042              * the minimum and the maximum,
7043              * since the extra communication cost is nearly zero.
7044              */
7045             acs                = average_cellsize_min(dd, ddbox);
7046             comm->cutoff_mbody = 0.5*(r_bonded + acs);
7047             if (comm->eDLB != edlbNO)
7048             {
7049                 /* Check if this does not limit the scaling */
7050                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
7051             }
7052             if (!comm->bBondComm)
7053             {
7054                 /* Without bBondComm do not go beyond the n.b. cut-off */
7055                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
7056                 if (comm->cellsize_limit >= comm->cutoff)
7057                 {
7058                     /* We don't loose a lot of efficieny
7059                      * when increasing it to the n.b. cut-off.
7060                      * It can even be slightly faster, because we need
7061                      * less checks for the communication setup.
7062                      */
7063                     comm->cutoff_mbody = comm->cutoff;
7064                 }
7065             }
7066             /* Check if we did not end up below our original limit */
7067             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
7068
7069             if (comm->cutoff_mbody > comm->cellsize_limit)
7070             {
7071                 comm->cellsize_limit = comm->cutoff_mbody;
7072             }
7073         }
7074         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7075     }
7076
7077     if (debug)
7078     {
7079         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
7080                 "cellsize limit %f\n",
7081                 comm->bBondComm, comm->cellsize_limit);
7082     }
7083
7084     if (MASTER(cr))
7085     {
7086         check_dd_restrictions(cr, dd, ir, fplog);
7087     }
7088
7089     comm->partition_step = INT_MIN;
7090     dd->ddp_count        = 0;
7091
7092     clear_dd_cycle_counts(dd);
7093
7094     return dd;
7095 }
7096
7097 static void set_dlb_limits(gmx_domdec_t *dd)
7098
7099 {
7100     int d;
7101
7102     for (d = 0; d < dd->ndim; d++)
7103     {
7104         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
7105         dd->comm->cellsize_min[dd->dim[d]] =
7106             dd->comm->cellsize_min_dlb[dd->dim[d]];
7107     }
7108 }
7109
7110
7111 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
7112 {
7113     gmx_domdec_t      *dd;
7114     gmx_domdec_comm_t *comm;
7115     real               cellsize_min;
7116     int                d, nc, i;
7117     char               buf[STRLEN];
7118
7119     dd   = cr->dd;
7120     comm = dd->comm;
7121
7122     if (fplog)
7123     {
7124         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
7125     }
7126
7127     cellsize_min = comm->cellsize_min[dd->dim[0]];
7128     for (d = 1; d < dd->ndim; d++)
7129     {
7130         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
7131     }
7132
7133     if (cellsize_min < comm->cellsize_limit*1.05)
7134     {
7135         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7136
7137         /* Change DLB from "auto" to "no". */
7138         comm->eDLB = edlbNO;
7139
7140         return;
7141     }
7142
7143     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
7144     comm->bDynLoadBal = TRUE;
7145     dd->bGridJump     = TRUE;
7146
7147     set_dlb_limits(dd);
7148
7149     /* We can set the required cell size info here,
7150      * so we do not need to communicate this.
7151      * The grid is completely uniform.
7152      */
7153     for (d = 0; d < dd->ndim; d++)
7154     {
7155         if (comm->root[d])
7156         {
7157             comm->load[d].sum_m = comm->load[d].sum;
7158
7159             nc = dd->nc[dd->dim[d]];
7160             for (i = 0; i < nc; i++)
7161             {
7162                 comm->root[d]->cell_f[i]    = i/(real)nc;
7163                 if (d > 0)
7164                 {
7165                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7166                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7167                 }
7168             }
7169             comm->root[d]->cell_f[nc] = 1.0;
7170         }
7171     }
7172 }
7173
7174 static char *init_bLocalCG(gmx_mtop_t *mtop)
7175 {
7176     int   ncg, cg;
7177     char *bLocalCG;
7178
7179     ncg = ncg_mtop(mtop);
7180     snew(bLocalCG, ncg);
7181     for (cg = 0; cg < ncg; cg++)
7182     {
7183         bLocalCG[cg] = FALSE;
7184     }
7185
7186     return bLocalCG;
7187 }
7188
7189 void dd_init_bondeds(FILE *fplog,
7190                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7191                      gmx_vsite_t *vsite, gmx_constr_t constr,
7192                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7193 {
7194     gmx_domdec_comm_t *comm;
7195     gmx_bool           bBondComm;
7196     int                d;
7197
7198     dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
7199
7200     comm = dd->comm;
7201
7202     if (comm->bBondComm)
7203     {
7204         /* Communicate atoms beyond the cut-off for bonded interactions */
7205         comm = dd->comm;
7206
7207         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7208
7209         comm->bLocalCG = init_bLocalCG(mtop);
7210     }
7211     else
7212     {
7213         /* Only communicate atoms based on cut-off */
7214         comm->cglink   = NULL;
7215         comm->bLocalCG = NULL;
7216     }
7217 }
7218
7219 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7220                               t_inputrec *ir,
7221                               gmx_bool bDynLoadBal, real dlb_scale,
7222                               gmx_ddbox_t *ddbox)
7223 {
7224     gmx_domdec_comm_t *comm;
7225     int                d;
7226     ivec               np;
7227     real               limit, shrink;
7228     char               buf[64];
7229
7230     if (fplog == NULL)
7231     {
7232         return;
7233     }
7234
7235     comm = dd->comm;
7236
7237     if (bDynLoadBal)
7238     {
7239         fprintf(fplog, "The maximum number of communication pulses is:");
7240         for (d = 0; d < dd->ndim; d++)
7241         {
7242             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7243         }
7244         fprintf(fplog, "\n");
7245         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7246         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7247         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7248         for (d = 0; d < DIM; d++)
7249         {
7250             if (dd->nc[d] > 1)
7251             {
7252                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7253                 {
7254                     shrink = 0;
7255                 }
7256                 else
7257                 {
7258                     shrink =
7259                         comm->cellsize_min_dlb[d]/
7260                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7261                 }
7262                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7263             }
7264         }
7265         fprintf(fplog, "\n");
7266     }
7267     else
7268     {
7269         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
7270         fprintf(fplog, "The initial number of communication pulses is:");
7271         for (d = 0; d < dd->ndim; d++)
7272         {
7273             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7274         }
7275         fprintf(fplog, "\n");
7276         fprintf(fplog, "The initial domain decomposition cell size is:");
7277         for (d = 0; d < DIM; d++)
7278         {
7279             if (dd->nc[d] > 1)
7280             {
7281                 fprintf(fplog, " %c %.2f nm",
7282                         dim2char(d), dd->comm->cellsize_min[d]);
7283             }
7284         }
7285         fprintf(fplog, "\n\n");
7286     }
7287
7288     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7289     {
7290         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7291         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7292                 "non-bonded interactions", "", comm->cutoff);
7293
7294         if (bDynLoadBal)
7295         {
7296             limit = dd->comm->cellsize_limit;
7297         }
7298         else
7299         {
7300             if (dynamic_dd_box(ddbox, ir))
7301             {
7302                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7303             }
7304             limit = dd->comm->cellsize_min[XX];
7305             for (d = 1; d < DIM; d++)
7306             {
7307                 limit = min(limit, dd->comm->cellsize_min[d]);
7308             }
7309         }
7310
7311         if (comm->bInterCGBondeds)
7312         {
7313             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7314                     "two-body bonded interactions", "(-rdd)",
7315                     max(comm->cutoff, comm->cutoff_mbody));
7316             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7317                     "multi-body bonded interactions", "(-rdd)",
7318                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7319         }
7320         if (dd->vsite_comm)
7321         {
7322             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7323                     "virtual site constructions", "(-rcon)", limit);
7324         }
7325         if (dd->constraint_comm)
7326         {
7327             sprintf(buf, "atoms separated by up to %d constraints",
7328                     1+ir->nProjOrder);
7329             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7330                     buf, "(-rcon)", limit);
7331         }
7332         fprintf(fplog, "\n");
7333     }
7334
7335     fflush(fplog);
7336 }
7337
7338 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7339                                 real               dlb_scale,
7340                                 const t_inputrec  *ir,
7341                                 const gmx_ddbox_t *ddbox)
7342 {
7343     gmx_domdec_comm_t *comm;
7344     int                d, dim, npulse, npulse_d_max, npulse_d;
7345     gmx_bool           bNoCutOff;
7346
7347     comm = dd->comm;
7348
7349     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7350
7351     /* Determine the maximum number of comm. pulses in one dimension */
7352
7353     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7354
7355     /* Determine the maximum required number of grid pulses */
7356     if (comm->cellsize_limit >= comm->cutoff)
7357     {
7358         /* Only a single pulse is required */
7359         npulse = 1;
7360     }
7361     else if (!bNoCutOff && comm->cellsize_limit > 0)
7362     {
7363         /* We round down slightly here to avoid overhead due to the latency
7364          * of extra communication calls when the cut-off
7365          * would be only slightly longer than the cell size.
7366          * Later cellsize_limit is redetermined,
7367          * so we can not miss interactions due to this rounding.
7368          */
7369         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7370     }
7371     else
7372     {
7373         /* There is no cell size limit */
7374         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7375     }
7376
7377     if (!bNoCutOff && npulse > 1)
7378     {
7379         /* See if we can do with less pulses, based on dlb_scale */
7380         npulse_d_max = 0;
7381         for (d = 0; d < dd->ndim; d++)
7382         {
7383             dim      = dd->dim[d];
7384             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7385                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7386             npulse_d_max = max(npulse_d_max, npulse_d);
7387         }
7388         npulse = min(npulse, npulse_d_max);
7389     }
7390
7391     /* This env var can override npulse */
7392     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7393     if (d > 0)
7394     {
7395         npulse = d;
7396     }
7397
7398     comm->maxpulse       = 1;
7399     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7400     for (d = 0; d < dd->ndim; d++)
7401     {
7402         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7403         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7404         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7405         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7406         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7407         {
7408             comm->bVacDLBNoLimit = FALSE;
7409         }
7410     }
7411
7412     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7413     if (!comm->bVacDLBNoLimit)
7414     {
7415         comm->cellsize_limit = max(comm->cellsize_limit,
7416                                    comm->cutoff/comm->maxpulse);
7417     }
7418     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7419     /* Set the minimum cell size for each DD dimension */
7420     for (d = 0; d < dd->ndim; d++)
7421     {
7422         if (comm->bVacDLBNoLimit ||
7423             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7424         {
7425             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7426         }
7427         else
7428         {
7429             comm->cellsize_min_dlb[dd->dim[d]] =
7430                 comm->cutoff/comm->cd[d].np_dlb;
7431         }
7432     }
7433     if (comm->cutoff_mbody <= 0)
7434     {
7435         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7436     }
7437     if (comm->bDynLoadBal)
7438     {
7439         set_dlb_limits(dd);
7440     }
7441 }
7442
7443 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7444 {
7445     /* If each molecule is a single charge group
7446      * or we use domain decomposition for each periodic dimension,
7447      * we do not need to take pbc into account for the bonded interactions.
7448      */
7449     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7450             !(dd->nc[XX] > 1 &&
7451               dd->nc[YY] > 1 &&
7452               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7453 }
7454
7455 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7456                        t_inputrec *ir, t_forcerec *fr,
7457                        gmx_ddbox_t *ddbox)
7458 {
7459     gmx_domdec_comm_t *comm;
7460     int                natoms_tot;
7461     real               vol_frac;
7462
7463     comm = dd->comm;
7464
7465     /* Initialize the thread data.
7466      * This can not be done in init_domain_decomposition,
7467      * as the numbers of threads is determined later.
7468      */
7469     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7470     if (comm->nth > 1)
7471     {
7472         snew(comm->dth, comm->nth);
7473     }
7474
7475     if (EEL_PME(ir->coulombtype))
7476     {
7477         init_ddpme(dd, &comm->ddpme[0], 0);
7478         if (comm->npmedecompdim >= 2)
7479         {
7480             init_ddpme(dd, &comm->ddpme[1], 1);
7481         }
7482     }
7483     else
7484     {
7485         comm->npmenodes = 0;
7486         if (dd->pme_nodeid >= 0)
7487         {
7488             gmx_fatal_collective(FARGS, NULL, dd,
7489                                  "Can not have separate PME nodes without PME electrostatics");
7490         }
7491     }
7492
7493     if (debug)
7494     {
7495         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7496     }
7497     if (comm->eDLB != edlbNO)
7498     {
7499         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7500     }
7501
7502     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7503     if (comm->eDLB == edlbAUTO)
7504     {
7505         if (fplog)
7506         {
7507             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7508         }
7509         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7510     }
7511
7512     if (ir->ePBC == epbcNONE)
7513     {
7514         vol_frac = 1 - 1/(double)dd->nnodes;
7515     }
7516     else
7517     {
7518         vol_frac =
7519             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7520     }
7521     if (debug)
7522     {
7523         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7524     }
7525     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7526
7527     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7528 }
7529
7530 static gmx_bool test_dd_cutoff(t_commrec *cr,
7531                                t_state *state, t_inputrec *ir,
7532                                real cutoff_req)
7533 {
7534     gmx_domdec_t *dd;
7535     gmx_ddbox_t   ddbox;
7536     int           d, dim, np;
7537     real          inv_cell_size;
7538     int           LocallyLimited;
7539
7540     dd = cr->dd;
7541
7542     set_ddbox(dd, FALSE, cr, ir, state->box,
7543               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7544
7545     LocallyLimited = 0;
7546
7547     for (d = 0; d < dd->ndim; d++)
7548     {
7549         dim = dd->dim[d];
7550
7551         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7552         if (dynamic_dd_box(&ddbox, ir))
7553         {
7554             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7555         }
7556
7557         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7558
7559         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7560             dd->comm->cd[d].np_dlb > 0)
7561         {
7562             if (np > dd->comm->cd[d].np_dlb)
7563             {
7564                 return FALSE;
7565             }
7566
7567             /* If a current local cell size is smaller than the requested
7568              * cut-off, we could still fix it, but this gets very complicated.
7569              * Without fixing here, we might actually need more checks.
7570              */
7571             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7572             {
7573                 LocallyLimited = 1;
7574             }
7575         }
7576     }
7577
7578     if (dd->comm->eDLB != edlbNO)
7579     {
7580         /* If DLB is not active yet, we don't need to check the grid jumps.
7581          * Actually we shouldn't, because then the grid jump data is not set.
7582          */
7583         if (dd->comm->bDynLoadBal &&
7584             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7585         {
7586             LocallyLimited = 1;
7587         }
7588
7589         gmx_sumi(1, &LocallyLimited, cr);
7590
7591         if (LocallyLimited > 0)
7592         {
7593             return FALSE;
7594         }
7595     }
7596
7597     return TRUE;
7598 }
7599
7600 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7601                           real cutoff_req)
7602 {
7603     gmx_bool bCutoffAllowed;
7604
7605     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7606
7607     if (bCutoffAllowed)
7608     {
7609         cr->dd->comm->cutoff = cutoff_req;
7610     }
7611
7612     return bCutoffAllowed;
7613 }
7614
7615 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7616 {
7617     gmx_domdec_comm_t *comm;
7618
7619     comm = cr->dd->comm;
7620
7621     /* Turn on the DLB limiting (might have been on already) */
7622     comm->bPMELoadBalDLBLimits = TRUE;
7623
7624     /* Change the cut-off limit */
7625     comm->PMELoadBal_max_cutoff = comm->cutoff;
7626 }
7627
7628 static void merge_cg_buffers(int ncell,
7629                              gmx_domdec_comm_dim_t *cd, int pulse,
7630                              int  *ncg_cell,
7631                              int  *index_gl, int  *recv_i,
7632                              rvec *cg_cm,    rvec *recv_vr,
7633                              int *cgindex,
7634                              cginfo_mb_t *cginfo_mb, int *cginfo)
7635 {
7636     gmx_domdec_ind_t *ind, *ind_p;
7637     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7638     int               shift, shift_at;
7639
7640     ind = &cd->ind[pulse];
7641
7642     /* First correct the already stored data */
7643     shift = ind->nrecv[ncell];
7644     for (cell = ncell-1; cell >= 0; cell--)
7645     {
7646         shift -= ind->nrecv[cell];
7647         if (shift > 0)
7648         {
7649             /* Move the cg's present from previous grid pulses */
7650             cg0                = ncg_cell[ncell+cell];
7651             cg1                = ncg_cell[ncell+cell+1];
7652             cgindex[cg1+shift] = cgindex[cg1];
7653             for (cg = cg1-1; cg >= cg0; cg--)
7654             {
7655                 index_gl[cg+shift] = index_gl[cg];
7656                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7657                 cgindex[cg+shift] = cgindex[cg];
7658                 cginfo[cg+shift]  = cginfo[cg];
7659             }
7660             /* Correct the already stored send indices for the shift */
7661             for (p = 1; p <= pulse; p++)
7662             {
7663                 ind_p = &cd->ind[p];
7664                 cg0   = 0;
7665                 for (c = 0; c < cell; c++)
7666                 {
7667                     cg0 += ind_p->nsend[c];
7668                 }
7669                 cg1 = cg0 + ind_p->nsend[cell];
7670                 for (cg = cg0; cg < cg1; cg++)
7671                 {
7672                     ind_p->index[cg] += shift;
7673                 }
7674             }
7675         }
7676     }
7677
7678     /* Merge in the communicated buffers */
7679     shift    = 0;
7680     shift_at = 0;
7681     cg0      = 0;
7682     for (cell = 0; cell < ncell; cell++)
7683     {
7684         cg1 = ncg_cell[ncell+cell+1] + shift;
7685         if (shift_at > 0)
7686         {
7687             /* Correct the old cg indices */
7688             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7689             {
7690                 cgindex[cg+1] += shift_at;
7691             }
7692         }
7693         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7694         {
7695             /* Copy this charge group from the buffer */
7696             index_gl[cg1] = recv_i[cg0];
7697             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7698             /* Add it to the cgindex */
7699             cg_gl          = index_gl[cg1];
7700             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7701             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7702             cgindex[cg1+1] = cgindex[cg1] + nat;
7703             cg0++;
7704             cg1++;
7705             shift_at += nat;
7706         }
7707         shift                 += ind->nrecv[cell];
7708         ncg_cell[ncell+cell+1] = cg1;
7709     }
7710 }
7711
7712 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7713                                int nzone, int cg0, const int *cgindex)
7714 {
7715     int cg, zone, p;
7716
7717     /* Store the atom block boundaries for easy copying of communication buffers
7718      */
7719     cg = cg0;
7720     for (zone = 0; zone < nzone; zone++)
7721     {
7722         for (p = 0; p < cd->np; p++)
7723         {
7724             cd->ind[p].cell2at0[zone] = cgindex[cg];
7725             cg += cd->ind[p].nrecv[zone];
7726             cd->ind[p].cell2at1[zone] = cgindex[cg];
7727         }
7728     }
7729 }
7730
7731 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7732 {
7733     int      i;
7734     gmx_bool bMiss;
7735
7736     bMiss = FALSE;
7737     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7738     {
7739         if (!bLocalCG[link->a[i]])
7740         {
7741             bMiss = TRUE;
7742         }
7743     }
7744
7745     return bMiss;
7746 }
7747
7748 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7749 typedef struct {
7750     real c[DIM][4]; /* the corners for the non-bonded communication */
7751     real cr0;       /* corner for rounding */
7752     real cr1[4];    /* corners for rounding */
7753     real bc[DIM];   /* corners for bounded communication */
7754     real bcr1;      /* corner for rounding for bonded communication */
7755 } dd_corners_t;
7756
7757 /* Determine the corners of the domain(s) we are communicating with */
7758 static void
7759 set_dd_corners(const gmx_domdec_t *dd,
7760                int dim0, int dim1, int dim2,
7761                gmx_bool bDistMB,
7762                dd_corners_t *c)
7763 {
7764     const gmx_domdec_comm_t  *comm;
7765     const gmx_domdec_zones_t *zones;
7766     int i, j;
7767
7768     comm = dd->comm;
7769
7770     zones = &comm->zones;
7771
7772     /* Keep the compiler happy */
7773     c->cr0  = 0;
7774     c->bcr1 = 0;
7775
7776     /* The first dimension is equal for all cells */
7777     c->c[0][0] = comm->cell_x0[dim0];
7778     if (bDistMB)
7779     {
7780         c->bc[0] = c->c[0][0];
7781     }
7782     if (dd->ndim >= 2)
7783     {
7784         dim1 = dd->dim[1];
7785         /* This cell row is only seen from the first row */
7786         c->c[1][0] = comm->cell_x0[dim1];
7787         /* All rows can see this row */
7788         c->c[1][1] = comm->cell_x0[dim1];
7789         if (dd->bGridJump)
7790         {
7791             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7792             if (bDistMB)
7793             {
7794                 /* For the multi-body distance we need the maximum */
7795                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7796             }
7797         }
7798         /* Set the upper-right corner for rounding */
7799         c->cr0 = comm->cell_x1[dim0];
7800
7801         if (dd->ndim >= 3)
7802         {
7803             dim2 = dd->dim[2];
7804             for (j = 0; j < 4; j++)
7805             {
7806                 c->c[2][j] = comm->cell_x0[dim2];
7807             }
7808             if (dd->bGridJump)
7809             {
7810                 /* Use the maximum of the i-cells that see a j-cell */
7811                 for (i = 0; i < zones->nizone; i++)
7812                 {
7813                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7814                     {
7815                         if (j >= 4)
7816                         {
7817                             c->c[2][j-4] =
7818                                 max(c->c[2][j-4],
7819                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7820                         }
7821                     }
7822                 }
7823                 if (bDistMB)
7824                 {
7825                     /* For the multi-body distance we need the maximum */
7826                     c->bc[2] = comm->cell_x0[dim2];
7827                     for (i = 0; i < 2; i++)
7828                     {
7829                         for (j = 0; j < 2; j++)
7830                         {
7831                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7832                         }
7833                     }
7834                 }
7835             }
7836
7837             /* Set the upper-right corner for rounding */
7838             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7839              * Only cell (0,0,0) can see cell 7 (1,1,1)
7840              */
7841             c->cr1[0] = comm->cell_x1[dim1];
7842             c->cr1[3] = comm->cell_x1[dim1];
7843             if (dd->bGridJump)
7844             {
7845                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7846                 if (bDistMB)
7847                 {
7848                     /* For the multi-body distance we need the maximum */
7849                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7850                 }
7851             }
7852         }
7853     }
7854 }
7855
7856 /* Determine which cg's we need to send in this pulse from this zone */
7857 static void
7858 get_zone_pulse_cgs(gmx_domdec_t *dd,
7859                    int zonei, int zone,
7860                    int cg0, int cg1,
7861                    const int *index_gl,
7862                    const int *cgindex,
7863                    int dim, int dim_ind,
7864                    int dim0, int dim1, int dim2,
7865                    real r_comm2, real r_bcomm2,
7866                    matrix box,
7867                    ivec tric_dist,
7868                    rvec *normal,
7869                    real skew_fac2_d, real skew_fac_01,
7870                    rvec *v_d, rvec *v_0, rvec *v_1,
7871                    const dd_corners_t *c,
7872                    rvec sf2_round,
7873                    gmx_bool bDistBonded,
7874                    gmx_bool bBondComm,
7875                    gmx_bool bDist2B,
7876                    gmx_bool bDistMB,
7877                    rvec *cg_cm,
7878                    int *cginfo,
7879                    gmx_domdec_ind_t *ind,
7880                    int **ibuf, int *ibuf_nalloc,
7881                    vec_rvec_t *vbuf,
7882                    int *nsend_ptr,
7883                    int *nat_ptr,
7884                    int *nsend_z_ptr)
7885 {
7886     gmx_domdec_comm_t *comm;
7887     gmx_bool           bScrew;
7888     gmx_bool           bDistMB_pulse;
7889     int                cg, i;
7890     real               r2, rb2, r, tric_sh;
7891     rvec               rn, rb;
7892     int                dimd;
7893     int                nsend_z, nsend, nat;
7894
7895     comm = dd->comm;
7896
7897     bScrew = (dd->bScrewPBC && dim == XX);
7898
7899     bDistMB_pulse = (bDistMB && bDistBonded);
7900
7901     nsend_z = 0;
7902     nsend   = *nsend_ptr;
7903     nat     = *nat_ptr;
7904
7905     for (cg = cg0; cg < cg1; cg++)
7906     {
7907         r2  = 0;
7908         rb2 = 0;
7909         if (tric_dist[dim_ind] == 0)
7910         {
7911             /* Rectangular direction, easy */
7912             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7913             if (r > 0)
7914             {
7915                 r2 += r*r;
7916             }
7917             if (bDistMB_pulse)
7918             {
7919                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7920                 if (r > 0)
7921                 {
7922                     rb2 += r*r;
7923                 }
7924             }
7925             /* Rounding gives at most a 16% reduction
7926              * in communicated atoms
7927              */
7928             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7929             {
7930                 r = cg_cm[cg][dim0] - c->cr0;
7931                 /* This is the first dimension, so always r >= 0 */
7932                 r2 += r*r;
7933                 if (bDistMB_pulse)
7934                 {
7935                     rb2 += r*r;
7936                 }
7937             }
7938             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7939             {
7940                 r = cg_cm[cg][dim1] - c->cr1[zone];
7941                 if (r > 0)
7942                 {
7943                     r2 += r*r;
7944                 }
7945                 if (bDistMB_pulse)
7946                 {
7947                     r = cg_cm[cg][dim1] - c->bcr1;
7948                     if (r > 0)
7949                     {
7950                         rb2 += r*r;
7951                     }
7952                 }
7953             }
7954         }
7955         else
7956         {
7957             /* Triclinic direction, more complicated */
7958             clear_rvec(rn);
7959             clear_rvec(rb);
7960             /* Rounding, conservative as the skew_fac multiplication
7961              * will slightly underestimate the distance.
7962              */
7963             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7964             {
7965                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7966                 for (i = dim0+1; i < DIM; i++)
7967                 {
7968                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7969                 }
7970                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7971                 if (bDistMB_pulse)
7972                 {
7973                     rb[dim0] = rn[dim0];
7974                     rb2      = r2;
7975                 }
7976                 /* Take care that the cell planes along dim0 might not
7977                  * be orthogonal to those along dim1 and dim2.
7978                  */
7979                 for (i = 1; i <= dim_ind; i++)
7980                 {
7981                     dimd = dd->dim[i];
7982                     if (normal[dim0][dimd] > 0)
7983                     {
7984                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7985                         if (bDistMB_pulse)
7986                         {
7987                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7988                         }
7989                     }
7990                 }
7991             }
7992             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7993             {
7994                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7995                 tric_sh   = 0;
7996                 for (i = dim1+1; i < DIM; i++)
7997                 {
7998                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7999                 }
8000                 rn[dim1] += tric_sh;
8001                 if (rn[dim1] > 0)
8002                 {
8003                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
8004                     /* Take care of coupling of the distances
8005                      * to the planes along dim0 and dim1 through dim2.
8006                      */
8007                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
8008                     /* Take care that the cell planes along dim1
8009                      * might not be orthogonal to that along dim2.
8010                      */
8011                     if (normal[dim1][dim2] > 0)
8012                     {
8013                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
8014                     }
8015                 }
8016                 if (bDistMB_pulse)
8017                 {
8018                     rb[dim1] +=
8019                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
8020                     if (rb[dim1] > 0)
8021                     {
8022                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
8023                         /* Take care of coupling of the distances
8024                          * to the planes along dim0 and dim1 through dim2.
8025                          */
8026                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
8027                         /* Take care that the cell planes along dim1
8028                          * might not be orthogonal to that along dim2.
8029                          */
8030                         if (normal[dim1][dim2] > 0)
8031                         {
8032                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
8033                         }
8034                     }
8035                 }
8036             }
8037             /* The distance along the communication direction */
8038             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
8039             tric_sh  = 0;
8040             for (i = dim+1; i < DIM; i++)
8041             {
8042                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
8043             }
8044             rn[dim] += tric_sh;
8045             if (rn[dim] > 0)
8046             {
8047                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
8048                 /* Take care of coupling of the distances
8049                  * to the planes along dim0 and dim1 through dim2.
8050                  */
8051                 if (dim_ind == 1 && zonei == 1)
8052                 {
8053                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
8054                 }
8055             }
8056             if (bDistMB_pulse)
8057             {
8058                 clear_rvec(rb);
8059                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
8060                 if (rb[dim] > 0)
8061                 {
8062                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
8063                     /* Take care of coupling of the distances
8064                      * to the planes along dim0 and dim1 through dim2.
8065                      */
8066                     if (dim_ind == 1 && zonei == 1)
8067                     {
8068                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
8069                     }
8070                 }
8071             }
8072         }
8073
8074         if (r2 < r_comm2 ||
8075             (bDistBonded &&
8076              ((bDistMB && rb2 < r_bcomm2) ||
8077               (bDist2B && r2  < r_bcomm2)) &&
8078              (!bBondComm ||
8079               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
8080                missing_link(comm->cglink, index_gl[cg],
8081                             comm->bLocalCG)))))
8082         {
8083             /* Make an index to the local charge groups */
8084             if (nsend+1 > ind->nalloc)
8085             {
8086                 ind->nalloc = over_alloc_large(nsend+1);
8087                 srenew(ind->index, ind->nalloc);
8088             }
8089             if (nsend+1 > *ibuf_nalloc)
8090             {
8091                 *ibuf_nalloc = over_alloc_large(nsend+1);
8092                 srenew(*ibuf, *ibuf_nalloc);
8093             }
8094             ind->index[nsend] = cg;
8095             (*ibuf)[nsend]    = index_gl[cg];
8096             nsend_z++;
8097             vec_rvec_check_alloc(vbuf, nsend+1);
8098
8099             if (dd->ci[dim] == 0)
8100             {
8101                 /* Correct cg_cm for pbc */
8102                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
8103                 if (bScrew)
8104                 {
8105                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
8106                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
8107                 }
8108             }
8109             else
8110             {
8111                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
8112             }
8113             nsend++;
8114             nat += cgindex[cg+1] - cgindex[cg];
8115         }
8116     }
8117
8118     *nsend_ptr   = nsend;
8119     *nat_ptr     = nat;
8120     *nsend_z_ptr = nsend_z;
8121 }
8122
8123 static void setup_dd_communication(gmx_domdec_t *dd,
8124                                    matrix box, gmx_ddbox_t *ddbox,
8125                                    t_forcerec *fr, t_state *state, rvec **f)
8126 {
8127     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8128     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
8129     int                    c, i, j, cg, cg_gl, nrcg;
8130     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8131     gmx_domdec_comm_t     *comm;
8132     gmx_domdec_zones_t    *zones;
8133     gmx_domdec_comm_dim_t *cd;
8134     gmx_domdec_ind_t      *ind;
8135     cginfo_mb_t           *cginfo_mb;
8136     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8137     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
8138     dd_corners_t           corners;
8139     ivec                   tric_dist;
8140     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
8141     real                   skew_fac2_d, skew_fac_01;
8142     rvec                   sf2_round;
8143     int                    nsend, nat;
8144     int                    th;
8145
8146     if (debug)
8147     {
8148         fprintf(debug, "Setting up DD communication\n");
8149     }
8150
8151     comm  = dd->comm;
8152
8153     switch (fr->cutoff_scheme)
8154     {
8155         case ecutsGROUP:
8156             cg_cm = fr->cg_cm;
8157             break;
8158         case ecutsVERLET:
8159             cg_cm = state->x;
8160             break;
8161         default:
8162             gmx_incons("unimplemented");
8163             cg_cm = NULL;
8164     }
8165
8166     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8167     {
8168         dim = dd->dim[dim_ind];
8169
8170         /* Check if we need to use triclinic distances */
8171         tric_dist[dim_ind] = 0;
8172         for (i = 0; i <= dim_ind; i++)
8173         {
8174             if (ddbox->tric_dir[dd->dim[i]])
8175             {
8176                 tric_dist[dim_ind] = 1;
8177             }
8178         }
8179     }
8180
8181     bBondComm = comm->bBondComm;
8182
8183     /* Do we need to determine extra distances for multi-body bondeds? */
8184     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8185
8186     /* Do we need to determine extra distances for only two-body bondeds? */
8187     bDist2B = (bBondComm && !bDistMB);
8188
8189     r_comm2  = sqr(comm->cutoff);
8190     r_bcomm2 = sqr(comm->cutoff_mbody);
8191
8192     if (debug)
8193     {
8194         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8195     }
8196
8197     zones = &comm->zones;
8198
8199     dim0 = dd->dim[0];
8200     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8201     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8202
8203     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8204
8205     /* Triclinic stuff */
8206     normal      = ddbox->normal;
8207     skew_fac_01 = 0;
8208     if (dd->ndim >= 2)
8209     {
8210         v_0 = ddbox->v[dim0];
8211         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8212         {
8213             /* Determine the coupling coefficient for the distances
8214              * to the cell planes along dim0 and dim1 through dim2.
8215              * This is required for correct rounding.
8216              */
8217             skew_fac_01 =
8218                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8219             if (debug)
8220             {
8221                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8222             }
8223         }
8224     }
8225     if (dd->ndim >= 3)
8226     {
8227         v_1 = ddbox->v[dim1];
8228     }
8229
8230     zone_cg_range = zones->cg_range;
8231     index_gl      = dd->index_gl;
8232     cgindex       = dd->cgindex;
8233     cginfo_mb     = fr->cginfo_mb;
8234
8235     zone_cg_range[0]   = 0;
8236     zone_cg_range[1]   = dd->ncg_home;
8237     comm->zone_ncg1[0] = dd->ncg_home;
8238     pos_cg             = dd->ncg_home;
8239
8240     nat_tot = dd->nat_home;
8241     nzone   = 1;
8242     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8243     {
8244         dim = dd->dim[dim_ind];
8245         cd  = &comm->cd[dim_ind];
8246
8247         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8248         {
8249             /* No pbc in this dimension, the first node should not comm. */
8250             nzone_send = 0;
8251         }
8252         else
8253         {
8254             nzone_send = nzone;
8255         }
8256
8257         v_d         = ddbox->v[dim];
8258         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8259
8260         cd->bInPlace = TRUE;
8261         for (p = 0; p < cd->np; p++)
8262         {
8263             /* Only atoms communicated in the first pulse are used
8264              * for multi-body bonded interactions or for bBondComm.
8265              */
8266             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8267
8268             ind   = &cd->ind[p];
8269             nsend = 0;
8270             nat   = 0;
8271             for (zone = 0; zone < nzone_send; zone++)
8272             {
8273                 if (tric_dist[dim_ind] && dim_ind > 0)
8274                 {
8275                     /* Determine slightly more optimized skew_fac's
8276                      * for rounding.
8277                      * This reduces the number of communicated atoms
8278                      * by about 10% for 3D DD of rhombic dodecahedra.
8279                      */
8280                     for (dimd = 0; dimd < dim; dimd++)
8281                     {
8282                         sf2_round[dimd] = 1;
8283                         if (ddbox->tric_dir[dimd])
8284                         {
8285                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8286                             {
8287                                 /* If we are shifted in dimension i
8288                                  * and the cell plane is tilted forward
8289                                  * in dimension i, skip this coupling.
8290                                  */
8291                                 if (!(zones->shift[nzone+zone][i] &&
8292                                       ddbox->v[dimd][i][dimd] >= 0))
8293                                 {
8294                                     sf2_round[dimd] +=
8295                                         sqr(ddbox->v[dimd][i][dimd]);
8296                                 }
8297                             }
8298                             sf2_round[dimd] = 1/sf2_round[dimd];
8299                         }
8300                     }
8301                 }
8302
8303                 zonei = zone_perm[dim_ind][zone];
8304                 if (p == 0)
8305                 {
8306                     /* Here we permutate the zones to obtain a convenient order
8307                      * for neighbor searching
8308                      */
8309                     cg0 = zone_cg_range[zonei];
8310                     cg1 = zone_cg_range[zonei+1];
8311                 }
8312                 else
8313                 {
8314                     /* Look only at the cg's received in the previous grid pulse
8315                      */
8316                     cg1 = zone_cg_range[nzone+zone+1];
8317                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8318                 }
8319
8320 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8321                 for (th = 0; th < comm->nth; th++)
8322                 {
8323                     gmx_domdec_ind_t *ind_p;
8324                     int             **ibuf_p, *ibuf_nalloc_p;
8325                     vec_rvec_t       *vbuf_p;
8326                     int              *nsend_p, *nat_p;
8327                     int              *nsend_zone_p;
8328                     int               cg0_th, cg1_th;
8329
8330                     if (th == 0)
8331                     {
8332                         /* Thread 0 writes in the comm buffers */
8333                         ind_p         = ind;
8334                         ibuf_p        = &comm->buf_int;
8335                         ibuf_nalloc_p = &comm->nalloc_int;
8336                         vbuf_p        = &comm->vbuf;
8337                         nsend_p       = &nsend;
8338                         nat_p         = &nat;
8339                         nsend_zone_p  = &ind->nsend[zone];
8340                     }
8341                     else
8342                     {
8343                         /* Other threads write into temp buffers */
8344                         ind_p         = &comm->dth[th].ind;
8345                         ibuf_p        = &comm->dth[th].ibuf;
8346                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8347                         vbuf_p        = &comm->dth[th].vbuf;
8348                         nsend_p       = &comm->dth[th].nsend;
8349                         nat_p         = &comm->dth[th].nat;
8350                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8351
8352                         comm->dth[th].nsend      = 0;
8353                         comm->dth[th].nat        = 0;
8354                         comm->dth[th].nsend_zone = 0;
8355                     }
8356
8357                     if (comm->nth == 1)
8358                     {
8359                         cg0_th = cg0;
8360                         cg1_th = cg1;
8361                     }
8362                     else
8363                     {
8364                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8365                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8366                     }
8367
8368                     /* Get the cg's for this pulse in this zone */
8369                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8370                                        index_gl, cgindex,
8371                                        dim, dim_ind, dim0, dim1, dim2,
8372                                        r_comm2, r_bcomm2,
8373                                        box, tric_dist,
8374                                        normal, skew_fac2_d, skew_fac_01,
8375                                        v_d, v_0, v_1, &corners, sf2_round,
8376                                        bDistBonded, bBondComm,
8377                                        bDist2B, bDistMB,
8378                                        cg_cm, fr->cginfo,
8379                                        ind_p,
8380                                        ibuf_p, ibuf_nalloc_p,
8381                                        vbuf_p,
8382                                        nsend_p, nat_p,
8383                                        nsend_zone_p);
8384                 }
8385
8386                 /* Append data of threads>=1 to the communication buffers */
8387                 for (th = 1; th < comm->nth; th++)
8388                 {
8389                     dd_comm_setup_work_t *dth;
8390                     int                   i, ns1;
8391
8392                     dth = &comm->dth[th];
8393
8394                     ns1 = nsend + dth->nsend_zone;
8395                     if (ns1 > ind->nalloc)
8396                     {
8397                         ind->nalloc = over_alloc_dd(ns1);
8398                         srenew(ind->index, ind->nalloc);
8399                     }
8400                     if (ns1 > comm->nalloc_int)
8401                     {
8402                         comm->nalloc_int = over_alloc_dd(ns1);
8403                         srenew(comm->buf_int, comm->nalloc_int);
8404                     }
8405                     if (ns1 > comm->vbuf.nalloc)
8406                     {
8407                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8408                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8409                     }
8410
8411                     for (i = 0; i < dth->nsend_zone; i++)
8412                     {
8413                         ind->index[nsend]    = dth->ind.index[i];
8414                         comm->buf_int[nsend] = dth->ibuf[i];
8415                         copy_rvec(dth->vbuf.v[i],
8416                                   comm->vbuf.v[nsend]);
8417                         nsend++;
8418                     }
8419                     nat              += dth->nat;
8420                     ind->nsend[zone] += dth->nsend_zone;
8421                 }
8422             }
8423             /* Clear the counts in case we do not have pbc */
8424             for (zone = nzone_send; zone < nzone; zone++)
8425             {
8426                 ind->nsend[zone] = 0;
8427             }
8428             ind->nsend[nzone]   = nsend;
8429             ind->nsend[nzone+1] = nat;
8430             /* Communicate the number of cg's and atoms to receive */
8431             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8432                             ind->nsend, nzone+2,
8433                             ind->nrecv, nzone+2);
8434
8435             /* The rvec buffer is also required for atom buffers of size nsend
8436              * in dd_move_x and dd_move_f.
8437              */
8438             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8439
8440             if (p > 0)
8441             {
8442                 /* We can receive in place if only the last zone is not empty */
8443                 for (zone = 0; zone < nzone-1; zone++)
8444                 {
8445                     if (ind->nrecv[zone] > 0)
8446                     {
8447                         cd->bInPlace = FALSE;
8448                     }
8449                 }
8450                 if (!cd->bInPlace)
8451                 {
8452                     /* The int buffer is only required here for the cg indices */
8453                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8454                     {
8455                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8456                         srenew(comm->buf_int2, comm->nalloc_int2);
8457                     }
8458                     /* The rvec buffer is also required for atom buffers
8459                      * of size nrecv in dd_move_x and dd_move_f.
8460                      */
8461                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8462                     vec_rvec_check_alloc(&comm->vbuf2, i);
8463                 }
8464             }
8465
8466             /* Make space for the global cg indices */
8467             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8468                 || dd->cg_nalloc == 0)
8469             {
8470                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8471                 srenew(index_gl, dd->cg_nalloc);
8472                 srenew(cgindex, dd->cg_nalloc+1);
8473             }
8474             /* Communicate the global cg indices */
8475             if (cd->bInPlace)
8476             {
8477                 recv_i = index_gl + pos_cg;
8478             }
8479             else
8480             {
8481                 recv_i = comm->buf_int2;
8482             }
8483             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8484                             comm->buf_int, nsend,
8485                             recv_i,        ind->nrecv[nzone]);
8486
8487             /* Make space for cg_cm */
8488             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8489             if (fr->cutoff_scheme == ecutsGROUP)
8490             {
8491                 cg_cm = fr->cg_cm;
8492             }
8493             else
8494             {
8495                 cg_cm = state->x;
8496             }
8497             /* Communicate cg_cm */
8498             if (cd->bInPlace)
8499             {
8500                 recv_vr = cg_cm + pos_cg;
8501             }
8502             else
8503             {
8504                 recv_vr = comm->vbuf2.v;
8505             }
8506             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8507                              comm->vbuf.v, nsend,
8508                              recv_vr,      ind->nrecv[nzone]);
8509
8510             /* Make the charge group index */
8511             if (cd->bInPlace)
8512             {
8513                 zone = (p == 0 ? 0 : nzone - 1);
8514                 while (zone < nzone)
8515                 {
8516                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8517                     {
8518                         cg_gl              = index_gl[pos_cg];
8519                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8520                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8521                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8522                         if (bBondComm)
8523                         {
8524                             /* Update the charge group presence,
8525                              * so we can use it in the next pass of the loop.
8526                              */
8527                             comm->bLocalCG[cg_gl] = TRUE;
8528                         }
8529                         pos_cg++;
8530                     }
8531                     if (p == 0)
8532                     {
8533                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8534                     }
8535                     zone++;
8536                     zone_cg_range[nzone+zone] = pos_cg;
8537                 }
8538             }
8539             else
8540             {
8541                 /* This part of the code is never executed with bBondComm. */
8542                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8543                                  index_gl, recv_i, cg_cm, recv_vr,
8544                                  cgindex, fr->cginfo_mb, fr->cginfo);
8545                 pos_cg += ind->nrecv[nzone];
8546             }
8547             nat_tot += ind->nrecv[nzone+1];
8548         }
8549         if (!cd->bInPlace)
8550         {
8551             /* Store the atom block for easy copying of communication buffers */
8552             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8553         }
8554         nzone += nzone;
8555     }
8556     dd->index_gl = index_gl;
8557     dd->cgindex  = cgindex;
8558
8559     dd->ncg_tot          = zone_cg_range[zones->n];
8560     dd->nat_tot          = nat_tot;
8561     comm->nat[ddnatHOME] = dd->nat_home;
8562     for (i = ddnatZONE; i < ddnatNR; i++)
8563     {
8564         comm->nat[i] = dd->nat_tot;
8565     }
8566
8567     if (!bBondComm)
8568     {
8569         /* We don't need to update cginfo, since that was alrady done above.
8570          * So we pass NULL for the forcerec.
8571          */
8572         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8573                       NULL, comm->bLocalCG);
8574     }
8575
8576     if (debug)
8577     {
8578         fprintf(debug, "Finished setting up DD communication, zones:");
8579         for (c = 0; c < zones->n; c++)
8580         {
8581             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8582         }
8583         fprintf(debug, "\n");
8584     }
8585 }
8586
8587 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8588 {
8589     int c;
8590
8591     for (c = 0; c < zones->nizone; c++)
8592     {
8593         zones->izone[c].cg1  = zones->cg_range[c+1];
8594         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8595         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8596     }
8597 }
8598
8599 static void set_zones_size(gmx_domdec_t *dd,
8600                            matrix box, const gmx_ddbox_t *ddbox,
8601                            int zone_start, int zone_end)
8602 {
8603     gmx_domdec_comm_t  *comm;
8604     gmx_domdec_zones_t *zones;
8605     gmx_bool            bDistMB;
8606     int                 z, zi, zj0, zj1, d, dim;
8607     real                rcs, rcmbs;
8608     int                 i, j;
8609     real                size_j, add_tric;
8610     real                vol;
8611
8612     comm = dd->comm;
8613
8614     zones = &comm->zones;
8615
8616     /* Do we need to determine extra distances for multi-body bondeds? */
8617     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8618
8619     for (z = zone_start; z < zone_end; z++)
8620     {
8621         /* Copy cell limits to zone limits.
8622          * Valid for non-DD dims and non-shifted dims.
8623          */
8624         copy_rvec(comm->cell_x0, zones->size[z].x0);
8625         copy_rvec(comm->cell_x1, zones->size[z].x1);
8626     }
8627
8628     for (d = 0; d < dd->ndim; d++)
8629     {
8630         dim = dd->dim[d];
8631
8632         for (z = 0; z < zones->n; z++)
8633         {
8634             /* With a staggered grid we have different sizes
8635              * for non-shifted dimensions.
8636              */
8637             if (dd->bGridJump && zones->shift[z][dim] == 0)
8638             {
8639                 if (d == 1)
8640                 {
8641                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8642                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8643                 }
8644                 else if (d == 2)
8645                 {
8646                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8647                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8648                 }
8649             }
8650         }
8651
8652         rcs   = comm->cutoff;
8653         rcmbs = comm->cutoff_mbody;
8654         if (ddbox->tric_dir[dim])
8655         {
8656             rcs   /= ddbox->skew_fac[dim];
8657             rcmbs /= ddbox->skew_fac[dim];
8658         }
8659
8660         /* Set the lower limit for the shifted zone dimensions */
8661         for (z = zone_start; z < zone_end; z++)
8662         {
8663             if (zones->shift[z][dim] > 0)
8664             {
8665                 dim = dd->dim[d];
8666                 if (!dd->bGridJump || d == 0)
8667                 {
8668                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8669                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8670                 }
8671                 else
8672                 {
8673                     /* Here we take the lower limit of the zone from
8674                      * the lowest domain of the zone below.
8675                      */
8676                     if (z < 4)
8677                     {
8678                         zones->size[z].x0[dim] =
8679                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8680                     }
8681                     else
8682                     {
8683                         if (d == 1)
8684                         {
8685                             zones->size[z].x0[dim] =
8686                                 zones->size[zone_perm[2][z-4]].x0[dim];
8687                         }
8688                         else
8689                         {
8690                             zones->size[z].x0[dim] =
8691                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8692                         }
8693                     }
8694                     /* A temporary limit, is updated below */
8695                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8696
8697                     if (bDistMB)
8698                     {
8699                         for (zi = 0; zi < zones->nizone; zi++)
8700                         {
8701                             if (zones->shift[zi][dim] == 0)
8702                             {
8703                                 /* This takes the whole zone into account.
8704                                  * With multiple pulses this will lead
8705                                  * to a larger zone then strictly necessary.
8706                                  */
8707                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8708                                                              zones->size[zi].x1[dim]+rcmbs);
8709                             }
8710                         }
8711                     }
8712                 }
8713             }
8714         }
8715
8716         /* Loop over the i-zones to set the upper limit of each
8717          * j-zone they see.
8718          */
8719         for (zi = 0; zi < zones->nizone; zi++)
8720         {
8721             if (zones->shift[zi][dim] == 0)
8722             {
8723                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8724                 {
8725                     if (zones->shift[z][dim] > 0)
8726                     {
8727                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8728                                                      zones->size[zi].x1[dim]+rcs);
8729                     }
8730                 }
8731             }
8732         }
8733     }
8734
8735     for (z = zone_start; z < zone_end; z++)
8736     {
8737         /* Initialization only required to keep the compiler happy */
8738         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8739         int  nc, c;
8740
8741         /* To determine the bounding box for a zone we need to find
8742          * the extreme corners of 4, 2 or 1 corners.
8743          */
8744         nc = 1 << (ddbox->npbcdim - 1);
8745
8746         for (c = 0; c < nc; c++)
8747         {
8748             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8749             corner[XX] = 0;
8750             if ((c & 1) == 0)
8751             {
8752                 corner[YY] = zones->size[z].x0[YY];
8753             }
8754             else
8755             {
8756                 corner[YY] = zones->size[z].x1[YY];
8757             }
8758             if ((c & 2) == 0)
8759             {
8760                 corner[ZZ] = zones->size[z].x0[ZZ];
8761             }
8762             else
8763             {
8764                 corner[ZZ] = zones->size[z].x1[ZZ];
8765             }
8766             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8767             {
8768                 /* With 1D domain decomposition the cg's are not in
8769                  * the triclinic box, but triclinic x-y and rectangular y-z.
8770                  * Shift y back, so it will later end up at 0.
8771                  */
8772                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8773             }
8774             /* Apply the triclinic couplings */
8775             for (i = YY; i < ddbox->npbcdim; i++)
8776             {
8777                 for (j = XX; j < i; j++)
8778                 {
8779                     corner[j] += corner[i]*box[i][j]/box[i][i];
8780                 }
8781             }
8782             if (c == 0)
8783             {
8784                 copy_rvec(corner, corner_min);
8785                 copy_rvec(corner, corner_max);
8786             }
8787             else
8788             {
8789                 for (i = 0; i < DIM; i++)
8790                 {
8791                     corner_min[i] = min(corner_min[i], corner[i]);
8792                     corner_max[i] = max(corner_max[i], corner[i]);
8793                 }
8794             }
8795         }
8796         /* Copy the extreme cornes without offset along x */
8797         for (i = 0; i < DIM; i++)
8798         {
8799             zones->size[z].bb_x0[i] = corner_min[i];
8800             zones->size[z].bb_x1[i] = corner_max[i];
8801         }
8802         /* Add the offset along x */
8803         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8804         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8805     }
8806
8807     if (zone_start == 0)
8808     {
8809         vol = 1;
8810         for (dim = 0; dim < DIM; dim++)
8811         {
8812             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8813         }
8814         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8815     }
8816
8817     if (debug)
8818     {
8819         for (z = zone_start; z < zone_end; z++)
8820         {
8821             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8822                     z,
8823                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8824                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8825                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8826             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8827                     z,
8828                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8829                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8830                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8831         }
8832     }
8833 }
8834
8835 static int comp_cgsort(const void *a, const void *b)
8836 {
8837     int           comp;
8838
8839     gmx_cgsort_t *cga, *cgb;
8840     cga = (gmx_cgsort_t *)a;
8841     cgb = (gmx_cgsort_t *)b;
8842
8843     comp = cga->nsc - cgb->nsc;
8844     if (comp == 0)
8845     {
8846         comp = cga->ind_gl - cgb->ind_gl;
8847     }
8848
8849     return comp;
8850 }
8851
8852 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8853                          int *a, int *buf)
8854 {
8855     int i;
8856
8857     /* Order the data */
8858     for (i = 0; i < n; i++)
8859     {
8860         buf[i] = a[sort[i].ind];
8861     }
8862
8863     /* Copy back to the original array */
8864     for (i = 0; i < n; i++)
8865     {
8866         a[i] = buf[i];
8867     }
8868 }
8869
8870 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8871                          rvec *v, rvec *buf)
8872 {
8873     int i;
8874
8875     /* Order the data */
8876     for (i = 0; i < n; i++)
8877     {
8878         copy_rvec(v[sort[i].ind], buf[i]);
8879     }
8880
8881     /* Copy back to the original array */
8882     for (i = 0; i < n; i++)
8883     {
8884         copy_rvec(buf[i], v[i]);
8885     }
8886 }
8887
8888 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8889                            rvec *v, rvec *buf)
8890 {
8891     int a, atot, cg, cg0, cg1, i;
8892
8893     if (cgindex == NULL)
8894     {
8895         /* Avoid the useless loop of the atoms within a cg */
8896         order_vec_cg(ncg, sort, v, buf);
8897
8898         return;
8899     }
8900
8901     /* Order the data */
8902     a = 0;
8903     for (cg = 0; cg < ncg; cg++)
8904     {
8905         cg0 = cgindex[sort[cg].ind];
8906         cg1 = cgindex[sort[cg].ind+1];
8907         for (i = cg0; i < cg1; i++)
8908         {
8909             copy_rvec(v[i], buf[a]);
8910             a++;
8911         }
8912     }
8913     atot = a;
8914
8915     /* Copy back to the original array */
8916     for (a = 0; a < atot; a++)
8917     {
8918         copy_rvec(buf[a], v[a]);
8919     }
8920 }
8921
8922 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8923                          int nsort_new, gmx_cgsort_t *sort_new,
8924                          gmx_cgsort_t *sort1)
8925 {
8926     int i1, i2, i_new;
8927
8928     /* The new indices are not very ordered, so we qsort them */
8929     qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8930
8931     /* sort2 is already ordered, so now we can merge the two arrays */
8932     i1    = 0;
8933     i2    = 0;
8934     i_new = 0;
8935     while (i2 < nsort2 || i_new < nsort_new)
8936     {
8937         if (i2 == nsort2)
8938         {
8939             sort1[i1++] = sort_new[i_new++];
8940         }
8941         else if (i_new == nsort_new)
8942         {
8943             sort1[i1++] = sort2[i2++];
8944         }
8945         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8946                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8947                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8948         {
8949             sort1[i1++] = sort2[i2++];
8950         }
8951         else
8952         {
8953             sort1[i1++] = sort_new[i_new++];
8954         }
8955     }
8956 }
8957
8958 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8959 {
8960     gmx_domdec_sort_t *sort;
8961     gmx_cgsort_t      *cgsort, *sort_i;
8962     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8963     int                sort_last, sort_skip;
8964
8965     sort = dd->comm->sort;
8966
8967     a = fr->ns.grid->cell_index;
8968
8969     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8970
8971     if (ncg_home_old >= 0)
8972     {
8973         /* The charge groups that remained in the same ns grid cell
8974          * are completely ordered. So we can sort efficiently by sorting
8975          * the charge groups that did move into the stationary list.
8976          */
8977         ncg_new   = 0;
8978         nsort2    = 0;
8979         nsort_new = 0;
8980         for (i = 0; i < dd->ncg_home; i++)
8981         {
8982             /* Check if this cg did not move to another node */
8983             if (a[i] < moved)
8984             {
8985                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8986                 {
8987                     /* This cg is new on this node or moved ns grid cell */
8988                     if (nsort_new >= sort->sort_new_nalloc)
8989                     {
8990                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8991                         srenew(sort->sort_new, sort->sort_new_nalloc);
8992                     }
8993                     sort_i = &(sort->sort_new[nsort_new++]);
8994                 }
8995                 else
8996                 {
8997                     /* This cg did not move */
8998                     sort_i = &(sort->sort2[nsort2++]);
8999                 }
9000                 /* Sort on the ns grid cell indices
9001                  * and the global topology index.
9002                  * index_gl is irrelevant with cell ns,
9003                  * but we set it here anyhow to avoid a conditional.
9004                  */
9005                 sort_i->nsc    = a[i];
9006                 sort_i->ind_gl = dd->index_gl[i];
9007                 sort_i->ind    = i;
9008                 ncg_new++;
9009             }
9010         }
9011         if (debug)
9012         {
9013             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
9014                     nsort2, nsort_new);
9015         }
9016         /* Sort efficiently */
9017         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
9018                      sort->sort);
9019     }
9020     else
9021     {
9022         cgsort  = sort->sort;
9023         ncg_new = 0;
9024         for (i = 0; i < dd->ncg_home; i++)
9025         {
9026             /* Sort on the ns grid cell indices
9027              * and the global topology index
9028              */
9029             cgsort[i].nsc    = a[i];
9030             cgsort[i].ind_gl = dd->index_gl[i];
9031             cgsort[i].ind    = i;
9032             if (cgsort[i].nsc < moved)
9033             {
9034                 ncg_new++;
9035             }
9036         }
9037         if (debug)
9038         {
9039             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
9040         }
9041         /* Determine the order of the charge groups using qsort */
9042         qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
9043     }
9044
9045     return ncg_new;
9046 }
9047
9048 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
9049 {
9050     gmx_cgsort_t *sort;
9051     int           ncg_new, i, *a, na;
9052
9053     sort = dd->comm->sort->sort;
9054
9055     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
9056
9057     ncg_new = 0;
9058     for (i = 0; i < na; i++)
9059     {
9060         if (a[i] >= 0)
9061         {
9062             sort[ncg_new].ind = a[i];
9063             ncg_new++;
9064         }
9065     }
9066
9067     return ncg_new;
9068 }
9069
9070 static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
9071                           rvec *cgcm, t_forcerec *fr, t_state *state,
9072                           int ncg_home_old)
9073 {
9074     gmx_domdec_sort_t *sort;
9075     gmx_cgsort_t      *cgsort, *sort_i;
9076     int               *cgindex;
9077     int                ncg_new, i, *ibuf, cgsize;
9078     rvec              *vbuf;
9079
9080     sort = dd->comm->sort;
9081
9082     if (dd->ncg_home > sort->sort_nalloc)
9083     {
9084         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9085         srenew(sort->sort, sort->sort_nalloc);
9086         srenew(sort->sort2, sort->sort_nalloc);
9087     }
9088     cgsort = sort->sort;
9089
9090     switch (fr->cutoff_scheme)
9091     {
9092         case ecutsGROUP:
9093             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9094             break;
9095         case ecutsVERLET:
9096             ncg_new = dd_sort_order_nbnxn(dd, fr);
9097             break;
9098         default:
9099             gmx_incons("unimplemented");
9100             ncg_new = 0;
9101     }
9102
9103     /* We alloc with the old size, since cgindex is still old */
9104     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9105     vbuf = dd->comm->vbuf.v;
9106
9107     if (dd->comm->bCGs)
9108     {
9109         cgindex = dd->cgindex;
9110     }
9111     else
9112     {
9113         cgindex = NULL;
9114     }
9115
9116     /* Remove the charge groups which are no longer at home here */
9117     dd->ncg_home = ncg_new;
9118     if (debug)
9119     {
9120         fprintf(debug, "Set the new home charge group count to %d\n",
9121                 dd->ncg_home);
9122     }
9123
9124     /* Reorder the state */
9125     for (i = 0; i < estNR; i++)
9126     {
9127         if (EST_DISTR(i) && (state->flags & (1<<i)))
9128         {
9129             switch (i)
9130             {
9131                 case estX:
9132                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
9133                     break;
9134                 case estV:
9135                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
9136                     break;
9137                 case estSDX:
9138                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
9139                     break;
9140                 case estCGP:
9141                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
9142                     break;
9143                 case estLD_RNG:
9144                 case estLD_RNGI:
9145                 case estDISRE_INITF:
9146                 case estDISRE_RM3TAV:
9147                 case estORIRE_INITF:
9148                 case estORIRE_DTAV:
9149                     /* No ordering required */
9150                     break;
9151                 default:
9152                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9153                     break;
9154             }
9155         }
9156     }
9157     if (fr->cutoff_scheme == ecutsGROUP)
9158     {
9159         /* Reorder cgcm */
9160         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9161     }
9162
9163     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9164     {
9165         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9166         srenew(sort->ibuf, sort->ibuf_nalloc);
9167     }
9168     ibuf = sort->ibuf;
9169     /* Reorder the global cg index */
9170     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9171     /* Reorder the cginfo */
9172     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9173     /* Rebuild the local cg index */
9174     if (dd->comm->bCGs)
9175     {
9176         ibuf[0] = 0;
9177         for (i = 0; i < dd->ncg_home; i++)
9178         {
9179             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9180             ibuf[i+1] = ibuf[i] + cgsize;
9181         }
9182         for (i = 0; i < dd->ncg_home+1; i++)
9183         {
9184             dd->cgindex[i] = ibuf[i];
9185         }
9186     }
9187     else
9188     {
9189         for (i = 0; i < dd->ncg_home+1; i++)
9190         {
9191             dd->cgindex[i] = i;
9192         }
9193     }
9194     /* Set the home atom number */
9195     dd->nat_home = dd->cgindex[dd->ncg_home];
9196
9197     if (fr->cutoff_scheme == ecutsVERLET)
9198     {
9199         /* The atoms are now exactly in grid order, update the grid order */
9200         nbnxn_set_atomorder(fr->nbv->nbs);
9201     }
9202     else
9203     {
9204         /* Copy the sorted ns cell indices back to the ns grid struct */
9205         for (i = 0; i < dd->ncg_home; i++)
9206         {
9207             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9208         }
9209         fr->ns.grid->nr = dd->ncg_home;
9210     }
9211 }
9212
9213 static void add_dd_statistics(gmx_domdec_t *dd)
9214 {
9215     gmx_domdec_comm_t *comm;
9216     int                ddnat;
9217
9218     comm = dd->comm;
9219
9220     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9221     {
9222         comm->sum_nat[ddnat-ddnatZONE] +=
9223             comm->nat[ddnat] - comm->nat[ddnat-1];
9224     }
9225     comm->ndecomp++;
9226 }
9227
9228 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9229 {
9230     gmx_domdec_comm_t *comm;
9231     int                ddnat;
9232
9233     comm = dd->comm;
9234
9235     /* Reset all the statistics and counters for total run counting */
9236     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9237     {
9238         comm->sum_nat[ddnat-ddnatZONE] = 0;
9239     }
9240     comm->ndecomp   = 0;
9241     comm->nload     = 0;
9242     comm->load_step = 0;
9243     comm->load_sum  = 0;
9244     comm->load_max  = 0;
9245     clear_ivec(comm->load_lim);
9246     comm->load_mdf = 0;
9247     comm->load_pme = 0;
9248 }
9249
9250 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9251 {
9252     gmx_domdec_comm_t *comm;
9253     int                ddnat;
9254     double             av;
9255
9256     comm = cr->dd->comm;
9257
9258     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9259
9260     if (fplog == NULL)
9261     {
9262         return;
9263     }
9264
9265     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9266
9267     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9268     {
9269         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9270         switch (ddnat)
9271         {
9272             case ddnatZONE:
9273                 fprintf(fplog,
9274                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9275                         2, av);
9276                 break;
9277             case ddnatVSITE:
9278                 if (cr->dd->vsite_comm)
9279                 {
9280                     fprintf(fplog,
9281                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9282                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9283                             av);
9284                 }
9285                 break;
9286             case ddnatCON:
9287                 if (cr->dd->constraint_comm)
9288                 {
9289                     fprintf(fplog,
9290                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9291                             1 + ir->nLincsIter, av);
9292                 }
9293                 break;
9294             default:
9295                 gmx_incons(" Unknown type for DD statistics");
9296         }
9297     }
9298     fprintf(fplog, "\n");
9299
9300     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9301     {
9302         print_dd_load_av(fplog, cr->dd);
9303     }
9304 }
9305
9306 void dd_partition_system(FILE                *fplog,
9307                          gmx_large_int_t      step,
9308                          t_commrec           *cr,
9309                          gmx_bool             bMasterState,
9310                          int                  nstglobalcomm,
9311                          t_state             *state_global,
9312                          gmx_mtop_t          *top_global,
9313                          t_inputrec          *ir,
9314                          t_state             *state_local,
9315                          rvec               **f,
9316                          t_mdatoms           *mdatoms,
9317                          gmx_localtop_t      *top_local,
9318                          t_forcerec          *fr,
9319                          gmx_vsite_t         *vsite,
9320                          gmx_shellfc_t        shellfc,
9321                          gmx_constr_t         constr,
9322                          t_nrnb              *nrnb,
9323                          gmx_wallcycle_t      wcycle,
9324                          gmx_bool             bVerbose)
9325 {
9326     gmx_domdec_t      *dd;
9327     gmx_domdec_comm_t *comm;
9328     gmx_ddbox_t        ddbox = {0};
9329     t_block           *cgs_gl;
9330     gmx_large_int_t    step_pcoupl;
9331     rvec               cell_ns_x0, cell_ns_x1;
9332     int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9333     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9334     gmx_bool           bRedist, bSortCG, bResortAll;
9335     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9336     real               grid_density;
9337     char               sbuf[22];
9338
9339     dd   = cr->dd;
9340     comm = dd->comm;
9341
9342     bBoxChanged = (bMasterState || DEFORM(*ir));
9343     if (ir->epc != epcNO)
9344     {
9345         /* With nstpcouple > 1 pressure coupling happens.
9346          * one step after calculating the pressure.
9347          * Box scaling happens at the end of the MD step,
9348          * after the DD partitioning.
9349          * We therefore have to do DLB in the first partitioning
9350          * after an MD step where P-coupling occured.
9351          * We need to determine the last step in which p-coupling occurred.
9352          * MRS -- need to validate this for vv?
9353          */
9354         n = ir->nstpcouple;
9355         if (n == 1)
9356         {
9357             step_pcoupl = step - 1;
9358         }
9359         else
9360         {
9361             step_pcoupl = ((step - 1)/n)*n + 1;
9362         }
9363         if (step_pcoupl >= comm->partition_step)
9364         {
9365             bBoxChanged = TRUE;
9366         }
9367     }
9368
9369     bNStGlobalComm = (step % nstglobalcomm == 0);
9370
9371     if (!comm->bDynLoadBal)
9372     {
9373         bDoDLB = FALSE;
9374     }
9375     else
9376     {
9377         /* Should we do dynamic load balacing this step?
9378          * Since it requires (possibly expensive) global communication,
9379          * we might want to do DLB less frequently.
9380          */
9381         if (bBoxChanged || ir->epc != epcNO)
9382         {
9383             bDoDLB = bBoxChanged;
9384         }
9385         else
9386         {
9387             bDoDLB = bNStGlobalComm;
9388         }
9389     }
9390
9391     /* Check if we have recorded loads on the nodes */
9392     if (comm->bRecordLoad && dd_load_count(comm))
9393     {
9394         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9395         {
9396             /* Check if we should use DLB at the second partitioning
9397              * and every 100 partitionings,
9398              * so the extra communication cost is negligible.
9399              */
9400             n         = max(100, nstglobalcomm);
9401             bCheckDLB = (comm->n_load_collect == 0 ||
9402                          comm->n_load_have % n == n-1);
9403         }
9404         else
9405         {
9406             bCheckDLB = FALSE;
9407         }
9408
9409         /* Print load every nstlog, first and last step to the log file */
9410         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9411                     comm->n_load_collect == 0 ||
9412                     (ir->nsteps >= 0 &&
9413                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9414
9415         /* Avoid extra communication due to verbose screen output
9416          * when nstglobalcomm is set.
9417          */
9418         if (bDoDLB || bLogLoad || bCheckDLB ||
9419             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9420         {
9421             get_load_distribution(dd, wcycle);
9422             if (DDMASTER(dd))
9423             {
9424                 if (bLogLoad)
9425                 {
9426                     dd_print_load(fplog, dd, step-1);
9427                 }
9428                 if (bVerbose)
9429                 {
9430                     dd_print_load_verbose(dd);
9431                 }
9432             }
9433             comm->n_load_collect++;
9434
9435             if (bCheckDLB)
9436             {
9437                 /* Since the timings are node dependent, the master decides */
9438                 if (DDMASTER(dd))
9439                 {
9440                     bTurnOnDLB =
9441                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9442                     if (debug)
9443                     {
9444                         fprintf(debug, "step %s, imb loss %f\n",
9445                                 gmx_step_str(step, sbuf),
9446                                 dd_force_imb_perf_loss(dd));
9447                     }
9448                 }
9449                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9450                 if (bTurnOnDLB)
9451                 {
9452                     turn_on_dlb(fplog, cr, step);
9453                     bDoDLB = TRUE;
9454                 }
9455             }
9456         }
9457         comm->n_load_have++;
9458     }
9459
9460     cgs_gl = &comm->cgs_gl;
9461
9462     bRedist = FALSE;
9463     if (bMasterState)
9464     {
9465         /* Clear the old state */
9466         clear_dd_indices(dd, 0, 0);
9467         ncgindex_set = 0;
9468
9469         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9470                   TRUE, cgs_gl, state_global->x, &ddbox);
9471
9472         get_cg_distribution(fplog, step, dd, cgs_gl,
9473                             state_global->box, &ddbox, state_global->x);
9474
9475         dd_distribute_state(dd, cgs_gl,
9476                             state_global, state_local, f);
9477
9478         dd_make_local_cgs(dd, &top_local->cgs);
9479
9480         /* Ensure that we have space for the new distribution */
9481         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9482
9483         if (fr->cutoff_scheme == ecutsGROUP)
9484         {
9485             calc_cgcm(fplog, 0, dd->ncg_home,
9486                       &top_local->cgs, state_local->x, fr->cg_cm);
9487         }
9488
9489         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9490
9491         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9492     }
9493     else if (state_local->ddp_count != dd->ddp_count)
9494     {
9495         if (state_local->ddp_count > dd->ddp_count)
9496         {
9497             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9498         }
9499
9500         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9501         {
9502             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9503         }
9504
9505         /* Clear the old state */
9506         clear_dd_indices(dd, 0, 0);
9507
9508         /* Build the new indices */
9509         rebuild_cgindex(dd, cgs_gl->index, state_local);
9510         make_dd_indices(dd, cgs_gl->index, 0);
9511         ncgindex_set = dd->ncg_home;
9512
9513         if (fr->cutoff_scheme == ecutsGROUP)
9514         {
9515             /* Redetermine the cg COMs */
9516             calc_cgcm(fplog, 0, dd->ncg_home,
9517                       &top_local->cgs, state_local->x, fr->cg_cm);
9518         }
9519
9520         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9521
9522         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9523
9524         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9525                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9526
9527         bRedist = comm->bDynLoadBal;
9528     }
9529     else
9530     {
9531         /* We have the full state, only redistribute the cgs */
9532
9533         /* Clear the non-home indices */
9534         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9535         ncgindex_set = 0;
9536
9537         /* Avoid global communication for dim's without pbc and -gcom */
9538         if (!bNStGlobalComm)
9539         {
9540             copy_rvec(comm->box0, ddbox.box0    );
9541             copy_rvec(comm->box_size, ddbox.box_size);
9542         }
9543         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9544                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9545
9546         bBoxChanged = TRUE;
9547         bRedist     = TRUE;
9548     }
9549     /* For dim's without pbc and -gcom */
9550     copy_rvec(ddbox.box0, comm->box0    );
9551     copy_rvec(ddbox.box_size, comm->box_size);
9552
9553     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9554                       step, wcycle);
9555
9556     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9557     {
9558         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9559     }
9560
9561     /* Check if we should sort the charge groups */
9562     if (comm->nstSortCG > 0)
9563     {
9564         bSortCG = (bMasterState ||
9565                    (bRedist && (step % comm->nstSortCG == 0)));
9566     }
9567     else
9568     {
9569         bSortCG = FALSE;
9570     }
9571
9572     ncg_home_old = dd->ncg_home;
9573
9574     ncg_moved = 0;
9575     if (bRedist)
9576     {
9577         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9578
9579         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9580                            state_local, f, fr, mdatoms,
9581                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9582
9583         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9584     }
9585
9586     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9587                           dd, &ddbox,
9588                           &comm->cell_x0, &comm->cell_x1,
9589                           dd->ncg_home, fr->cg_cm,
9590                           cell_ns_x0, cell_ns_x1, &grid_density);
9591
9592     if (bBoxChanged)
9593     {
9594         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9595     }
9596
9597     switch (fr->cutoff_scheme)
9598     {
9599         case ecutsGROUP:
9600             copy_ivec(fr->ns.grid->n, ncells_old);
9601             grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
9602                        state_local->box, cell_ns_x0, cell_ns_x1,
9603                        fr->rlistlong, grid_density);
9604             break;
9605         case ecutsVERLET:
9606             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9607             break;
9608         default:
9609             gmx_incons("unimplemented");
9610     }
9611     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9612     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9613
9614     if (bSortCG)
9615     {
9616         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9617
9618         /* Sort the state on charge group position.
9619          * This enables exact restarts from this step.
9620          * It also improves performance by about 15% with larger numbers
9621          * of atoms per node.
9622          */
9623
9624         /* Fill the ns grid with the home cell,
9625          * so we can sort with the indices.
9626          */
9627         set_zones_ncg_home(dd);
9628
9629         switch (fr->cutoff_scheme)
9630         {
9631             case ecutsVERLET:
9632                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9633
9634                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9635                                   0,
9636                                   comm->zones.size[0].bb_x0,
9637                                   comm->zones.size[0].bb_x1,
9638                                   0, dd->ncg_home,
9639                                   comm->zones.dens_zone0,
9640                                   fr->cginfo,
9641                                   state_local->x,
9642                                   ncg_moved, bRedist ? comm->moved : NULL,
9643                                   fr->nbv->grp[eintLocal].kernel_type,
9644                                   fr->nbv->grp[eintLocal].nbat);
9645
9646                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9647                 break;
9648             case ecutsGROUP:
9649                 fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
9650                           0, dd->ncg_home, fr->cg_cm);
9651
9652                 copy_ivec(fr->ns.grid->n, ncells_new);
9653                 break;
9654             default:
9655                 gmx_incons("unimplemented");
9656         }
9657
9658         bResortAll = bMasterState;
9659
9660         /* Check if we can user the old order and ns grid cell indices
9661          * of the charge groups to sort the charge groups efficiently.
9662          */
9663         if (ncells_new[XX] != ncells_old[XX] ||
9664             ncells_new[YY] != ncells_old[YY] ||
9665             ncells_new[ZZ] != ncells_old[ZZ])
9666         {
9667             bResortAll = TRUE;
9668         }
9669
9670         if (debug)
9671         {
9672             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9673                     gmx_step_str(step, sbuf), dd->ncg_home);
9674         }
9675         dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
9676                       bResortAll ? -1 : ncg_home_old);
9677         /* Rebuild all the indices */
9678         ga2la_clear(dd->ga2la);
9679         ncgindex_set = 0;
9680
9681         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9682     }
9683
9684     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9685
9686     /* Setup up the communication and communicate the coordinates */
9687     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9688
9689     /* Set the indices */
9690     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9691
9692     /* Set the charge group boundaries for neighbor searching */
9693     set_cg_boundaries(&comm->zones);
9694
9695     if (fr->cutoff_scheme == ecutsVERLET)
9696     {
9697         set_zones_size(dd, state_local->box, &ddbox,
9698                        bSortCG ? 1 : 0, comm->zones.n);
9699     }
9700
9701     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9702
9703     /*
9704        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9705                  -1,state_local->x,state_local->box);
9706      */
9707
9708     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9709
9710     /* Extract a local topology from the global topology */
9711     for (i = 0; i < dd->ndim; i++)
9712     {
9713         np[dd->dim[i]] = comm->cd[i].np;
9714     }
9715     dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
9716                       comm->cellsize_min, np,
9717                       fr,
9718                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9719                       vsite, top_global, top_local);
9720
9721     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9722
9723     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9724
9725     /* Set up the special atom communication */
9726     n = comm->nat[ddnatZONE];
9727     for (i = ddnatZONE+1; i < ddnatNR; i++)
9728     {
9729         switch (i)
9730         {
9731             case ddnatVSITE:
9732                 if (vsite && vsite->n_intercg_vsite)
9733                 {
9734                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9735                 }
9736                 break;
9737             case ddnatCON:
9738                 if (dd->bInterCGcons || dd->bInterCGsettles)
9739                 {
9740                     /* Only for inter-cg constraints we need special code */
9741                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9742                                                   constr, ir->nProjOrder,
9743                                                   top_local->idef.il);
9744                 }
9745                 break;
9746             default:
9747                 gmx_incons("Unknown special atom type setup");
9748         }
9749         comm->nat[i] = n;
9750     }
9751
9752     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9753
9754     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9755
9756     /* Make space for the extra coordinates for virtual site
9757      * or constraint communication.
9758      */
9759     state_local->natoms = comm->nat[ddnatNR-1];
9760     if (state_local->natoms > state_local->nalloc)
9761     {
9762         dd_realloc_state(state_local, f, state_local->natoms);
9763     }
9764
9765     if (fr->bF_NoVirSum)
9766     {
9767         if (vsite && vsite->n_intercg_vsite)
9768         {
9769             nat_f_novirsum = comm->nat[ddnatVSITE];
9770         }
9771         else
9772         {
9773             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9774             {
9775                 nat_f_novirsum = dd->nat_tot;
9776             }
9777             else
9778             {
9779                 nat_f_novirsum = dd->nat_home;
9780             }
9781         }
9782     }
9783     else
9784     {
9785         nat_f_novirsum = 0;
9786     }
9787
9788     /* Set the number of atoms required for the force calculation.
9789      * Forces need to be constrained when using a twin-range setup
9790      * or with energy minimization. For simple simulations we could
9791      * avoid some allocation, zeroing and copying, but this is
9792      * probably not worth the complications ande checking.
9793      */
9794     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9795                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9796
9797     /* We make the all mdatoms up to nat_tot_con.
9798      * We could save some work by only setting invmass
9799      * between nat_tot and nat_tot_con.
9800      */
9801     /* This call also sets the new number of home particles to dd->nat_home */
9802     atoms2md(top_global, ir,
9803              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9804
9805     /* Now we have the charges we can sort the FE interactions */
9806     dd_sort_local_top(dd, mdatoms, top_local);
9807
9808     if (vsite != NULL)
9809     {
9810         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9811         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9812                                   mdatoms, FALSE, vsite);
9813     }
9814
9815     if (shellfc)
9816     {
9817         /* Make the local shell stuff, currently no communication is done */
9818         make_local_shells(cr, mdatoms, shellfc);
9819     }
9820
9821     if (ir->implicit_solvent)
9822     {
9823         make_local_gb(cr, fr->born, ir->gb_algorithm);
9824     }
9825
9826     setup_bonded_threading(fr, &top_local->idef);
9827
9828     if (!(cr->duty & DUTY_PME))
9829     {
9830         /* Send the charges to our PME only node */
9831         gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
9832                        mdatoms->chargeA, mdatoms->chargeB,
9833                        dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9834     }
9835
9836     if (constr)
9837     {
9838         set_constraints(constr, top_local, ir, mdatoms, cr);
9839     }
9840
9841     if (ir->ePull != epullNO)
9842     {
9843         /* Update the local pull groups */
9844         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9845     }
9846
9847     if (ir->bRot)
9848     {
9849         /* Update the local rotation groups */
9850         dd_make_local_rotation_groups(dd, ir->rot);
9851     }
9852
9853
9854     add_dd_statistics(dd);
9855
9856     /* Make sure we only count the cycles for this DD partitioning */
9857     clear_dd_cycle_counts(dd);
9858
9859     /* Because the order of the atoms might have changed since
9860      * the last vsite construction, we need to communicate the constructing
9861      * atom coordinates again (for spreading the forces this MD step).
9862      */
9863     dd_move_x_vsites(dd, state_local->box, state_local->x);
9864
9865     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9866
9867     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9868     {
9869         dd_move_x(dd, state_local->box, state_local->x);
9870         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9871                      -1, state_local->x, state_local->box);
9872     }
9873
9874     /* Store the partitioning step */
9875     comm->partition_step = step;
9876
9877     /* Increase the DD partitioning counter */
9878     dd->ddp_count++;
9879     /* The state currently matches this DD partitioning count, store it */
9880     state_local->ddp_count = dd->ddp_count;
9881     if (bMasterState)
9882     {
9883         /* The DD master node knows the complete cg distribution,
9884          * store the count so we can possibly skip the cg info communication.
9885          */
9886         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9887     }
9888
9889     if (comm->DD_debug > 0)
9890     {
9891         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9892         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9893                                 "after partitioning");
9894     }
9895 }