src/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2008
   5  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   7  * others, as listed in the AUTHORS file in the top-level source
   8  * directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <time.h>
  43 #include <math.h>
  44 #include <string.h>
  45 #include <stdlib.h>
  46 #include "typedefs.h"
  47 #include "smalloc.h"
  48 #include "gmx_fatal.h"
  49 #include "gmx_fatal_collective.h"
  50 #include "vec.h"
  51 #include "domdec.h"
  52 #include "domdec_network.h"
  53 #include "nrnb.h"
  54 #include "pbc.h"
  55 #include "chargegroup.h"
  56 #include "constr.h"
  57 #include "mdatoms.h"
  58 #include "names.h"
  59 #include "pdbio.h"
  60 #include "futil.h"
  61 #include "force.h"
  62 #include "pme.h"
  63 #include "pull.h"
  64 #include "pull_rotation.h"
  65 #include "gmx_wallcycle.h"
  66 #include "mdrun.h"
  67 #include "nsgrid.h"
  68 #include "shellfc.h"
  69 #include "mtop_util.h"
  70 #include "gmxfio.h"
  71 #include "gmx_ga2la.h"
  72 #include "gmx_sort.h"
  73 #include "nbnxn_search.h"
  74 #include "bondf.h"
  75 #include "gmx_omp_nthreads.h"
  76
  77 #ifdef GMX_LIB_MPI
  78 #include <mpi.h>
  79 #endif
  80 #ifdef GMX_THREAD_MPI
  81 #include "tmpi.h"
  82 #endif
  83
  84 #define DDRANK(dd, rank)    (rank)
  85 #define DDMASTERRANK(dd)   (dd->masterrank)
  86
  87 typedef struct gmx_domdec_master
  88 {
  89     /* The cell boundaries */
  90     real **cell_x;
  91     /* The global charge group division */
  92     int   *ncg;    /* Number of home charge groups for each node */
  93     int   *index;  /* Index of nnodes+1 into cg */
  94     int   *cg;     /* Global charge group index */
  95     int   *nat;    /* Number of home atoms for each node. */
  96     int   *ibuf;   /* Buffer for communication */
  97     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  98 } gmx_domdec_master_t;
  99
 100 typedef struct
 101 {
 102     /* The numbers of charge groups to send and receive for each cell
 103      * that requires communication, the last entry contains the total
 104      * number of atoms that needs to be communicated.
 105      */
 106     int  nsend[DD_MAXIZONE+2];
 107     int  nrecv[DD_MAXIZONE+2];
 108     /* The charge groups to send */
 109     int *index;
 110     int  nalloc;
 111     /* The atom range for non-in-place communication */
 112     int  cell2at0[DD_MAXIZONE];
 113     int  cell2at1[DD_MAXIZONE];
 114 } gmx_domdec_ind_t;
 115
 116 typedef struct
 117 {
 118     int               np;       /* Number of grid pulses in this dimension */
 119     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 120     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 121     int               np_nalloc;
 122     gmx_bool          bInPlace; /* Can we communicate in place?            */
 123 } gmx_domdec_comm_dim_t;
 124
 125 typedef struct
 126 {
 127     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 128     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 129     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 130     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 131     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 132     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 133     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 134     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 135     real     *buf_ncd;     /* Temp. var.                                     */
 136 } gmx_domdec_root_t;
 137
 138 #define DD_NLOAD_MAX 9
 139
 140 /* Here floats are accurate enough, since these variables
 141  * only influence the load balancing, not the actual MD results.
 142  */
 143 typedef struct
 144 {
 145     int    nload;
 146     float *load;
 147     float  sum;
 148     float  max;
 149     float  sum_m;
 150     float  cvol_min;
 151     float  mdf;
 152     float  pme;
 153     int    flags;
 154 } gmx_domdec_load_t;
 155
 156 typedef struct
 157 {
 158     int  nsc;
 159     int  ind_gl;
 160     int  ind;
 161 } gmx_cgsort_t;
 162
 163 typedef struct
 164 {
 165     gmx_cgsort_t *sort;
 166     gmx_cgsort_t *sort2;
 167     int           sort_nalloc;
 168     gmx_cgsort_t *sort_new;
 169     int           sort_new_nalloc;
 170     int          *ibuf;
 171     int           ibuf_nalloc;
 172 } gmx_domdec_sort_t;
 173
 174 typedef struct
 175 {
 176     rvec *v;
 177     int   nalloc;
 178 } vec_rvec_t;
 179
 180 /* This enum determines the order of the coordinates.
 181  * ddnatHOME and ddnatZONE should be first and second,
 182  * the others can be ordered as wanted.
 183  */
 184 enum {
 185     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 186 };
 187
 188 enum {
 189     edlbAUTO, edlbNO, edlbYES, edlbNR
 190 };
 191 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 192
 193 typedef struct
 194 {
 195     int      dim;       /* The dimension                                          */
 196     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 197     int      nslab;     /* The number of PME slabs in this dimension              */
 198     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 199     int     *pp_min;    /* The minimum pp node location, size nslab               */
 200     int     *pp_max;    /* The maximum pp node location,size nslab                */
 201     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 202 } gmx_ddpme_t;
 203
 204 typedef struct
 205 {
 206     real min0;    /* The minimum bottom of this zone                        */
 207     real max1;    /* The maximum top of this zone                           */
 208     real min1;    /* The minimum top of this zone                           */
 209     real mch0;    /* The maximum bottom communicaton height for this zone   */
 210     real mch1;    /* The maximum top communicaton height for this zone      */
 211     real p1_0;    /* The bottom value of the first cell in this zone        */
 212     real p1_1;    /* The top value of the first cell in this zone           */
 213 } gmx_ddzone_t;
 214
 215 typedef struct
 216 {
 217     gmx_domdec_ind_t ind;
 218     int             *ibuf;
 219     int              ibuf_nalloc;
 220     vec_rvec_t       vbuf;
 221     int              nsend;
 222     int              nat;
 223     int              nsend_zone;
 224 } dd_comm_setup_work_t;
 225
 226 typedef struct gmx_domdec_comm
 227 {
 228     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 229      * unless stated otherwise.
 230      */
 231
 232     /* The number of decomposition dimensions for PME, 0: no PME */
 233     int         npmedecompdim;
 234     /* The number of nodes doing PME (PP/PME or only PME) */
 235     int         npmenodes;
 236     int         npmenodes_x;
 237     int         npmenodes_y;
 238     /* The communication setup including the PME only nodes */
 239     gmx_bool    bCartesianPP_PME;
 240     ivec        ntot;
 241     int         cartpmedim;
 242     int        *pmenodes;          /* size npmenodes                         */
 243     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 244                                     * but with bCartesianPP_PME              */
 245     gmx_ddpme_t ddpme[2];
 246
 247     /* The DD particle-particle nodes only */
 248     gmx_bool bCartesianPP;
 249     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 250
 251     /* The global charge groups */
 252     t_block cgs_gl;
 253
 254     /* Should we sort the cgs */
 255     int                nstSortCG;
 256     gmx_domdec_sort_t *sort;
 257
 258     /* Are there charge groups? */
 259     gmx_bool bCGs;
 260
 261     /* Are there bonded and multi-body interactions between charge groups? */
 262     gmx_bool bInterCGBondeds;
 263     gmx_bool bInterCGMultiBody;
 264
 265     /* Data for the optional bonded interaction atom communication range */
 266     gmx_bool  bBondComm;
 267     t_blocka *cglink;
 268     char     *bLocalCG;
 269
 270     /* The DLB option */
 271     int      eDLB;
 272     /* Are we actually using DLB? */
 273     gmx_bool bDynLoadBal;
 274
 275     /* Cell sizes for static load balancing, first index cartesian */
 276     real **slb_frac;
 277
 278     /* The width of the communicated boundaries */
 279     real     cutoff_mbody;
 280     real     cutoff;
 281     /* The minimum cell size (including triclinic correction) */
 282     rvec     cellsize_min;
 283     /* For dlb, for use with edlbAUTO */
 284     rvec     cellsize_min_dlb;
 285     /* The lower limit for the DD cell size with DLB */
 286     real     cellsize_limit;
 287     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 288     gmx_bool bVacDLBNoLimit;
 289
 290     /* With PME load balancing we set limits on DLB */
 291     gmx_bool bPMELoadBalDLBLimits;
 292     /* DLB needs to take into account that we want to allow this maximum
 293      * cut-off (for PME load balancing), this could limit cell boundaries.
 294      */
 295     real PMELoadBal_max_cutoff;
 296
 297     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 298     ivec tric_dir;
 299     /* box0 and box_size are required with dim's without pbc and -gcom */
 300     rvec box0;
 301     rvec box_size;
 302
 303     /* The cell boundaries */
 304     rvec cell_x0;
 305     rvec cell_x1;
 306
 307     /* The old location of the cell boundaries, to check cg displacements */
 308     rvec old_cell_x0;
 309     rvec old_cell_x1;
 310
 311     /* The communication setup and charge group boundaries for the zones */
 312     gmx_domdec_zones_t zones;
 313
 314     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 315      * cell boundaries of neighboring cells for dynamic load balancing.
 316      */
 317     gmx_ddzone_t zone_d1[2];
 318     gmx_ddzone_t zone_d2[2][2];
 319
 320     /* The coordinate/force communication setup and indices */
 321     gmx_domdec_comm_dim_t cd[DIM];
 322     /* The maximum number of cells to communicate with in one dimension */
 323     int                   maxpulse;
 324
 325     /* Which cg distribution is stored on the master node */
 326     int master_cg_ddp_count;
 327
 328     /* The number of cg's received from the direct neighbors */
 329     int  zone_ncg1[DD_MAXZONE];
 330
 331     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 332     int  nat[ddnatNR];
 333
 334     /* Array for signalling if atoms have moved to another domain */
 335     int  *moved;
 336     int   moved_nalloc;
 337
 338     /* Communication buffer for general use */
 339     int  *buf_int;
 340     int   nalloc_int;
 341
 342     /* Communication buffer for general use */
 343     vec_rvec_t vbuf;
 344
 345     /* Temporary storage for thread parallel communication setup */
 346     int                   nth;
 347     dd_comm_setup_work_t *dth;
 348
 349     /* Communication buffers only used with multiple grid pulses */
 350     int       *buf_int2;
 351     int        nalloc_int2;
 352     vec_rvec_t vbuf2;
 353
 354     /* Communication buffers for local redistribution */
 355     int  **cggl_flag;
 356     int    cggl_flag_nalloc[DIM*2];
 357     rvec **cgcm_state;
 358     int    cgcm_state_nalloc[DIM*2];
 359
 360     /* Cell sizes for dynamic load balancing */
 361     gmx_domdec_root_t **root;
 362     real               *cell_f_row;
 363     real                cell_f0[DIM];
 364     real                cell_f1[DIM];
 365     real                cell_f_max0[DIM];
 366     real                cell_f_min1[DIM];
 367
 368     /* Stuff for load communication */
 369     gmx_bool           bRecordLoad;
 370     gmx_domdec_load_t *load;
 371 #ifdef GMX_MPI
 372     MPI_Comm          *mpi_comm_load;
 373 #endif
 374
 375     /* Maximum DLB scaling per load balancing step in percent */
 376     int dlb_scale_lim;
 377
 378     /* Cycle counters */
 379     float  cycl[ddCyclNr];
 380     int    cycl_n[ddCyclNr];
 381     float  cycl_max[ddCyclNr];
 382     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 383     int    eFlop;
 384     double flop;
 385     int    flop_n;
 386     /* Have often have did we have load measurements */
 387     int    n_load_have;
 388     /* Have often have we collected the load measurements */
 389     int    n_load_collect;
 390
 391     /* Statistics */
 392     double sum_nat[ddnatNR-ddnatZONE];
 393     int    ndecomp;
 394     int    nload;
 395     double load_step;
 396     double load_sum;
 397     double load_max;
 398     ivec   load_lim;
 399     double load_mdf;
 400     double load_pme;
 401
 402     /* The last partition step */
 403     gmx_large_int_t partition_step;
 404
 405     /* Debugging */
 406     int  nstDDDump;
 407     int  nstDDDumpGrid;
 408     int  DD_debug;
 409 } gmx_domdec_comm_t;
 410
 411 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 412 #define DD_CGIBS 2
 413
 414 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 415 #define DD_FLAG_NRCG  65535
 416 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 417 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 418
 419 /* Zone permutation required to obtain consecutive charge groups
 420  * for neighbor searching.
 421  */
 422 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 423
 424 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 425  * components see only j zones with that component 0.
 426  */
 427
 428 /* The DD zone order */
 429 static const ivec dd_zo[DD_MAXZONE] =
 430 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 431
 432 /* The 3D setup */
 433 #define dd_z3n  8
 434 #define dd_zp3n 4
 435 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 436
 437 /* The 2D setup */
 438 #define dd_z2n  4
 439 #define dd_zp2n 2
 440 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 441
 442 /* The 1D setup */
 443 #define dd_z1n  2
 444 #define dd_zp1n 1
 445 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 446
 447 /* Factors used to avoid problems due to rounding issues */
 448 #define DD_CELL_MARGIN       1.0001
 449 #define DD_CELL_MARGIN2      1.00005
 450 /* Factor to account for pressure scaling during nstlist steps */
 451 #define DD_PRES_SCALE_MARGIN 1.02
 452
 453 /* Allowed performance loss before we DLB or warn */
 454 #define DD_PERF_LOSS 0.05
 455
 456 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 457
 458 /* Use separate MPI send and receive commands
 459  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 460  * This saves memory (and some copying for small nnodes).
 461  * For high parallelization scatter and gather calls are used.
 462  */
 463 #define GMX_DD_NNODES_SENDRECV 4
 464
 465
 466 /*
 467    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 468
 469    static void index2xyz(ivec nc,int ind,ivec xyz)
 470    {
 471    xyz[XX] = ind % nc[XX];
 472    xyz[YY] = (ind / nc[XX]) % nc[YY];
 473    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 474    }
 475  */
 476
 477 /* This order is required to minimize the coordinate communication in PME
 478  * which uses decomposition in the x direction.
 479  */
 480 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 481
 482 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 483 {
 484     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 485     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 486     xyz[ZZ] = ind % nc[ZZ];
 487 }
 488
 489 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 490 {
 491     int ddindex;
 492     int ddnodeid = -1;
 493
 494     ddindex = dd_index(dd->nc, c);
 495     if (dd->comm->bCartesianPP_PME)
 496     {
 497         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 498     }
 499     else if (dd->comm->bCartesianPP)
 500     {
 501 #ifdef GMX_MPI
 502         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 503 #endif
 504     }
 505     else
 506     {
 507         ddnodeid = ddindex;
 508     }
 509
 510     return ddnodeid;
 511 }
 512
 513 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 514 {
 515     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 516 }
 517
 518 int ddglatnr(gmx_domdec_t *dd, int i)
 519 {
 520     int atnr;
 521
 522     if (dd == NULL)
 523     {
 524         atnr = i + 1;
 525     }
 526     else
 527     {
 528         if (i >= dd->comm->nat[ddnatNR-1])
 529         {
 530             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 531         }
 532         atnr = dd->gatindex[i] + 1;
 533     }
 534
 535     return atnr;
 536 }
 537
 538 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 539 {
 540     return &dd->comm->cgs_gl;
 541 }
 542
 543 static void vec_rvec_init(vec_rvec_t *v)
 544 {
 545     v->nalloc = 0;
 546     v->v      = NULL;
 547 }
 548
 549 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 550 {
 551     if (n > v->nalloc)
 552     {
 553         v->nalloc = over_alloc_dd(n);
 554         srenew(v->v, v->nalloc);
 555     }
 556 }
 557
 558 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 559 {
 560     int i;
 561
 562     if (state->ddp_count != dd->ddp_count)
 563     {
 564         gmx_incons("The state does not the domain decomposition state");
 565     }
 566
 567     state->ncg_gl = dd->ncg_home;
 568     if (state->ncg_gl > state->cg_gl_nalloc)
 569     {
 570         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 571         srenew(state->cg_gl, state->cg_gl_nalloc);
 572     }
 573     for (i = 0; i < state->ncg_gl; i++)
 574     {
 575         state->cg_gl[i] = dd->index_gl[i];
 576     }
 577
 578     state->ddp_count_cg_gl = dd->ddp_count;
 579 }
 580
 581 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 582 {
 583     return &dd->comm->zones;
 584 }
 585
 586 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 587                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 588 {
 589     gmx_domdec_zones_t *zones;
 590     int                 izone, d, dim;
 591
 592     zones = &dd->comm->zones;
 593
 594     izone = 0;
 595     while (icg >= zones->izone[izone].cg1)
 596     {
 597         izone++;
 598     }
 599
 600     if (izone == 0)
 601     {
 602         *jcg0 = icg;
 603     }
 604     else if (izone < zones->nizone)
 605     {
 606         *jcg0 = zones->izone[izone].jcg0;
 607     }
 608     else
 609     {
 610         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 611                   icg, izone, zones->nizone);
 612     }
 613
 614     *jcg1 = zones->izone[izone].jcg1;
 615
 616     for (d = 0; d < dd->ndim; d++)
 617     {
 618         dim         = dd->dim[d];
 619         shift0[dim] = zones->izone[izone].shift0[dim];
 620         shift1[dim] = zones->izone[izone].shift1[dim];
 621         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 622         {
 623             /* A conservative approach, this can be optimized */
 624             shift0[dim] -= 1;
 625             shift1[dim] += 1;
 626         }
 627     }
 628 }
 629
 630 int dd_natoms_vsite(gmx_domdec_t *dd)
 631 {
 632     return dd->comm->nat[ddnatVSITE];
 633 }
 634
 635 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 636 {
 637     *at_start = dd->comm->nat[ddnatCON-1];
 638     *at_end   = dd->comm->nat[ddnatCON];
 639 }
 640
 641 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 642 {
 643     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 644     int                   *index, *cgindex;
 645     gmx_domdec_comm_t     *comm;
 646     gmx_domdec_comm_dim_t *cd;
 647     gmx_domdec_ind_t      *ind;
 648     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 649     gmx_bool               bPBC, bScrew;
 650
 651     comm = dd->comm;
 652
 653     cgindex = dd->cgindex;
 654
 655     buf = comm->vbuf.v;
 656
 657     nzone   = 1;
 658     nat_tot = dd->nat_home;
 659     for (d = 0; d < dd->ndim; d++)
 660     {
 661         bPBC   = (dd->ci[dd->dim[d]] == 0);
 662         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 663         if (bPBC)
 664         {
 665             copy_rvec(box[dd->dim[d]], shift);
 666         }
 667         cd = &comm->cd[d];
 668         for (p = 0; p < cd->np; p++)
 669         {
 670             ind   = &cd->ind[p];
 671             index = ind->index;
 672             n     = 0;
 673             if (!bPBC)
 674             {
 675                 for (i = 0; i < ind->nsend[nzone]; i++)
 676                 {
 677                     at0 = cgindex[index[i]];
 678                     at1 = cgindex[index[i]+1];
 679                     for (j = at0; j < at1; j++)
 680                     {
 681                         copy_rvec(x[j], buf[n]);
 682                         n++;
 683                     }
 684                 }
 685             }
 686             else if (!bScrew)
 687             {
 688                 for (i = 0; i < ind->nsend[nzone]; i++)
 689                 {
 690                     at0 = cgindex[index[i]];
 691                     at1 = cgindex[index[i]+1];
 692                     for (j = at0; j < at1; j++)
 693                     {
 694                         /* We need to shift the coordinates */
 695                         rvec_add(x[j], shift, buf[n]);
 696                         n++;
 697                     }
 698                 }
 699             }
 700             else
 701             {
 702                 for (i = 0; i < ind->nsend[nzone]; i++)
 703                 {
 704                     at0 = cgindex[index[i]];
 705                     at1 = cgindex[index[i]+1];
 706                     for (j = at0; j < at1; j++)
 707                     {
 708                         /* Shift x */
 709                         buf[n][XX] = x[j][XX] + shift[XX];
 710                         /* Rotate y and z.
 711                          * This operation requires a special shift force
 712                          * treatment, which is performed in calc_vir.
 713                          */
 714                         buf[n][YY] = box[YY][YY] - x[j][YY];
 715                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 716                         n++;
 717                     }
 718                 }
 719             }
 720
 721             if (cd->bInPlace)
 722             {
 723                 rbuf = x + nat_tot;
 724             }
 725             else
 726             {
 727                 rbuf = comm->vbuf2.v;
 728             }
 729             /* Send and receive the coordinates */
 730             dd_sendrecv_rvec(dd, d, dddirBackward,
 731                              buf,  ind->nsend[nzone+1],
 732                              rbuf, ind->nrecv[nzone+1]);
 733             if (!cd->bInPlace)
 734             {
 735                 j = 0;
 736                 for (zone = 0; zone < nzone; zone++)
 737                 {
 738                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 739                     {
 740                         copy_rvec(rbuf[j], x[i]);
 741                         j++;
 742                     }
 743                 }
 744             }
 745             nat_tot += ind->nrecv[nzone+1];
 746         }
 747         nzone += nzone;
 748     }
 749 }
 750
 751 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 752 {
 753     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 754     int                   *index, *cgindex;
 755     gmx_domdec_comm_t     *comm;
 756     gmx_domdec_comm_dim_t *cd;
 757     gmx_domdec_ind_t      *ind;
 758     rvec                  *buf, *sbuf;
 759     ivec                   vis;
 760     int                    is;
 761     gmx_bool               bPBC, bScrew;
 762
 763     comm = dd->comm;
 764
 765     cgindex = dd->cgindex;
 766
 767     buf = comm->vbuf.v;
 768
 769     n       = 0;
 770     nzone   = comm->zones.n/2;
 771     nat_tot = dd->nat_tot;
 772     for (d = dd->ndim-1; d >= 0; d--)
 773     {
 774         bPBC   = (dd->ci[dd->dim[d]] == 0);
 775         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 776         if (fshift == NULL && !bScrew)
 777         {
 778             bPBC = FALSE;
 779         }
 780         /* Determine which shift vector we need */
 781         clear_ivec(vis);
 782         vis[dd->dim[d]] = 1;
 783         is              = IVEC2IS(vis);
 784
 785         cd = &comm->cd[d];
 786         for (p = cd->np-1; p >= 0; p--)
 787         {
 788             ind      = &cd->ind[p];
 789             nat_tot -= ind->nrecv[nzone+1];
 790             if (cd->bInPlace)
 791             {
 792                 sbuf = f + nat_tot;
 793             }
 794             else
 795             {
 796                 sbuf = comm->vbuf2.v;
 797                 j    = 0;
 798                 for (zone = 0; zone < nzone; zone++)
 799                 {
 800                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 801                     {
 802                         copy_rvec(f[i], sbuf[j]);
 803                         j++;
 804                     }
 805                 }
 806             }
 807             /* Communicate the forces */
 808             dd_sendrecv_rvec(dd, d, dddirForward,
 809                              sbuf, ind->nrecv[nzone+1],
 810                              buf,  ind->nsend[nzone+1]);
 811             index = ind->index;
 812             /* Add the received forces */
 813             n = 0;
 814             if (!bPBC)
 815             {
 816                 for (i = 0; i < ind->nsend[nzone]; i++)
 817                 {
 818                     at0 = cgindex[index[i]];
 819                     at1 = cgindex[index[i]+1];
 820                     for (j = at0; j < at1; j++)
 821                     {
 822                         rvec_inc(f[j], buf[n]);
 823                         n++;
 824                     }
 825                 }
 826             }
 827             else if (!bScrew)
 828             {
 829                 for (i = 0; i < ind->nsend[nzone]; i++)
 830                 {
 831                     at0 = cgindex[index[i]];
 832                     at1 = cgindex[index[i]+1];
 833                     for (j = at0; j < at1; j++)
 834                     {
 835                         rvec_inc(f[j], buf[n]);
 836                         /* Add this force to the shift force */
 837                         rvec_inc(fshift[is], buf[n]);
 838                         n++;
 839                     }
 840                 }
 841             }
 842             else
 843             {
 844                 for (i = 0; i < ind->nsend[nzone]; i++)
 845                 {
 846                     at0 = cgindex[index[i]];
 847                     at1 = cgindex[index[i]+1];
 848                     for (j = at0; j < at1; j++)
 849                     {
 850                         /* Rotate the force */
 851                         f[j][XX] += buf[n][XX];
 852                         f[j][YY] -= buf[n][YY];
 853                         f[j][ZZ] -= buf[n][ZZ];
 854                         if (fshift)
 855                         {
 856                             /* Add this force to the shift force */
 857                             rvec_inc(fshift[is], buf[n]);
 858                         }
 859                         n++;
 860                     }
 861                 }
 862             }
 863         }
 864         nzone /= 2;
 865     }
 866 }
 867
 868 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 869 {
 870     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 871     int                   *index, *cgindex;
 872     gmx_domdec_comm_t     *comm;
 873     gmx_domdec_comm_dim_t *cd;
 874     gmx_domdec_ind_t      *ind;
 875     real                  *buf, *rbuf;
 876
 877     comm = dd->comm;
 878
 879     cgindex = dd->cgindex;
 880
 881     buf = &comm->vbuf.v[0][0];
 882
 883     nzone   = 1;
 884     nat_tot = dd->nat_home;
 885     for (d = 0; d < dd->ndim; d++)
 886     {
 887         cd = &comm->cd[d];
 888         for (p = 0; p < cd->np; p++)
 889         {
 890             ind   = &cd->ind[p];
 891             index = ind->index;
 892             n     = 0;
 893             for (i = 0; i < ind->nsend[nzone]; i++)
 894             {
 895                 at0 = cgindex[index[i]];
 896                 at1 = cgindex[index[i]+1];
 897                 for (j = at0; j < at1; j++)
 898                 {
 899                     buf[n] = v[j];
 900                     n++;
 901                 }
 902             }
 903
 904             if (cd->bInPlace)
 905             {
 906                 rbuf = v + nat_tot;
 907             }
 908             else
 909             {
 910                 rbuf = &comm->vbuf2.v[0][0];
 911             }
 912             /* Send and receive the coordinates */
 913             dd_sendrecv_real(dd, d, dddirBackward,
 914                              buf,  ind->nsend[nzone+1],
 915                              rbuf, ind->nrecv[nzone+1]);
 916             if (!cd->bInPlace)
 917             {
 918                 j = 0;
 919                 for (zone = 0; zone < nzone; zone++)
 920                 {
 921                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 922                     {
 923                         v[i] = rbuf[j];
 924                         j++;
 925                     }
 926                 }
 927             }
 928             nat_tot += ind->nrecv[nzone+1];
 929         }
 930         nzone += nzone;
 931     }
 932 }
 933
 934 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 935 {
 936     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 937     int                   *index, *cgindex;
 938     gmx_domdec_comm_t     *comm;
 939     gmx_domdec_comm_dim_t *cd;
 940     gmx_domdec_ind_t      *ind;
 941     real                  *buf, *sbuf;
 942
 943     comm = dd->comm;
 944
 945     cgindex = dd->cgindex;
 946
 947     buf = &comm->vbuf.v[0][0];
 948
 949     n       = 0;
 950     nzone   = comm->zones.n/2;
 951     nat_tot = dd->nat_tot;
 952     for (d = dd->ndim-1; d >= 0; d--)
 953     {
 954         cd = &comm->cd[d];
 955         for (p = cd->np-1; p >= 0; p--)
 956         {
 957             ind      = &cd->ind[p];
 958             nat_tot -= ind->nrecv[nzone+1];
 959             if (cd->bInPlace)
 960             {
 961                 sbuf = v + nat_tot;
 962             }
 963             else
 964             {
 965                 sbuf = &comm->vbuf2.v[0][0];
 966                 j    = 0;
 967                 for (zone = 0; zone < nzone; zone++)
 968                 {
 969                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 970                     {
 971                         sbuf[j] = v[i];
 972                         j++;
 973                     }
 974                 }
 975             }
 976             /* Communicate the forces */
 977             dd_sendrecv_real(dd, d, dddirForward,
 978                              sbuf, ind->nrecv[nzone+1],
 979                              buf,  ind->nsend[nzone+1]);
 980             index = ind->index;
 981             /* Add the received forces */
 982             n = 0;
 983             for (i = 0; i < ind->nsend[nzone]; i++)
 984             {
 985                 at0 = cgindex[index[i]];
 986                 at1 = cgindex[index[i]+1];
 987                 for (j = at0; j < at1; j++)
 988                 {
 989                     v[j] += buf[n];
 990                     n++;
 991                 }
 992             }
 993         }
 994         nzone /= 2;
 995     }
 996 }
 997
 998 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 999 {
1000     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1001             d, i, j,
1002             zone->min0, zone->max1,
1003             zone->mch0, zone->mch0,
1004             zone->p1_0, zone->p1_1);
1005 }
1006
1007
1008 #define DDZONECOMM_MAXZONE  5
1009 #define DDZONECOMM_BUFSIZE  3
1010
1011 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1012                                int ddimind, int direction,
1013                                gmx_ddzone_t *buf_s, int n_s,
1014                                gmx_ddzone_t *buf_r, int n_r)
1015 {
1016 #define ZBS  DDZONECOMM_BUFSIZE
1017     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1018     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1019     int  i;
1020
1021     for (i = 0; i < n_s; i++)
1022     {
1023         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1024         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1025         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1026         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1027         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1028         vbuf_s[i*ZBS+1][2] = 0;
1029         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1030         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1031         vbuf_s[i*ZBS+2][2] = 0;
1032     }
1033
1034     dd_sendrecv_rvec(dd, ddimind, direction,
1035                      vbuf_s, n_s*ZBS,
1036                      vbuf_r, n_r*ZBS);
1037
1038     for (i = 0; i < n_r; i++)
1039     {
1040         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1041         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1042         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1043         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1044         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1045         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1046         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1047     }
1048
1049 #undef ZBS
1050 }
1051
1052 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1053                           rvec cell_ns_x0, rvec cell_ns_x1)
1054 {
1055     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1056     gmx_ddzone_t      *zp;
1057     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1058     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1059     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1060     rvec               extr_s[2], extr_r[2];
1061     rvec               dh;
1062     real               dist_d, c = 0, det;
1063     gmx_domdec_comm_t *comm;
1064     gmx_bool           bPBC, bUse;
1065
1066     comm = dd->comm;
1067
1068     for (d = 1; d < dd->ndim; d++)
1069     {
1070         dim      = dd->dim[d];
1071         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1072         zp->min0 = cell_ns_x0[dim];
1073         zp->max1 = cell_ns_x1[dim];
1074         zp->min1 = cell_ns_x1[dim];
1075         zp->mch0 = cell_ns_x0[dim];
1076         zp->mch1 = cell_ns_x1[dim];
1077         zp->p1_0 = cell_ns_x0[dim];
1078         zp->p1_1 = cell_ns_x1[dim];
1079     }
1080
1081     for (d = dd->ndim-2; d >= 0; d--)
1082     {
1083         dim  = dd->dim[d];
1084         bPBC = (dim < ddbox->npbcdim);
1085
1086         /* Use an rvec to store two reals */
1087         extr_s[d][0] = comm->cell_f0[d+1];
1088         extr_s[d][1] = comm->cell_f1[d+1];
1089         extr_s[d][2] = comm->cell_f1[d+1];
1090
1091         pos = 0;
1092         /* Store the extremes in the backward sending buffer,
1093          * so the get updated separately from the forward communication.
1094          */
1095         for (d1 = d; d1 < dd->ndim-1; d1++)
1096         {
1097             /* We invert the order to be able to use the same loop for buf_e */
1098             buf_s[pos].min0 = extr_s[d1][1];
1099             buf_s[pos].max1 = extr_s[d1][0];
1100             buf_s[pos].min1 = extr_s[d1][2];
1101             buf_s[pos].mch0 = 0;
1102             buf_s[pos].mch1 = 0;
1103             /* Store the cell corner of the dimension we communicate along */
1104             buf_s[pos].p1_0 = comm->cell_x0[dim];
1105             buf_s[pos].p1_1 = 0;
1106             pos++;
1107         }
1108
1109         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1110         pos++;
1111
1112         if (dd->ndim == 3 && d == 0)
1113         {
1114             buf_s[pos] = comm->zone_d2[0][1];
1115             pos++;
1116             buf_s[pos] = comm->zone_d1[0];
1117             pos++;
1118         }
1119
1120         /* We only need to communicate the extremes
1121          * in the forward direction
1122          */
1123         npulse = comm->cd[d].np;
1124         if (bPBC)
1125         {
1126             /* Take the minimum to avoid double communication */
1127             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1128         }
1129         else
1130         {
1131             /* Without PBC we should really not communicate over
1132              * the boundaries, but implementing that complicates
1133              * the communication setup and therefore we simply
1134              * do all communication, but ignore some data.
1135              */
1136             npulse_min = npulse;
1137         }
1138         for (p = 0; p < npulse_min; p++)
1139         {
1140             /* Communicate the extremes forward */
1141             bUse = (bPBC || dd->ci[dim] > 0);
1142
1143             dd_sendrecv_rvec(dd, d, dddirForward,
1144                              extr_s+d, dd->ndim-d-1,
1145                              extr_r+d, dd->ndim-d-1);
1146
1147             if (bUse)
1148             {
1149                 for (d1 = d; d1 < dd->ndim-1; d1++)
1150                 {
1151                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1152                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1153                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1154                 }
1155             }
1156         }
1157
1158         buf_size = pos;
1159         for (p = 0; p < npulse; p++)
1160         {
1161             /* Communicate all the zone information backward */
1162             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1163
1164             dd_sendrecv_ddzone(dd, d, dddirBackward,
1165                                buf_s, buf_size,
1166                                buf_r, buf_size);
1167
1168             clear_rvec(dh);
1169             if (p > 0)
1170             {
1171                 for (d1 = d+1; d1 < dd->ndim; d1++)
1172                 {
1173                     /* Determine the decrease of maximum required
1174                      * communication height along d1 due to the distance along d,
1175                      * this avoids a lot of useless atom communication.
1176                      */
1177                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1178
1179                     if (ddbox->tric_dir[dim])
1180                     {
1181                         /* c is the off-diagonal coupling between the cell planes
1182                          * along directions d and d1.
1183                          */
1184                         c = ddbox->v[dim][dd->dim[d1]][dim];
1185                     }
1186                     else
1187                     {
1188                         c = 0;
1189                     }
1190                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1191                     if (det > 0)
1192                     {
1193                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1194                     }
1195                     else
1196                     {
1197                         /* A negative value signals out of range */
1198                         dh[d1] = -1;
1199                     }
1200                 }
1201             }
1202
1203             /* Accumulate the extremes over all pulses */
1204             for (i = 0; i < buf_size; i++)
1205             {
1206                 if (p == 0)
1207                 {
1208                     buf_e[i] = buf_r[i];
1209                 }
1210                 else
1211                 {
1212                     if (bUse)
1213                     {
1214                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1215                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1216                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1217                     }
1218
1219                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1220                     {
1221                         d1 = 1;
1222                     }
1223                     else
1224                     {
1225                         d1 = d + 1;
1226                     }
1227                     if (bUse && dh[d1] >= 0)
1228                     {
1229                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1230                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1231                     }
1232                 }
1233                 /* Copy the received buffer to the send buffer,
1234                  * to pass the data through with the next pulse.
1235                  */
1236                 buf_s[i] = buf_r[i];
1237             }
1238             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1239                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1240             {
1241                 /* Store the extremes */
1242                 pos = 0;
1243
1244                 for (d1 = d; d1 < dd->ndim-1; d1++)
1245                 {
1246                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1247                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1248                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1249                     pos++;
1250                 }
1251
1252                 if (d == 1 || (d == 0 && dd->ndim == 3))
1253                 {
1254                     for (i = d; i < 2; i++)
1255                     {
1256                         comm->zone_d2[1-d][i] = buf_e[pos];
1257                         pos++;
1258                     }
1259                 }
1260                 if (d == 0)
1261                 {
1262                     comm->zone_d1[1] = buf_e[pos];
1263                     pos++;
1264                 }
1265             }
1266         }
1267     }
1268
1269     if (dd->ndim >= 2)
1270     {
1271         dim = dd->dim[1];
1272         for (i = 0; i < 2; i++)
1273         {
1274             if (debug)
1275             {
1276                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1277             }
1278             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1279             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1280         }
1281     }
1282     if (dd->ndim >= 3)
1283     {
1284         dim = dd->dim[2];
1285         for (i = 0; i < 2; i++)
1286         {
1287             for (j = 0; j < 2; j++)
1288             {
1289                 if (debug)
1290                 {
1291                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1292                 }
1293                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1294                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1295             }
1296         }
1297     }
1298     for (d = 1; d < dd->ndim; d++)
1299     {
1300         comm->cell_f_max0[d] = extr_s[d-1][0];
1301         comm->cell_f_min1[d] = extr_s[d-1][1];
1302         if (debug)
1303         {
1304             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1305                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1306         }
1307     }
1308 }
1309
1310 static void dd_collect_cg(gmx_domdec_t *dd,
1311                           t_state      *state_local)
1312 {
1313     gmx_domdec_master_t *ma = NULL;
1314     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1315     t_block             *cgs_gl;
1316
1317     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1318     {
1319         /* The master has the correct distribution */
1320         return;
1321     }
1322
1323     if (state_local->ddp_count == dd->ddp_count)
1324     {
1325         ncg_home = dd->ncg_home;
1326         cg       = dd->index_gl;
1327         nat_home = dd->nat_home;
1328     }
1329     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1330     {
1331         cgs_gl = &dd->comm->cgs_gl;
1332
1333         ncg_home = state_local->ncg_gl;
1334         cg       = state_local->cg_gl;
1335         nat_home = 0;
1336         for (i = 0; i < ncg_home; i++)
1337         {
1338             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1339         }
1340     }
1341     else
1342     {
1343         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1344     }
1345
1346     buf2[0] = dd->ncg_home;
1347     buf2[1] = dd->nat_home;
1348     if (DDMASTER(dd))
1349     {
1350         ma   = dd->ma;
1351         ibuf = ma->ibuf;
1352     }
1353     else
1354     {
1355         ibuf = NULL;
1356     }
1357     /* Collect the charge group and atom counts on the master */
1358     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1359
1360     if (DDMASTER(dd))
1361     {
1362         ma->index[0] = 0;
1363         for (i = 0; i < dd->nnodes; i++)
1364         {
1365             ma->ncg[i]     = ma->ibuf[2*i];
1366             ma->nat[i]     = ma->ibuf[2*i+1];
1367             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1368
1369         }
1370         /* Make byte counts and indices */
1371         for (i = 0; i < dd->nnodes; i++)
1372         {
1373             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1374             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1375         }
1376         if (debug)
1377         {
1378             fprintf(debug, "Initial charge group distribution: ");
1379             for (i = 0; i < dd->nnodes; i++)
1380             {
1381                 fprintf(debug, " %d", ma->ncg[i]);
1382             }
1383             fprintf(debug, "\n");
1384         }
1385     }
1386
1387     /* Collect the charge group indices on the master */
1388     dd_gatherv(dd,
1389                dd->ncg_home*sizeof(int), dd->index_gl,
1390                DDMASTER(dd) ? ma->ibuf : NULL,
1391                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1392                DDMASTER(dd) ? ma->cg : NULL);
1393
1394     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1395 }
1396
1397 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1398                                     rvec *lv, rvec *v)
1399 {
1400     gmx_domdec_master_t *ma;
1401     int                  n, i, c, a, nalloc = 0;
1402     rvec                *buf = NULL;
1403     t_block             *cgs_gl;
1404
1405     ma = dd->ma;
1406
1407     if (!DDMASTER(dd))
1408     {
1409 #ifdef GMX_MPI
1410         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1411                  dd->rank, dd->mpi_comm_all);
1412 #endif
1413     }
1414     else
1415     {
1416         /* Copy the master coordinates to the global array */
1417         cgs_gl = &dd->comm->cgs_gl;
1418
1419         n = DDMASTERRANK(dd);
1420         a = 0;
1421         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1422         {
1423             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1424             {
1425                 copy_rvec(lv[a++], v[c]);
1426             }
1427         }
1428
1429         for (n = 0; n < dd->nnodes; n++)
1430         {
1431             if (n != dd->rank)
1432             {
1433                 if (ma->nat[n] > nalloc)
1434                 {
1435                     nalloc = over_alloc_dd(ma->nat[n]);
1436                     srenew(buf, nalloc);
1437                 }
1438 #ifdef GMX_MPI
1439                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1440                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1441 #endif
1442                 a = 0;
1443                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1444                 {
1445                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1446                     {
1447                         copy_rvec(buf[a++], v[c]);
1448                     }
1449                 }
1450             }
1451         }
1452         sfree(buf);
1453     }
1454 }
1455
1456 static void get_commbuffer_counts(gmx_domdec_t *dd,
1457                                   int **counts, int **disps)
1458 {
1459     gmx_domdec_master_t *ma;
1460     int                  n;
1461
1462     ma = dd->ma;
1463
1464     /* Make the rvec count and displacment arrays */
1465     *counts  = ma->ibuf;
1466     *disps   = ma->ibuf + dd->nnodes;
1467     for (n = 0; n < dd->nnodes; n++)
1468     {
1469         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1470         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1471     }
1472 }
1473
1474 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1475                                    rvec *lv, rvec *v)
1476 {
1477     gmx_domdec_master_t *ma;
1478     int                 *rcounts = NULL, *disps = NULL;
1479     int                  n, i, c, a;
1480     rvec                *buf = NULL;
1481     t_block             *cgs_gl;
1482
1483     ma = dd->ma;
1484
1485     if (DDMASTER(dd))
1486     {
1487         get_commbuffer_counts(dd, &rcounts, &disps);
1488
1489         buf = ma->vbuf;
1490     }
1491
1492     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1493
1494     if (DDMASTER(dd))
1495     {
1496         cgs_gl = &dd->comm->cgs_gl;
1497
1498         a = 0;
1499         for (n = 0; n < dd->nnodes; n++)
1500         {
1501             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1502             {
1503                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1504                 {
1505                     copy_rvec(buf[a++], v[c]);
1506                 }
1507             }
1508         }
1509     }
1510 }
1511
1512 void dd_collect_vec(gmx_domdec_t *dd,
1513                     t_state *state_local, rvec *lv, rvec *v)
1514 {
1515     gmx_domdec_master_t *ma;
1516     int                  n, i, c, a, nalloc = 0;
1517     rvec                *buf = NULL;
1518
1519     dd_collect_cg(dd, state_local);
1520
1521     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1522     {
1523         dd_collect_vec_sendrecv(dd, lv, v);
1524     }
1525     else
1526     {
1527         dd_collect_vec_gatherv(dd, lv, v);
1528     }
1529 }
1530
1531
1532 void dd_collect_state(gmx_domdec_t *dd,
1533                       t_state *state_local, t_state *state)
1534 {
1535     int est, i, j, nh;
1536
1537     nh = state->nhchainlength;
1538
1539     if (DDMASTER(dd))
1540     {
1541         for (i = 0; i < efptNR; i++)
1542         {
1543             state->lambda[i] = state_local->lambda[i];
1544         }
1545         state->fep_state = state_local->fep_state;
1546         state->veta      = state_local->veta;
1547         state->vol0      = state_local->vol0;
1548         copy_mat(state_local->box, state->box);
1549         copy_mat(state_local->boxv, state->boxv);
1550         copy_mat(state_local->svir_prev, state->svir_prev);
1551         copy_mat(state_local->fvir_prev, state->fvir_prev);
1552         copy_mat(state_local->pres_prev, state->pres_prev);
1553
1554
1555         for (i = 0; i < state_local->ngtc; i++)
1556         {
1557             for (j = 0; j < nh; j++)
1558             {
1559                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1560                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1561             }
1562             state->therm_integral[i] = state_local->therm_integral[i];
1563         }
1564         for (i = 0; i < state_local->nnhpres; i++)
1565         {
1566             for (j = 0; j < nh; j++)
1567             {
1568                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1569                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1570             }
1571         }
1572     }
1573     for (est = 0; est < estNR; est++)
1574     {
1575         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1576         {
1577             switch (est)
1578             {
1579                 case estX:
1580                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1581                     break;
1582                 case estV:
1583                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1584                     break;
1585                 case estSDX:
1586                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1587                     break;
1588                 case estCGP:
1589                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1590                     break;
1591                 case estLD_RNG:
1592                     if (state->nrngi == 1)
1593                     {
1594                         if (DDMASTER(dd))
1595                         {
1596                             for (i = 0; i < state_local->nrng; i++)
1597                             {
1598                                 state->ld_rng[i] = state_local->ld_rng[i];
1599                             }
1600                         }
1601                     }
1602                     else
1603                     {
1604                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1605                                   state_local->ld_rng, state->ld_rng);
1606                     }
1607                     break;
1608                 case estLD_RNGI:
1609                     if (state->nrngi == 1)
1610                     {
1611                         if (DDMASTER(dd))
1612                         {
1613                             state->ld_rngi[0] = state_local->ld_rngi[0];
1614                         }
1615                     }
1616                     else
1617                     {
1618                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1619                                   state_local->ld_rngi, state->ld_rngi);
1620                     }
1621                     break;
1622                 case estDISRE_INITF:
1623                 case estDISRE_RM3TAV:
1624                 case estORIRE_INITF:
1625                 case estORIRE_DTAV:
1626                     break;
1627                 default:
1628                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1629             }
1630         }
1631     }
1632 }
1633
1634 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1635 {
1636     int est;
1637
1638     if (debug)
1639     {
1640         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1641     }
1642
1643     state->nalloc = over_alloc_dd(nalloc);
1644
1645     for (est = 0; est < estNR; est++)
1646     {
1647         if (EST_DISTR(est) && (state->flags & (1<<est)))
1648         {
1649             switch (est)
1650             {
1651                 case estX:
1652                     srenew(state->x, state->nalloc);
1653                     break;
1654                 case estV:
1655                     srenew(state->v, state->nalloc);
1656                     break;
1657                 case estSDX:
1658                     srenew(state->sd_X, state->nalloc);
1659                     break;
1660                 case estCGP:
1661                     srenew(state->cg_p, state->nalloc);
1662                     break;
1663                 case estLD_RNG:
1664                 case estLD_RNGI:
1665                 case estDISRE_INITF:
1666                 case estDISRE_RM3TAV:
1667                 case estORIRE_INITF:
1668                 case estORIRE_DTAV:
1669                     /* No reallocation required */
1670                     break;
1671                 default:
1672                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1673             }
1674         }
1675     }
1676
1677     if (f != NULL)
1678     {
1679         srenew(*f, state->nalloc);
1680     }
1681 }
1682
1683 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1684                                int nalloc)
1685 {
1686     if (nalloc > fr->cg_nalloc)
1687     {
1688         if (debug)
1689         {
1690             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1691         }
1692         fr->cg_nalloc = over_alloc_dd(nalloc);
1693         srenew(fr->cginfo, fr->cg_nalloc);
1694         if (fr->cutoff_scheme == ecutsGROUP)
1695         {
1696             srenew(fr->cg_cm, fr->cg_nalloc);
1697         }
1698     }
1699     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1700     {
1701         /* We don't use charge groups, we use x in state to set up
1702          * the atom communication.
1703          */
1704         dd_realloc_state(state, f, nalloc);
1705     }
1706 }
1707
1708 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1709                                        rvec *v, rvec *lv)
1710 {
1711     gmx_domdec_master_t *ma;
1712     int                  n, i, c, a, nalloc = 0;
1713     rvec                *buf = NULL;
1714
1715     if (DDMASTER(dd))
1716     {
1717         ma  = dd->ma;
1718
1719         for (n = 0; n < dd->nnodes; n++)
1720         {
1721             if (n != dd->rank)
1722             {
1723                 if (ma->nat[n] > nalloc)
1724                 {
1725                     nalloc = over_alloc_dd(ma->nat[n]);
1726                     srenew(buf, nalloc);
1727                 }
1728                 /* Use lv as a temporary buffer */
1729                 a = 0;
1730                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1731                 {
1732                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1733                     {
1734                         copy_rvec(v[c], buf[a++]);
1735                     }
1736                 }
1737                 if (a != ma->nat[n])
1738                 {
1739                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1740                               a, ma->nat[n]);
1741                 }
1742
1743 #ifdef GMX_MPI
1744                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1745                          DDRANK(dd, n), n, dd->mpi_comm_all);
1746 #endif
1747             }
1748         }
1749         sfree(buf);
1750         n = DDMASTERRANK(dd);
1751         a = 0;
1752         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1753         {
1754             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1755             {
1756                 copy_rvec(v[c], lv[a++]);
1757             }
1758         }
1759     }
1760     else
1761     {
1762 #ifdef GMX_MPI
1763         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1764                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1765 #endif
1766     }
1767 }
1768
1769 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1770                                        rvec *v, rvec *lv)
1771 {
1772     gmx_domdec_master_t *ma;
1773     int                 *scounts = NULL, *disps = NULL;
1774     int                  n, i, c, a, nalloc = 0;
1775     rvec                *buf = NULL;
1776
1777     if (DDMASTER(dd))
1778     {
1779         ma  = dd->ma;
1780
1781         get_commbuffer_counts(dd, &scounts, &disps);
1782
1783         buf = ma->vbuf;
1784         a   = 0;
1785         for (n = 0; n < dd->nnodes; n++)
1786         {
1787             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1788             {
1789                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1790                 {
1791                     copy_rvec(v[c], buf[a++]);
1792                 }
1793             }
1794         }
1795     }
1796
1797     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1798 }
1799
1800 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1801 {
1802     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1803     {
1804         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1805     }
1806     else
1807     {
1808         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1809     }
1810 }
1811
1812 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1813                                 t_state *state, t_state *state_local,
1814                                 rvec **f)
1815 {
1816     int  i, j, nh;
1817
1818     nh = state->nhchainlength;
1819
1820     if (DDMASTER(dd))
1821     {
1822         for (i = 0; i < efptNR; i++)
1823         {
1824             state_local->lambda[i] = state->lambda[i];
1825         }
1826         state_local->fep_state = state->fep_state;
1827         state_local->veta      = state->veta;
1828         state_local->vol0      = state->vol0;
1829         copy_mat(state->box, state_local->box);
1830         copy_mat(state->box_rel, state_local->box_rel);
1831         copy_mat(state->boxv, state_local->boxv);
1832         copy_mat(state->svir_prev, state_local->svir_prev);
1833         copy_mat(state->fvir_prev, state_local->fvir_prev);
1834         for (i = 0; i < state_local->ngtc; i++)
1835         {
1836             for (j = 0; j < nh; j++)
1837             {
1838                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1839                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1840             }
1841             state_local->therm_integral[i] = state->therm_integral[i];
1842         }
1843         for (i = 0; i < state_local->nnhpres; i++)
1844         {
1845             for (j = 0; j < nh; j++)
1846             {
1847                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1848                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1849             }
1850         }
1851     }
1852     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1853     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1854     dd_bcast(dd, sizeof(real), &state_local->veta);
1855     dd_bcast(dd, sizeof(real), &state_local->vol0);
1856     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1857     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1858     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1859     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1860     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1861     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1862     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1863     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1864     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1865     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1866
1867     if (dd->nat_home > state_local->nalloc)
1868     {
1869         dd_realloc_state(state_local, f, dd->nat_home);
1870     }
1871     for (i = 0; i < estNR; i++)
1872     {
1873         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1874         {
1875             switch (i)
1876             {
1877                 case estX:
1878                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1879                     break;
1880                 case estV:
1881                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1882                     break;
1883                 case estSDX:
1884                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1885                     break;
1886                 case estCGP:
1887                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1888                     break;
1889                 case estLD_RNG:
1890                     if (state->nrngi == 1)
1891                     {
1892                         dd_bcastc(dd,
1893                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1894                                   state->ld_rng, state_local->ld_rng);
1895                     }
1896                     else
1897                     {
1898                         dd_scatter(dd,
1899                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1900                                    state->ld_rng, state_local->ld_rng);
1901                     }
1902                     break;
1903                 case estLD_RNGI:
1904                     if (state->nrngi == 1)
1905                     {
1906                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1907                                   state->ld_rngi, state_local->ld_rngi);
1908                     }
1909                     else
1910                     {
1911                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1912                                    state->ld_rngi, state_local->ld_rngi);
1913                     }
1914                     break;
1915                 case estDISRE_INITF:
1916                 case estDISRE_RM3TAV:
1917                 case estORIRE_INITF:
1918                 case estORIRE_DTAV:
1919                     /* Not implemented yet */
1920                     break;
1921                 default:
1922                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1923             }
1924         }
1925     }
1926 }
1927
1928 static char dim2char(int dim)
1929 {
1930     char c = '?';
1931
1932     switch (dim)
1933     {
1934         case XX: c = 'X'; break;
1935         case YY: c = 'Y'; break;
1936         case ZZ: c = 'Z'; break;
1937         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1938     }
1939
1940     return c;
1941 }
1942
1943 static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
1944                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1945 {
1946     rvec   grid_s[2], *grid_r = NULL, cx, r;
1947     char   fname[STRLEN], format[STRLEN], buf[22];
1948     FILE  *out;
1949     int    a, i, d, z, y, x;
1950     matrix tric;
1951     real   vol;
1952
1953     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1954     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1955
1956     if (DDMASTER(dd))
1957     {
1958         snew(grid_r, 2*dd->nnodes);
1959     }
1960
1961     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1962
1963     if (DDMASTER(dd))
1964     {
1965         for (d = 0; d < DIM; d++)
1966         {
1967             for (i = 0; i < DIM; i++)
1968             {
1969                 if (d == i)
1970                 {
1971                     tric[d][i] = 1;
1972                 }
1973                 else
1974                 {
1975                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1976                     {
1977                         tric[d][i] = box[i][d]/box[i][i];
1978                     }
1979                     else
1980                     {
1981                         tric[d][i] = 0;
1982                     }
1983                 }
1984             }
1985         }
1986         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1987         sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1988         out = gmx_fio_fopen(fname, "w");
1989         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1990         a = 1;
1991         for (i = 0; i < dd->nnodes; i++)
1992         {
1993             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1994             for (d = 0; d < DIM; d++)
1995             {
1996                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1997             }
1998             for (z = 0; z < 2; z++)
1999             {
2000                 for (y = 0; y < 2; y++)
2001                 {
2002                     for (x = 0; x < 2; x++)
2003                     {
2004                         cx[XX] = grid_r[i*2+x][XX];
2005                         cx[YY] = grid_r[i*2+y][YY];
2006                         cx[ZZ] = grid_r[i*2+z][ZZ];
2007                         mvmul(tric, cx, r);
2008                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
2009                                 ' ', 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
2010                     }
2011                 }
2012             }
2013             for (d = 0; d < DIM; d++)
2014             {
2015                 for (x = 0; x < 4; x++)
2016                 {
2017                     switch (d)
2018                     {
2019                         case 0: y = 1 + i*8 + 2*x; break;
2020                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2021                         case 2: y = 1 + i*8 + x; break;
2022                     }
2023                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2024                 }
2025             }
2026         }
2027         gmx_fio_fclose(out);
2028         sfree(grid_r);
2029     }
2030 }
2031
2032 void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
2033                   gmx_mtop_t *mtop, t_commrec *cr,
2034                   int natoms, rvec x[], matrix box)
2035 {
2036     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2037     FILE         *out;
2038     int           i, ii, resnr, c;
2039     char         *atomname, *resname;
2040     real          b;
2041     gmx_domdec_t *dd;
2042
2043     dd = cr->dd;
2044     if (natoms == -1)
2045     {
2046         natoms = dd->comm->nat[ddnatVSITE];
2047     }
2048
2049     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2050
2051     sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
2052     sprintf(format4, "%s%s\n", pdbformat4, "%6.2f%6.2f");
2053
2054     out = gmx_fio_fopen(fname, "w");
2055
2056     fprintf(out, "TITLE     %s\n", title);
2057     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2058     for (i = 0; i < natoms; i++)
2059     {
2060         ii = dd->gatindex[i];
2061         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2062         if (i < dd->comm->nat[ddnatZONE])
2063         {
2064             c = 0;
2065             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2066             {
2067                 c++;
2068             }
2069             b = c;
2070         }
2071         else if (i < dd->comm->nat[ddnatVSITE])
2072         {
2073             b = dd->comm->zones.n;
2074         }
2075         else
2076         {
2077             b = dd->comm->zones.n + 1;
2078         }
2079         fprintf(out, strlen(atomname) < 4 ? format : format4,
2080                 "ATOM", (ii+1)%100000,
2081                 atomname, resname, ' ', resnr%10000, ' ',
2082                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2083     }
2084     fprintf(out, "TER\n");
2085
2086     gmx_fio_fclose(out);
2087 }
2088
2089 real dd_cutoff_mbody(gmx_domdec_t *dd)
2090 {
2091     gmx_domdec_comm_t *comm;
2092     int                di;
2093     real               r;
2094
2095     comm = dd->comm;
2096
2097     r = -1;
2098     if (comm->bInterCGBondeds)
2099     {
2100         if (comm->cutoff_mbody > 0)
2101         {
2102             r = comm->cutoff_mbody;
2103         }
2104         else
2105         {
2106             /* cutoff_mbody=0 means we do not have DLB */
2107             r = comm->cellsize_min[dd->dim[0]];
2108             for (di = 1; di < dd->ndim; di++)
2109             {
2110                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2111             }
2112             if (comm->bBondComm)
2113             {
2114                 r = max(r, comm->cutoff_mbody);
2115             }
2116             else
2117             {
2118                 r = min(r, comm->cutoff);
2119             }
2120         }
2121     }
2122
2123     return r;
2124 }
2125
2126 real dd_cutoff_twobody(gmx_domdec_t *dd)
2127 {
2128     real r_mb;
2129
2130     r_mb = dd_cutoff_mbody(dd);
2131
2132     return max(dd->comm->cutoff, r_mb);
2133 }
2134
2135
2136 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2137 {
2138     int nc, ntot;
2139
2140     nc   = dd->nc[dd->comm->cartpmedim];
2141     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2142     copy_ivec(coord, coord_pme);
2143     coord_pme[dd->comm->cartpmedim] =
2144         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2145 }
2146
2147 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2148 {
2149     /* Here we assign a PME node to communicate with this DD node
2150      * by assuming that the major index of both is x.
2151      * We add cr->npmenodes/2 to obtain an even distribution.
2152      */
2153     return (ddindex*npme + npme/2)/ndd;
2154 }
2155
2156 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2157 {
2158     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2159 }
2160
2161 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2162 {
2163     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2164 }
2165
2166 static int *dd_pmenodes(t_commrec *cr)
2167 {
2168     int *pmenodes;
2169     int  n, i, p0, p1;
2170
2171     snew(pmenodes, cr->npmenodes);
2172     n = 0;
2173     for (i = 0; i < cr->dd->nnodes; i++)
2174     {
2175         p0 = cr_ddindex2pmeindex(cr, i);
2176         p1 = cr_ddindex2pmeindex(cr, i+1);
2177         if (i+1 == cr->dd->nnodes || p1 > p0)
2178         {
2179             if (debug)
2180             {
2181                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2182             }
2183             pmenodes[n] = i + 1 + n;
2184             n++;
2185         }
2186     }
2187
2188     return pmenodes;
2189 }
2190
2191 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2192 {
2193     gmx_domdec_t *dd;
2194     ivec          coords, coords_pme, nc;
2195     int           slab;
2196
2197     dd = cr->dd;
2198     /*
2199        if (dd->comm->bCartesian) {
2200        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2201        dd_coords2pmecoords(dd,coords,coords_pme);
2202        copy_ivec(dd->ntot,nc);
2203        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2204        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2205
2206        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2207        } else {
2208        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2209        }
2210      */
2211     coords[XX] = x;
2212     coords[YY] = y;
2213     coords[ZZ] = z;
2214     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2215
2216     return slab;
2217 }
2218
2219 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2220 {
2221     gmx_domdec_comm_t *comm;
2222     ivec               coords;
2223     int                ddindex, nodeid = -1;
2224
2225     comm = cr->dd->comm;
2226
2227     coords[XX] = x;
2228     coords[YY] = y;
2229     coords[ZZ] = z;
2230     if (comm->bCartesianPP_PME)
2231     {
2232 #ifdef GMX_MPI
2233         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2234 #endif
2235     }
2236     else
2237     {
2238         ddindex = dd_index(cr->dd->nc, coords);
2239         if (comm->bCartesianPP)
2240         {
2241             nodeid = comm->ddindex2simnodeid[ddindex];
2242         }
2243         else
2244         {
2245             if (comm->pmenodes)
2246             {
2247                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2248             }
2249             else
2250             {
2251                 nodeid = ddindex;
2252             }
2253         }
2254     }
2255
2256     return nodeid;
2257 }
2258
2259 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2260 {
2261     gmx_domdec_t      *dd;
2262     gmx_domdec_comm_t *comm;
2263     ivec               coord, coord_pme;
2264     int                i;
2265     int                pmenode = -1;
2266
2267     dd   = cr->dd;
2268     comm = dd->comm;
2269
2270     /* This assumes a uniform x domain decomposition grid cell size */
2271     if (comm->bCartesianPP_PME)
2272     {
2273 #ifdef GMX_MPI
2274         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2275         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2276         {
2277             /* This is a PP node */
2278             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2279             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2280         }
2281 #endif
2282     }
2283     else if (comm->bCartesianPP)
2284     {
2285         if (sim_nodeid < dd->nnodes)
2286         {
2287             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2288         }
2289     }
2290     else
2291     {
2292         /* This assumes DD cells with identical x coordinates
2293          * are numbered sequentially.
2294          */
2295         if (dd->comm->pmenodes == NULL)
2296         {
2297             if (sim_nodeid < dd->nnodes)
2298             {
2299                 /* The DD index equals the nodeid */
2300                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2301             }
2302         }
2303         else
2304         {
2305             i = 0;
2306             while (sim_nodeid > dd->comm->pmenodes[i])
2307             {
2308                 i++;
2309             }
2310             if (sim_nodeid < dd->comm->pmenodes[i])
2311             {
2312                 pmenode = dd->comm->pmenodes[i];
2313             }
2314         }
2315     }
2316
2317     return pmenode;
2318 }
2319
2320 void get_pme_nnodes(const gmx_domdec_t *dd,
2321                     int *npmenodes_x, int *npmenodes_y)
2322 {
2323     if (dd != NULL)
2324     {
2325         *npmenodes_x = dd->comm->npmenodes_x;
2326         *npmenodes_y = dd->comm->npmenodes_y;
2327     }
2328     else
2329     {
2330         *npmenodes_x = 1;
2331         *npmenodes_y = 1;
2332     }
2333 }
2334
2335 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2336 {
2337     gmx_bool bPMEOnlyNode;
2338
2339     if (DOMAINDECOMP(cr))
2340     {
2341         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2342     }
2343     else
2344     {
2345         bPMEOnlyNode = FALSE;
2346     }
2347
2348     return bPMEOnlyNode;
2349 }
2350
2351 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2352                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2353 {
2354     gmx_domdec_t *dd;
2355     int           x, y, z;
2356     ivec          coord, coord_pme;
2357
2358     dd = cr->dd;
2359
2360     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2361
2362     *nmy_ddnodes = 0;
2363     for (x = 0; x < dd->nc[XX]; x++)
2364     {
2365         for (y = 0; y < dd->nc[YY]; y++)
2366         {
2367             for (z = 0; z < dd->nc[ZZ]; z++)
2368             {
2369                 if (dd->comm->bCartesianPP_PME)
2370                 {
2371                     coord[XX] = x;
2372                     coord[YY] = y;
2373                     coord[ZZ] = z;
2374                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2375                     if (dd->ci[XX] == coord_pme[XX] &&
2376                         dd->ci[YY] == coord_pme[YY] &&
2377                         dd->ci[ZZ] == coord_pme[ZZ])
2378                     {
2379                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2380                     }
2381                 }
2382                 else
2383                 {
2384                     /* The slab corresponds to the nodeid in the PME group */
2385                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2386                     {
2387                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2388                     }
2389                 }
2390             }
2391         }
2392     }
2393
2394     /* The last PP-only node is the peer node */
2395     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2396
2397     if (debug)
2398     {
2399         fprintf(debug, "Receive coordinates from PP nodes:");
2400         for (x = 0; x < *nmy_ddnodes; x++)
2401         {
2402             fprintf(debug, " %d", (*my_ddnodes)[x]);
2403         }
2404         fprintf(debug, "\n");
2405     }
2406 }
2407
2408 static gmx_bool receive_vir_ener(t_commrec *cr)
2409 {
2410     gmx_domdec_comm_t *comm;
2411     int                pmenode, coords[DIM], rank;
2412     gmx_bool           bReceive;
2413
2414     bReceive = TRUE;
2415     if (cr->npmenodes < cr->dd->nnodes)
2416     {
2417         comm = cr->dd->comm;
2418         if (comm->bCartesianPP_PME)
2419         {
2420             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2421 #ifdef GMX_MPI
2422             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2423             coords[comm->cartpmedim]++;
2424             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2425             {
2426                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2427                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2428                 {
2429                     /* This is not the last PP node for pmenode */
2430                     bReceive = FALSE;
2431                 }
2432             }
2433 #endif
2434         }
2435         else
2436         {
2437             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2438             if (cr->sim_nodeid+1 < cr->nnodes &&
2439                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2440             {
2441                 /* This is not the last PP node for pmenode */
2442                 bReceive = FALSE;
2443             }
2444         }
2445     }
2446
2447     return bReceive;
2448 }
2449
2450 static void set_zones_ncg_home(gmx_domdec_t *dd)
2451 {
2452     gmx_domdec_zones_t *zones;
2453     int                 i;
2454
2455     zones = &dd->comm->zones;
2456
2457     zones->cg_range[0] = 0;
2458     for (i = 1; i < zones->n+1; i++)
2459     {
2460         zones->cg_range[i] = dd->ncg_home;
2461     }
2462     /* zone_ncg1[0] should always be equal to ncg_home */
2463     dd->comm->zone_ncg1[0] = dd->ncg_home;
2464 }
2465
2466 static void rebuild_cgindex(gmx_domdec_t *dd,
2467                             const int *gcgs_index, t_state *state)
2468 {
2469     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2470
2471     ind        = state->cg_gl;
2472     dd_cg_gl   = dd->index_gl;
2473     cgindex    = dd->cgindex;
2474     nat        = 0;
2475     cgindex[0] = nat;
2476     for (i = 0; i < state->ncg_gl; i++)
2477     {
2478         cgindex[i]  = nat;
2479         cg_gl       = ind[i];
2480         dd_cg_gl[i] = cg_gl;
2481         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2482     }
2483     cgindex[i] = nat;
2484
2485     dd->ncg_home = state->ncg_gl;
2486     dd->nat_home = nat;
2487
2488     set_zones_ncg_home(dd);
2489 }
2490
2491 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2492 {
2493     while (cg >= cginfo_mb->cg_end)
2494     {
2495         cginfo_mb++;
2496     }
2497
2498     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2499 }
2500
2501 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2502                           t_forcerec *fr, char *bLocalCG)
2503 {
2504     cginfo_mb_t *cginfo_mb;
2505     int         *cginfo;
2506     int          cg;
2507
2508     if (fr != NULL)
2509     {
2510         cginfo_mb = fr->cginfo_mb;
2511         cginfo    = fr->cginfo;
2512
2513         for (cg = cg0; cg < cg1; cg++)
2514         {
2515             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2516         }
2517     }
2518
2519     if (bLocalCG != NULL)
2520     {
2521         for (cg = cg0; cg < cg1; cg++)
2522         {
2523             bLocalCG[index_gl[cg]] = TRUE;
2524         }
2525     }
2526 }
2527
2528 static void make_dd_indices(gmx_domdec_t *dd,
2529                             const int *gcgs_index, int cg_start)
2530 {
2531     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2532     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2533     gmx_ga2la_t *ga2la;
2534     char        *bLocalCG;
2535     gmx_bool     bCGs;
2536
2537     bLocalCG = dd->comm->bLocalCG;
2538
2539     if (dd->nat_tot > dd->gatindex_nalloc)
2540     {
2541         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2542         srenew(dd->gatindex, dd->gatindex_nalloc);
2543     }
2544
2545     nzone      = dd->comm->zones.n;
2546     zone2cg    = dd->comm->zones.cg_range;
2547     zone_ncg1  = dd->comm->zone_ncg1;
2548     index_gl   = dd->index_gl;
2549     gatindex   = dd->gatindex;
2550     bCGs       = dd->comm->bCGs;
2551
2552     if (zone2cg[1] != dd->ncg_home)
2553     {
2554         gmx_incons("dd->ncg_zone is not up to date");
2555     }
2556
2557     /* Make the local to global and global to local atom index */
2558     a = dd->cgindex[cg_start];
2559     for (zone = 0; zone < nzone; zone++)
2560     {
2561         if (zone == 0)
2562         {
2563             cg0 = cg_start;
2564         }
2565         else
2566         {
2567             cg0 = zone2cg[zone];
2568         }
2569         cg1    = zone2cg[zone+1];
2570         cg1_p1 = cg0 + zone_ncg1[zone];
2571
2572         for (cg = cg0; cg < cg1; cg++)
2573         {
2574             zone1 = zone;
2575             if (cg >= cg1_p1)
2576             {
2577                 /* Signal that this cg is from more than one pulse away */
2578                 zone1 += nzone;
2579             }
2580             cg_gl = index_gl[cg];
2581             if (bCGs)
2582             {
2583                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2584                 {
2585                     gatindex[a] = a_gl;
2586                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2587                     a++;
2588                 }
2589             }
2590             else
2591             {
2592                 gatindex[a] = cg_gl;
2593                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2594                 a++;
2595             }
2596         }
2597     }
2598 }
2599
2600 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2601                           const char *where)
2602 {
2603     int ncg, i, ngl, nerr;
2604
2605     nerr = 0;
2606     if (bLocalCG == NULL)
2607     {
2608         return nerr;
2609     }
2610     for (i = 0; i < dd->ncg_tot; i++)
2611     {
2612         if (!bLocalCG[dd->index_gl[i]])
2613         {
2614             fprintf(stderr,
2615                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2616             nerr++;
2617         }
2618     }
2619     ngl = 0;
2620     for (i = 0; i < ncg_sys; i++)
2621     {
2622         if (bLocalCG[i])
2623         {
2624             ngl++;
2625         }
2626     }
2627     if (ngl != dd->ncg_tot)
2628     {
2629         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2630         nerr++;
2631     }
2632
2633     return nerr;
2634 }
2635
2636 static void check_index_consistency(gmx_domdec_t *dd,
2637                                     int natoms_sys, int ncg_sys,
2638                                     const char *where)
2639 {
2640     int   nerr, ngl, i, a, cell;
2641     int  *have;
2642
2643     nerr = 0;
2644
2645     if (dd->comm->DD_debug > 1)
2646     {
2647         snew(have, natoms_sys);
2648         for (a = 0; a < dd->nat_tot; a++)
2649         {
2650             if (have[dd->gatindex[a]] > 0)
2651             {
2652                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2653             }
2654             else
2655             {
2656                 have[dd->gatindex[a]] = a + 1;
2657             }
2658         }
2659         sfree(have);
2660     }
2661
2662     snew(have, dd->nat_tot);
2663
2664     ngl  = 0;
2665     for (i = 0; i < natoms_sys; i++)
2666     {
2667         if (ga2la_get(dd->ga2la, i, &a, &cell))
2668         {
2669             if (a >= dd->nat_tot)
2670             {
2671                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2672                 nerr++;
2673             }
2674             else
2675             {
2676                 have[a] = 1;
2677                 if (dd->gatindex[a] != i)
2678                 {
2679                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2680                     nerr++;
2681                 }
2682             }
2683             ngl++;
2684         }
2685     }
2686     if (ngl != dd->nat_tot)
2687     {
2688         fprintf(stderr,
2689                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2690                 dd->rank, where, ngl, dd->nat_tot);
2691     }
2692     for (a = 0; a < dd->nat_tot; a++)
2693     {
2694         if (have[a] == 0)
2695         {
2696             fprintf(stderr,
2697                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2698                     dd->rank, where, a+1, dd->gatindex[a]+1);
2699         }
2700     }
2701     sfree(have);
2702
2703     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2704
2705     if (nerr > 0)
2706     {
2707         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2708                   dd->rank, where, nerr);
2709     }
2710 }
2711
2712 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2713 {
2714     int   i;
2715     char *bLocalCG;
2716
2717     if (a_start == 0)
2718     {
2719         /* Clear the whole list without searching */
2720         ga2la_clear(dd->ga2la);
2721     }
2722     else
2723     {
2724         for (i = a_start; i < dd->nat_tot; i++)
2725         {
2726             ga2la_del(dd->ga2la, dd->gatindex[i]);
2727         }
2728     }
2729
2730     bLocalCG = dd->comm->bLocalCG;
2731     if (bLocalCG)
2732     {
2733         for (i = cg_start; i < dd->ncg_tot; i++)
2734         {
2735             bLocalCG[dd->index_gl[i]] = FALSE;
2736         }
2737     }
2738
2739     dd_clear_local_vsite_indices(dd);
2740
2741     if (dd->constraints)
2742     {
2743         dd_clear_local_constraint_indices(dd);
2744     }
2745 }
2746
2747 /* This function should be used for moving the domain boudaries during DLB,
2748  * for obtaining the minimum cell size. It checks the initially set limit
2749  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2750  * and, possibly, a longer cut-off limit set for PME load balancing.
2751  */
2752 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2753 {
2754     real cellsize_min;
2755
2756     cellsize_min = comm->cellsize_min[dim];
2757
2758     if (!comm->bVacDLBNoLimit)
2759     {
2760         /* The cut-off might have changed, e.g. by PME load balacning,
2761          * from the value used to set comm->cellsize_min, so check it.
2762          */
2763         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2764
2765         if (comm->bPMELoadBalDLBLimits)
2766         {
2767             /* Check for the cut-off limit set by the PME load balancing */
2768             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2769         }
2770     }
2771
2772     return cellsize_min;
2773 }
2774
2775 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2776                             int dim_ind)
2777 {
2778     real grid_jump_limit;
2779
2780     /* The distance between the boundaries of cells at distance
2781      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2782      * and by the fact that cells should not be shifted by more than
2783      * half their size, such that cg's only shift by one cell
2784      * at redecomposition.
2785      */
2786     grid_jump_limit = comm->cellsize_limit;
2787     if (!comm->bVacDLBNoLimit)
2788     {
2789         if (comm->bPMELoadBalDLBLimits)
2790         {
2791             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2792         }
2793         grid_jump_limit = max(grid_jump_limit,
2794                               cutoff/comm->cd[dim_ind].np);
2795     }
2796
2797     return grid_jump_limit;
2798 }
2799
2800 static gmx_bool check_grid_jump(gmx_large_int_t step,
2801                                 gmx_domdec_t   *dd,
2802                                 real            cutoff,
2803                                 gmx_ddbox_t    *ddbox,
2804                                 gmx_bool        bFatal)
2805 {
2806     gmx_domdec_comm_t *comm;
2807     int                d, dim;
2808     real               limit, bfac;
2809     gmx_bool           bInvalid;
2810
2811     bInvalid = FALSE;
2812
2813     comm = dd->comm;
2814
2815     for (d = 1; d < dd->ndim; d++)
2816     {
2817         dim   = dd->dim[d];
2818         limit = grid_jump_limit(comm, cutoff, d);
2819         bfac  = ddbox->box_size[dim];
2820         if (ddbox->tric_dir[dim])
2821         {
2822             bfac *= ddbox->skew_fac[dim];
2823         }
2824         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2825                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2826         {
2827             bInvalid = TRUE;
2828
2829             if (bFatal)
2830             {
2831                 char buf[22];
2832
2833                 /* This error should never be triggered under normal
2834                  * circumstances, but you never know ...
2835                  */
2836                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2837                           gmx_step_str(step, buf),
2838                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2839             }
2840         }
2841     }
2842
2843     return bInvalid;
2844 }
2845
2846 static int dd_load_count(gmx_domdec_comm_t *comm)
2847 {
2848     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2849 }
2850
2851 static float dd_force_load(gmx_domdec_comm_t *comm)
2852 {
2853     float load;
2854
2855     if (comm->eFlop)
2856     {
2857         load = comm->flop;
2858         if (comm->eFlop > 1)
2859         {
2860             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2861         }
2862     }
2863     else
2864     {
2865         load = comm->cycl[ddCyclF];
2866         if (comm->cycl_n[ddCyclF] > 1)
2867         {
2868             /* Subtract the maximum of the last n cycle counts
2869              * to get rid of possible high counts due to other soures,
2870              * for instance system activity, that would otherwise
2871              * affect the dynamic load balancing.
2872              */
2873             load -= comm->cycl_max[ddCyclF];
2874         }
2875     }
2876
2877     return load;
2878 }
2879
2880 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2881 {
2882     gmx_domdec_comm_t *comm;
2883     int                i;
2884
2885     comm = dd->comm;
2886
2887     snew(*dim_f, dd->nc[dim]+1);
2888     (*dim_f)[0] = 0;
2889     for (i = 1; i < dd->nc[dim]; i++)
2890     {
2891         if (comm->slb_frac[dim])
2892         {
2893             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2894         }
2895         else
2896         {
2897             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2898         }
2899     }
2900     (*dim_f)[dd->nc[dim]] = 1;
2901 }
2902
2903 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2904 {
2905     int  pmeindex, slab, nso, i;
2906     ivec xyz;
2907
2908     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2909     {
2910         ddpme->dim = YY;
2911     }
2912     else
2913     {
2914         ddpme->dim = dimind;
2915     }
2916     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2917
2918     ddpme->nslab = (ddpme->dim == 0 ?
2919                     dd->comm->npmenodes_x :
2920                     dd->comm->npmenodes_y);
2921
2922     if (ddpme->nslab <= 1)
2923     {
2924         return;
2925     }
2926
2927     nso = dd->comm->npmenodes/ddpme->nslab;
2928     /* Determine for each PME slab the PP location range for dimension dim */
2929     snew(ddpme->pp_min, ddpme->nslab);
2930     snew(ddpme->pp_max, ddpme->nslab);
2931     for (slab = 0; slab < ddpme->nslab; slab++)
2932     {
2933         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2934         ddpme->pp_max[slab] = 0;
2935     }
2936     for (i = 0; i < dd->nnodes; i++)
2937     {
2938         ddindex2xyz(dd->nc, i, xyz);
2939         /* For y only use our y/z slab.
2940          * This assumes that the PME x grid size matches the DD grid size.
2941          */
2942         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2943         {
2944             pmeindex = ddindex2pmeindex(dd, i);
2945             if (dimind == 0)
2946             {
2947                 slab = pmeindex/nso;
2948             }
2949             else
2950             {
2951                 slab = pmeindex % ddpme->nslab;
2952             }
2953             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2954             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2955         }
2956     }
2957
2958     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2959 }
2960
2961 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2962 {
2963     if (dd->comm->ddpme[0].dim == XX)
2964     {
2965         return dd->comm->ddpme[0].maxshift;
2966     }
2967     else
2968     {
2969         return 0;
2970     }
2971 }
2972
2973 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2974 {
2975     if (dd->comm->ddpme[0].dim == YY)
2976     {
2977         return dd->comm->ddpme[0].maxshift;
2978     }
2979     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2980     {
2981         return dd->comm->ddpme[1].maxshift;
2982     }
2983     else
2984     {
2985         return 0;
2986     }
2987 }
2988
2989 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2990                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2991 {
2992     gmx_domdec_comm_t *comm;
2993     int                nc, ns, s;
2994     int               *xmin, *xmax;
2995     real               range, pme_boundary;
2996     int                sh;
2997
2998     comm = dd->comm;
2999     nc   = dd->nc[ddpme->dim];
3000     ns   = ddpme->nslab;
3001
3002     if (!ddpme->dim_match)
3003     {
3004         /* PP decomposition is not along dim: the worst situation */
3005         sh = ns/2;
3006     }
3007     else if (ns <= 3 || (bUniform && ns == nc))
3008     {
3009         /* The optimal situation */
3010         sh = 1;
3011     }
3012     else
3013     {
3014         /* We need to check for all pme nodes which nodes they
3015          * could possibly need to communicate with.
3016          */
3017         xmin = ddpme->pp_min;
3018         xmax = ddpme->pp_max;
3019         /* Allow for atoms to be maximally 2/3 times the cut-off
3020          * out of their DD cell. This is a reasonable balance between
3021          * between performance and support for most charge-group/cut-off
3022          * combinations.
3023          */
3024         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3025         /* Avoid extra communication when we are exactly at a boundary */
3026         range *= 0.999;
3027
3028         sh = 1;
3029         for (s = 0; s < ns; s++)
3030         {
3031             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3032             pme_boundary = (real)s/ns;
3033             while (sh+1 < ns &&
3034                    ((s-(sh+1) >= 0 &&
3035                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3036                     (s-(sh+1) <  0 &&
3037                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3038             {
3039                 sh++;
3040             }
3041             pme_boundary = (real)(s+1)/ns;
3042             while (sh+1 < ns &&
3043                    ((s+(sh+1) <  ns &&
3044                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3045                     (s+(sh+1) >= ns &&
3046                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3047             {
3048                 sh++;
3049             }
3050         }
3051     }
3052
3053     ddpme->maxshift = sh;
3054
3055     if (debug)
3056     {
3057         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3058                 ddpme->dim, ddpme->maxshift);
3059     }
3060 }
3061
3062 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3063 {
3064     int d, dim;
3065
3066     for (d = 0; d < dd->ndim; d++)
3067     {
3068         dim = dd->dim[d];
3069         if (dim < ddbox->nboundeddim &&
3070             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3071             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3072         {
3073             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3074                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3075                       dd->nc[dim], dd->comm->cellsize_limit);
3076         }
3077     }
3078 }
3079
3080 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3081                                   gmx_bool bMaster, ivec npulse)
3082 {
3083     gmx_domdec_comm_t *comm;
3084     int                d, j;
3085     rvec               cellsize_min;
3086     real              *cell_x, cell_dx, cellsize;
3087
3088     comm = dd->comm;
3089
3090     for (d = 0; d < DIM; d++)
3091     {
3092         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3093         npulse[d]       = 1;
3094         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3095         {
3096             /* Uniform grid */
3097             cell_dx = ddbox->box_size[d]/dd->nc[d];
3098             if (bMaster)
3099             {
3100                 for (j = 0; j < dd->nc[d]+1; j++)
3101                 {
3102                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3103                 }
3104             }
3105             else
3106             {
3107                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3108                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3109             }
3110             cellsize = cell_dx*ddbox->skew_fac[d];
3111             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3112             {
3113                 npulse[d]++;
3114             }
3115             cellsize_min[d] = cellsize;
3116         }
3117         else
3118         {
3119             /* Statically load balanced grid */
3120             /* Also when we are not doing a master distribution we determine
3121              * all cell borders in a loop to obtain identical values
3122              * to the master distribution case and to determine npulse.
3123              */
3124             if (bMaster)
3125             {
3126                 cell_x = dd->ma->cell_x[d];
3127             }
3128             else
3129             {
3130                 snew(cell_x, dd->nc[d]+1);
3131             }
3132             cell_x[0] = ddbox->box0[d];
3133             for (j = 0; j < dd->nc[d]; j++)
3134             {
3135                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3136                 cell_x[j+1] = cell_x[j] + cell_dx;
3137                 cellsize    = cell_dx*ddbox->skew_fac[d];
3138                 while (cellsize*npulse[d] < comm->cutoff &&
3139                        npulse[d] < dd->nc[d]-1)
3140                 {
3141                     npulse[d]++;
3142                 }
3143                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3144             }
3145             if (!bMaster)
3146             {
3147                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3148                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3149                 sfree(cell_x);
3150             }
3151         }
3152         /* The following limitation is to avoid that a cell would receive
3153          * some of its own home charge groups back over the periodic boundary.
3154          * Double charge groups cause trouble with the global indices.
3155          */
3156         if (d < ddbox->npbcdim &&
3157             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3158         {
3159             gmx_fatal_collective(FARGS, NULL, dd,
3160                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3161                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3162                                  comm->cutoff,
3163                                  dd->nc[d], dd->nc[d],
3164                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3165         }
3166     }
3167
3168     if (!comm->bDynLoadBal)
3169     {
3170         copy_rvec(cellsize_min, comm->cellsize_min);
3171     }
3172
3173     for (d = 0; d < comm->npmedecompdim; d++)
3174     {
3175         set_pme_maxshift(dd, &comm->ddpme[d],
3176                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3177                          comm->ddpme[d].slb_dim_f);
3178     }
3179 }
3180
3181
3182 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3183                                                   int d, int dim, gmx_domdec_root_t *root,
3184                                                   gmx_ddbox_t *ddbox,
3185                                                   gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
3186 {
3187     gmx_domdec_comm_t *comm;
3188     int                ncd, i, j, nmin, nmin_old;
3189     gmx_bool           bLimLo, bLimHi;
3190     real              *cell_size;
3191     real               fac, halfway, cellsize_limit_f_i, region_size;
3192     gmx_bool           bPBC, bLastHi = FALSE;
3193     int                nrange[] = {range[0], range[1]};
3194
3195     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3196
3197     comm = dd->comm;
3198
3199     ncd = dd->nc[dim];
3200
3201     bPBC = (dim < ddbox->npbcdim);
3202
3203     cell_size = root->buf_ncd;
3204
3205     if (debug)
3206     {
3207         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3208     }
3209
3210     /* First we need to check if the scaling does not make cells
3211      * smaller than the smallest allowed size.
3212      * We need to do this iteratively, since if a cell is too small,
3213      * it needs to be enlarged, which makes all the other cells smaller,
3214      * which could in turn make another cell smaller than allowed.
3215      */
3216     for (i = range[0]; i < range[1]; i++)
3217     {
3218         root->bCellMin[i] = FALSE;
3219     }
3220     nmin = 0;
3221     do
3222     {
3223         nmin_old = nmin;
3224         /* We need the total for normalization */
3225         fac = 0;
3226         for (i = range[0]; i < range[1]; i++)
3227         {
3228             if (root->bCellMin[i] == FALSE)
3229             {
3230                 fac += cell_size[i];
3231             }
3232         }
3233         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3234         /* Determine the cell boundaries */
3235         for (i = range[0]; i < range[1]; i++)
3236         {
3237             if (root->bCellMin[i] == FALSE)
3238             {
3239                 cell_size[i] *= fac;
3240                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3241                 {
3242                     cellsize_limit_f_i = 0;
3243                 }
3244                 else
3245                 {
3246                     cellsize_limit_f_i = cellsize_limit_f;
3247                 }
3248                 if (cell_size[i] < cellsize_limit_f_i)
3249                 {
3250                     root->bCellMin[i] = TRUE;
3251                     cell_size[i]      = cellsize_limit_f_i;
3252                     nmin++;
3253                 }
3254             }
3255             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3256         }
3257     }
3258     while (nmin > nmin_old);
3259
3260     i            = range[1]-1;
3261     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3262     /* For this check we should not use DD_CELL_MARGIN,
3263      * but a slightly smaller factor,
3264      * since rounding could get use below the limit.
3265      */
3266     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3267     {
3268         char buf[22];
3269         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3270                   gmx_step_str(step, buf),
3271                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3272                   ncd, comm->cellsize_min[dim]);
3273     }
3274
3275     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3276
3277     if (!bUniform)
3278     {
3279         /* Check if the boundary did not displace more than halfway
3280          * each of the cells it bounds, as this could cause problems,
3281          * especially when the differences between cell sizes are large.
3282          * If changes are applied, they will not make cells smaller
3283          * than the cut-off, as we check all the boundaries which
3284          * might be affected by a change and if the old state was ok,
3285          * the cells will at most be shrunk back to their old size.
3286          */
3287         for (i = range[0]+1; i < range[1]; i++)
3288         {
3289             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3290             if (root->cell_f[i] < halfway)
3291             {
3292                 root->cell_f[i] = halfway;
3293                 /* Check if the change also causes shifts of the next boundaries */
3294                 for (j = i+1; j < range[1]; j++)
3295                 {
3296                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3297                     {
3298                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3299                     }
3300                 }
3301             }
3302             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3303             if (root->cell_f[i] > halfway)
3304             {
3305                 root->cell_f[i] = halfway;
3306                 /* Check if the change also causes shifts of the next boundaries */
3307                 for (j = i-1; j >= range[0]+1; j--)
3308                 {
3309                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3310                     {
3311                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3312                     }
3313                 }
3314             }
3315         }
3316     }
3317
3318     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3319     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3320      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3321      * for a and b nrange is used */
3322     if (d > 0)
3323     {
3324         /* Take care of the staggering of the cell boundaries */
3325         if (bUniform)
3326         {
3327             for (i = range[0]; i < range[1]; i++)
3328             {
3329                 root->cell_f_max0[i] = root->cell_f[i];
3330                 root->cell_f_min1[i] = root->cell_f[i+1];
3331             }
3332         }
3333         else
3334         {
3335             for (i = range[0]+1; i < range[1]; i++)
3336             {
3337                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3338                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3339                 if (bLimLo && bLimHi)
3340                 {
3341                     /* Both limits violated, try the best we can */
3342                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3343                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3344                     nrange[0]       = range[0];
3345                     nrange[1]       = i;
3346                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3347
3348                     nrange[0] = i;
3349                     nrange[1] = range[1];
3350                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3351
3352                     return;
3353                 }
3354                 else if (bLimLo)
3355                 {
3356                     /* root->cell_f[i] = root->bound_min[i]; */
3357                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3358                     bLastHi   = FALSE;
3359                 }
3360                 else if (bLimHi && !bLastHi)
3361                 {
3362                     bLastHi = TRUE;
3363                     if (nrange[1] < range[1])   /* found a LimLo before */
3364                     {
3365                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3366                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3367                         nrange[0] = nrange[1];
3368                     }
3369                     root->cell_f[i] = root->bound_max[i];
3370                     nrange[1]       = i;
3371                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3372                     nrange[0] = i;
3373                     nrange[1] = range[1];
3374                 }
3375             }
3376             if (nrange[1] < range[1])   /* found last a LimLo */
3377             {
3378                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3379                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3380                 nrange[0] = nrange[1];
3381                 nrange[1] = range[1];
3382                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3383             }
3384             else if (nrange[0] > range[0]) /* found at least one LimHi */
3385             {
3386                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3387             }
3388         }
3389     }
3390 }
3391
3392
3393 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3394                                        int d, int dim, gmx_domdec_root_t *root,
3395                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3396                                        gmx_bool bUniform, gmx_large_int_t step)
3397 {
3398     gmx_domdec_comm_t *comm;
3399     int                ncd, d1, i, j, pos;
3400     real              *cell_size;
3401     real               load_aver, load_i, imbalance, change, change_max, sc;
3402     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3403     real               change_limit;
3404     real               relax = 0.5;
3405     gmx_bool           bPBC;
3406     int                range[] = { 0, 0 };
3407
3408     comm = dd->comm;
3409
3410     /* Convert the maximum change from the input percentage to a fraction */
3411     change_limit = comm->dlb_scale_lim*0.01;
3412
3413     ncd = dd->nc[dim];
3414
3415     bPBC = (dim < ddbox->npbcdim);
3416
3417     cell_size = root->buf_ncd;
3418
3419     /* Store the original boundaries */
3420     for (i = 0; i < ncd+1; i++)
3421     {
3422         root->old_cell_f[i] = root->cell_f[i];
3423     }
3424     if (bUniform)
3425     {
3426         for (i = 0; i < ncd; i++)
3427         {
3428             cell_size[i] = 1.0/ncd;
3429         }
3430     }
3431     else if (dd_load_count(comm))
3432     {
3433         load_aver  = comm->load[d].sum_m/ncd;
3434         change_max = 0;
3435         for (i = 0; i < ncd; i++)
3436         {
3437             /* Determine the relative imbalance of cell i */
3438             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3439             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3440             /* Determine the change of the cell size using underrelaxation */
3441             change     = -relax*imbalance;
3442             change_max = max(change_max, max(change, -change));
3443         }
3444         /* Limit the amount of scaling.
3445          * We need to use the same rescaling for all cells in one row,
3446          * otherwise the load balancing might not converge.
3447          */
3448         sc = relax;
3449         if (change_max > change_limit)
3450         {
3451             sc *= change_limit/change_max;
3452         }
3453         for (i = 0; i < ncd; i++)
3454         {
3455             /* Determine the relative imbalance of cell i */
3456             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3457             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3458             /* Determine the change of the cell size using underrelaxation */
3459             change       = -sc*imbalance;
3460             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3461         }
3462     }
3463
3464     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3465     cellsize_limit_f *= DD_CELL_MARGIN;
3466     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3467     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3468     if (ddbox->tric_dir[dim])
3469     {
3470         cellsize_limit_f /= ddbox->skew_fac[dim];
3471         dist_min_f       /= ddbox->skew_fac[dim];
3472     }
3473     if (bDynamicBox && d > 0)
3474     {
3475         dist_min_f *= DD_PRES_SCALE_MARGIN;
3476     }
3477     if (d > 0 && !bUniform)
3478     {
3479         /* Make sure that the grid is not shifted too much */
3480         for (i = 1; i < ncd; i++)
3481         {
3482             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3483             {
3484                 gmx_incons("Inconsistent DD boundary staggering limits!");
3485             }
3486             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3487             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3488             if (space > 0)
3489             {
3490                 root->bound_min[i] += 0.5*space;
3491             }
3492             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3493             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3494             if (space < 0)
3495             {
3496                 root->bound_max[i] += 0.5*space;
3497             }
3498             if (debug)
3499             {
3500                 fprintf(debug,
3501                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3502                         d, i,
3503                         root->cell_f_max0[i-1] + dist_min_f,
3504                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3505                         root->cell_f_min1[i] - dist_min_f);
3506             }
3507         }
3508     }
3509     range[1]          = ncd;
3510     root->cell_f[0]   = 0;
3511     root->cell_f[ncd] = 1;
3512     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3513
3514
3515     /* After the checks above, the cells should obey the cut-off
3516      * restrictions, but it does not hurt to check.
3517      */
3518     for (i = 0; i < ncd; i++)
3519     {
3520         if (debug)
3521         {
3522             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3523                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3524         }
3525
3526         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3527             root->cell_f[i+1] - root->cell_f[i] <
3528             cellsize_limit_f/DD_CELL_MARGIN)
3529         {
3530             char buf[22];
3531             fprintf(stderr,
3532                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3533                     gmx_step_str(step, buf), dim2char(dim), i,
3534                     (root->cell_f[i+1] - root->cell_f[i])
3535                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3536         }
3537     }
3538
3539     pos = ncd + 1;
3540     /* Store the cell boundaries of the lower dimensions at the end */
3541     for (d1 = 0; d1 < d; d1++)
3542     {
3543         root->cell_f[pos++] = comm->cell_f0[d1];
3544         root->cell_f[pos++] = comm->cell_f1[d1];
3545     }
3546
3547     if (d < comm->npmedecompdim)
3548     {
3549         /* The master determines the maximum shift for
3550          * the coordinate communication between separate PME nodes.
3551          */
3552         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3553     }
3554     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3555     if (d >= 1)
3556     {
3557         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3558     }
3559 }
3560
3561 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3562                                              gmx_ddbox_t *ddbox, int dimind)
3563 {
3564     gmx_domdec_comm_t *comm;
3565     int                dim;
3566
3567     comm = dd->comm;
3568
3569     /* Set the cell dimensions */
3570     dim                = dd->dim[dimind];
3571     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3572     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3573     if (dim >= ddbox->nboundeddim)
3574     {
3575         comm->cell_x0[dim] += ddbox->box0[dim];
3576         comm->cell_x1[dim] += ddbox->box0[dim];
3577     }
3578 }
3579
3580 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3581                                          int d, int dim, real *cell_f_row,
3582                                          gmx_ddbox_t *ddbox)
3583 {
3584     gmx_domdec_comm_t *comm;
3585     int                d1, dim1, pos;
3586
3587     comm = dd->comm;
3588
3589 #ifdef GMX_MPI
3590     /* Each node would only need to know two fractions,
3591      * but it is probably cheaper to broadcast the whole array.
3592      */
3593     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3594               0, comm->mpi_comm_load[d]);
3595 #endif
3596     /* Copy the fractions for this dimension from the buffer */
3597     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3598     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3599     /* The whole array was communicated, so set the buffer position */
3600     pos = dd->nc[dim] + 1;
3601     for (d1 = 0; d1 <= d; d1++)
3602     {
3603         if (d1 < d)
3604         {
3605             /* Copy the cell fractions of the lower dimensions */
3606             comm->cell_f0[d1] = cell_f_row[pos++];
3607             comm->cell_f1[d1] = cell_f_row[pos++];
3608         }
3609         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3610     }
3611     /* Convert the communicated shift from float to int */
3612     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3613     if (d >= 1)
3614     {
3615         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3616     }
3617 }
3618
3619 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3620                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3621                                          gmx_bool bUniform, gmx_large_int_t step)
3622 {
3623     gmx_domdec_comm_t *comm;
3624     int                d, dim, d1;
3625     gmx_bool           bRowMember, bRowRoot;
3626     real              *cell_f_row;
3627
3628     comm = dd->comm;
3629
3630     for (d = 0; d < dd->ndim; d++)
3631     {
3632         dim        = dd->dim[d];
3633         bRowMember = TRUE;
3634         bRowRoot   = TRUE;
3635         for (d1 = d; d1 < dd->ndim; d1++)
3636         {
3637             if (dd->ci[dd->dim[d1]] > 0)
3638             {
3639                 if (d1 > d)
3640                 {
3641                     bRowMember = FALSE;
3642                 }
3643                 bRowRoot = FALSE;
3644             }
3645         }
3646         if (bRowMember)
3647         {
3648             if (bRowRoot)
3649             {
3650                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3651                                            ddbox, bDynamicBox, bUniform, step);
3652                 cell_f_row = comm->root[d]->cell_f;
3653             }
3654             else
3655             {
3656                 cell_f_row = comm->cell_f_row;
3657             }
3658             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3659         }
3660     }
3661 }
3662
3663 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3664 {
3665     int d;
3666
3667     /* This function assumes the box is static and should therefore
3668      * not be called when the box has changed since the last
3669      * call to dd_partition_system.
3670      */
3671     for (d = 0; d < dd->ndim; d++)
3672     {
3673         relative_to_absolute_cell_bounds(dd, ddbox, d);
3674     }
3675 }
3676
3677
3678
3679 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3680                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3681                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3682                                   gmx_wallcycle_t wcycle)
3683 {
3684     gmx_domdec_comm_t *comm;
3685     int                dim;
3686
3687     comm = dd->comm;
3688
3689     if (bDoDLB)
3690     {
3691         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3692         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3693         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3694     }
3695     else if (bDynamicBox)
3696     {
3697         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3698     }
3699
3700     /* Set the dimensions for which no DD is used */
3701     for (dim = 0; dim < DIM; dim++)
3702     {
3703         if (dd->nc[dim] == 1)
3704         {
3705             comm->cell_x0[dim] = 0;
3706             comm->cell_x1[dim] = ddbox->box_size[dim];
3707             if (dim >= ddbox->nboundeddim)
3708             {
3709                 comm->cell_x0[dim] += ddbox->box0[dim];
3710                 comm->cell_x1[dim] += ddbox->box0[dim];
3711             }
3712         }
3713     }
3714 }
3715
3716 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3717 {
3718     int                    d, np, i;
3719     gmx_domdec_comm_dim_t *cd;
3720
3721     for (d = 0; d < dd->ndim; d++)
3722     {
3723         cd = &dd->comm->cd[d];
3724         np = npulse[dd->dim[d]];
3725         if (np > cd->np_nalloc)
3726         {
3727             if (debug)
3728             {
3729                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3730                         dim2char(dd->dim[d]), np);
3731             }
3732             if (DDMASTER(dd) && cd->np_nalloc > 0)
3733             {
3734                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3735             }
3736             srenew(cd->ind, np);
3737             for (i = cd->np_nalloc; i < np; i++)
3738             {
3739                 cd->ind[i].index  = NULL;
3740                 cd->ind[i].nalloc = 0;
3741             }
3742             cd->np_nalloc = np;
3743         }
3744         cd->np = np;
3745     }
3746 }
3747
3748
3749 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3750                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3751                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3752                               gmx_wallcycle_t wcycle)
3753 {
3754     gmx_domdec_comm_t *comm;
3755     int                d;
3756     ivec               npulse;
3757
3758     comm = dd->comm;
3759
3760     /* Copy the old cell boundaries for the cg displacement check */
3761     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3762     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3763
3764     if (comm->bDynLoadBal)
3765     {
3766         if (DDMASTER(dd))
3767         {
3768             check_box_size(dd, ddbox);
3769         }
3770         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3771     }
3772     else
3773     {
3774         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
3775         realloc_comm_ind(dd, npulse);
3776     }
3777
3778     if (debug)
3779     {
3780         for (d = 0; d < DIM; d++)
3781         {
3782             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3783                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3784         }
3785     }
3786 }
3787
3788 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3789                                   gmx_ddbox_t *ddbox,
3790                                   rvec cell_ns_x0, rvec cell_ns_x1,
3791                                   gmx_large_int_t step)
3792 {
3793     gmx_domdec_comm_t *comm;
3794     int                dim_ind, dim;
3795
3796     comm = dd->comm;
3797
3798     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3799     {
3800         dim = dd->dim[dim_ind];
3801
3802         /* Without PBC we don't have restrictions on the outer cells */
3803         if (!(dim >= ddbox->npbcdim &&
3804               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3805             comm->bDynLoadBal &&
3806             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3807             comm->cellsize_min[dim])
3808         {
3809             char buf[22];
3810             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3811                       gmx_step_str(step, buf), dim2char(dim),
3812                       comm->cell_x1[dim] - comm->cell_x0[dim],
3813                       ddbox->skew_fac[dim],
3814                       dd->comm->cellsize_min[dim],
3815                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3816         }
3817     }
3818
3819     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3820     {
3821         /* Communicate the boundaries and update cell_ns_x0/1 */
3822         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3823         if (dd->bGridJump && dd->ndim > 1)
3824         {
3825             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3826         }
3827     }
3828 }
3829
3830 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3831 {
3832     if (YY < npbcdim)
3833     {
3834         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3835     }
3836     else
3837     {
3838         tcm[YY][XX] = 0;
3839     }
3840     if (ZZ < npbcdim)
3841     {
3842         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3843         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3844     }
3845     else
3846     {
3847         tcm[ZZ][XX] = 0;
3848         tcm[ZZ][YY] = 0;
3849     }
3850 }
3851
3852 static void check_screw_box(matrix box)
3853 {
3854     /* Mathematical limitation */
3855     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3856     {
3857         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3858     }
3859
3860     /* Limitation due to the asymmetry of the eighth shell method */
3861     if (box[ZZ][YY] != 0)
3862     {
3863         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3864     }
3865 }
3866
3867 static void distribute_cg(FILE *fplog, gmx_large_int_t step,
3868                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3869                           gmx_domdec_t *dd)
3870 {
3871     gmx_domdec_master_t *ma;
3872     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3873     int                  i, icg, j, k, k0, k1, d, npbcdim;
3874     matrix               tcm;
3875     rvec                 box_size, cg_cm;
3876     ivec                 ind;
3877     real                 nrcg, inv_ncg, pos_d;
3878     atom_id             *cgindex;
3879     gmx_bool             bUnbounded, bScrew;
3880
3881     ma = dd->ma;
3882
3883     if (tmp_ind == NULL)
3884     {
3885         snew(tmp_nalloc, dd->nnodes);
3886         snew(tmp_ind, dd->nnodes);
3887         for (i = 0; i < dd->nnodes; i++)
3888         {
3889             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3890             snew(tmp_ind[i], tmp_nalloc[i]);
3891         }
3892     }
3893
3894     /* Clear the count */
3895     for (i = 0; i < dd->nnodes; i++)
3896     {
3897         ma->ncg[i] = 0;
3898         ma->nat[i] = 0;
3899     }
3900
3901     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3902
3903     cgindex = cgs->index;
3904
3905     /* Compute the center of geometry for all charge groups */
3906     for (icg = 0; icg < cgs->nr; icg++)
3907     {
3908         k0      = cgindex[icg];
3909         k1      = cgindex[icg+1];
3910         nrcg    = k1 - k0;
3911         if (nrcg == 1)
3912         {
3913             copy_rvec(pos[k0], cg_cm);
3914         }
3915         else
3916         {
3917             inv_ncg = 1.0/nrcg;
3918
3919             clear_rvec(cg_cm);
3920             for (k = k0; (k < k1); k++)
3921             {
3922                 rvec_inc(cg_cm, pos[k]);
3923             }
3924             for (d = 0; (d < DIM); d++)
3925             {
3926                 cg_cm[d] *= inv_ncg;
3927             }
3928         }
3929         /* Put the charge group in the box and determine the cell index */
3930         for (d = DIM-1; d >= 0; d--)
3931         {
3932             pos_d = cg_cm[d];
3933             if (d < dd->npbcdim)
3934             {
3935                 bScrew = (dd->bScrewPBC && d == XX);
3936                 if (tric_dir[d] && dd->nc[d] > 1)
3937                 {
3938                     /* Use triclinic coordintates for this dimension */
3939                     for (j = d+1; j < DIM; j++)
3940                     {
3941                         pos_d += cg_cm[j]*tcm[j][d];
3942                     }
3943                 }
3944                 while (pos_d >= box[d][d])
3945                 {
3946                     pos_d -= box[d][d];
3947                     rvec_dec(cg_cm, box[d]);
3948                     if (bScrew)
3949                     {
3950                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3951                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3952                     }
3953                     for (k = k0; (k < k1); k++)
3954                     {
3955                         rvec_dec(pos[k], box[d]);
3956                         if (bScrew)
3957                         {
3958                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3959                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3960                         }
3961                     }
3962                 }
3963                 while (pos_d < 0)
3964                 {
3965                     pos_d += box[d][d];
3966                     rvec_inc(cg_cm, box[d]);
3967                     if (bScrew)
3968                     {
3969                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3970                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3971                     }
3972                     for (k = k0; (k < k1); k++)
3973                     {
3974                         rvec_inc(pos[k], box[d]);
3975                         if (bScrew)
3976                         {
3977                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3978                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3979                         }
3980                     }
3981                 }
3982             }
3983             /* This could be done more efficiently */
3984             ind[d] = 0;
3985             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3986             {
3987                 ind[d]++;
3988             }
3989         }
3990         i = dd_index(dd->nc, ind);
3991         if (ma->ncg[i] == tmp_nalloc[i])
3992         {
3993             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3994             srenew(tmp_ind[i], tmp_nalloc[i]);
3995         }
3996         tmp_ind[i][ma->ncg[i]] = icg;
3997         ma->ncg[i]++;
3998         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3999     }
4000
4001     k1 = 0;
4002     for (i = 0; i < dd->nnodes; i++)
4003     {
4004         ma->index[i] = k1;
4005         for (k = 0; k < ma->ncg[i]; k++)
4006         {
4007             ma->cg[k1++] = tmp_ind[i][k];
4008         }
4009     }
4010     ma->index[dd->nnodes] = k1;
4011
4012     for (i = 0; i < dd->nnodes; i++)
4013     {
4014         sfree(tmp_ind[i]);
4015     }
4016     sfree(tmp_ind);
4017     sfree(tmp_nalloc);
4018
4019     if (fplog)
4020     {
4021         char buf[22];
4022         fprintf(fplog, "Charge group distribution at step %s:",
4023                 gmx_step_str(step, buf));
4024         for (i = 0; i < dd->nnodes; i++)
4025         {
4026             fprintf(fplog, " %d", ma->ncg[i]);
4027         }
4028         fprintf(fplog, "\n");
4029     }
4030 }
4031
4032 static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
4033                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4034                                 rvec pos[])
4035 {
4036     gmx_domdec_master_t *ma = NULL;
4037     ivec                 npulse;
4038     int                  i, cg_gl;
4039     int                 *ibuf, buf2[2] = { 0, 0 };
4040     gmx_bool             bMaster = DDMASTER(dd);
4041     if (bMaster)
4042     {
4043         ma = dd->ma;
4044
4045         if (dd->bScrewPBC)
4046         {
4047             check_screw_box(box);
4048         }
4049
4050         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
4051
4052         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4053         for (i = 0; i < dd->nnodes; i++)
4054         {
4055             ma->ibuf[2*i]   = ma->ncg[i];
4056             ma->ibuf[2*i+1] = ma->nat[i];
4057         }
4058         ibuf = ma->ibuf;
4059     }
4060     else
4061     {
4062         ibuf = NULL;
4063     }
4064     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4065
4066     dd->ncg_home = buf2[0];
4067     dd->nat_home = buf2[1];
4068     dd->ncg_tot  = dd->ncg_home;
4069     dd->nat_tot  = dd->nat_home;
4070     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4071     {
4072         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4073         srenew(dd->index_gl, dd->cg_nalloc);
4074         srenew(dd->cgindex, dd->cg_nalloc+1);
4075     }
4076     if (bMaster)
4077     {
4078         for (i = 0; i < dd->nnodes; i++)
4079         {
4080             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4081             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4082         }
4083     }
4084
4085     dd_scatterv(dd,
4086                 DDMASTER(dd) ? ma->ibuf : NULL,
4087                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4088                 DDMASTER(dd) ? ma->cg : NULL,
4089                 dd->ncg_home*sizeof(int), dd->index_gl);
4090
4091     /* Determine the home charge group sizes */
4092     dd->cgindex[0] = 0;
4093     for (i = 0; i < dd->ncg_home; i++)
4094     {
4095         cg_gl            = dd->index_gl[i];
4096         dd->cgindex[i+1] =
4097             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4098     }
4099
4100     if (debug)
4101     {
4102         fprintf(debug, "Home charge groups:\n");
4103         for (i = 0; i < dd->ncg_home; i++)
4104         {
4105             fprintf(debug, " %d", dd->index_gl[i]);
4106             if (i % 10 == 9)
4107             {
4108                 fprintf(debug, "\n");
4109             }
4110         }
4111         fprintf(debug, "\n");
4112     }
4113 }
4114
4115 static int compact_and_copy_vec_at(int ncg, int *move,
4116                                    int *cgindex,
4117                                    int nvec, int vec,
4118                                    rvec *src, gmx_domdec_comm_t *comm,
4119                                    gmx_bool bCompact)
4120 {
4121     int m, icg, i, i0, i1, nrcg;
4122     int home_pos;
4123     int pos_vec[DIM*2];
4124
4125     home_pos = 0;
4126
4127     for (m = 0; m < DIM*2; m++)
4128     {
4129         pos_vec[m] = 0;
4130     }
4131
4132     i0 = 0;
4133     for (icg = 0; icg < ncg; icg++)
4134     {
4135         i1 = cgindex[icg+1];
4136         m  = move[icg];
4137         if (m == -1)
4138         {
4139             if (bCompact)
4140             {
4141                 /* Compact the home array in place */
4142                 for (i = i0; i < i1; i++)
4143                 {
4144                     copy_rvec(src[i], src[home_pos++]);
4145                 }
4146             }
4147         }
4148         else
4149         {
4150             /* Copy to the communication buffer */
4151             nrcg        = i1 - i0;
4152             pos_vec[m] += 1 + vec*nrcg;
4153             for (i = i0; i < i1; i++)
4154             {
4155                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4156             }
4157             pos_vec[m] += (nvec - vec - 1)*nrcg;
4158         }
4159         if (!bCompact)
4160         {
4161             home_pos += i1 - i0;
4162         }
4163         i0 = i1;
4164     }
4165
4166     return home_pos;
4167 }
4168
4169 static int compact_and_copy_vec_cg(int ncg, int *move,
4170                                    int *cgindex,
4171                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4172                                    gmx_bool bCompact)
4173 {
4174     int m, icg, i0, i1, nrcg;
4175     int home_pos;
4176     int pos_vec[DIM*2];
4177
4178     home_pos = 0;
4179
4180     for (m = 0; m < DIM*2; m++)
4181     {
4182         pos_vec[m] = 0;
4183     }
4184
4185     i0 = 0;
4186     for (icg = 0; icg < ncg; icg++)
4187     {
4188         i1 = cgindex[icg+1];
4189         m  = move[icg];
4190         if (m == -1)
4191         {
4192             if (bCompact)
4193             {
4194                 /* Compact the home array in place */
4195                 copy_rvec(src[icg], src[home_pos++]);
4196             }
4197         }
4198         else
4199         {
4200             nrcg = i1 - i0;
4201             /* Copy to the communication buffer */
4202             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4203             pos_vec[m] += 1 + nrcg*nvec;
4204         }
4205         i0 = i1;
4206     }
4207     if (!bCompact)
4208     {
4209         home_pos = ncg;
4210     }
4211
4212     return home_pos;
4213 }
4214
4215 static int compact_ind(int ncg, int *move,
4216                        int *index_gl, int *cgindex,
4217                        int *gatindex,
4218                        gmx_ga2la_t ga2la, char *bLocalCG,
4219                        int *cginfo)
4220 {
4221     int cg, nat, a0, a1, a, a_gl;
4222     int home_pos;
4223
4224     home_pos = 0;
4225     nat      = 0;
4226     for (cg = 0; cg < ncg; cg++)
4227     {
4228         a0 = cgindex[cg];
4229         a1 = cgindex[cg+1];
4230         if (move[cg] == -1)
4231         {
4232             /* Compact the home arrays in place.
4233              * Anything that can be done here avoids access to global arrays.
4234              */
4235             cgindex[home_pos] = nat;
4236             for (a = a0; a < a1; a++)
4237             {
4238                 a_gl          = gatindex[a];
4239                 gatindex[nat] = a_gl;
4240                 /* The cell number stays 0, so we don't need to set it */
4241                 ga2la_change_la(ga2la, a_gl, nat);
4242                 nat++;
4243             }
4244             index_gl[home_pos] = index_gl[cg];
4245             cginfo[home_pos]   = cginfo[cg];
4246             /* The charge group remains local, so bLocalCG does not change */
4247             home_pos++;
4248         }
4249         else
4250         {
4251             /* Clear the global indices */
4252             for (a = a0; a < a1; a++)
4253             {
4254                 ga2la_del(ga2la, gatindex[a]);
4255             }
4256             if (bLocalCG)
4257             {
4258                 bLocalCG[index_gl[cg]] = FALSE;
4259             }
4260         }
4261     }
4262     cgindex[home_pos] = nat;
4263
4264     return home_pos;
4265 }
4266
4267 static void clear_and_mark_ind(int ncg, int *move,
4268                                int *index_gl, int *cgindex, int *gatindex,
4269                                gmx_ga2la_t ga2la, char *bLocalCG,
4270                                int *cell_index)
4271 {
4272     int cg, a0, a1, a;
4273
4274     for (cg = 0; cg < ncg; cg++)
4275     {
4276         if (move[cg] >= 0)
4277         {
4278             a0 = cgindex[cg];
4279             a1 = cgindex[cg+1];
4280             /* Clear the global indices */
4281             for (a = a0; a < a1; a++)
4282             {
4283                 ga2la_del(ga2la, gatindex[a]);
4284             }
4285             if (bLocalCG)
4286             {
4287                 bLocalCG[index_gl[cg]] = FALSE;
4288             }
4289             /* Signal that this cg has moved using the ns cell index.
4290              * Here we set it to -1. fill_grid will change it
4291              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4292              */
4293             cell_index[cg] = -1;
4294         }
4295     }
4296 }
4297
4298 static void print_cg_move(FILE *fplog,
4299                           gmx_domdec_t *dd,
4300                           gmx_large_int_t step, int cg, int dim, int dir,
4301                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4302                           rvec cm_old, rvec cm_new, real pos_d)
4303 {
4304     gmx_domdec_comm_t *comm;
4305     char               buf[22];
4306
4307     comm = dd->comm;
4308
4309     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4310     if (bHaveLimitdAndCMOld)
4311     {
4312         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4313                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4314     }
4315     else
4316     {
4317         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4318                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4319     }
4320     fprintf(fplog, "distance out of cell %f\n",
4321             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4322     if (bHaveLimitdAndCMOld)
4323     {
4324         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4325                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4326     }
4327     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4328             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4329     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4330             dim2char(dim),
4331             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4332     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4333             dim2char(dim),
4334             comm->cell_x0[dim], comm->cell_x1[dim]);
4335 }
4336
4337 static void cg_move_error(FILE *fplog,
4338                           gmx_domdec_t *dd,
4339                           gmx_large_int_t step, int cg, int dim, int dir,
4340                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4341                           rvec cm_old, rvec cm_new, real pos_d)
4342 {
4343     if (fplog)
4344     {
4345         print_cg_move(fplog, dd, step, cg, dim, dir,
4346                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4347     }
4348     print_cg_move(stderr, dd, step, cg, dim, dir,
4349                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4350     gmx_fatal(FARGS,
4351               "A charge group moved too far between two domain decomposition steps\n"
4352               "This usually means that your system is not well equilibrated");
4353 }
4354
4355 static void rotate_state_atom(t_state *state, int a)
4356 {
4357     int est;
4358
4359     for (est = 0; est < estNR; est++)
4360     {
4361         if (EST_DISTR(est) && (state->flags & (1<<est)))
4362         {
4363             switch (est)
4364             {
4365                 case estX:
4366                     /* Rotate the complete state; for a rectangular box only */
4367                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4368                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4369                     break;
4370                 case estV:
4371                     state->v[a][YY] = -state->v[a][YY];
4372                     state->v[a][ZZ] = -state->v[a][ZZ];
4373                     break;
4374                 case estSDX:
4375                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4376                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4377                     break;
4378                 case estCGP:
4379                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4380                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4381                     break;
4382                 case estDISRE_INITF:
4383                 case estDISRE_RM3TAV:
4384                 case estORIRE_INITF:
4385                 case estORIRE_DTAV:
4386                     /* These are distances, so not affected by rotation */
4387                     break;
4388                 default:
4389                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4390             }
4391         }
4392     }
4393 }
4394
4395 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4396 {
4397     if (natoms > comm->moved_nalloc)
4398     {
4399         /* Contents should be preserved here */
4400         comm->moved_nalloc = over_alloc_dd(natoms);
4401         srenew(comm->moved, comm->moved_nalloc);
4402     }
4403
4404     return comm->moved;
4405 }
4406
4407 static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
4408                          gmx_domdec_t *dd,
4409                          t_state *state,
4410                          ivec tric_dir, matrix tcm,
4411                          rvec cell_x0, rvec cell_x1,
4412                          rvec limitd, rvec limit0, rvec limit1,
4413                          const int *cgindex,
4414                          int cg_start, int cg_end,
4415                          rvec *cg_cm,
4416                          int *move)
4417 {
4418     int      npbcdim;
4419     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4420     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4421     int      flag;
4422     gmx_bool bScrew;
4423     ivec     dev;
4424     real     inv_ncg, pos_d;
4425     rvec     cm_new;
4426
4427     npbcdim = dd->npbcdim;
4428
4429     for (cg = cg_start; cg < cg_end; cg++)
4430     {
4431         k0   = cgindex[cg];
4432         k1   = cgindex[cg+1];
4433         nrcg = k1 - k0;
4434         if (nrcg == 1)
4435         {
4436             copy_rvec(state->x[k0], cm_new);
4437         }
4438         else
4439         {
4440             inv_ncg = 1.0/nrcg;
4441
4442             clear_rvec(cm_new);
4443             for (k = k0; (k < k1); k++)
4444             {
4445                 rvec_inc(cm_new, state->x[k]);
4446             }
4447             for (d = 0; (d < DIM); d++)
4448             {
4449                 cm_new[d] = inv_ncg*cm_new[d];
4450             }
4451         }
4452
4453         clear_ivec(dev);
4454         /* Do pbc and check DD cell boundary crossings */
4455         for (d = DIM-1; d >= 0; d--)
4456         {
4457             if (dd->nc[d] > 1)
4458             {
4459                 bScrew = (dd->bScrewPBC && d == XX);
4460                 /* Determine the location of this cg in lattice coordinates */
4461                 pos_d = cm_new[d];
4462                 if (tric_dir[d])
4463                 {
4464                     for (d2 = d+1; d2 < DIM; d2++)
4465                     {
4466                         pos_d += cm_new[d2]*tcm[d2][d];
4467                     }
4468                 }
4469                 /* Put the charge group in the triclinic unit-cell */
4470                 if (pos_d >= cell_x1[d])
4471                 {
4472                     if (pos_d >= limit1[d])
4473                     {
4474                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4475                                       cg_cm[cg], cm_new, pos_d);
4476                     }
4477                     dev[d] = 1;
4478                     if (dd->ci[d] == dd->nc[d] - 1)
4479                     {
4480                         rvec_dec(cm_new, state->box[d]);
4481                         if (bScrew)
4482                         {
4483                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4484                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4485                         }
4486                         for (k = k0; (k < k1); k++)
4487                         {
4488                             rvec_dec(state->x[k], state->box[d]);
4489                             if (bScrew)
4490                             {
4491                                 rotate_state_atom(state, k);
4492                             }
4493                         }
4494                     }
4495                 }
4496                 else if (pos_d < cell_x0[d])
4497                 {
4498                     if (pos_d < limit0[d])
4499                     {
4500                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4501                                       cg_cm[cg], cm_new, pos_d);
4502                     }
4503                     dev[d] = -1;
4504                     if (dd->ci[d] == 0)
4505                     {
4506                         rvec_inc(cm_new, state->box[d]);
4507                         if (bScrew)
4508                         {
4509                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4510                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4511                         }
4512                         for (k = k0; (k < k1); k++)
4513                         {
4514                             rvec_inc(state->x[k], state->box[d]);
4515                             if (bScrew)
4516                             {
4517                                 rotate_state_atom(state, k);
4518                             }
4519                         }
4520                     }
4521                 }
4522             }
4523             else if (d < npbcdim)
4524             {
4525                 /* Put the charge group in the rectangular unit-cell */
4526                 while (cm_new[d] >= state->box[d][d])
4527                 {
4528                     rvec_dec(cm_new, state->box[d]);
4529                     for (k = k0; (k < k1); k++)
4530                     {
4531                         rvec_dec(state->x[k], state->box[d]);
4532                     }
4533                 }
4534                 while (cm_new[d] < 0)
4535                 {
4536                     rvec_inc(cm_new, state->box[d]);
4537                     for (k = k0; (k < k1); k++)
4538                     {
4539                         rvec_inc(state->x[k], state->box[d]);
4540                     }
4541                 }
4542             }
4543         }
4544
4545         copy_rvec(cm_new, cg_cm[cg]);
4546
4547         /* Determine where this cg should go */
4548         flag = 0;
4549         mc   = -1;
4550         for (d = 0; d < dd->ndim; d++)
4551         {
4552             dim = dd->dim[d];
4553             if (dev[dim] == 1)
4554             {
4555                 flag |= DD_FLAG_FW(d);
4556                 if (mc == -1)
4557                 {
4558                     mc = d*2;
4559                 }
4560             }
4561             else if (dev[dim] == -1)
4562             {
4563                 flag |= DD_FLAG_BW(d);
4564                 if (mc == -1)
4565                 {
4566                     if (dd->nc[dim] > 2)
4567                     {
4568                         mc = d*2 + 1;
4569                     }
4570                     else
4571                     {
4572                         mc = d*2;
4573                     }
4574                 }
4575             }
4576         }
4577         /* Temporarily store the flag in move */
4578         move[cg] = mc + flag;
4579     }
4580 }
4581
4582 static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
4583                                gmx_domdec_t *dd, ivec tric_dir,
4584                                t_state *state, rvec **f,
4585                                t_forcerec *fr, t_mdatoms *md,
4586                                gmx_bool bCompact,
4587                                t_nrnb *nrnb,
4588                                int *ncg_stay_home,
4589                                int *ncg_moved)
4590 {
4591     int               *move;
4592     int                npbcdim;
4593     int                ncg[DIM*2], nat[DIM*2];
4594     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4595     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4596     int                sbuf[2], rbuf[2];
4597     int                home_pos_cg, home_pos_at, buf_pos;
4598     int                flag;
4599     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4600     gmx_bool           bScrew;
4601     ivec               dev;
4602     real               inv_ncg, pos_d;
4603     matrix             tcm;
4604     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4605     atom_id           *cgindex;
4606     cginfo_mb_t       *cginfo_mb;
4607     gmx_domdec_comm_t *comm;
4608     int               *moved;
4609     int                nthread, thread;
4610
4611     if (dd->bScrewPBC)
4612     {
4613         check_screw_box(state->box);
4614     }
4615
4616     comm  = dd->comm;
4617     if (fr->cutoff_scheme == ecutsGROUP)
4618     {
4619         cg_cm = fr->cg_cm;
4620     }
4621
4622     for (i = 0; i < estNR; i++)
4623     {
4624         if (EST_DISTR(i))
4625         {
4626             switch (i)
4627             {
4628                 case estX: /* Always present */ break;
4629                 case estV:   bV   = (state->flags & (1<<i)); break;
4630                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4631                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4632                 case estLD_RNG:
4633                 case estLD_RNGI:
4634                 case estDISRE_INITF:
4635                 case estDISRE_RM3TAV:
4636                 case estORIRE_INITF:
4637                 case estORIRE_DTAV:
4638                     /* No processing required */
4639                     break;
4640                 default:
4641                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4642             }
4643         }
4644     }
4645
4646     if (dd->ncg_tot > comm->nalloc_int)
4647     {
4648         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4649         srenew(comm->buf_int, comm->nalloc_int);
4650     }
4651     move = comm->buf_int;
4652
4653     /* Clear the count */
4654     for (c = 0; c < dd->ndim*2; c++)
4655     {
4656         ncg[c] = 0;
4657         nat[c] = 0;
4658     }
4659
4660     npbcdim = dd->npbcdim;
4661
4662     for (d = 0; (d < DIM); d++)
4663     {
4664         limitd[d] = dd->comm->cellsize_min[d];
4665         if (d >= npbcdim && dd->ci[d] == 0)
4666         {
4667             cell_x0[d] = -GMX_FLOAT_MAX;
4668         }
4669         else
4670         {
4671             cell_x0[d] = comm->cell_x0[d];
4672         }
4673         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4674         {
4675             cell_x1[d] = GMX_FLOAT_MAX;
4676         }
4677         else
4678         {
4679             cell_x1[d] = comm->cell_x1[d];
4680         }
4681         if (d < npbcdim)
4682         {
4683             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4684             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4685         }
4686         else
4687         {
4688             /* We check after communication if a charge group moved
4689              * more than one cell. Set the pre-comm check limit to float_max.
4690              */
4691             limit0[d] = -GMX_FLOAT_MAX;
4692             limit1[d] =  GMX_FLOAT_MAX;
4693         }
4694     }
4695
4696     make_tric_corr_matrix(npbcdim, state->box, tcm);
4697
4698     cgindex = dd->cgindex;
4699
4700     nthread = gmx_omp_nthreads_get(emntDomdec);
4701
4702     /* Compute the center of geometry for all home charge groups
4703      * and put them in the box and determine where they should go.
4704      */
4705 #pragma omp parallel for num_threads(nthread) schedule(static)
4706     for (thread = 0; thread < nthread; thread++)
4707     {
4708         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4709                      cell_x0, cell_x1, limitd, limit0, limit1,
4710                      cgindex,
4711                      ( thread   *dd->ncg_home)/nthread,
4712                      ((thread+1)*dd->ncg_home)/nthread,
4713                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4714                      move);
4715     }
4716
4717     for (cg = 0; cg < dd->ncg_home; cg++)
4718     {
4719         if (move[cg] >= 0)
4720         {
4721             mc       = move[cg];
4722             flag     = mc & ~DD_FLAG_NRCG;
4723             mc       = mc & DD_FLAG_NRCG;
4724             move[cg] = mc;
4725
4726             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4727             {
4728                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4729                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4730             }
4731             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4732             /* We store the cg size in the lower 16 bits
4733              * and the place where the charge group should go
4734              * in the next 6 bits. This saves some communication volume.
4735              */
4736             nrcg = cgindex[cg+1] - cgindex[cg];
4737             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4738             ncg[mc] += 1;
4739             nat[mc] += nrcg;
4740         }
4741     }
4742
4743     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4744     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4745
4746     *ncg_moved = 0;
4747     for (i = 0; i < dd->ndim*2; i++)
4748     {
4749         *ncg_moved += ncg[i];
4750     }
4751
4752     nvec = 1;
4753     if (bV)
4754     {
4755         nvec++;
4756     }
4757     if (bSDX)
4758     {
4759         nvec++;
4760     }
4761     if (bCGP)
4762     {
4763         nvec++;
4764     }
4765
4766     /* Make sure the communication buffers are large enough */
4767     for (mc = 0; mc < dd->ndim*2; mc++)
4768     {
4769         nvr = ncg[mc] + nat[mc]*nvec;
4770         if (nvr > comm->cgcm_state_nalloc[mc])
4771         {
4772             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4773             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4774         }
4775     }
4776
4777     switch (fr->cutoff_scheme)
4778     {
4779         case ecutsGROUP:
4780             /* Recalculating cg_cm might be cheaper than communicating,
4781              * but that could give rise to rounding issues.
4782              */
4783             home_pos_cg =
4784                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4785                                         nvec, cg_cm, comm, bCompact);
4786             break;
4787         case ecutsVERLET:
4788             /* Without charge groups we send the moved atom coordinates
4789              * over twice. This is so the code below can be used without
4790              * many conditionals for both for with and without charge groups.
4791              */
4792             home_pos_cg =
4793                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4794                                         nvec, state->x, comm, FALSE);
4795             if (bCompact)
4796             {
4797                 home_pos_cg -= *ncg_moved;
4798             }
4799             break;
4800         default:
4801             gmx_incons("unimplemented");
4802             home_pos_cg = 0;
4803     }
4804
4805     vec         = 0;
4806     home_pos_at =
4807         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4808                                 nvec, vec++, state->x, comm, bCompact);
4809     if (bV)
4810     {
4811         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4812                                 nvec, vec++, state->v, comm, bCompact);
4813     }
4814     if (bSDX)
4815     {
4816         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4817                                 nvec, vec++, state->sd_X, comm, bCompact);
4818     }
4819     if (bCGP)
4820     {
4821         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4822                                 nvec, vec++, state->cg_p, comm, bCompact);
4823     }
4824
4825     if (bCompact)
4826     {
4827         compact_ind(dd->ncg_home, move,
4828                     dd->index_gl, dd->cgindex, dd->gatindex,
4829                     dd->ga2la, comm->bLocalCG,
4830                     fr->cginfo);
4831     }
4832     else
4833     {
4834         if (fr->cutoff_scheme == ecutsVERLET)
4835         {
4836             moved = get_moved(comm, dd->ncg_home);
4837
4838             for (k = 0; k < dd->ncg_home; k++)
4839             {
4840                 moved[k] = 0;
4841             }
4842         }
4843         else
4844         {
4845             moved = fr->ns.grid->cell_index;
4846         }
4847
4848         clear_and_mark_ind(dd->ncg_home, move,
4849                            dd->index_gl, dd->cgindex, dd->gatindex,
4850                            dd->ga2la, comm->bLocalCG,
4851                            moved);
4852     }
4853
4854     cginfo_mb = fr->cginfo_mb;
4855
4856     *ncg_stay_home = home_pos_cg;
4857     for (d = 0; d < dd->ndim; d++)
4858     {
4859         dim      = dd->dim[d];
4860         ncg_recv = 0;
4861         nat_recv = 0;
4862         nvr      = 0;
4863         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4864         {
4865             cdd = d*2 + dir;
4866             /* Communicate the cg and atom counts */
4867             sbuf[0] = ncg[cdd];
4868             sbuf[1] = nat[cdd];
4869             if (debug)
4870             {
4871                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4872                         d, dir, sbuf[0], sbuf[1]);
4873             }
4874             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4875
4876             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4877             {
4878                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4879                 srenew(comm->buf_int, comm->nalloc_int);
4880             }
4881
4882             /* Communicate the charge group indices, sizes and flags */
4883             dd_sendrecv_int(dd, d, dir,
4884                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4885                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4886
4887             nvs = ncg[cdd] + nat[cdd]*nvec;
4888             i   = rbuf[0]  + rbuf[1] *nvec;
4889             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4890
4891             /* Communicate cgcm and state */
4892             dd_sendrecv_rvec(dd, d, dir,
4893                              comm->cgcm_state[cdd], nvs,
4894                              comm->vbuf.v+nvr, i);
4895             ncg_recv += rbuf[0];
4896             nat_recv += rbuf[1];
4897             nvr      += i;
4898         }
4899
4900         /* Process the received charge groups */
4901         buf_pos = 0;
4902         for (cg = 0; cg < ncg_recv; cg++)
4903         {
4904             flag = comm->buf_int[cg*DD_CGIBS+1];
4905
4906             if (dim >= npbcdim && dd->nc[dim] > 2)
4907             {
4908                 /* No pbc in this dim and more than one domain boundary.
4909                  * We do a separate check if a charge group didn't move too far.
4910                  */
4911                 if (((flag & DD_FLAG_FW(d)) &&
4912                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4913                     ((flag & DD_FLAG_BW(d)) &&
4914                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4915                 {
4916                     cg_move_error(fplog, dd, step, cg, dim,
4917                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4918                                   FALSE, 0,
4919                                   comm->vbuf.v[buf_pos],
4920                                   comm->vbuf.v[buf_pos],
4921                                   comm->vbuf.v[buf_pos][dim]);
4922                 }
4923             }
4924
4925             mc = -1;
4926             if (d < dd->ndim-1)
4927             {
4928                 /* Check which direction this cg should go */
4929                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4930                 {
4931                     if (dd->bGridJump)
4932                     {
4933                         /* The cell boundaries for dimension d2 are not equal
4934                          * for each cell row of the lower dimension(s),
4935                          * therefore we might need to redetermine where
4936                          * this cg should go.
4937                          */
4938                         dim2 = dd->dim[d2];
4939                         /* If this cg crosses the box boundary in dimension d2
4940                          * we can use the communicated flag, so we do not
4941                          * have to worry about pbc.
4942                          */
4943                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4944                                (flag & DD_FLAG_FW(d2))) ||
4945                               (dd->ci[dim2] == 0 &&
4946                                (flag & DD_FLAG_BW(d2)))))
4947                         {
4948                             /* Clear the two flags for this dimension */
4949                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4950                             /* Determine the location of this cg
4951                              * in lattice coordinates
4952                              */
4953                             pos_d = comm->vbuf.v[buf_pos][dim2];
4954                             if (tric_dir[dim2])
4955                             {
4956                                 for (d3 = dim2+1; d3 < DIM; d3++)
4957                                 {
4958                                     pos_d +=
4959                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4960                                 }
4961                             }
4962                             /* Check of we are not at the box edge.
4963                              * pbc is only handled in the first step above,
4964                              * but this check could move over pbc while
4965                              * the first step did not due to different rounding.
4966                              */
4967                             if (pos_d >= cell_x1[dim2] &&
4968                                 dd->ci[dim2] != dd->nc[dim2]-1)
4969                             {
4970                                 flag |= DD_FLAG_FW(d2);
4971                             }
4972                             else if (pos_d < cell_x0[dim2] &&
4973                                      dd->ci[dim2] != 0)
4974                             {
4975                                 flag |= DD_FLAG_BW(d2);
4976                             }
4977                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4978                         }
4979                     }
4980                     /* Set to which neighboring cell this cg should go */
4981                     if (flag & DD_FLAG_FW(d2))
4982                     {
4983                         mc = d2*2;
4984                     }
4985                     else if (flag & DD_FLAG_BW(d2))
4986                     {
4987                         if (dd->nc[dd->dim[d2]] > 2)
4988                         {
4989                             mc = d2*2+1;
4990                         }
4991                         else
4992                         {
4993                             mc = d2*2;
4994                         }
4995                     }
4996                 }
4997             }
4998
4999             nrcg = flag & DD_FLAG_NRCG;
5000             if (mc == -1)
5001             {
5002                 if (home_pos_cg+1 > dd->cg_nalloc)
5003                 {
5004                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5005                     srenew(dd->index_gl, dd->cg_nalloc);
5006                     srenew(dd->cgindex, dd->cg_nalloc+1);
5007                 }
5008                 /* Set the global charge group index and size */
5009                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5010                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5011                 /* Copy the state from the buffer */
5012                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5013                 if (fr->cutoff_scheme == ecutsGROUP)
5014                 {
5015                     cg_cm = fr->cg_cm;
5016                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5017                 }
5018                 buf_pos++;
5019
5020                 /* Set the cginfo */
5021                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5022                                                    dd->index_gl[home_pos_cg]);
5023                 if (comm->bLocalCG)
5024                 {
5025                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5026                 }
5027
5028                 if (home_pos_at+nrcg > state->nalloc)
5029                 {
5030                     dd_realloc_state(state, f, home_pos_at+nrcg);
5031                 }
5032                 for (i = 0; i < nrcg; i++)
5033                 {
5034                     copy_rvec(comm->vbuf.v[buf_pos++],
5035                               state->x[home_pos_at+i]);
5036                 }
5037                 if (bV)
5038                 {
5039                     for (i = 0; i < nrcg; i++)
5040                     {
5041                         copy_rvec(comm->vbuf.v[buf_pos++],
5042                                   state->v[home_pos_at+i]);
5043                     }
5044                 }
5045                 if (bSDX)
5046                 {
5047                     for (i = 0; i < nrcg; i++)
5048                     {
5049                         copy_rvec(comm->vbuf.v[buf_pos++],
5050                                   state->sd_X[home_pos_at+i]);
5051                     }
5052                 }
5053                 if (bCGP)
5054                 {
5055                     for (i = 0; i < nrcg; i++)
5056                     {
5057                         copy_rvec(comm->vbuf.v[buf_pos++],
5058                                   state->cg_p[home_pos_at+i]);
5059                     }
5060                 }
5061                 home_pos_cg += 1;
5062                 home_pos_at += nrcg;
5063             }
5064             else
5065             {
5066                 /* Reallocate the buffers if necessary  */
5067                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5068                 {
5069                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5070                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5071                 }
5072                 nvr = ncg[mc] + nat[mc]*nvec;
5073                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5074                 {
5075                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5076                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5077                 }
5078                 /* Copy from the receive to the send buffers */
5079                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5080                        comm->buf_int + cg*DD_CGIBS,
5081                        DD_CGIBS*sizeof(int));
5082                 memcpy(comm->cgcm_state[mc][nvr],
5083                        comm->vbuf.v[buf_pos],
5084                        (1+nrcg*nvec)*sizeof(rvec));
5085                 buf_pos += 1 + nrcg*nvec;
5086                 ncg[mc] += 1;
5087                 nat[mc] += nrcg;
5088             }
5089         }
5090     }
5091
5092     /* With sorting (!bCompact) the indices are now only partially up to date
5093      * and ncg_home and nat_home are not the real count, since there are
5094      * "holes" in the arrays for the charge groups that moved to neighbors.
5095      */
5096     if (fr->cutoff_scheme == ecutsVERLET)
5097     {
5098         moved = get_moved(comm, home_pos_cg);
5099
5100         for (i = dd->ncg_home; i < home_pos_cg; i++)
5101         {
5102             moved[i] = 0;
5103         }
5104     }
5105     dd->ncg_home = home_pos_cg;
5106     dd->nat_home = home_pos_at;
5107
5108     if (debug)
5109     {
5110         fprintf(debug,
5111                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5112                 *ncg_moved, dd->ncg_home-*ncg_moved);
5113
5114     }
5115 }
5116
5117 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5118 {
5119     dd->comm->cycl[ddCycl] += cycles;
5120     dd->comm->cycl_n[ddCycl]++;
5121     if (cycles > dd->comm->cycl_max[ddCycl])
5122     {
5123         dd->comm->cycl_max[ddCycl] = cycles;
5124     }
5125 }
5126
5127 static double force_flop_count(t_nrnb *nrnb)
5128 {
5129     int         i;
5130     double      sum;
5131     const char *name;
5132
5133     sum = 0;
5134     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5135     {
5136         /* To get closer to the real timings, we half the count
5137          * for the normal loops and again half it for water loops.
5138          */
5139         name = nrnb_str(i);
5140         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5141         {
5142             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5143         }
5144         else
5145         {
5146             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5147         }
5148     }
5149     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5150     {
5151         name = nrnb_str(i);
5152         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5153         {
5154             sum += nrnb->n[i]*cost_nrnb(i);
5155         }
5156     }
5157     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5158     {
5159         sum += nrnb->n[i]*cost_nrnb(i);
5160     }
5161
5162     return sum;
5163 }
5164
5165 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5166 {
5167     if (dd->comm->eFlop)
5168     {
5169         dd->comm->flop -= force_flop_count(nrnb);
5170     }
5171 }
5172 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5173 {
5174     if (dd->comm->eFlop)
5175     {
5176         dd->comm->flop += force_flop_count(nrnb);
5177         dd->comm->flop_n++;
5178     }
5179 }
5180
5181 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5182 {
5183     int i;
5184
5185     for (i = 0; i < ddCyclNr; i++)
5186     {
5187         dd->comm->cycl[i]     = 0;
5188         dd->comm->cycl_n[i]   = 0;
5189         dd->comm->cycl_max[i] = 0;
5190     }
5191     dd->comm->flop   = 0;
5192     dd->comm->flop_n = 0;
5193 }
5194
5195 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5196 {
5197     gmx_domdec_comm_t *comm;
5198     gmx_domdec_load_t *load;
5199     gmx_domdec_root_t *root = NULL;
5200     int                d, dim, cid, i, pos;
5201     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5202     gmx_bool           bSepPME;
5203
5204     if (debug)
5205     {
5206         fprintf(debug, "get_load_distribution start\n");
5207     }
5208
5209     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5210
5211     comm = dd->comm;
5212
5213     bSepPME = (dd->pme_nodeid >= 0);
5214
5215     for (d = dd->ndim-1; d >= 0; d--)
5216     {
5217         dim = dd->dim[d];
5218         /* Check if we participate in the communication in this dimension */
5219         if (d == dd->ndim-1 ||
5220             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5221         {
5222             load = &comm->load[d];
5223             if (dd->bGridJump)
5224             {
5225                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5226             }
5227             pos = 0;
5228             if (d == dd->ndim-1)
5229             {
5230                 sbuf[pos++] = dd_force_load(comm);
5231                 sbuf[pos++] = sbuf[0];
5232                 if (dd->bGridJump)
5233                 {
5234                     sbuf[pos++] = sbuf[0];
5235                     sbuf[pos++] = cell_frac;
5236                     if (d > 0)
5237                     {
5238                         sbuf[pos++] = comm->cell_f_max0[d];
5239                         sbuf[pos++] = comm->cell_f_min1[d];
5240                     }
5241                 }
5242                 if (bSepPME)
5243                 {
5244                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5245                     sbuf[pos++] = comm->cycl[ddCyclPME];
5246                 }
5247             }
5248             else
5249             {
5250                 sbuf[pos++] = comm->load[d+1].sum;
5251                 sbuf[pos++] = comm->load[d+1].max;
5252                 if (dd->bGridJump)
5253                 {
5254                     sbuf[pos++] = comm->load[d+1].sum_m;
5255                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5256                     sbuf[pos++] = comm->load[d+1].flags;
5257                     if (d > 0)
5258                     {
5259                         sbuf[pos++] = comm->cell_f_max0[d];
5260                         sbuf[pos++] = comm->cell_f_min1[d];
5261                     }
5262                 }
5263                 if (bSepPME)
5264                 {
5265                     sbuf[pos++] = comm->load[d+1].mdf;
5266                     sbuf[pos++] = comm->load[d+1].pme;
5267                 }
5268             }
5269             load->nload = pos;
5270             /* Communicate a row in DD direction d.
5271              * The communicators are setup such that the root always has rank 0.
5272              */
5273 #ifdef GMX_MPI
5274             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5275                        load->load, load->nload*sizeof(float), MPI_BYTE,
5276                        0, comm->mpi_comm_load[d]);
5277 #endif
5278             if (dd->ci[dim] == dd->master_ci[dim])
5279             {
5280                 /* We are the root, process this row */
5281                 if (comm->bDynLoadBal)
5282                 {
5283                     root = comm->root[d];
5284                 }
5285                 load->sum      = 0;
5286                 load->max      = 0;
5287                 load->sum_m    = 0;
5288                 load->cvol_min = 1;
5289                 load->flags    = 0;
5290                 load->mdf      = 0;
5291                 load->pme      = 0;
5292                 pos            = 0;
5293                 for (i = 0; i < dd->nc[dim]; i++)
5294                 {
5295                     load->sum += load->load[pos++];
5296                     load->max  = max(load->max, load->load[pos]);
5297                     pos++;
5298                     if (dd->bGridJump)
5299                     {
5300                         if (root->bLimited)
5301                         {
5302                             /* This direction could not be load balanced properly,
5303                              * therefore we need to use the maximum iso the average load.
5304                              */
5305                             load->sum_m = max(load->sum_m, load->load[pos]);
5306                         }
5307                         else
5308                         {
5309                             load->sum_m += load->load[pos];
5310                         }
5311                         pos++;
5312                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5313                         pos++;
5314                         if (d < dd->ndim-1)
5315                         {
5316                             load->flags = (int)(load->load[pos++] + 0.5);
5317                         }
5318                         if (d > 0)
5319                         {
5320                             root->cell_f_max0[i] = load->load[pos++];
5321                             root->cell_f_min1[i] = load->load[pos++];
5322                         }
5323                     }
5324                     if (bSepPME)
5325                     {
5326                         load->mdf = max(load->mdf, load->load[pos]);
5327                         pos++;
5328                         load->pme = max(load->pme, load->load[pos]);
5329                         pos++;
5330                     }
5331                 }
5332                 if (comm->bDynLoadBal && root->bLimited)
5333                 {
5334                     load->sum_m *= dd->nc[dim];
5335                     load->flags |= (1<<d);
5336                 }
5337             }
5338         }
5339     }
5340
5341     if (DDMASTER(dd))
5342     {
5343         comm->nload      += dd_load_count(comm);
5344         comm->load_step  += comm->cycl[ddCyclStep];
5345         comm->load_sum   += comm->load[0].sum;
5346         comm->load_max   += comm->load[0].max;
5347         if (comm->bDynLoadBal)
5348         {
5349             for (d = 0; d < dd->ndim; d++)
5350             {
5351                 if (comm->load[0].flags & (1<<d))
5352                 {
5353                     comm->load_lim[d]++;
5354                 }
5355             }
5356         }
5357         if (bSepPME)
5358         {
5359             comm->load_mdf += comm->load[0].mdf;
5360             comm->load_pme += comm->load[0].pme;
5361         }
5362     }
5363
5364     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5365
5366     if (debug)
5367     {
5368         fprintf(debug, "get_load_distribution finished\n");
5369     }
5370 }
5371
5372 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5373 {
5374     /* Return the relative performance loss on the total run time
5375      * due to the force calculation load imbalance.
5376      */
5377     if (dd->comm->nload > 0)
5378     {
5379         return
5380             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5381             (dd->comm->load_step*dd->nnodes);
5382     }
5383     else
5384     {
5385         return 0;
5386     }
5387 }
5388
5389 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5390 {
5391     char               buf[STRLEN];
5392     int                npp, npme, nnodes, d, limp;
5393     float              imbal, pme_f_ratio, lossf, lossp = 0;
5394     gmx_bool           bLim;
5395     gmx_domdec_comm_t *comm;
5396
5397     comm = dd->comm;
5398     if (DDMASTER(dd) && comm->nload > 0)
5399     {
5400         npp    = dd->nnodes;
5401         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5402         nnodes = npp + npme;
5403         imbal  = comm->load_max*npp/comm->load_sum - 1;
5404         lossf  = dd_force_imb_perf_loss(dd);
5405         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5406         fprintf(fplog, "%s", buf);
5407         fprintf(stderr, "\n");
5408         fprintf(stderr, "%s", buf);
5409         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5410         fprintf(fplog, "%s", buf);
5411         fprintf(stderr, "%s", buf);
5412         bLim = FALSE;
5413         if (comm->bDynLoadBal)
5414         {
5415             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5416             for (d = 0; d < dd->ndim; d++)
5417             {
5418                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5419                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5420                 if (limp >= 50)
5421                 {
5422                     bLim = TRUE;
5423                 }
5424             }
5425             sprintf(buf+strlen(buf), "\n");
5426             fprintf(fplog, "%s", buf);
5427             fprintf(stderr, "%s", buf);
5428         }
5429         if (npme > 0)
5430         {
5431             pme_f_ratio = comm->load_pme/comm->load_mdf;
5432             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5433             if (lossp <= 0)
5434             {
5435                 lossp *= (float)npme/(float)nnodes;
5436             }
5437             else
5438             {
5439                 lossp *= (float)npp/(float)nnodes;
5440             }
5441             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5442             fprintf(fplog, "%s", buf);
5443             fprintf(stderr, "%s", buf);
5444             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5445             fprintf(fplog, "%s", buf);
5446             fprintf(stderr, "%s", buf);
5447         }
5448         fprintf(fplog, "\n");
5449         fprintf(stderr, "\n");
5450
5451         if (lossf >= DD_PERF_LOSS)
5452         {
5453             sprintf(buf,
5454                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5455                     "      in the domain decomposition.\n", lossf*100);
5456             if (!comm->bDynLoadBal)
5457             {
5458                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5459             }
5460             else if (bLim)
5461             {
5462                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5463             }
5464             fprintf(fplog, "%s\n", buf);
5465             fprintf(stderr, "%s\n", buf);
5466         }
5467         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5468         {
5469             sprintf(buf,
5470                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5471                     "      had %s work to do than the PP nodes.\n"
5472                     "      You might want to %s the number of PME nodes\n"
5473                     "      or %s the cut-off and the grid spacing.\n",
5474                     fabs(lossp*100),
5475                     (lossp < 0) ? "less"     : "more",
5476                     (lossp < 0) ? "decrease" : "increase",
5477                     (lossp < 0) ? "decrease" : "increase");
5478             fprintf(fplog, "%s\n", buf);
5479             fprintf(stderr, "%s\n", buf);
5480         }
5481     }
5482 }
5483
5484 static float dd_vol_min(gmx_domdec_t *dd)
5485 {
5486     return dd->comm->load[0].cvol_min*dd->nnodes;
5487 }
5488
5489 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5490 {
5491     return dd->comm->load[0].flags;
5492 }
5493
5494 static float dd_f_imbal(gmx_domdec_t *dd)
5495 {
5496     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5497 }
5498
5499 float dd_pme_f_ratio(gmx_domdec_t *dd)
5500 {
5501     if (dd->comm->cycl_n[ddCyclPME] > 0)
5502     {
5503         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5504     }
5505     else
5506     {
5507         return -1.0;
5508     }
5509 }
5510
5511 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
5512 {
5513     int  flags, d;
5514     char buf[22];
5515
5516     flags = dd_load_flags(dd);
5517     if (flags)
5518     {
5519         fprintf(fplog,
5520                 "DD  load balancing is limited by minimum cell size in dimension");
5521         for (d = 0; d < dd->ndim; d++)
5522         {
5523             if (flags & (1<<d))
5524             {
5525                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5526             }
5527         }
5528         fprintf(fplog, "\n");
5529     }
5530     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5531     if (dd->comm->bDynLoadBal)
5532     {
5533         fprintf(fplog, "  vol min/aver %5.3f%c",
5534                 dd_vol_min(dd), flags ? '!' : ' ');
5535     }
5536     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5537     if (dd->comm->cycl_n[ddCyclPME])
5538     {
5539         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5540     }
5541     fprintf(fplog, "\n\n");
5542 }
5543
5544 static void dd_print_load_verbose(gmx_domdec_t *dd)
5545 {
5546     if (dd->comm->bDynLoadBal)
5547     {
5548         fprintf(stderr, "vol %4.2f%c ",
5549                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5550     }
5551     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5552     if (dd->comm->cycl_n[ddCyclPME])
5553     {
5554         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5555     }
5556 }
5557
5558 #ifdef GMX_MPI
5559 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5560 {
5561     MPI_Comm           c_row;
5562     int                dim, i, rank;
5563     ivec               loc_c;
5564     gmx_domdec_root_t *root;
5565     gmx_bool           bPartOfGroup = FALSE;
5566
5567     dim = dd->dim[dim_ind];
5568     copy_ivec(loc, loc_c);
5569     for (i = 0; i < dd->nc[dim]; i++)
5570     {
5571         loc_c[dim] = i;
5572         rank       = dd_index(dd->nc, loc_c);
5573         if (rank == dd->rank)
5574         {
5575             /* This process is part of the group */
5576             bPartOfGroup = TRUE;
5577         }
5578     }
5579     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5580                    &c_row);
5581     if (bPartOfGroup)
5582     {
5583         dd->comm->mpi_comm_load[dim_ind] = c_row;
5584         if (dd->comm->eDLB != edlbNO)
5585         {
5586             if (dd->ci[dim] == dd->master_ci[dim])
5587             {
5588                 /* This is the root process of this row */
5589                 snew(dd->comm->root[dim_ind], 1);
5590                 root = dd->comm->root[dim_ind];
5591                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5592                 snew(root->old_cell_f, dd->nc[dim]+1);
5593                 snew(root->bCellMin, dd->nc[dim]);
5594                 if (dim_ind > 0)
5595                 {
5596                     snew(root->cell_f_max0, dd->nc[dim]);
5597                     snew(root->cell_f_min1, dd->nc[dim]);
5598                     snew(root->bound_min, dd->nc[dim]);
5599                     snew(root->bound_max, dd->nc[dim]);
5600                 }
5601                 snew(root->buf_ncd, dd->nc[dim]);
5602             }
5603             else
5604             {
5605                 /* This is not a root process, we only need to receive cell_f */
5606                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5607             }
5608         }
5609         if (dd->ci[dim] == dd->master_ci[dim])
5610         {
5611             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5612         }
5613     }
5614 }
5615 #endif
5616
5617 static void make_load_communicators(gmx_domdec_t *dd)
5618 {
5619 #ifdef GMX_MPI
5620     int  dim0, dim1, i, j;
5621     ivec loc;
5622
5623     if (debug)
5624     {
5625         fprintf(debug, "Making load communicators\n");
5626     }
5627
5628     snew(dd->comm->load, dd->ndim);
5629     snew(dd->comm->mpi_comm_load, dd->ndim);
5630
5631     clear_ivec(loc);
5632     make_load_communicator(dd, 0, loc);
5633     if (dd->ndim > 1)
5634     {
5635         dim0 = dd->dim[0];
5636         for (i = 0; i < dd->nc[dim0]; i++)
5637         {
5638             loc[dim0] = i;
5639             make_load_communicator(dd, 1, loc);
5640         }
5641     }
5642     if (dd->ndim > 2)
5643     {
5644         dim0 = dd->dim[0];
5645         for (i = 0; i < dd->nc[dim0]; i++)
5646         {
5647             loc[dim0] = i;
5648             dim1      = dd->dim[1];
5649             for (j = 0; j < dd->nc[dim1]; j++)
5650             {
5651                 loc[dim1] = j;
5652                 make_load_communicator(dd, 2, loc);
5653             }
5654         }
5655     }
5656
5657     if (debug)
5658     {
5659         fprintf(debug, "Finished making load communicators\n");
5660     }
5661 #endif
5662 }
5663
5664 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5665 {
5666     gmx_bool                bZYX;
5667     int                     d, dim, i, j, m;
5668     ivec                    tmp, s;
5669     int                     nzone, nzonep;
5670     ivec                    dd_zp[DD_MAXIZONE];
5671     gmx_domdec_zones_t     *zones;
5672     gmx_domdec_ns_ranges_t *izone;
5673
5674     for (d = 0; d < dd->ndim; d++)
5675     {
5676         dim = dd->dim[d];
5677         copy_ivec(dd->ci, tmp);
5678         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5679         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5680         copy_ivec(dd->ci, tmp);
5681         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5682         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5683         if (debug)
5684         {
5685             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5686                     dd->rank, dim,
5687                     dd->neighbor[d][0],
5688                     dd->neighbor[d][1]);
5689         }
5690     }
5691
5692     if (fplog)
5693     {
5694         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5695                 dd->ndim,
5696                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5697                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5698     }
5699     switch (dd->ndim)
5700     {
5701         case 3:
5702             nzone  = dd_z3n;
5703             nzonep = dd_zp3n;
5704             for (i = 0; i < nzonep; i++)
5705             {
5706                 copy_ivec(dd_zp3[i], dd_zp[i]);
5707             }
5708             break;
5709         case 2:
5710             nzone  = dd_z2n;
5711             nzonep = dd_zp2n;
5712             for (i = 0; i < nzonep; i++)
5713             {
5714                 copy_ivec(dd_zp2[i], dd_zp[i]);
5715             }
5716             break;
5717         case 1:
5718             nzone  = dd_z1n;
5719             nzonep = dd_zp1n;
5720             for (i = 0; i < nzonep; i++)
5721             {
5722                 copy_ivec(dd_zp1[i], dd_zp[i]);
5723             }
5724             break;
5725         default:
5726             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5727             nzone  = 0;
5728             nzonep = 0;
5729     }
5730
5731     zones = &dd->comm->zones;
5732
5733     for (i = 0; i < nzone; i++)
5734     {
5735         m = 0;
5736         clear_ivec(zones->shift[i]);
5737         for (d = 0; d < dd->ndim; d++)
5738         {
5739             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5740         }
5741     }
5742
5743     zones->n = nzone;
5744     for (i = 0; i < nzone; i++)
5745     {
5746         for (d = 0; d < DIM; d++)
5747         {
5748             s[d] = dd->ci[d] - zones->shift[i][d];
5749             if (s[d] < 0)
5750             {
5751                 s[d] += dd->nc[d];
5752             }
5753             else if (s[d] >= dd->nc[d])
5754             {
5755                 s[d] -= dd->nc[d];
5756             }
5757         }
5758     }
5759     zones->nizone = nzonep;
5760     for (i = 0; i < zones->nizone; i++)
5761     {
5762         if (dd_zp[i][0] != i)
5763         {
5764             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5765         }
5766         izone     = &zones->izone[i];
5767         izone->j0 = dd_zp[i][1];
5768         izone->j1 = dd_zp[i][2];
5769         for (dim = 0; dim < DIM; dim++)
5770         {
5771             if (dd->nc[dim] == 1)
5772             {
5773                 /* All shifts should be allowed */
5774                 izone->shift0[dim] = -1;
5775                 izone->shift1[dim] = 1;
5776             }
5777             else
5778             {
5779                 /*
5780                    izone->shift0[d] = 0;
5781                    izone->shift1[d] = 0;
5782                    for(j=izone->j0; j<izone->j1; j++) {
5783                    if (dd->shift[j][d] > dd->shift[i][d])
5784                    izone->shift0[d] = -1;
5785                    if (dd->shift[j][d] < dd->shift[i][d])
5786                    izone->shift1[d] = 1;
5787                    }
5788                  */
5789
5790                 int shift_diff;
5791
5792                 /* Assume the shift are not more than 1 cell */
5793                 izone->shift0[dim] = 1;
5794                 izone->shift1[dim] = -1;
5795                 for (j = izone->j0; j < izone->j1; j++)
5796                 {
5797                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5798                     if (shift_diff < izone->shift0[dim])
5799                     {
5800                         izone->shift0[dim] = shift_diff;
5801                     }
5802                     if (shift_diff > izone->shift1[dim])
5803                     {
5804                         izone->shift1[dim] = shift_diff;
5805                     }
5806                 }
5807             }
5808         }
5809     }
5810
5811     if (dd->comm->eDLB != edlbNO)
5812     {
5813         snew(dd->comm->root, dd->ndim);
5814     }
5815
5816     if (dd->comm->bRecordLoad)
5817     {
5818         make_load_communicators(dd);
5819     }
5820 }
5821
5822 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
5823 {
5824     gmx_domdec_t      *dd;
5825     gmx_domdec_comm_t *comm;
5826     int                i, rank, *buf;
5827     ivec               periods;
5828 #ifdef GMX_MPI
5829     MPI_Comm           comm_cart;
5830 #endif
5831
5832     dd   = cr->dd;
5833     comm = dd->comm;
5834
5835 #ifdef GMX_MPI
5836     if (comm->bCartesianPP)
5837     {
5838         /* Set up cartesian communication for the particle-particle part */
5839         if (fplog)
5840         {
5841             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5842                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5843         }
5844
5845         for (i = 0; i < DIM; i++)
5846         {
5847             periods[i] = TRUE;
5848         }
5849         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5850                         &comm_cart);
5851         /* We overwrite the old communicator with the new cartesian one */
5852         cr->mpi_comm_mygroup = comm_cart;
5853     }
5854
5855     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5856     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5857
5858     if (comm->bCartesianPP_PME)
5859     {
5860         /* Since we want to use the original cartesian setup for sim,
5861          * and not the one after split, we need to make an index.
5862          */
5863         snew(comm->ddindex2ddnodeid, dd->nnodes);
5864         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5865         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5866         /* Get the rank of the DD master,
5867          * above we made sure that the master node is a PP node.
5868          */
5869         if (MASTER(cr))
5870         {
5871             rank = dd->rank;
5872         }
5873         else
5874         {
5875             rank = 0;
5876         }
5877         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5878     }
5879     else if (comm->bCartesianPP)
5880     {
5881         if (cr->npmenodes == 0)
5882         {
5883             /* The PP communicator is also
5884              * the communicator for this simulation
5885              */
5886             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5887         }
5888         cr->nodeid = dd->rank;
5889
5890         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5891
5892         /* We need to make an index to go from the coordinates
5893          * to the nodeid of this simulation.
5894          */
5895         snew(comm->ddindex2simnodeid, dd->nnodes);
5896         snew(buf, dd->nnodes);
5897         if (cr->duty & DUTY_PP)
5898         {
5899             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5900         }
5901         /* Communicate the ddindex to simulation nodeid index */
5902         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5903                       cr->mpi_comm_mysim);
5904         sfree(buf);
5905
5906         /* Determine the master coordinates and rank.
5907          * The DD master should be the same node as the master of this sim.
5908          */
5909         for (i = 0; i < dd->nnodes; i++)
5910         {
5911             if (comm->ddindex2simnodeid[i] == 0)
5912             {
5913                 ddindex2xyz(dd->nc, i, dd->master_ci);
5914                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5915             }
5916         }
5917         if (debug)
5918         {
5919             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5920         }
5921     }
5922     else
5923     {
5924         /* No Cartesian communicators */
5925         /* We use the rank in dd->comm->all as DD index */
5926         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5927         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5928         dd->masterrank = 0;
5929         clear_ivec(dd->master_ci);
5930     }
5931 #endif
5932
5933     if (fplog)
5934     {
5935         fprintf(fplog,
5936                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5937                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5938     }
5939     if (debug)
5940     {
5941         fprintf(debug,
5942                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5943                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5944     }
5945 }
5946
5947 static void receive_ddindex2simnodeid(t_commrec *cr)
5948 {
5949     gmx_domdec_t      *dd;
5950
5951     gmx_domdec_comm_t *comm;
5952     int               *buf;
5953
5954     dd   = cr->dd;
5955     comm = dd->comm;
5956
5957 #ifdef GMX_MPI
5958     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5959     {
5960         snew(comm->ddindex2simnodeid, dd->nnodes);
5961         snew(buf, dd->nnodes);
5962         if (cr->duty & DUTY_PP)
5963         {
5964             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5965         }
5966 #ifdef GMX_MPI
5967         /* Communicate the ddindex to simulation nodeid index */
5968         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5969                       cr->mpi_comm_mysim);
5970 #endif
5971         sfree(buf);
5972     }
5973 #endif
5974 }
5975
5976 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5977                                                      int ncg, int natoms)
5978 {
5979     gmx_domdec_master_t *ma;
5980     int                  i;
5981
5982     snew(ma, 1);
5983
5984     snew(ma->ncg, dd->nnodes);
5985     snew(ma->index, dd->nnodes+1);
5986     snew(ma->cg, ncg);
5987     snew(ma->nat, dd->nnodes);
5988     snew(ma->ibuf, dd->nnodes*2);
5989     snew(ma->cell_x, DIM);
5990     for (i = 0; i < DIM; i++)
5991     {
5992         snew(ma->cell_x[i], dd->nc[i]+1);
5993     }
5994
5995     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5996     {
5997         ma->vbuf = NULL;
5998     }
5999     else
6000     {
6001         snew(ma->vbuf, natoms);
6002     }
6003
6004     return ma;
6005 }
6006
6007 static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
6008                                int reorder)
6009 {
6010     gmx_domdec_t      *dd;
6011     gmx_domdec_comm_t *comm;
6012     int                i, rank;
6013     gmx_bool           bDiv[DIM];
6014     ivec               periods;
6015 #ifdef GMX_MPI
6016     MPI_Comm           comm_cart;
6017 #endif
6018
6019     dd   = cr->dd;
6020     comm = dd->comm;
6021
6022     if (comm->bCartesianPP)
6023     {
6024         for (i = 1; i < DIM; i++)
6025         {
6026             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6027         }
6028         if (bDiv[YY] || bDiv[ZZ])
6029         {
6030             comm->bCartesianPP_PME = TRUE;
6031             /* If we have 2D PME decomposition, which is always in x+y,
6032              * we stack the PME only nodes in z.
6033              * Otherwise we choose the direction that provides the thinnest slab
6034              * of PME only nodes as this will have the least effect
6035              * on the PP communication.
6036              * But for the PME communication the opposite might be better.
6037              */
6038             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6039                              !bDiv[YY] ||
6040                              dd->nc[YY] > dd->nc[ZZ]))
6041             {
6042                 comm->cartpmedim = ZZ;
6043             }
6044             else
6045             {
6046                 comm->cartpmedim = YY;
6047             }
6048             comm->ntot[comm->cartpmedim]
6049                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6050         }
6051         else if (fplog)
6052         {
6053             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6054             fprintf(fplog,
6055                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6056         }
6057     }
6058
6059 #ifdef GMX_MPI
6060     if (comm->bCartesianPP_PME)
6061     {
6062         if (fplog)
6063         {
6064             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6065         }
6066
6067         for (i = 0; i < DIM; i++)
6068         {
6069             periods[i] = TRUE;
6070         }
6071         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6072                         &comm_cart);
6073
6074         MPI_Comm_rank(comm_cart, &rank);
6075         if (MASTERNODE(cr) && rank != 0)
6076         {
6077             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6078         }
6079
6080         /* With this assigment we loose the link to the original communicator
6081          * which will usually be MPI_COMM_WORLD, unless have multisim.
6082          */
6083         cr->mpi_comm_mysim = comm_cart;
6084         cr->sim_nodeid     = rank;
6085
6086         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6087
6088         if (fplog)
6089         {
6090             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6091                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6092         }
6093
6094         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6095         {
6096             cr->duty = DUTY_PP;
6097         }
6098         if (cr->npmenodes == 0 ||
6099             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6100         {
6101             cr->duty = DUTY_PME;
6102         }
6103
6104         /* Split the sim communicator into PP and PME only nodes */
6105         MPI_Comm_split(cr->mpi_comm_mysim,
6106                        cr->duty,
6107                        dd_index(comm->ntot, dd->ci),
6108                        &cr->mpi_comm_mygroup);
6109     }
6110     else
6111     {
6112         switch (dd_node_order)
6113         {
6114             case ddnoPP_PME:
6115                 if (fplog)
6116                 {
6117                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6118                 }
6119                 break;
6120             case ddnoINTERLEAVE:
6121                 /* Interleave the PP-only and PME-only nodes,
6122                  * as on clusters with dual-core machines this will double
6123                  * the communication bandwidth of the PME processes
6124                  * and thus speed up the PP <-> PME and inter PME communication.
6125                  */
6126                 if (fplog)
6127                 {
6128                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6129                 }
6130                 comm->pmenodes = dd_pmenodes(cr);
6131                 break;
6132             case ddnoCARTESIAN:
6133                 break;
6134             default:
6135                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6136         }
6137
6138         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6139         {
6140             cr->duty = DUTY_PME;
6141         }
6142         else
6143         {
6144             cr->duty = DUTY_PP;
6145         }
6146
6147         /* Split the sim communicator into PP and PME only nodes */
6148         MPI_Comm_split(cr->mpi_comm_mysim,
6149                        cr->duty,
6150                        cr->nodeid,
6151                        &cr->mpi_comm_mygroup);
6152         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6153     }
6154 #endif
6155
6156     if (fplog)
6157     {
6158         fprintf(fplog, "This is a %s only node\n\n",
6159                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6160     }
6161 }
6162
6163 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6164 {
6165     gmx_domdec_t      *dd;
6166     gmx_domdec_comm_t *comm;
6167     int                CartReorder;
6168
6169     dd   = cr->dd;
6170     comm = dd->comm;
6171
6172     copy_ivec(dd->nc, comm->ntot);
6173
6174     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6175     comm->bCartesianPP_PME = FALSE;
6176
6177     /* Reorder the nodes by default. This might change the MPI ranks.
6178      * Real reordering is only supported on very few architectures,
6179      * Blue Gene is one of them.
6180      */
6181     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6182
6183     if (cr->npmenodes > 0)
6184     {
6185         /* Split the communicator into a PP and PME part */
6186         split_communicator(fplog, cr, dd_node_order, CartReorder);
6187         if (comm->bCartesianPP_PME)
6188         {
6189             /* We (possibly) reordered the nodes in split_communicator,
6190              * so it is no longer required in make_pp_communicator.
6191              */
6192             CartReorder = FALSE;
6193         }
6194     }
6195     else
6196     {
6197         /* All nodes do PP and PME */
6198 #ifdef GMX_MPI
6199         /* We do not require separate communicators */
6200         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6201 #endif
6202     }
6203
6204     if (cr->duty & DUTY_PP)
6205     {
6206         /* Copy or make a new PP communicator */
6207         make_pp_communicator(fplog, cr, CartReorder);
6208     }
6209     else
6210     {
6211         receive_ddindex2simnodeid(cr);
6212     }
6213
6214     if (!(cr->duty & DUTY_PME))
6215     {
6216         /* Set up the commnuication to our PME node */
6217         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6218         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6219         if (debug)
6220         {
6221             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6222                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6223         }
6224     }
6225     else
6226     {
6227         dd->pme_nodeid = -1;
6228     }
6229
6230     if (DDMASTER(dd))
6231     {
6232         dd->ma = init_gmx_domdec_master_t(dd,
6233                                           comm->cgs_gl.nr,
6234                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6235     }
6236 }
6237
6238 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6239 {
6240     real  *slb_frac, tot;
6241     int    i, n;
6242     double dbl;
6243
6244     slb_frac = NULL;
6245     if (nc > 1 && size_string != NULL)
6246     {
6247         if (fplog)
6248         {
6249             fprintf(fplog, "Using static load balancing for the %s direction\n",
6250                     dir);
6251         }
6252         snew(slb_frac, nc);
6253         tot = 0;
6254         for (i = 0; i < nc; i++)
6255         {
6256             dbl = 0;
6257             sscanf(size_string, "%lf%n", &dbl, &n);
6258             if (dbl == 0)
6259             {
6260                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6261             }
6262             slb_frac[i]  = dbl;
6263             size_string += n;
6264             tot         += slb_frac[i];
6265         }
6266         /* Normalize */
6267         if (fplog)
6268         {
6269             fprintf(fplog, "Relative cell sizes:");
6270         }
6271         for (i = 0; i < nc; i++)
6272         {
6273             slb_frac[i] /= tot;
6274             if (fplog)
6275             {
6276                 fprintf(fplog, " %5.3f", slb_frac[i]);
6277             }
6278         }
6279         if (fplog)
6280         {
6281             fprintf(fplog, "\n");
6282         }
6283     }
6284
6285     return slb_frac;
6286 }
6287
6288 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6289 {
6290     int                  n, nmol, ftype;
6291     gmx_mtop_ilistloop_t iloop;
6292     t_ilist             *il;
6293
6294     n     = 0;
6295     iloop = gmx_mtop_ilistloop_init(mtop);
6296     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6297     {
6298         for (ftype = 0; ftype < F_NRE; ftype++)
6299         {
6300             if ((interaction_function[ftype].flags & IF_BOND) &&
6301                 NRAL(ftype) >  2)
6302             {
6303                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6304             }
6305         }
6306     }
6307
6308     return n;
6309 }
6310
6311 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6312 {
6313     char *val;
6314     int   nst;
6315
6316     nst = def;
6317     val = getenv(env_var);
6318     if (val)
6319     {
6320         if (sscanf(val, "%d", &nst) <= 0)
6321         {
6322             nst = 1;
6323         }
6324         if (fplog)
6325         {
6326             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6327                     env_var, val, nst);
6328         }
6329     }
6330
6331     return nst;
6332 }
6333
6334 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6335 {
6336     if (MASTER(cr))
6337     {
6338         fprintf(stderr, "\n%s\n", warn_string);
6339     }
6340     if (fplog)
6341     {
6342         fprintf(fplog, "\n%s\n", warn_string);
6343     }
6344 }
6345
6346 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6347                                   t_inputrec *ir, FILE *fplog)
6348 {
6349     if (ir->ePBC == epbcSCREW &&
6350         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6351     {
6352         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6353     }
6354
6355     if (ir->ns_type == ensSIMPLE)
6356     {
6357         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6358     }
6359
6360     if (ir->nstlist == 0)
6361     {
6362         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6363     }
6364
6365     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6366     {
6367         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6368     }
6369 }
6370
6371 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6372 {
6373     int  di, d;
6374     real r;
6375
6376     r = ddbox->box_size[XX];
6377     for (di = 0; di < dd->ndim; di++)
6378     {
6379         d = dd->dim[di];
6380         /* Check using the initial average cell size */
6381         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6382     }
6383
6384     return r;
6385 }
6386
6387 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6388                              const char *dlb_opt, gmx_bool bRecordLoad,
6389                              unsigned long Flags, t_inputrec *ir)
6390 {
6391     gmx_domdec_t *dd;
6392     int           eDLB = -1;
6393     char          buf[STRLEN];
6394
6395     switch (dlb_opt[0])
6396     {
6397         case 'a': eDLB = edlbAUTO; break;
6398         case 'n': eDLB = edlbNO;   break;
6399         case 'y': eDLB = edlbYES;  break;
6400         default: gmx_incons("Unknown dlb_opt");
6401     }
6402
6403     if (Flags & MD_RERUN)
6404     {
6405         return edlbNO;
6406     }
6407
6408     if (!EI_DYNAMICS(ir->eI))
6409     {
6410         if (eDLB == edlbYES)
6411         {
6412             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6413             dd_warning(cr, fplog, buf);
6414         }
6415
6416         return edlbNO;
6417     }
6418
6419     if (!bRecordLoad)
6420     {
6421         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6422
6423         return edlbNO;
6424     }
6425
6426     if (Flags & MD_REPRODUCIBLE)
6427     {
6428         switch (eDLB)
6429         {
6430             case edlbNO:
6431                 break;
6432             case edlbAUTO:
6433                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6434                 eDLB = edlbNO;
6435                 break;
6436             case edlbYES:
6437                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6438                 break;
6439             default:
6440                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6441                 break;
6442         }
6443     }
6444
6445     return eDLB;
6446 }
6447
6448 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6449 {
6450     int dim;
6451
6452     dd->ndim = 0;
6453     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6454     {
6455         /* Decomposition order z,y,x */
6456         if (fplog)
6457         {
6458             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6459         }
6460         for (dim = DIM-1; dim >= 0; dim--)
6461         {
6462             if (dd->nc[dim] > 1)
6463             {
6464                 dd->dim[dd->ndim++] = dim;
6465             }
6466         }
6467     }
6468     else
6469     {
6470         /* Decomposition order x,y,z */
6471         for (dim = 0; dim < DIM; dim++)
6472         {
6473             if (dd->nc[dim] > 1)
6474             {
6475                 dd->dim[dd->ndim++] = dim;
6476             }
6477         }
6478     }
6479 }
6480
6481 static gmx_domdec_comm_t *init_dd_comm()
6482 {
6483     gmx_domdec_comm_t *comm;
6484     int                i;
6485
6486     snew(comm, 1);
6487     snew(comm->cggl_flag, DIM*2);
6488     snew(comm->cgcm_state, DIM*2);
6489     for (i = 0; i < DIM*2; i++)
6490     {
6491         comm->cggl_flag_nalloc[i]  = 0;
6492         comm->cgcm_state_nalloc[i] = 0;
6493     }
6494
6495     comm->nalloc_int = 0;
6496     comm->buf_int    = NULL;
6497
6498     vec_rvec_init(&comm->vbuf);
6499
6500     comm->n_load_have    = 0;
6501     comm->n_load_collect = 0;
6502
6503     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6504     {
6505         comm->sum_nat[i] = 0;
6506     }
6507     comm->ndecomp   = 0;
6508     comm->nload     = 0;
6509     comm->load_step = 0;
6510     comm->load_sum  = 0;
6511     comm->load_max  = 0;
6512     clear_ivec(comm->load_lim);
6513     comm->load_mdf  = 0;
6514     comm->load_pme  = 0;
6515
6516     return comm;
6517 }
6518
6519 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6520                                         unsigned long Flags,
6521                                         ivec nc,
6522                                         real comm_distance_min, real rconstr,
6523                                         const char *dlb_opt, real dlb_scale,
6524                                         const char *sizex, const char *sizey, const char *sizez,
6525                                         gmx_mtop_t *mtop, t_inputrec *ir,
6526                                         matrix box, rvec *x,
6527                                         gmx_ddbox_t *ddbox,
6528                                         int *npme_x, int *npme_y)
6529 {
6530     gmx_domdec_t      *dd;
6531     gmx_domdec_comm_t *comm;
6532     int                recload;
6533     int                d, i, j;
6534     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6535     gmx_bool           bC;
6536     char               buf[STRLEN];
6537
6538     if (fplog)
6539     {
6540         fprintf(fplog,
6541                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6542     }
6543
6544     snew(dd, 1);
6545
6546     dd->comm = init_dd_comm();
6547     comm     = dd->comm;
6548     snew(comm->cggl_flag, DIM*2);
6549     snew(comm->cgcm_state, DIM*2);
6550
6551     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6552     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6553
6554     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6555     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6556     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6557     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6558     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6559     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6560     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6561     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6562
6563     dd->pme_recv_f_alloc = 0;
6564     dd->pme_recv_f_buf   = NULL;
6565
6566     if (dd->bSendRecv2 && fplog)
6567     {
6568         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6569     }
6570     if (comm->eFlop)
6571     {
6572         if (fplog)
6573         {
6574             fprintf(fplog, "Will load balance based on FLOP count\n");
6575         }
6576         if (comm->eFlop > 1)
6577         {
6578             srand(1+cr->nodeid);
6579         }
6580         comm->bRecordLoad = TRUE;
6581     }
6582     else
6583     {
6584         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6585
6586     }
6587
6588     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6589
6590     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6591     if (fplog)
6592     {
6593         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6594     }
6595     dd->bGridJump              = comm->bDynLoadBal;
6596     comm->bPMELoadBalDLBLimits = FALSE;
6597
6598     if (comm->nstSortCG)
6599     {
6600         if (fplog)
6601         {
6602             if (comm->nstSortCG == 1)
6603             {
6604                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6605             }
6606             else
6607             {
6608                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6609                         comm->nstSortCG);
6610             }
6611         }
6612         snew(comm->sort, 1);
6613     }
6614     else
6615     {
6616         if (fplog)
6617         {
6618             fprintf(fplog, "Will not sort the charge groups\n");
6619         }
6620     }
6621
6622     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6623
6624     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6625     if (comm->bInterCGBondeds)
6626     {
6627         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6628     }
6629     else
6630     {
6631         comm->bInterCGMultiBody = FALSE;
6632     }
6633
6634     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6635     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6636
6637     if (ir->rlistlong == 0)
6638     {
6639         /* Set the cut-off to some very large value,
6640          * so we don't need if statements everywhere in the code.
6641          * We use sqrt, since the cut-off is squared in some places.
6642          */
6643         comm->cutoff   = GMX_CUTOFF_INF;
6644     }
6645     else
6646     {
6647         comm->cutoff   = ir->rlistlong;
6648     }
6649     comm->cutoff_mbody = 0;
6650
6651     comm->cellsize_limit = 0;
6652     comm->bBondComm      = FALSE;
6653
6654     if (comm->bInterCGBondeds)
6655     {
6656         if (comm_distance_min > 0)
6657         {
6658             comm->cutoff_mbody = comm_distance_min;
6659             if (Flags & MD_DDBONDCOMM)
6660             {
6661                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6662             }
6663             else
6664             {
6665                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6666             }
6667             r_bonded_limit = comm->cutoff_mbody;
6668         }
6669         else if (ir->bPeriodicMols)
6670         {
6671             /* Can not easily determine the required cut-off */
6672             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6673             comm->cutoff_mbody = comm->cutoff/2;
6674             r_bonded_limit     = comm->cutoff_mbody;
6675         }
6676         else
6677         {
6678             if (MASTER(cr))
6679             {
6680                 dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
6681                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6682             }
6683             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6684             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6685
6686             /* We use an initial margin of 10% for the minimum cell size,
6687              * except when we are just below the non-bonded cut-off.
6688              */
6689             if (Flags & MD_DDBONDCOMM)
6690             {
6691                 if (max(r_2b, r_mb) > comm->cutoff)
6692                 {
6693                     r_bonded        = max(r_2b, r_mb);
6694                     r_bonded_limit  = 1.1*r_bonded;
6695                     comm->bBondComm = TRUE;
6696                 }
6697                 else
6698                 {
6699                     r_bonded       = r_mb;
6700                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6701                 }
6702                 /* We determine cutoff_mbody later */
6703             }
6704             else
6705             {
6706                 /* No special bonded communication,
6707                  * simply increase the DD cut-off.
6708                  */
6709                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6710                 comm->cutoff_mbody = r_bonded_limit;
6711                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6712             }
6713         }
6714         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6715         if (fplog)
6716         {
6717             fprintf(fplog,
6718                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6719                     comm->cellsize_limit);
6720         }
6721     }
6722
6723     if (dd->bInterCGcons && rconstr <= 0)
6724     {
6725         /* There is a cell size limit due to the constraints (P-LINCS) */
6726         rconstr = constr_r_max(fplog, mtop, ir);
6727         if (fplog)
6728         {
6729             fprintf(fplog,
6730                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6731                     rconstr);
6732             if (rconstr > comm->cellsize_limit)
6733             {
6734                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6735             }
6736         }
6737     }
6738     else if (rconstr > 0 && fplog)
6739     {
6740         /* Here we do not check for dd->bInterCGcons,
6741          * because one can also set a cell size limit for virtual sites only
6742          * and at this point we don't know yet if there are intercg v-sites.
6743          */
6744         fprintf(fplog,
6745                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6746                 rconstr);
6747     }
6748     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6749
6750     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6751
6752     if (nc[XX] > 0)
6753     {
6754         copy_ivec(nc, dd->nc);
6755         set_dd_dim(fplog, dd);
6756         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6757
6758         if (cr->npmenodes == -1)
6759         {
6760             cr->npmenodes = 0;
6761         }
6762         acs = average_cellsize_min(dd, ddbox);
6763         if (acs < comm->cellsize_limit)
6764         {
6765             if (fplog)
6766             {
6767                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6768             }
6769             gmx_fatal_collective(FARGS, cr, NULL,
6770                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6771                                  acs, comm->cellsize_limit);
6772         }
6773     }
6774     else
6775     {
6776         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6777
6778         /* We need to choose the optimal DD grid and possibly PME nodes */
6779         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6780                                comm->eDLB != edlbNO, dlb_scale,
6781                                comm->cellsize_limit, comm->cutoff,
6782                                comm->bInterCGBondeds, comm->bInterCGMultiBody);
6783
6784         if (dd->nc[XX] == 0)
6785         {
6786             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6787             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6788                     !bC ? "-rdd" : "-rcon",
6789                     comm->eDLB != edlbNO ? " or -dds" : "",
6790                     bC ? " or your LINCS settings" : "");
6791
6792             gmx_fatal_collective(FARGS, cr, NULL,
6793                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6794                                  "%s\n"
6795                                  "Look in the log file for details on the domain decomposition",
6796                                  cr->nnodes-cr->npmenodes, limit, buf);
6797         }
6798         set_dd_dim(fplog, dd);
6799     }
6800
6801     if (fplog)
6802     {
6803         fprintf(fplog,
6804                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6805                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6806     }
6807
6808     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6809     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6810     {
6811         gmx_fatal_collective(FARGS, cr, NULL,
6812                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6813                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6814     }
6815     if (cr->npmenodes > dd->nnodes)
6816     {
6817         gmx_fatal_collective(FARGS, cr, NULL,
6818                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6819     }
6820     if (cr->npmenodes > 0)
6821     {
6822         comm->npmenodes = cr->npmenodes;
6823     }
6824     else
6825     {
6826         comm->npmenodes = dd->nnodes;
6827     }
6828
6829     if (EEL_PME(ir->coulombtype))
6830     {
6831         /* The following choices should match those
6832          * in comm_cost_est in domdec_setup.c.
6833          * Note that here the checks have to take into account
6834          * that the decomposition might occur in a different order than xyz
6835          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6836          * in which case they will not match those in comm_cost_est,
6837          * but since that is mainly for testing purposes that's fine.
6838          */
6839         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6840             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6841             getenv("GMX_PMEONEDD") == NULL)
6842         {
6843             comm->npmedecompdim = 2;
6844             comm->npmenodes_x   = dd->nc[XX];
6845             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6846         }
6847         else
6848         {
6849             /* In case nc is 1 in both x and y we could still choose to
6850              * decompose pme in y instead of x, but we use x for simplicity.
6851              */
6852             comm->npmedecompdim = 1;
6853             if (dd->dim[0] == YY)
6854             {
6855                 comm->npmenodes_x = 1;
6856                 comm->npmenodes_y = comm->npmenodes;
6857             }
6858             else
6859             {
6860                 comm->npmenodes_x = comm->npmenodes;
6861                 comm->npmenodes_y = 1;
6862             }
6863         }
6864         if (fplog)
6865         {
6866             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6867                     comm->npmenodes_x, comm->npmenodes_y, 1);
6868         }
6869     }
6870     else
6871     {
6872         comm->npmedecompdim = 0;
6873         comm->npmenodes_x   = 0;
6874         comm->npmenodes_y   = 0;
6875     }
6876
6877     /* Technically we don't need both of these,
6878      * but it simplifies code not having to recalculate it.
6879      */
6880     *npme_x = comm->npmenodes_x;
6881     *npme_y = comm->npmenodes_y;
6882
6883     snew(comm->slb_frac, DIM);
6884     if (comm->eDLB == edlbNO)
6885     {
6886         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6887         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6888         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6889     }
6890
6891     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6892     {
6893         if (comm->bBondComm || comm->eDLB != edlbNO)
6894         {
6895             /* Set the bonded communication distance to halfway
6896              * the minimum and the maximum,
6897              * since the extra communication cost is nearly zero.
6898              */
6899             acs                = average_cellsize_min(dd, ddbox);
6900             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6901             if (comm->eDLB != edlbNO)
6902             {
6903                 /* Check if this does not limit the scaling */
6904                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
6905             }
6906             if (!comm->bBondComm)
6907             {
6908                 /* Without bBondComm do not go beyond the n.b. cut-off */
6909                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
6910                 if (comm->cellsize_limit >= comm->cutoff)
6911                 {
6912                     /* We don't loose a lot of efficieny
6913                      * when increasing it to the n.b. cut-off.
6914                      * It can even be slightly faster, because we need
6915                      * less checks for the communication setup.
6916                      */
6917                     comm->cutoff_mbody = comm->cutoff;
6918                 }
6919             }
6920             /* Check if we did not end up below our original limit */
6921             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
6922
6923             if (comm->cutoff_mbody > comm->cellsize_limit)
6924             {
6925                 comm->cellsize_limit = comm->cutoff_mbody;
6926             }
6927         }
6928         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6929     }
6930
6931     if (debug)
6932     {
6933         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6934                 "cellsize limit %f\n",
6935                 comm->bBondComm, comm->cellsize_limit);
6936     }
6937
6938     if (MASTER(cr))
6939     {
6940         check_dd_restrictions(cr, dd, ir, fplog);
6941     }
6942
6943     comm->partition_step = INT_MIN;
6944     dd->ddp_count        = 0;
6945
6946     clear_dd_cycle_counts(dd);
6947
6948     return dd;
6949 }
6950
6951 static void set_dlb_limits(gmx_domdec_t *dd)
6952
6953 {
6954     int d;
6955
6956     for (d = 0; d < dd->ndim; d++)
6957     {
6958         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6959         dd->comm->cellsize_min[dd->dim[d]] =
6960             dd->comm->cellsize_min_dlb[dd->dim[d]];
6961     }
6962 }
6963
6964
6965 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
6966 {
6967     gmx_domdec_t      *dd;
6968     gmx_domdec_comm_t *comm;
6969     real               cellsize_min;
6970     int                d, nc, i;
6971     char               buf[STRLEN];
6972
6973     dd   = cr->dd;
6974     comm = dd->comm;
6975
6976     if (fplog)
6977     {
6978         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6979     }
6980
6981     cellsize_min = comm->cellsize_min[dd->dim[0]];
6982     for (d = 1; d < dd->ndim; d++)
6983     {
6984         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6985     }
6986
6987     if (cellsize_min < comm->cellsize_limit*1.05)
6988     {
6989         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6990
6991         /* Change DLB from "auto" to "no". */
6992         comm->eDLB = edlbNO;
6993
6994         return;
6995     }
6996
6997     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6998     comm->bDynLoadBal = TRUE;
6999     dd->bGridJump     = TRUE;
7000
7001     set_dlb_limits(dd);
7002
7003     /* We can set the required cell size info here,
7004      * so we do not need to communicate this.
7005      * The grid is completely uniform.
7006      */
7007     for (d = 0; d < dd->ndim; d++)
7008     {
7009         if (comm->root[d])
7010         {
7011             comm->load[d].sum_m = comm->load[d].sum;
7012
7013             nc = dd->nc[dd->dim[d]];
7014             for (i = 0; i < nc; i++)
7015             {
7016                 comm->root[d]->cell_f[i]    = i/(real)nc;
7017                 if (d > 0)
7018                 {
7019                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7020                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7021                 }
7022             }
7023             comm->root[d]->cell_f[nc] = 1.0;
7024         }
7025     }
7026 }
7027
7028 static char *init_bLocalCG(gmx_mtop_t *mtop)
7029 {
7030     int   ncg, cg;
7031     char *bLocalCG;
7032
7033     ncg = ncg_mtop(mtop);
7034     snew(bLocalCG, ncg);
7035     for (cg = 0; cg < ncg; cg++)
7036     {
7037         bLocalCG[cg] = FALSE;
7038     }
7039
7040     return bLocalCG;
7041 }
7042
7043 void dd_init_bondeds(FILE *fplog,
7044                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7045                      gmx_vsite_t *vsite, gmx_constr_t constr,
7046                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7047 {
7048     gmx_domdec_comm_t *comm;
7049     gmx_bool           bBondComm;
7050     int                d;
7051
7052     dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
7053
7054     comm = dd->comm;
7055
7056     if (comm->bBondComm)
7057     {
7058         /* Communicate atoms beyond the cut-off for bonded interactions */
7059         comm = dd->comm;
7060
7061         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7062
7063         comm->bLocalCG = init_bLocalCG(mtop);
7064     }
7065     else
7066     {
7067         /* Only communicate atoms based on cut-off */
7068         comm->cglink   = NULL;
7069         comm->bLocalCG = NULL;
7070     }
7071 }
7072
7073 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7074                               t_inputrec *ir,
7075                               gmx_bool bDynLoadBal, real dlb_scale,
7076                               gmx_ddbox_t *ddbox)
7077 {
7078     gmx_domdec_comm_t *comm;
7079     int                d;
7080     ivec               np;
7081     real               limit, shrink;
7082     char               buf[64];
7083
7084     if (fplog == NULL)
7085     {
7086         return;
7087     }
7088
7089     comm = dd->comm;
7090
7091     if (bDynLoadBal)
7092     {
7093         fprintf(fplog, "The maximum number of communication pulses is:");
7094         for (d = 0; d < dd->ndim; d++)
7095         {
7096             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7097         }
7098         fprintf(fplog, "\n");
7099         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7100         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7101         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7102         for (d = 0; d < DIM; d++)
7103         {
7104             if (dd->nc[d] > 1)
7105             {
7106                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7107                 {
7108                     shrink = 0;
7109                 }
7110                 else
7111                 {
7112                     shrink =
7113                         comm->cellsize_min_dlb[d]/
7114                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7115                 }
7116                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7117             }
7118         }
7119         fprintf(fplog, "\n");
7120     }
7121     else
7122     {
7123         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
7124         fprintf(fplog, "The initial number of communication pulses is:");
7125         for (d = 0; d < dd->ndim; d++)
7126         {
7127             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7128         }
7129         fprintf(fplog, "\n");
7130         fprintf(fplog, "The initial domain decomposition cell size is:");
7131         for (d = 0; d < DIM; d++)
7132         {
7133             if (dd->nc[d] > 1)
7134             {
7135                 fprintf(fplog, " %c %.2f nm",
7136                         dim2char(d), dd->comm->cellsize_min[d]);
7137             }
7138         }
7139         fprintf(fplog, "\n\n");
7140     }
7141
7142     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7143     {
7144         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7145         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7146                 "non-bonded interactions", "", comm->cutoff);
7147
7148         if (bDynLoadBal)
7149         {
7150             limit = dd->comm->cellsize_limit;
7151         }
7152         else
7153         {
7154             if (dynamic_dd_box(ddbox, ir))
7155             {
7156                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7157             }
7158             limit = dd->comm->cellsize_min[XX];
7159             for (d = 1; d < DIM; d++)
7160             {
7161                 limit = min(limit, dd->comm->cellsize_min[d]);
7162             }
7163         }
7164
7165         if (comm->bInterCGBondeds)
7166         {
7167             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7168                     "two-body bonded interactions", "(-rdd)",
7169                     max(comm->cutoff, comm->cutoff_mbody));
7170             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7171                     "multi-body bonded interactions", "(-rdd)",
7172                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7173         }
7174         if (dd->vsite_comm)
7175         {
7176             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7177                     "virtual site constructions", "(-rcon)", limit);
7178         }
7179         if (dd->constraint_comm)
7180         {
7181             sprintf(buf, "atoms separated by up to %d constraints",
7182                     1+ir->nProjOrder);
7183             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7184                     buf, "(-rcon)", limit);
7185         }
7186         fprintf(fplog, "\n");
7187     }
7188
7189     fflush(fplog);
7190 }
7191
7192 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7193                                 real               dlb_scale,
7194                                 const t_inputrec  *ir,
7195                                 const gmx_ddbox_t *ddbox)
7196 {
7197     gmx_domdec_comm_t *comm;
7198     int                d, dim, npulse, npulse_d_max, npulse_d;
7199     gmx_bool           bNoCutOff;
7200
7201     comm = dd->comm;
7202
7203     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7204
7205     /* Determine the maximum number of comm. pulses in one dimension */
7206
7207     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7208
7209     /* Determine the maximum required number of grid pulses */
7210     if (comm->cellsize_limit >= comm->cutoff)
7211     {
7212         /* Only a single pulse is required */
7213         npulse = 1;
7214     }
7215     else if (!bNoCutOff && comm->cellsize_limit > 0)
7216     {
7217         /* We round down slightly here to avoid overhead due to the latency
7218          * of extra communication calls when the cut-off
7219          * would be only slightly longer than the cell size.
7220          * Later cellsize_limit is redetermined,
7221          * so we can not miss interactions due to this rounding.
7222          */
7223         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7224     }
7225     else
7226     {
7227         /* There is no cell size limit */
7228         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7229     }
7230
7231     if (!bNoCutOff && npulse > 1)
7232     {
7233         /* See if we can do with less pulses, based on dlb_scale */
7234         npulse_d_max = 0;
7235         for (d = 0; d < dd->ndim; d++)
7236         {
7237             dim      = dd->dim[d];
7238             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7239                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7240             npulse_d_max = max(npulse_d_max, npulse_d);
7241         }
7242         npulse = min(npulse, npulse_d_max);
7243     }
7244
7245     /* This env var can override npulse */
7246     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7247     if (d > 0)
7248     {
7249         npulse = d;
7250     }
7251
7252     comm->maxpulse       = 1;
7253     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7254     for (d = 0; d < dd->ndim; d++)
7255     {
7256         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7257         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7258         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7259         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7260         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7261         {
7262             comm->bVacDLBNoLimit = FALSE;
7263         }
7264     }
7265
7266     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7267     if (!comm->bVacDLBNoLimit)
7268     {
7269         comm->cellsize_limit = max(comm->cellsize_limit,
7270                                    comm->cutoff/comm->maxpulse);
7271     }
7272     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7273     /* Set the minimum cell size for each DD dimension */
7274     for (d = 0; d < dd->ndim; d++)
7275     {
7276         if (comm->bVacDLBNoLimit ||
7277             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7278         {
7279             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7280         }
7281         else
7282         {
7283             comm->cellsize_min_dlb[dd->dim[d]] =
7284                 comm->cutoff/comm->cd[d].np_dlb;
7285         }
7286     }
7287     if (comm->cutoff_mbody <= 0)
7288     {
7289         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7290     }
7291     if (comm->bDynLoadBal)
7292     {
7293         set_dlb_limits(dd);
7294     }
7295 }
7296
7297 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7298 {
7299     /* If each molecule is a single charge group
7300      * or we use domain decomposition for each periodic dimension,
7301      * we do not need to take pbc into account for the bonded interactions.
7302      */
7303     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7304             !(dd->nc[XX] > 1 &&
7305               dd->nc[YY] > 1 &&
7306               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7307 }
7308
7309 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7310                        t_inputrec *ir, t_forcerec *fr,
7311                        gmx_ddbox_t *ddbox)
7312 {
7313     gmx_domdec_comm_t *comm;
7314     int                natoms_tot;
7315     real               vol_frac;
7316
7317     comm = dd->comm;
7318
7319     /* Initialize the thread data.
7320      * This can not be done in init_domain_decomposition,
7321      * as the numbers of threads is determined later.
7322      */
7323     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7324     if (comm->nth > 1)
7325     {
7326         snew(comm->dth, comm->nth);
7327     }
7328
7329     if (EEL_PME(ir->coulombtype))
7330     {
7331         init_ddpme(dd, &comm->ddpme[0], 0);
7332         if (comm->npmedecompdim >= 2)
7333         {
7334             init_ddpme(dd, &comm->ddpme[1], 1);
7335         }
7336     }
7337     else
7338     {
7339         comm->npmenodes = 0;
7340         if (dd->pme_nodeid >= 0)
7341         {
7342             gmx_fatal_collective(FARGS, NULL, dd,
7343                                  "Can not have separate PME nodes without PME electrostatics");
7344         }
7345     }
7346
7347     if (debug)
7348     {
7349         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7350     }
7351     if (comm->eDLB != edlbNO)
7352     {
7353         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7354     }
7355
7356     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7357     if (comm->eDLB == edlbAUTO)
7358     {
7359         if (fplog)
7360         {
7361             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7362         }
7363         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7364     }
7365
7366     if (ir->ePBC == epbcNONE)
7367     {
7368         vol_frac = 1 - 1/(double)dd->nnodes;
7369     }
7370     else
7371     {
7372         vol_frac =
7373             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7374     }
7375     if (debug)
7376     {
7377         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7378     }
7379     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7380
7381     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7382 }
7383
7384 static gmx_bool test_dd_cutoff(t_commrec *cr,
7385                                t_state *state, t_inputrec *ir,
7386                                real cutoff_req)
7387 {
7388     gmx_domdec_t *dd;
7389     gmx_ddbox_t   ddbox;
7390     int           d, dim, np;
7391     real          inv_cell_size;
7392     int           LocallyLimited;
7393
7394     dd = cr->dd;
7395
7396     set_ddbox(dd, FALSE, cr, ir, state->box,
7397               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7398
7399     LocallyLimited = 0;
7400
7401     for (d = 0; d < dd->ndim; d++)
7402     {
7403         dim = dd->dim[d];
7404
7405         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7406         if (dynamic_dd_box(&ddbox, ir))
7407         {
7408             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7409         }
7410
7411         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7412
7413         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7414             dd->comm->cd[d].np_dlb > 0)
7415         {
7416             if (np > dd->comm->cd[d].np_dlb)
7417             {
7418                 return FALSE;
7419             }
7420
7421             /* If a current local cell size is smaller than the requested
7422              * cut-off, we could still fix it, but this gets very complicated.
7423              * Without fixing here, we might actually need more checks.
7424              */
7425             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7426             {
7427                 LocallyLimited = 1;
7428             }
7429         }
7430     }
7431
7432     if (dd->comm->eDLB != edlbNO)
7433     {
7434         /* If DLB is not active yet, we don't need to check the grid jumps.
7435          * Actually we shouldn't, because then the grid jump data is not set.
7436          */
7437         if (dd->comm->bDynLoadBal &&
7438             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7439         {
7440             LocallyLimited = 1;
7441         }
7442
7443         gmx_sumi(1, &LocallyLimited, cr);
7444
7445         if (LocallyLimited > 0)
7446         {
7447             return FALSE;
7448         }
7449     }
7450
7451     return TRUE;
7452 }
7453
7454 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7455                           real cutoff_req)
7456 {
7457     gmx_bool bCutoffAllowed;
7458
7459     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7460
7461     if (bCutoffAllowed)
7462     {
7463         cr->dd->comm->cutoff = cutoff_req;
7464     }
7465
7466     return bCutoffAllowed;
7467 }
7468
7469 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7470 {
7471     gmx_domdec_comm_t *comm;
7472
7473     comm = cr->dd->comm;
7474
7475     /* Turn on the DLB limiting (might have been on already) */
7476     comm->bPMELoadBalDLBLimits = TRUE;
7477
7478     /* Change the cut-off limit */
7479     comm->PMELoadBal_max_cutoff = comm->cutoff;
7480 }
7481
7482 static void merge_cg_buffers(int ncell,
7483                              gmx_domdec_comm_dim_t *cd, int pulse,
7484                              int  *ncg_cell,
7485                              int  *index_gl, int  *recv_i,
7486                              rvec *cg_cm,    rvec *recv_vr,
7487                              int *cgindex,
7488                              cginfo_mb_t *cginfo_mb, int *cginfo)
7489 {
7490     gmx_domdec_ind_t *ind, *ind_p;
7491     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7492     int               shift, shift_at;
7493
7494     ind = &cd->ind[pulse];
7495
7496     /* First correct the already stored data */
7497     shift = ind->nrecv[ncell];
7498     for (cell = ncell-1; cell >= 0; cell--)
7499     {
7500         shift -= ind->nrecv[cell];
7501         if (shift > 0)
7502         {
7503             /* Move the cg's present from previous grid pulses */
7504             cg0                = ncg_cell[ncell+cell];
7505             cg1                = ncg_cell[ncell+cell+1];
7506             cgindex[cg1+shift] = cgindex[cg1];
7507             for (cg = cg1-1; cg >= cg0; cg--)
7508             {
7509                 index_gl[cg+shift] = index_gl[cg];
7510                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7511                 cgindex[cg+shift] = cgindex[cg];
7512                 cginfo[cg+shift]  = cginfo[cg];
7513             }
7514             /* Correct the already stored send indices for the shift */
7515             for (p = 1; p <= pulse; p++)
7516             {
7517                 ind_p = &cd->ind[p];
7518                 cg0   = 0;
7519                 for (c = 0; c < cell; c++)
7520                 {
7521                     cg0 += ind_p->nsend[c];
7522                 }
7523                 cg1 = cg0 + ind_p->nsend[cell];
7524                 for (cg = cg0; cg < cg1; cg++)
7525                 {
7526                     ind_p->index[cg] += shift;
7527                 }
7528             }
7529         }
7530     }
7531
7532     /* Merge in the communicated buffers */
7533     shift    = 0;
7534     shift_at = 0;
7535     cg0      = 0;
7536     for (cell = 0; cell < ncell; cell++)
7537     {
7538         cg1 = ncg_cell[ncell+cell+1] + shift;
7539         if (shift_at > 0)
7540         {
7541             /* Correct the old cg indices */
7542             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7543             {
7544                 cgindex[cg+1] += shift_at;
7545             }
7546         }
7547         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7548         {
7549             /* Copy this charge group from the buffer */
7550             index_gl[cg1] = recv_i[cg0];
7551             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7552             /* Add it to the cgindex */
7553             cg_gl          = index_gl[cg1];
7554             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7555             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7556             cgindex[cg1+1] = cgindex[cg1] + nat;
7557             cg0++;
7558             cg1++;
7559             shift_at += nat;
7560         }
7561         shift                 += ind->nrecv[cell];
7562         ncg_cell[ncell+cell+1] = cg1;
7563     }
7564 }
7565
7566 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7567                                int nzone, int cg0, const int *cgindex)
7568 {
7569     int cg, zone, p;
7570
7571     /* Store the atom block boundaries for easy copying of communication buffers
7572      */
7573     cg = cg0;
7574     for (zone = 0; zone < nzone; zone++)
7575     {
7576         for (p = 0; p < cd->np; p++)
7577         {
7578             cd->ind[p].cell2at0[zone] = cgindex[cg];
7579             cg += cd->ind[p].nrecv[zone];
7580             cd->ind[p].cell2at1[zone] = cgindex[cg];
7581         }
7582     }
7583 }
7584
7585 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7586 {
7587     int      i;
7588     gmx_bool bMiss;
7589
7590     bMiss = FALSE;
7591     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7592     {
7593         if (!bLocalCG[link->a[i]])
7594         {
7595             bMiss = TRUE;
7596         }
7597     }
7598
7599     return bMiss;
7600 }
7601
7602 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7603 typedef struct {
7604     real c[DIM][4]; /* the corners for the non-bonded communication */
7605     real cr0;       /* corner for rounding */
7606     real cr1[4];    /* corners for rounding */
7607     real bc[DIM];   /* corners for bounded communication */
7608     real bcr1;      /* corner for rounding for bonded communication */
7609 } dd_corners_t;
7610
7611 /* Determine the corners of the domain(s) we are communicating with */
7612 static void
7613 set_dd_corners(const gmx_domdec_t *dd,
7614                int dim0, int dim1, int dim2,
7615                gmx_bool bDistMB,
7616                dd_corners_t *c)
7617 {
7618     const gmx_domdec_comm_t  *comm;
7619     const gmx_domdec_zones_t *zones;
7620     int i, j;
7621
7622     comm = dd->comm;
7623
7624     zones = &comm->zones;
7625
7626     /* Keep the compiler happy */
7627     c->cr0  = 0;
7628     c->bcr1 = 0;
7629
7630     /* The first dimension is equal for all cells */
7631     c->c[0][0] = comm->cell_x0[dim0];
7632     if (bDistMB)
7633     {
7634         c->bc[0] = c->c[0][0];
7635     }
7636     if (dd->ndim >= 2)
7637     {
7638         dim1 = dd->dim[1];
7639         /* This cell row is only seen from the first row */
7640         c->c[1][0] = comm->cell_x0[dim1];
7641         /* All rows can see this row */
7642         c->c[1][1] = comm->cell_x0[dim1];
7643         if (dd->bGridJump)
7644         {
7645             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7646             if (bDistMB)
7647             {
7648                 /* For the multi-body distance we need the maximum */
7649                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7650             }
7651         }
7652         /* Set the upper-right corner for rounding */
7653         c->cr0 = comm->cell_x1[dim0];
7654
7655         if (dd->ndim >= 3)
7656         {
7657             dim2 = dd->dim[2];
7658             for (j = 0; j < 4; j++)
7659             {
7660                 c->c[2][j] = comm->cell_x0[dim2];
7661             }
7662             if (dd->bGridJump)
7663             {
7664                 /* Use the maximum of the i-cells that see a j-cell */
7665                 for (i = 0; i < zones->nizone; i++)
7666                 {
7667                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7668                     {
7669                         if (j >= 4)
7670                         {
7671                             c->c[2][j-4] =
7672                                 max(c->c[2][j-4],
7673                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7674                         }
7675                     }
7676                 }
7677                 if (bDistMB)
7678                 {
7679                     /* For the multi-body distance we need the maximum */
7680                     c->bc[2] = comm->cell_x0[dim2];
7681                     for (i = 0; i < 2; i++)
7682                     {
7683                         for (j = 0; j < 2; j++)
7684                         {
7685                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7686                         }
7687                     }
7688                 }
7689             }
7690
7691             /* Set the upper-right corner for rounding */
7692             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7693              * Only cell (0,0,0) can see cell 7 (1,1,1)
7694              */
7695             c->cr1[0] = comm->cell_x1[dim1];
7696             c->cr1[3] = comm->cell_x1[dim1];
7697             if (dd->bGridJump)
7698             {
7699                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7700                 if (bDistMB)
7701                 {
7702                     /* For the multi-body distance we need the maximum */
7703                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7704                 }
7705             }
7706         }
7707     }
7708 }
7709
7710 /* Determine which cg's we need to send in this pulse from this zone */
7711 static void
7712 get_zone_pulse_cgs(gmx_domdec_t *dd,
7713                    int zonei, int zone,
7714                    int cg0, int cg1,
7715                    const int *index_gl,
7716                    const int *cgindex,
7717                    int dim, int dim_ind,
7718                    int dim0, int dim1, int dim2,
7719                    real r_comm2, real r_bcomm2,
7720                    matrix box,
7721                    ivec tric_dist,
7722                    rvec *normal,
7723                    real skew_fac2_d, real skew_fac_01,
7724                    rvec *v_d, rvec *v_0, rvec *v_1,
7725                    const dd_corners_t *c,
7726                    rvec sf2_round,
7727                    gmx_bool bDistBonded,
7728                    gmx_bool bBondComm,
7729                    gmx_bool bDist2B,
7730                    gmx_bool bDistMB,
7731                    rvec *cg_cm,
7732                    int *cginfo,
7733                    gmx_domdec_ind_t *ind,
7734                    int **ibuf, int *ibuf_nalloc,
7735                    vec_rvec_t *vbuf,
7736                    int *nsend_ptr,
7737                    int *nat_ptr,
7738                    int *nsend_z_ptr)
7739 {
7740     gmx_domdec_comm_t *comm;
7741     gmx_bool           bScrew;
7742     gmx_bool           bDistMB_pulse;
7743     int                cg, i;
7744     real               r2, rb2, r, tric_sh;
7745     rvec               rn, rb;
7746     int                dimd;
7747     int                nsend_z, nsend, nat;
7748
7749     comm = dd->comm;
7750
7751     bScrew = (dd->bScrewPBC && dim == XX);
7752
7753     bDistMB_pulse = (bDistMB && bDistBonded);
7754
7755     nsend_z = 0;
7756     nsend   = *nsend_ptr;
7757     nat     = *nat_ptr;
7758
7759     for (cg = cg0; cg < cg1; cg++)
7760     {
7761         r2  = 0;
7762         rb2 = 0;
7763         if (tric_dist[dim_ind] == 0)
7764         {
7765             /* Rectangular direction, easy */
7766             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7767             if (r > 0)
7768             {
7769                 r2 += r*r;
7770             }
7771             if (bDistMB_pulse)
7772             {
7773                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7774                 if (r > 0)
7775                 {
7776                     rb2 += r*r;
7777                 }
7778             }
7779             /* Rounding gives at most a 16% reduction
7780              * in communicated atoms
7781              */
7782             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7783             {
7784                 r = cg_cm[cg][dim0] - c->cr0;
7785                 /* This is the first dimension, so always r >= 0 */
7786                 r2 += r*r;
7787                 if (bDistMB_pulse)
7788                 {
7789                     rb2 += r*r;
7790                 }
7791             }
7792             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7793             {
7794                 r = cg_cm[cg][dim1] - c->cr1[zone];
7795                 if (r > 0)
7796                 {
7797                     r2 += r*r;
7798                 }
7799                 if (bDistMB_pulse)
7800                 {
7801                     r = cg_cm[cg][dim1] - c->bcr1;
7802                     if (r > 0)
7803                     {
7804                         rb2 += r*r;
7805                     }
7806                 }
7807             }
7808         }
7809         else
7810         {
7811             /* Triclinic direction, more complicated */
7812             clear_rvec(rn);
7813             clear_rvec(rb);
7814             /* Rounding, conservative as the skew_fac multiplication
7815              * will slightly underestimate the distance.
7816              */
7817             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7818             {
7819                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7820                 for (i = dim0+1; i < DIM; i++)
7821                 {
7822                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7823                 }
7824                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7825                 if (bDistMB_pulse)
7826                 {
7827                     rb[dim0] = rn[dim0];
7828                     rb2      = r2;
7829                 }
7830                 /* Take care that the cell planes along dim0 might not
7831                  * be orthogonal to those along dim1 and dim2.
7832                  */
7833                 for (i = 1; i <= dim_ind; i++)
7834                 {
7835                     dimd = dd->dim[i];
7836                     if (normal[dim0][dimd] > 0)
7837                     {
7838                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7839                         if (bDistMB_pulse)
7840                         {
7841                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7842                         }
7843                     }
7844                 }
7845             }
7846             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7847             {
7848                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7849                 tric_sh   = 0;
7850                 for (i = dim1+1; i < DIM; i++)
7851                 {
7852                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7853                 }
7854                 rn[dim1] += tric_sh;
7855                 if (rn[dim1] > 0)
7856                 {
7857                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7858                     /* Take care of coupling of the distances
7859                      * to the planes along dim0 and dim1 through dim2.
7860                      */
7861                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7862                     /* Take care that the cell planes along dim1
7863                      * might not be orthogonal to that along dim2.
7864                      */
7865                     if (normal[dim1][dim2] > 0)
7866                     {
7867                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7868                     }
7869                 }
7870                 if (bDistMB_pulse)
7871                 {
7872                     rb[dim1] +=
7873                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7874                     if (rb[dim1] > 0)
7875                     {
7876                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7877                         /* Take care of coupling of the distances
7878                          * to the planes along dim0 and dim1 through dim2.
7879                          */
7880                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7881                         /* Take care that the cell planes along dim1
7882                          * might not be orthogonal to that along dim2.
7883                          */
7884                         if (normal[dim1][dim2] > 0)
7885                         {
7886                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7887                         }
7888                     }
7889                 }
7890             }
7891             /* The distance along the communication direction */
7892             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7893             tric_sh  = 0;
7894             for (i = dim+1; i < DIM; i++)
7895             {
7896                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7897             }
7898             rn[dim] += tric_sh;
7899             if (rn[dim] > 0)
7900             {
7901                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7902                 /* Take care of coupling of the distances
7903                  * to the planes along dim0 and dim1 through dim2.
7904                  */
7905                 if (dim_ind == 1 && zonei == 1)
7906                 {
7907                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7908                 }
7909             }
7910             if (bDistMB_pulse)
7911             {
7912                 clear_rvec(rb);
7913                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7914                 if (rb[dim] > 0)
7915                 {
7916                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7917                     /* Take care of coupling of the distances
7918                      * to the planes along dim0 and dim1 through dim2.
7919                      */
7920                     if (dim_ind == 1 && zonei == 1)
7921                     {
7922                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7923                     }
7924                 }
7925             }
7926         }
7927
7928         if (r2 < r_comm2 ||
7929             (bDistBonded &&
7930              ((bDistMB && rb2 < r_bcomm2) ||
7931               (bDist2B && r2  < r_bcomm2)) &&
7932              (!bBondComm ||
7933               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7934                missing_link(comm->cglink, index_gl[cg],
7935                             comm->bLocalCG)))))
7936         {
7937             /* Make an index to the local charge groups */
7938             if (nsend+1 > ind->nalloc)
7939             {
7940                 ind->nalloc = over_alloc_large(nsend+1);
7941                 srenew(ind->index, ind->nalloc);
7942             }
7943             if (nsend+1 > *ibuf_nalloc)
7944             {
7945                 *ibuf_nalloc = over_alloc_large(nsend+1);
7946                 srenew(*ibuf, *ibuf_nalloc);
7947             }
7948             ind->index[nsend] = cg;
7949             (*ibuf)[nsend]    = index_gl[cg];
7950             nsend_z++;
7951             vec_rvec_check_alloc(vbuf, nsend+1);
7952
7953             if (dd->ci[dim] == 0)
7954             {
7955                 /* Correct cg_cm for pbc */
7956                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7957                 if (bScrew)
7958                 {
7959                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7960                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7961                 }
7962             }
7963             else
7964             {
7965                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7966             }
7967             nsend++;
7968             nat += cgindex[cg+1] - cgindex[cg];
7969         }
7970     }
7971
7972     *nsend_ptr   = nsend;
7973     *nat_ptr     = nat;
7974     *nsend_z_ptr = nsend_z;
7975 }
7976
7977 static void setup_dd_communication(gmx_domdec_t *dd,
7978                                    matrix box, gmx_ddbox_t *ddbox,
7979                                    t_forcerec *fr, t_state *state, rvec **f)
7980 {
7981     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7982     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7983     int                    c, i, j, cg, cg_gl, nrcg;
7984     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7985     gmx_domdec_comm_t     *comm;
7986     gmx_domdec_zones_t    *zones;
7987     gmx_domdec_comm_dim_t *cd;
7988     gmx_domdec_ind_t      *ind;
7989     cginfo_mb_t           *cginfo_mb;
7990     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7991     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
7992     dd_corners_t           corners;
7993     ivec                   tric_dist;
7994     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7995     real                   skew_fac2_d, skew_fac_01;
7996     rvec                   sf2_round;
7997     int                    nsend, nat;
7998     int                    th;
7999
8000     if (debug)
8001     {
8002         fprintf(debug, "Setting up DD communication\n");
8003     }
8004
8005     comm  = dd->comm;
8006
8007     switch (fr->cutoff_scheme)
8008     {
8009         case ecutsGROUP:
8010             cg_cm = fr->cg_cm;
8011             break;
8012         case ecutsVERLET:
8013             cg_cm = state->x;
8014             break;
8015         default:
8016             gmx_incons("unimplemented");
8017             cg_cm = NULL;
8018     }
8019
8020     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8021     {
8022         dim = dd->dim[dim_ind];
8023
8024         /* Check if we need to use triclinic distances */
8025         tric_dist[dim_ind] = 0;
8026         for (i = 0; i <= dim_ind; i++)
8027         {
8028             if (ddbox->tric_dir[dd->dim[i]])
8029             {
8030                 tric_dist[dim_ind] = 1;
8031             }
8032         }
8033     }
8034
8035     bBondComm = comm->bBondComm;
8036
8037     /* Do we need to determine extra distances for multi-body bondeds? */
8038     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8039
8040     /* Do we need to determine extra distances for only two-body bondeds? */
8041     bDist2B = (bBondComm && !bDistMB);
8042
8043     r_comm2  = sqr(comm->cutoff);
8044     r_bcomm2 = sqr(comm->cutoff_mbody);
8045
8046     if (debug)
8047     {
8048         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8049     }
8050
8051     zones = &comm->zones;
8052
8053     dim0 = dd->dim[0];
8054     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8055     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8056
8057     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8058
8059     /* Triclinic stuff */
8060     normal      = ddbox->normal;
8061     skew_fac_01 = 0;
8062     if (dd->ndim >= 2)
8063     {
8064         v_0 = ddbox->v[dim0];
8065         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8066         {
8067             /* Determine the coupling coefficient for the distances
8068              * to the cell planes along dim0 and dim1 through dim2.
8069              * This is required for correct rounding.
8070              */
8071             skew_fac_01 =
8072                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8073             if (debug)
8074             {
8075                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8076             }
8077         }
8078     }
8079     if (dd->ndim >= 3)
8080     {
8081         v_1 = ddbox->v[dim1];
8082     }
8083
8084     zone_cg_range = zones->cg_range;
8085     index_gl      = dd->index_gl;
8086     cgindex       = dd->cgindex;
8087     cginfo_mb     = fr->cginfo_mb;
8088
8089     zone_cg_range[0]   = 0;
8090     zone_cg_range[1]   = dd->ncg_home;
8091     comm->zone_ncg1[0] = dd->ncg_home;
8092     pos_cg             = dd->ncg_home;
8093
8094     nat_tot = dd->nat_home;
8095     nzone   = 1;
8096     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8097     {
8098         dim = dd->dim[dim_ind];
8099         cd  = &comm->cd[dim_ind];
8100
8101         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8102         {
8103             /* No pbc in this dimension, the first node should not comm. */
8104             nzone_send = 0;
8105         }
8106         else
8107         {
8108             nzone_send = nzone;
8109         }
8110
8111         v_d         = ddbox->v[dim];
8112         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8113
8114         cd->bInPlace = TRUE;
8115         for (p = 0; p < cd->np; p++)
8116         {
8117             /* Only atoms communicated in the first pulse are used
8118              * for multi-body bonded interactions or for bBondComm.
8119              */
8120             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8121
8122             ind   = &cd->ind[p];
8123             nsend = 0;
8124             nat   = 0;
8125             for (zone = 0; zone < nzone_send; zone++)
8126             {
8127                 if (tric_dist[dim_ind] && dim_ind > 0)
8128                 {
8129                     /* Determine slightly more optimized skew_fac's
8130                      * for rounding.
8131                      * This reduces the number of communicated atoms
8132                      * by about 10% for 3D DD of rhombic dodecahedra.
8133                      */
8134                     for (dimd = 0; dimd < dim; dimd++)
8135                     {
8136                         sf2_round[dimd] = 1;
8137                         if (ddbox->tric_dir[dimd])
8138                         {
8139                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8140                             {
8141                                 /* If we are shifted in dimension i
8142                                  * and the cell plane is tilted forward
8143                                  * in dimension i, skip this coupling.
8144                                  */
8145                                 if (!(zones->shift[nzone+zone][i] &&
8146                                       ddbox->v[dimd][i][dimd] >= 0))
8147                                 {
8148                                     sf2_round[dimd] +=
8149                                         sqr(ddbox->v[dimd][i][dimd]);
8150                                 }
8151                             }
8152                             sf2_round[dimd] = 1/sf2_round[dimd];
8153                         }
8154                     }
8155                 }
8156
8157                 zonei = zone_perm[dim_ind][zone];
8158                 if (p == 0)
8159                 {
8160                     /* Here we permutate the zones to obtain a convenient order
8161                      * for neighbor searching
8162                      */
8163                     cg0 = zone_cg_range[zonei];
8164                     cg1 = zone_cg_range[zonei+1];
8165                 }
8166                 else
8167                 {
8168                     /* Look only at the cg's received in the previous grid pulse
8169                      */
8170                     cg1 = zone_cg_range[nzone+zone+1];
8171                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8172                 }
8173
8174 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8175                 for (th = 0; th < comm->nth; th++)
8176                 {
8177                     gmx_domdec_ind_t *ind_p;
8178                     int             **ibuf_p, *ibuf_nalloc_p;
8179                     vec_rvec_t       *vbuf_p;
8180                     int              *nsend_p, *nat_p;
8181                     int              *nsend_zone_p;
8182                     int               cg0_th, cg1_th;
8183
8184                     if (th == 0)
8185                     {
8186                         /* Thread 0 writes in the comm buffers */
8187                         ind_p         = ind;
8188                         ibuf_p        = &comm->buf_int;
8189                         ibuf_nalloc_p = &comm->nalloc_int;
8190                         vbuf_p        = &comm->vbuf;
8191                         nsend_p       = &nsend;
8192                         nat_p         = &nat;
8193                         nsend_zone_p  = &ind->nsend[zone];
8194                     }
8195                     else
8196                     {
8197                         /* Other threads write into temp buffers */
8198                         ind_p         = &comm->dth[th].ind;
8199                         ibuf_p        = &comm->dth[th].ibuf;
8200                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8201                         vbuf_p        = &comm->dth[th].vbuf;
8202                         nsend_p       = &comm->dth[th].nsend;
8203                         nat_p         = &comm->dth[th].nat;
8204                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8205
8206                         comm->dth[th].nsend      = 0;
8207                         comm->dth[th].nat        = 0;
8208                         comm->dth[th].nsend_zone = 0;
8209                     }
8210
8211                     if (comm->nth == 1)
8212                     {
8213                         cg0_th = cg0;
8214                         cg1_th = cg1;
8215                     }
8216                     else
8217                     {
8218                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8219                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8220                     }
8221
8222                     /* Get the cg's for this pulse in this zone */
8223                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8224                                        index_gl, cgindex,
8225                                        dim, dim_ind, dim0, dim1, dim2,
8226                                        r_comm2, r_bcomm2,
8227                                        box, tric_dist,
8228                                        normal, skew_fac2_d, skew_fac_01,
8229                                        v_d, v_0, v_1, &corners, sf2_round,
8230                                        bDistBonded, bBondComm,
8231                                        bDist2B, bDistMB,
8232                                        cg_cm, fr->cginfo,
8233                                        ind_p,
8234                                        ibuf_p, ibuf_nalloc_p,
8235                                        vbuf_p,
8236                                        nsend_p, nat_p,
8237                                        nsend_zone_p);
8238                 }
8239
8240                 /* Append data of threads>=1 to the communication buffers */
8241                 for (th = 1; th < comm->nth; th++)
8242                 {
8243                     dd_comm_setup_work_t *dth;
8244                     int                   i, ns1;
8245
8246                     dth = &comm->dth[th];
8247
8248                     ns1 = nsend + dth->nsend_zone;
8249                     if (ns1 > ind->nalloc)
8250                     {
8251                         ind->nalloc = over_alloc_dd(ns1);
8252                         srenew(ind->index, ind->nalloc);
8253                     }
8254                     if (ns1 > comm->nalloc_int)
8255                     {
8256                         comm->nalloc_int = over_alloc_dd(ns1);
8257                         srenew(comm->buf_int, comm->nalloc_int);
8258                     }
8259                     if (ns1 > comm->vbuf.nalloc)
8260                     {
8261                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8262                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8263                     }
8264
8265                     for (i = 0; i < dth->nsend_zone; i++)
8266                     {
8267                         ind->index[nsend]    = dth->ind.index[i];
8268                         comm->buf_int[nsend] = dth->ibuf[i];
8269                         copy_rvec(dth->vbuf.v[i],
8270                                   comm->vbuf.v[nsend]);
8271                         nsend++;
8272                     }
8273                     nat              += dth->nat;
8274                     ind->nsend[zone] += dth->nsend_zone;
8275                 }
8276             }
8277             /* Clear the counts in case we do not have pbc */
8278             for (zone = nzone_send; zone < nzone; zone++)
8279             {
8280                 ind->nsend[zone] = 0;
8281             }
8282             ind->nsend[nzone]   = nsend;
8283             ind->nsend[nzone+1] = nat;
8284             /* Communicate the number of cg's and atoms to receive */
8285             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8286                             ind->nsend, nzone+2,
8287                             ind->nrecv, nzone+2);
8288
8289             /* The rvec buffer is also required for atom buffers of size nsend
8290              * in dd_move_x and dd_move_f.
8291              */
8292             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8293
8294             if (p > 0)
8295             {
8296                 /* We can receive in place if only the last zone is not empty */
8297                 for (zone = 0; zone < nzone-1; zone++)
8298                 {
8299                     if (ind->nrecv[zone] > 0)
8300                     {
8301                         cd->bInPlace = FALSE;
8302                     }
8303                 }
8304                 if (!cd->bInPlace)
8305                 {
8306                     /* The int buffer is only required here for the cg indices */
8307                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8308                     {
8309                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8310                         srenew(comm->buf_int2, comm->nalloc_int2);
8311                     }
8312                     /* The rvec buffer is also required for atom buffers
8313                      * of size nrecv in dd_move_x and dd_move_f.
8314                      */
8315                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8316                     vec_rvec_check_alloc(&comm->vbuf2, i);
8317                 }
8318             }
8319
8320             /* Make space for the global cg indices */
8321             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8322                 || dd->cg_nalloc == 0)
8323             {
8324                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8325                 srenew(index_gl, dd->cg_nalloc);
8326                 srenew(cgindex, dd->cg_nalloc+1);
8327             }
8328             /* Communicate the global cg indices */
8329             if (cd->bInPlace)
8330             {
8331                 recv_i = index_gl + pos_cg;
8332             }
8333             else
8334             {
8335                 recv_i = comm->buf_int2;
8336             }
8337             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8338                             comm->buf_int, nsend,
8339                             recv_i,        ind->nrecv[nzone]);
8340
8341             /* Make space for cg_cm */
8342             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8343             if (fr->cutoff_scheme == ecutsGROUP)
8344             {
8345                 cg_cm = fr->cg_cm;
8346             }
8347             else
8348             {
8349                 cg_cm = state->x;
8350             }
8351             /* Communicate cg_cm */
8352             if (cd->bInPlace)
8353             {
8354                 recv_vr = cg_cm + pos_cg;
8355             }
8356             else
8357             {
8358                 recv_vr = comm->vbuf2.v;
8359             }
8360             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8361                              comm->vbuf.v, nsend,
8362                              recv_vr,      ind->nrecv[nzone]);
8363
8364             /* Make the charge group index */
8365             if (cd->bInPlace)
8366             {
8367                 zone = (p == 0 ? 0 : nzone - 1);
8368                 while (zone < nzone)
8369                 {
8370                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8371                     {
8372                         cg_gl              = index_gl[pos_cg];
8373                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8374                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8375                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8376                         if (bBondComm)
8377                         {
8378                             /* Update the charge group presence,
8379                              * so we can use it in the next pass of the loop.
8380                              */
8381                             comm->bLocalCG[cg_gl] = TRUE;
8382                         }
8383                         pos_cg++;
8384                     }
8385                     if (p == 0)
8386                     {
8387                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8388                     }
8389                     zone++;
8390                     zone_cg_range[nzone+zone] = pos_cg;
8391                 }
8392             }
8393             else
8394             {
8395                 /* This part of the code is never executed with bBondComm. */
8396                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8397                                  index_gl, recv_i, cg_cm, recv_vr,
8398                                  cgindex, fr->cginfo_mb, fr->cginfo);
8399                 pos_cg += ind->nrecv[nzone];
8400             }
8401             nat_tot += ind->nrecv[nzone+1];
8402         }
8403         if (!cd->bInPlace)
8404         {
8405             /* Store the atom block for easy copying of communication buffers */
8406             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8407         }
8408         nzone += nzone;
8409     }
8410     dd->index_gl = index_gl;
8411     dd->cgindex  = cgindex;
8412
8413     dd->ncg_tot          = zone_cg_range[zones->n];
8414     dd->nat_tot          = nat_tot;
8415     comm->nat[ddnatHOME] = dd->nat_home;
8416     for (i = ddnatZONE; i < ddnatNR; i++)
8417     {
8418         comm->nat[i] = dd->nat_tot;
8419     }
8420
8421     if (!bBondComm)
8422     {
8423         /* We don't need to update cginfo, since that was alrady done above.
8424          * So we pass NULL for the forcerec.
8425          */
8426         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8427                       NULL, comm->bLocalCG);
8428     }
8429
8430     if (debug)
8431     {
8432         fprintf(debug, "Finished setting up DD communication, zones:");
8433         for (c = 0; c < zones->n; c++)
8434         {
8435             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8436         }
8437         fprintf(debug, "\n");
8438     }
8439 }
8440
8441 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8442 {
8443     int c;
8444
8445     for (c = 0; c < zones->nizone; c++)
8446     {
8447         zones->izone[c].cg1  = zones->cg_range[c+1];
8448         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8449         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8450     }
8451 }
8452
8453 static void set_zones_size(gmx_domdec_t *dd,
8454                            matrix box, const gmx_ddbox_t *ddbox,
8455                            int zone_start, int zone_end)
8456 {
8457     gmx_domdec_comm_t  *comm;
8458     gmx_domdec_zones_t *zones;
8459     gmx_bool            bDistMB;
8460     int                 z, zi, zj0, zj1, d, dim;
8461     real                rcs, rcmbs;
8462     int                 i, j;
8463     real                size_j, add_tric;
8464     real                vol;
8465
8466     comm = dd->comm;
8467
8468     zones = &comm->zones;
8469
8470     /* Do we need to determine extra distances for multi-body bondeds? */
8471     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8472
8473     for (z = zone_start; z < zone_end; z++)
8474     {
8475         /* Copy cell limits to zone limits.
8476          * Valid for non-DD dims and non-shifted dims.
8477          */
8478         copy_rvec(comm->cell_x0, zones->size[z].x0);
8479         copy_rvec(comm->cell_x1, zones->size[z].x1);
8480     }
8481
8482     for (d = 0; d < dd->ndim; d++)
8483     {
8484         dim = dd->dim[d];
8485
8486         for (z = 0; z < zones->n; z++)
8487         {
8488             /* With a staggered grid we have different sizes
8489              * for non-shifted dimensions.
8490              */
8491             if (dd->bGridJump && zones->shift[z][dim] == 0)
8492             {
8493                 if (d == 1)
8494                 {
8495                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8496                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8497                 }
8498                 else if (d == 2)
8499                 {
8500                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8501                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8502                 }
8503             }
8504         }
8505
8506         rcs   = comm->cutoff;
8507         rcmbs = comm->cutoff_mbody;
8508         if (ddbox->tric_dir[dim])
8509         {
8510             rcs   /= ddbox->skew_fac[dim];
8511             rcmbs /= ddbox->skew_fac[dim];
8512         }
8513
8514         /* Set the lower limit for the shifted zone dimensions */
8515         for (z = zone_start; z < zone_end; z++)
8516         {
8517             if (zones->shift[z][dim] > 0)
8518             {
8519                 dim = dd->dim[d];
8520                 if (!dd->bGridJump || d == 0)
8521                 {
8522                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8523                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8524                 }
8525                 else
8526                 {
8527                     /* Here we take the lower limit of the zone from
8528                      * the lowest domain of the zone below.
8529                      */
8530                     if (z < 4)
8531                     {
8532                         zones->size[z].x0[dim] =
8533                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8534                     }
8535                     else
8536                     {
8537                         if (d == 1)
8538                         {
8539                             zones->size[z].x0[dim] =
8540                                 zones->size[zone_perm[2][z-4]].x0[dim];
8541                         }
8542                         else
8543                         {
8544                             zones->size[z].x0[dim] =
8545                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8546                         }
8547                     }
8548                     /* A temporary limit, is updated below */
8549                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8550
8551                     if (bDistMB)
8552                     {
8553                         for (zi = 0; zi < zones->nizone; zi++)
8554                         {
8555                             if (zones->shift[zi][dim] == 0)
8556                             {
8557                                 /* This takes the whole zone into account.
8558                                  * With multiple pulses this will lead
8559                                  * to a larger zone then strictly necessary.
8560                                  */
8561                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8562                                                              zones->size[zi].x1[dim]+rcmbs);
8563                             }
8564                         }
8565                     }
8566                 }
8567             }
8568         }
8569
8570         /* Loop over the i-zones to set the upper limit of each
8571          * j-zone they see.
8572          */
8573         for (zi = 0; zi < zones->nizone; zi++)
8574         {
8575             if (zones->shift[zi][dim] == 0)
8576             {
8577                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8578                 {
8579                     if (zones->shift[z][dim] > 0)
8580                     {
8581                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8582                                                      zones->size[zi].x1[dim]+rcs);
8583                     }
8584                 }
8585             }
8586         }
8587     }
8588
8589     for (z = zone_start; z < zone_end; z++)
8590     {
8591         /* Initialization only required to keep the compiler happy */
8592         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8593         int  nc, c;
8594
8595         /* To determine the bounding box for a zone we need to find
8596          * the extreme corners of 4, 2 or 1 corners.
8597          */
8598         nc = 1 << (ddbox->npbcdim - 1);
8599
8600         for (c = 0; c < nc; c++)
8601         {
8602             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8603             corner[XX] = 0;
8604             if ((c & 1) == 0)
8605             {
8606                 corner[YY] = zones->size[z].x0[YY];
8607             }
8608             else
8609             {
8610                 corner[YY] = zones->size[z].x1[YY];
8611             }
8612             if ((c & 2) == 0)
8613             {
8614                 corner[ZZ] = zones->size[z].x0[ZZ];
8615             }
8616             else
8617             {
8618                 corner[ZZ] = zones->size[z].x1[ZZ];
8619             }
8620             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8621             {
8622                 /* With 1D domain decomposition the cg's are not in
8623                  * the triclinic box, but triclinic x-y and rectangular y-z.
8624                  * Shift y back, so it will later end up at 0.
8625                  */
8626                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8627             }
8628             /* Apply the triclinic couplings */
8629             for (i = YY; i < ddbox->npbcdim; i++)
8630             {
8631                 for (j = XX; j < i; j++)
8632                 {
8633                     corner[j] += corner[i]*box[i][j]/box[i][i];
8634                 }
8635             }
8636             if (c == 0)
8637             {
8638                 copy_rvec(corner, corner_min);
8639                 copy_rvec(corner, corner_max);
8640             }
8641             else
8642             {
8643                 for (i = 0; i < DIM; i++)
8644                 {
8645                     corner_min[i] = min(corner_min[i], corner[i]);
8646                     corner_max[i] = max(corner_max[i], corner[i]);
8647                 }
8648             }
8649         }
8650         /* Copy the extreme cornes without offset along x */
8651         for (i = 0; i < DIM; i++)
8652         {
8653             zones->size[z].bb_x0[i] = corner_min[i];
8654             zones->size[z].bb_x1[i] = corner_max[i];
8655         }
8656         /* Add the offset along x */
8657         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8658         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8659     }
8660
8661     if (zone_start == 0)
8662     {
8663         vol = 1;
8664         for (dim = 0; dim < DIM; dim++)
8665         {
8666             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8667         }
8668         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8669     }
8670
8671     if (debug)
8672     {
8673         for (z = zone_start; z < zone_end; z++)
8674         {
8675             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8676                     z,
8677                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8678                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8679                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8680             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8681                     z,
8682                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8683                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8684                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8685         }
8686     }
8687 }
8688
8689 static int comp_cgsort(const void *a, const void *b)
8690 {
8691     int           comp;
8692
8693     gmx_cgsort_t *cga, *cgb;
8694     cga = (gmx_cgsort_t *)a;
8695     cgb = (gmx_cgsort_t *)b;
8696
8697     comp = cga->nsc - cgb->nsc;
8698     if (comp == 0)
8699     {
8700         comp = cga->ind_gl - cgb->ind_gl;
8701     }
8702
8703     return comp;
8704 }
8705
8706 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8707                          int *a, int *buf)
8708 {
8709     int i;
8710
8711     /* Order the data */
8712     for (i = 0; i < n; i++)
8713     {
8714         buf[i] = a[sort[i].ind];
8715     }
8716
8717     /* Copy back to the original array */
8718     for (i = 0; i < n; i++)
8719     {
8720         a[i] = buf[i];
8721     }
8722 }
8723
8724 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8725                          rvec *v, rvec *buf)
8726 {
8727     int i;
8728
8729     /* Order the data */
8730     for (i = 0; i < n; i++)
8731     {
8732         copy_rvec(v[sort[i].ind], buf[i]);
8733     }
8734
8735     /* Copy back to the original array */
8736     for (i = 0; i < n; i++)
8737     {
8738         copy_rvec(buf[i], v[i]);
8739     }
8740 }
8741
8742 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8743                            rvec *v, rvec *buf)
8744 {
8745     int a, atot, cg, cg0, cg1, i;
8746
8747     if (cgindex == NULL)
8748     {
8749         /* Avoid the useless loop of the atoms within a cg */
8750         order_vec_cg(ncg, sort, v, buf);
8751
8752         return;
8753     }
8754
8755     /* Order the data */
8756     a = 0;
8757     for (cg = 0; cg < ncg; cg++)
8758     {
8759         cg0 = cgindex[sort[cg].ind];
8760         cg1 = cgindex[sort[cg].ind+1];
8761         for (i = cg0; i < cg1; i++)
8762         {
8763             copy_rvec(v[i], buf[a]);
8764             a++;
8765         }
8766     }
8767     atot = a;
8768
8769     /* Copy back to the original array */
8770     for (a = 0; a < atot; a++)
8771     {
8772         copy_rvec(buf[a], v[a]);
8773     }
8774 }
8775
8776 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8777                          int nsort_new, gmx_cgsort_t *sort_new,
8778                          gmx_cgsort_t *sort1)
8779 {
8780     int i1, i2, i_new;
8781
8782     /* The new indices are not very ordered, so we qsort them */
8783     qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8784
8785     /* sort2 is already ordered, so now we can merge the two arrays */
8786     i1    = 0;
8787     i2    = 0;
8788     i_new = 0;
8789     while (i2 < nsort2 || i_new < nsort_new)
8790     {
8791         if (i2 == nsort2)
8792         {
8793             sort1[i1++] = sort_new[i_new++];
8794         }
8795         else if (i_new == nsort_new)
8796         {
8797             sort1[i1++] = sort2[i2++];
8798         }
8799         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8800                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8801                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8802         {
8803             sort1[i1++] = sort2[i2++];
8804         }
8805         else
8806         {
8807             sort1[i1++] = sort_new[i_new++];
8808         }
8809     }
8810 }
8811
8812 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8813 {
8814     gmx_domdec_sort_t *sort;
8815     gmx_cgsort_t      *cgsort, *sort_i;
8816     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8817     int                sort_last, sort_skip;
8818
8819     sort = dd->comm->sort;
8820
8821     a = fr->ns.grid->cell_index;
8822
8823     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8824
8825     if (ncg_home_old >= 0)
8826     {
8827         /* The charge groups that remained in the same ns grid cell
8828          * are completely ordered. So we can sort efficiently by sorting
8829          * the charge groups that did move into the stationary list.
8830          */
8831         ncg_new   = 0;
8832         nsort2    = 0;
8833         nsort_new = 0;
8834         for (i = 0; i < dd->ncg_home; i++)
8835         {
8836             /* Check if this cg did not move to another node */
8837             if (a[i] < moved)
8838             {
8839                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8840                 {
8841                     /* This cg is new on this node or moved ns grid cell */
8842                     if (nsort_new >= sort->sort_new_nalloc)
8843                     {
8844                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8845                         srenew(sort->sort_new, sort->sort_new_nalloc);
8846                     }
8847                     sort_i = &(sort->sort_new[nsort_new++]);
8848                 }
8849                 else
8850                 {
8851                     /* This cg did not move */
8852                     sort_i = &(sort->sort2[nsort2++]);
8853                 }
8854                 /* Sort on the ns grid cell indices
8855                  * and the global topology index.
8856                  * index_gl is irrelevant with cell ns,
8857                  * but we set it here anyhow to avoid a conditional.
8858                  */
8859                 sort_i->nsc    = a[i];
8860                 sort_i->ind_gl = dd->index_gl[i];
8861                 sort_i->ind    = i;
8862                 ncg_new++;
8863             }
8864         }
8865         if (debug)
8866         {
8867             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8868                     nsort2, nsort_new);
8869         }
8870         /* Sort efficiently */
8871         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8872                      sort->sort);
8873     }
8874     else
8875     {
8876         cgsort  = sort->sort;
8877         ncg_new = 0;
8878         for (i = 0; i < dd->ncg_home; i++)
8879         {
8880             /* Sort on the ns grid cell indices
8881              * and the global topology index
8882              */
8883             cgsort[i].nsc    = a[i];
8884             cgsort[i].ind_gl = dd->index_gl[i];
8885             cgsort[i].ind    = i;
8886             if (cgsort[i].nsc < moved)
8887             {
8888                 ncg_new++;
8889             }
8890         }
8891         if (debug)
8892         {
8893             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8894         }
8895         /* Determine the order of the charge groups using qsort */
8896         qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8897     }
8898
8899     return ncg_new;
8900 }
8901
8902 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8903 {
8904     gmx_cgsort_t *sort;
8905     int           ncg_new, i, *a, na;
8906
8907     sort = dd->comm->sort->sort;
8908
8909     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8910
8911     ncg_new = 0;
8912     for (i = 0; i < na; i++)
8913     {
8914         if (a[i] >= 0)
8915         {
8916             sort[ncg_new].ind = a[i];
8917             ncg_new++;
8918         }
8919     }
8920
8921     return ncg_new;
8922 }
8923
8924 static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
8925                           rvec *cgcm, t_forcerec *fr, t_state *state,
8926                           int ncg_home_old)
8927 {
8928     gmx_domdec_sort_t *sort;
8929     gmx_cgsort_t      *cgsort, *sort_i;
8930     int               *cgindex;
8931     int                ncg_new, i, *ibuf, cgsize;
8932     rvec              *vbuf;
8933
8934     sort = dd->comm->sort;
8935
8936     if (dd->ncg_home > sort->sort_nalloc)
8937     {
8938         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8939         srenew(sort->sort, sort->sort_nalloc);
8940         srenew(sort->sort2, sort->sort_nalloc);
8941     }
8942     cgsort = sort->sort;
8943
8944     switch (fr->cutoff_scheme)
8945     {
8946         case ecutsGROUP:
8947             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8948             break;
8949         case ecutsVERLET:
8950             ncg_new = dd_sort_order_nbnxn(dd, fr);
8951             break;
8952         default:
8953             gmx_incons("unimplemented");
8954             ncg_new = 0;
8955     }
8956
8957     /* We alloc with the old size, since cgindex is still old */
8958     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8959     vbuf = dd->comm->vbuf.v;
8960
8961     if (dd->comm->bCGs)
8962     {
8963         cgindex = dd->cgindex;
8964     }
8965     else
8966     {
8967         cgindex = NULL;
8968     }
8969
8970     /* Remove the charge groups which are no longer at home here */
8971     dd->ncg_home = ncg_new;
8972     if (debug)
8973     {
8974         fprintf(debug, "Set the new home charge group count to %d\n",
8975                 dd->ncg_home);
8976     }
8977
8978     /* Reorder the state */
8979     for (i = 0; i < estNR; i++)
8980     {
8981         if (EST_DISTR(i) && (state->flags & (1<<i)))
8982         {
8983             switch (i)
8984             {
8985                 case estX:
8986                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8987                     break;
8988                 case estV:
8989                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8990                     break;
8991                 case estSDX:
8992                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8993                     break;
8994                 case estCGP:
8995                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8996                     break;
8997                 case estLD_RNG:
8998                 case estLD_RNGI:
8999                 case estDISRE_INITF:
9000                 case estDISRE_RM3TAV:
9001                 case estORIRE_INITF:
9002                 case estORIRE_DTAV:
9003                     /* No ordering required */
9004                     break;
9005                 default:
9006                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9007                     break;
9008             }
9009         }
9010     }
9011     if (fr->cutoff_scheme == ecutsGROUP)
9012     {
9013         /* Reorder cgcm */
9014         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9015     }
9016
9017     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9018     {
9019         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9020         srenew(sort->ibuf, sort->ibuf_nalloc);
9021     }
9022     ibuf = sort->ibuf;
9023     /* Reorder the global cg index */
9024     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9025     /* Reorder the cginfo */
9026     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9027     /* Rebuild the local cg index */
9028     if (dd->comm->bCGs)
9029     {
9030         ibuf[0] = 0;
9031         for (i = 0; i < dd->ncg_home; i++)
9032         {
9033             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9034             ibuf[i+1] = ibuf[i] + cgsize;
9035         }
9036         for (i = 0; i < dd->ncg_home+1; i++)
9037         {
9038             dd->cgindex[i] = ibuf[i];
9039         }
9040     }
9041     else
9042     {
9043         for (i = 0; i < dd->ncg_home+1; i++)
9044         {
9045             dd->cgindex[i] = i;
9046         }
9047     }
9048     /* Set the home atom number */
9049     dd->nat_home = dd->cgindex[dd->ncg_home];
9050
9051     if (fr->cutoff_scheme == ecutsVERLET)
9052     {
9053         /* The atoms are now exactly in grid order, update the grid order */
9054         nbnxn_set_atomorder(fr->nbv->nbs);
9055     }
9056     else
9057     {
9058         /* Copy the sorted ns cell indices back to the ns grid struct */
9059         for (i = 0; i < dd->ncg_home; i++)
9060         {
9061             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9062         }
9063         fr->ns.grid->nr = dd->ncg_home;
9064     }
9065 }
9066
9067 static void add_dd_statistics(gmx_domdec_t *dd)
9068 {
9069     gmx_domdec_comm_t *comm;
9070     int                ddnat;
9071
9072     comm = dd->comm;
9073
9074     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9075     {
9076         comm->sum_nat[ddnat-ddnatZONE] +=
9077             comm->nat[ddnat] - comm->nat[ddnat-1];
9078     }
9079     comm->ndecomp++;
9080 }
9081
9082 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9083 {
9084     gmx_domdec_comm_t *comm;
9085     int                ddnat;
9086
9087     comm = dd->comm;
9088
9089     /* Reset all the statistics and counters for total run counting */
9090     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9091     {
9092         comm->sum_nat[ddnat-ddnatZONE] = 0;
9093     }
9094     comm->ndecomp   = 0;
9095     comm->nload     = 0;
9096     comm->load_step = 0;
9097     comm->load_sum  = 0;
9098     comm->load_max  = 0;
9099     clear_ivec(comm->load_lim);
9100     comm->load_mdf = 0;
9101     comm->load_pme = 0;
9102 }
9103
9104 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9105 {
9106     gmx_domdec_comm_t *comm;
9107     int                ddnat;
9108     double             av;
9109
9110     comm = cr->dd->comm;
9111
9112     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9113
9114     if (fplog == NULL)
9115     {
9116         return;
9117     }
9118
9119     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9120
9121     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9122     {
9123         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9124         switch (ddnat)
9125         {
9126             case ddnatZONE:
9127                 fprintf(fplog,
9128                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9129                         2, av);
9130                 break;
9131             case ddnatVSITE:
9132                 if (cr->dd->vsite_comm)
9133                 {
9134                     fprintf(fplog,
9135                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9136                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9137                             av);
9138                 }
9139                 break;
9140             case ddnatCON:
9141                 if (cr->dd->constraint_comm)
9142                 {
9143                     fprintf(fplog,
9144                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9145                             1 + ir->nLincsIter, av);
9146                 }
9147                 break;
9148             default:
9149                 gmx_incons(" Unknown type for DD statistics");
9150         }
9151     }
9152     fprintf(fplog, "\n");
9153
9154     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9155     {
9156         print_dd_load_av(fplog, cr->dd);
9157     }
9158 }
9159
9160 void dd_partition_system(FILE                *fplog,
9161                          gmx_large_int_t      step,
9162                          t_commrec           *cr,
9163                          gmx_bool             bMasterState,
9164                          int                  nstglobalcomm,
9165                          t_state             *state_global,
9166                          gmx_mtop_t          *top_global,
9167                          t_inputrec          *ir,
9168                          t_state             *state_local,
9169                          rvec               **f,
9170                          t_mdatoms           *mdatoms,
9171                          gmx_localtop_t      *top_local,
9172                          t_forcerec          *fr,
9173                          gmx_vsite_t         *vsite,
9174                          gmx_shellfc_t        shellfc,
9175                          gmx_constr_t         constr,
9176                          t_nrnb              *nrnb,
9177                          gmx_wallcycle_t      wcycle,
9178                          gmx_bool             bVerbose)
9179 {
9180     gmx_domdec_t      *dd;
9181     gmx_domdec_comm_t *comm;
9182     gmx_ddbox_t        ddbox = {0};
9183     t_block           *cgs_gl;
9184     gmx_large_int_t    step_pcoupl;
9185     rvec               cell_ns_x0, cell_ns_x1;
9186     int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9187     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9188     gmx_bool           bRedist, bSortCG, bResortAll;
9189     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9190     real               grid_density;
9191     char               sbuf[22];
9192
9193     dd   = cr->dd;
9194     comm = dd->comm;
9195
9196     bBoxChanged = (bMasterState || DEFORM(*ir));
9197     if (ir->epc != epcNO)
9198     {
9199         /* With nstpcouple > 1 pressure coupling happens.
9200          * one step after calculating the pressure.
9201          * Box scaling happens at the end of the MD step,
9202          * after the DD partitioning.
9203          * We therefore have to do DLB in the first partitioning
9204          * after an MD step where P-coupling occured.
9205          * We need to determine the last step in which p-coupling occurred.
9206          * MRS -- need to validate this for vv?
9207          */
9208         n = ir->nstpcouple;
9209         if (n == 1)
9210         {
9211             step_pcoupl = step - 1;
9212         }
9213         else
9214         {
9215             step_pcoupl = ((step - 1)/n)*n + 1;
9216         }
9217         if (step_pcoupl >= comm->partition_step)
9218         {
9219             bBoxChanged = TRUE;
9220         }
9221     }
9222
9223     bNStGlobalComm = (step % nstglobalcomm == 0);
9224
9225     if (!comm->bDynLoadBal)
9226     {
9227         bDoDLB = FALSE;
9228     }
9229     else
9230     {
9231         /* Should we do dynamic load balacing this step?
9232          * Since it requires (possibly expensive) global communication,
9233          * we might want to do DLB less frequently.
9234          */
9235         if (bBoxChanged || ir->epc != epcNO)
9236         {
9237             bDoDLB = bBoxChanged;
9238         }
9239         else
9240         {
9241             bDoDLB = bNStGlobalComm;
9242         }
9243     }
9244
9245     /* Check if we have recorded loads on the nodes */
9246     if (comm->bRecordLoad && dd_load_count(comm))
9247     {
9248         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9249         {
9250             /* Check if we should use DLB at the second partitioning
9251              * and every 100 partitionings,
9252              * so the extra communication cost is negligible.
9253              */
9254             n         = max(100, nstglobalcomm);
9255             bCheckDLB = (comm->n_load_collect == 0 ||
9256                          comm->n_load_have % n == n-1);
9257         }
9258         else
9259         {
9260             bCheckDLB = FALSE;
9261         }
9262
9263         /* Print load every nstlog, first and last step to the log file */
9264         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9265                     comm->n_load_collect == 0 ||
9266                     (ir->nsteps >= 0 &&
9267                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9268
9269         /* Avoid extra communication due to verbose screen output
9270          * when nstglobalcomm is set.
9271          */
9272         if (bDoDLB || bLogLoad || bCheckDLB ||
9273             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9274         {
9275             get_load_distribution(dd, wcycle);
9276             if (DDMASTER(dd))
9277             {
9278                 if (bLogLoad)
9279                 {
9280                     dd_print_load(fplog, dd, step-1);
9281                 }
9282                 if (bVerbose)
9283                 {
9284                     dd_print_load_verbose(dd);
9285                 }
9286             }
9287             comm->n_load_collect++;
9288
9289             if (bCheckDLB)
9290             {
9291                 /* Since the timings are node dependent, the master decides */
9292                 if (DDMASTER(dd))
9293                 {
9294                     bTurnOnDLB =
9295                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9296                     if (debug)
9297                     {
9298                         fprintf(debug, "step %s, imb loss %f\n",
9299                                 gmx_step_str(step, sbuf),
9300                                 dd_force_imb_perf_loss(dd));
9301                     }
9302                 }
9303                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9304                 if (bTurnOnDLB)
9305                 {
9306                     turn_on_dlb(fplog, cr, step);
9307                     bDoDLB = TRUE;
9308                 }
9309             }
9310         }
9311         comm->n_load_have++;
9312     }
9313
9314     cgs_gl = &comm->cgs_gl;
9315
9316     bRedist = FALSE;
9317     if (bMasterState)
9318     {
9319         /* Clear the old state */
9320         clear_dd_indices(dd, 0, 0);
9321         ncgindex_set = 0;
9322
9323         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9324                   TRUE, cgs_gl, state_global->x, &ddbox);
9325
9326         get_cg_distribution(fplog, step, dd, cgs_gl,
9327                             state_global->box, &ddbox, state_global->x);
9328
9329         dd_distribute_state(dd, cgs_gl,
9330                             state_global, state_local, f);
9331
9332         dd_make_local_cgs(dd, &top_local->cgs);
9333
9334         /* Ensure that we have space for the new distribution */
9335         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9336
9337         if (fr->cutoff_scheme == ecutsGROUP)
9338         {
9339             calc_cgcm(fplog, 0, dd->ncg_home,
9340                       &top_local->cgs, state_local->x, fr->cg_cm);
9341         }
9342
9343         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9344
9345         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9346     }
9347     else if (state_local->ddp_count != dd->ddp_count)
9348     {
9349         if (state_local->ddp_count > dd->ddp_count)
9350         {
9351             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9352         }
9353
9354         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9355         {
9356             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9357         }
9358
9359         /* Clear the old state */
9360         clear_dd_indices(dd, 0, 0);
9361
9362         /* Build the new indices */
9363         rebuild_cgindex(dd, cgs_gl->index, state_local);
9364         make_dd_indices(dd, cgs_gl->index, 0);
9365         ncgindex_set = dd->ncg_home;
9366
9367         if (fr->cutoff_scheme == ecutsGROUP)
9368         {
9369             /* Redetermine the cg COMs */
9370             calc_cgcm(fplog, 0, dd->ncg_home,
9371                       &top_local->cgs, state_local->x, fr->cg_cm);
9372         }
9373
9374         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9375
9376         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9377
9378         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9379                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9380
9381         bRedist = comm->bDynLoadBal;
9382     }
9383     else
9384     {
9385         /* We have the full state, only redistribute the cgs */
9386
9387         /* Clear the non-home indices */
9388         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9389         ncgindex_set = 0;
9390
9391         /* Avoid global communication for dim's without pbc and -gcom */
9392         if (!bNStGlobalComm)
9393         {
9394             copy_rvec(comm->box0, ddbox.box0    );
9395             copy_rvec(comm->box_size, ddbox.box_size);
9396         }
9397         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9398                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9399
9400         bBoxChanged = TRUE;
9401         bRedist     = TRUE;
9402     }
9403     /* For dim's without pbc and -gcom */
9404     copy_rvec(ddbox.box0, comm->box0    );
9405     copy_rvec(ddbox.box_size, comm->box_size);
9406
9407     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9408                       step, wcycle);
9409
9410     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9411     {
9412         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9413     }
9414
9415     /* Check if we should sort the charge groups */
9416     if (comm->nstSortCG > 0)
9417     {
9418         bSortCG = (bMasterState ||
9419                    (bRedist && (step % comm->nstSortCG == 0)));
9420     }
9421     else
9422     {
9423         bSortCG = FALSE;
9424     }
9425
9426     ncg_home_old = dd->ncg_home;
9427
9428     ncg_moved = 0;
9429     if (bRedist)
9430     {
9431         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9432
9433         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9434                            state_local, f, fr, mdatoms,
9435                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9436
9437         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9438     }
9439
9440     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9441                           dd, &ddbox,
9442                           &comm->cell_x0, &comm->cell_x1,
9443                           dd->ncg_home, fr->cg_cm,
9444                           cell_ns_x0, cell_ns_x1, &grid_density);
9445
9446     if (bBoxChanged)
9447     {
9448         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9449     }
9450
9451     switch (fr->cutoff_scheme)
9452     {
9453         case ecutsGROUP:
9454             copy_ivec(fr->ns.grid->n, ncells_old);
9455             grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
9456                        state_local->box, cell_ns_x0, cell_ns_x1,
9457                        fr->rlistlong, grid_density);
9458             break;
9459         case ecutsVERLET:
9460             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9461             break;
9462         default:
9463             gmx_incons("unimplemented");
9464     }
9465     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9466     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9467
9468     if (bSortCG)
9469     {
9470         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9471
9472         /* Sort the state on charge group position.
9473          * This enables exact restarts from this step.
9474          * It also improves performance by about 15% with larger numbers
9475          * of atoms per node.
9476          */
9477
9478         /* Fill the ns grid with the home cell,
9479          * so we can sort with the indices.
9480          */
9481         set_zones_ncg_home(dd);
9482
9483         switch (fr->cutoff_scheme)
9484         {
9485             case ecutsVERLET:
9486                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9487
9488                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9489                                   0,
9490                                   comm->zones.size[0].bb_x0,
9491                                   comm->zones.size[0].bb_x1,
9492                                   0, dd->ncg_home,
9493                                   comm->zones.dens_zone0,
9494                                   fr->cginfo,
9495                                   state_local->x,
9496                                   ncg_moved, bRedist ? comm->moved : NULL,
9497                                   fr->nbv->grp[eintLocal].kernel_type,
9498                                   fr->nbv->grp[eintLocal].nbat);
9499
9500                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9501                 break;
9502             case ecutsGROUP:
9503                 fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
9504                           0, dd->ncg_home, fr->cg_cm);
9505
9506                 copy_ivec(fr->ns.grid->n, ncells_new);
9507                 break;
9508             default:
9509                 gmx_incons("unimplemented");
9510         }
9511
9512         bResortAll = bMasterState;
9513
9514         /* Check if we can user the old order and ns grid cell indices
9515          * of the charge groups to sort the charge groups efficiently.
9516          */
9517         if (ncells_new[XX] != ncells_old[XX] ||
9518             ncells_new[YY] != ncells_old[YY] ||
9519             ncells_new[ZZ] != ncells_old[ZZ])
9520         {
9521             bResortAll = TRUE;
9522         }
9523
9524         if (debug)
9525         {
9526             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9527                     gmx_step_str(step, sbuf), dd->ncg_home);
9528         }
9529         dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
9530                       bResortAll ? -1 : ncg_home_old);
9531         /* Rebuild all the indices */
9532         ga2la_clear(dd->ga2la);
9533         ncgindex_set = 0;
9534
9535         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9536     }
9537
9538     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9539
9540     /* Setup up the communication and communicate the coordinates */
9541     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9542
9543     /* Set the indices */
9544     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9545
9546     /* Set the charge group boundaries for neighbor searching */
9547     set_cg_boundaries(&comm->zones);
9548
9549     if (fr->cutoff_scheme == ecutsVERLET)
9550     {
9551         set_zones_size(dd, state_local->box, &ddbox,
9552                        bSortCG ? 1 : 0, comm->zones.n);
9553     }
9554
9555     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9556
9557     /*
9558        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9559                  -1,state_local->x,state_local->box);
9560      */
9561
9562     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9563
9564     /* Extract a local topology from the global topology */
9565     for (i = 0; i < dd->ndim; i++)
9566     {
9567         np[dd->dim[i]] = comm->cd[i].np;
9568     }
9569     dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
9570                       comm->cellsize_min, np,
9571                       fr,
9572                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9573                       vsite, top_global, top_local);
9574
9575     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9576
9577     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9578
9579     /* Set up the special atom communication */
9580     n = comm->nat[ddnatZONE];
9581     for (i = ddnatZONE+1; i < ddnatNR; i++)
9582     {
9583         switch (i)
9584         {
9585             case ddnatVSITE:
9586                 if (vsite && vsite->n_intercg_vsite)
9587                 {
9588                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9589                 }
9590                 break;
9591             case ddnatCON:
9592                 if (dd->bInterCGcons || dd->bInterCGsettles)
9593                 {
9594                     /* Only for inter-cg constraints we need special code */
9595                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9596                                                   constr, ir->nProjOrder,
9597                                                   top_local->idef.il);
9598                 }
9599                 break;
9600             default:
9601                 gmx_incons("Unknown special atom type setup");
9602         }
9603         comm->nat[i] = n;
9604     }
9605
9606     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9607
9608     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9609
9610     /* Make space for the extra coordinates for virtual site
9611      * or constraint communication.
9612      */
9613     state_local->natoms = comm->nat[ddnatNR-1];
9614     if (state_local->natoms > state_local->nalloc)
9615     {
9616         dd_realloc_state(state_local, f, state_local->natoms);
9617     }
9618
9619     if (fr->bF_NoVirSum)
9620     {
9621         if (vsite && vsite->n_intercg_vsite)
9622         {
9623             nat_f_novirsum = comm->nat[ddnatVSITE];
9624         }
9625         else
9626         {
9627             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9628             {
9629                 nat_f_novirsum = dd->nat_tot;
9630             }
9631             else
9632             {
9633                 nat_f_novirsum = dd->nat_home;
9634             }
9635         }
9636     }
9637     else
9638     {
9639         nat_f_novirsum = 0;
9640     }
9641
9642     /* Set the number of atoms required for the force calculation.
9643      * Forces need to be constrained when using a twin-range setup
9644      * or with energy minimization. For simple simulations we could
9645      * avoid some allocation, zeroing and copying, but this is
9646      * probably not worth the complications ande checking.
9647      */
9648     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9649                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9650
9651     /* We make the all mdatoms up to nat_tot_con.
9652      * We could save some work by only setting invmass
9653      * between nat_tot and nat_tot_con.
9654      */
9655     /* This call also sets the new number of home particles to dd->nat_home */
9656     atoms2md(top_global, ir,
9657              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9658
9659     /* Now we have the charges we can sort the FE interactions */
9660     dd_sort_local_top(dd, mdatoms, top_local);
9661
9662     if (vsite != NULL)
9663     {
9664         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9665         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
9666     }
9667
9668     if (shellfc)
9669     {
9670         /* Make the local shell stuff, currently no communication is done */
9671         make_local_shells(cr, mdatoms, shellfc);
9672     }
9673
9674     if (ir->implicit_solvent)
9675     {
9676         make_local_gb(cr, fr->born, ir->gb_algorithm);
9677     }
9678
9679     init_bonded_thread_force_reduction(fr, &top_local->idef);
9680
9681     if (!(cr->duty & DUTY_PME))
9682     {
9683         /* Send the charges to our PME only node */
9684         gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
9685                        mdatoms->chargeA, mdatoms->chargeB,
9686                        dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9687     }
9688
9689     if (constr)
9690     {
9691         set_constraints(constr, top_local, ir, mdatoms, cr);
9692     }
9693
9694     if (ir->ePull != epullNO)
9695     {
9696         /* Update the local pull groups */
9697         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9698     }
9699
9700     if (ir->bRot)
9701     {
9702         /* Update the local rotation groups */
9703         dd_make_local_rotation_groups(dd, ir->rot);
9704     }
9705
9706
9707     add_dd_statistics(dd);
9708
9709     /* Make sure we only count the cycles for this DD partitioning */
9710     clear_dd_cycle_counts(dd);
9711
9712     /* Because the order of the atoms might have changed since
9713      * the last vsite construction, we need to communicate the constructing
9714      * atom coordinates again (for spreading the forces this MD step).
9715      */
9716     dd_move_x_vsites(dd, state_local->box, state_local->x);
9717
9718     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9719
9720     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9721     {
9722         dd_move_x(dd, state_local->box, state_local->x);
9723         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9724                      -1, state_local->x, state_local->box);
9725     }
9726
9727     /* Store the partitioning step */
9728     comm->partition_step = step;
9729
9730     /* Increase the DD partitioning counter */
9731     dd->ddp_count++;
9732     /* The state currently matches this DD partitioning count, store it */
9733     state_local->ddp_count = dd->ddp_count;
9734     if (bMasterState)
9735     {
9736         /* The DD master node knows the complete cg distribution,
9737          * store the count so we can possibly skip the cg info communication.
9738          */
9739         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9740     }
9741
9742     if (comm->DD_debug > 0)
9743     {
9744         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9745         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9746                                 "after partitioning");
9747     }
9748 }