src/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 1991-2008
   5  * Copyright (c) 2012,2013, by the GROMACS development team, led by
   6  * David van der Spoel, Berk Hess, Erik Lindahl, and including many
   7  * others, as listed in the AUTHORS file in the top-level source
   8  * directory and at http://www.gromacs.org.
   9  *
  10  * GROMACS is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public License
  12  * as published by the Free Software Foundation; either version 2.1
  13  * of the License, or (at your option) any later version.
  14  *
  15  * GROMACS is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with GROMACS; if not, see
  22  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  23  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  24  *
  25  * If you want to redistribute modifications to GROMACS, please
  26  * consider that scientific software is very special. Version
  27  * control is crucial - bugs must be traceable. We will be happy to
  28  * consider code for inclusion in the official distribution, but
  29  * derived work must not be called official GROMACS. Details are found
  30  * in the README & COPYING files - if they are missing, get the
  31  * official version at http://www.gromacs.org.
  32  *
  33  * To help us fund GROMACS development, we humbly ask that you cite
  34  * the research papers on the package. Check out http://www.gromacs.org.
  35  */
  36
  37 #ifdef HAVE_CONFIG_H
  38 #include <config.h>
  39 #endif
  40
  41 #include <stdio.h>
  42 #include <time.h>
  43 #include <math.h>
  44 #include <string.h>
  45 #include <stdlib.h>
  46 #include "typedefs.h"
  47 #include "smalloc.h"
  48 #include "gmx_fatal.h"
  49 #include "gmx_fatal_collective.h"
  50 #include "vec.h"
  51 #include "domdec.h"
  52 #include "domdec_network.h"
  53 #include "nrnb.h"
  54 #include "pbc.h"
  55 #include "chargegroup.h"
  56 #include "constr.h"
  57 #include "mdatoms.h"
  58 #include "names.h"
  59 #include "pdbio.h"
  60 #include "futil.h"
  61 #include "force.h"
  62 #include "pme.h"
  63 #include "pull.h"
  64 #include "pull_rotation.h"
  65 #include "gmx_wallcycle.h"
  66 #include "mdrun.h"
  67 #include "nsgrid.h"
  68 #include "shellfc.h"
  69 #include "mtop_util.h"
  70 #include "gmxfio.h"
  71 #include "gmx_ga2la.h"
  72 #include "gmx_sort.h"
  73 #include "nbnxn_search.h"
  74 #include "bondf.h"
  75 #include "gmx_omp_nthreads.h"
  76
  77 #ifdef GMX_LIB_MPI
  78 #include <mpi.h>
  79 #endif
  80 #ifdef GMX_THREAD_MPI
  81 #include "tmpi.h"
  82 #endif
  83
  84 #define DDRANK(dd, rank)    (rank)
  85 #define DDMASTERRANK(dd)   (dd->masterrank)
  86
  87 typedef struct gmx_domdec_master
  88 {
  89     /* The cell boundaries */
  90     real **cell_x;
  91     /* The global charge group division */
  92     int   *ncg;    /* Number of home charge groups for each node */
  93     int   *index;  /* Index of nnodes+1 into cg */
  94     int   *cg;     /* Global charge group index */
  95     int   *nat;    /* Number of home atoms for each node. */
  96     int   *ibuf;   /* Buffer for communication */
  97     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  98 } gmx_domdec_master_t;
  99
 100 typedef struct
 101 {
 102     /* The numbers of charge groups to send and receive for each cell
 103      * that requires communication, the last entry contains the total
 104      * number of atoms that needs to be communicated.
 105      */
 106     int  nsend[DD_MAXIZONE+2];
 107     int  nrecv[DD_MAXIZONE+2];
 108     /* The charge groups to send */
 109     int *index;
 110     int  nalloc;
 111     /* The atom range for non-in-place communication */
 112     int  cell2at0[DD_MAXIZONE];
 113     int  cell2at1[DD_MAXIZONE];
 114 } gmx_domdec_ind_t;
 115
 116 typedef struct
 117 {
 118     int               np;       /* Number of grid pulses in this dimension */
 119     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 120     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 121     int               np_nalloc;
 122     gmx_bool          bInPlace; /* Can we communicate in place?            */
 123 } gmx_domdec_comm_dim_t;
 124
 125 typedef struct
 126 {
 127     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 128     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 129     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 130     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 131     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 132     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 133     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 134     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 135     real     *buf_ncd;     /* Temp. var.                                     */
 136 } gmx_domdec_root_t;
 137
 138 #define DD_NLOAD_MAX 9
 139
 140 /* Here floats are accurate enough, since these variables
 141  * only influence the load balancing, not the actual MD results.
 142  */
 143 typedef struct
 144 {
 145     int    nload;
 146     float *load;
 147     float  sum;
 148     float  max;
 149     float  sum_m;
 150     float  cvol_min;
 151     float  mdf;
 152     float  pme;
 153     int    flags;
 154 } gmx_domdec_load_t;
 155
 156 typedef struct
 157 {
 158     int  nsc;
 159     int  ind_gl;
 160     int  ind;
 161 } gmx_cgsort_t;
 162
 163 typedef struct
 164 {
 165     gmx_cgsort_t *sort;
 166     gmx_cgsort_t *sort2;
 167     int           sort_nalloc;
 168     gmx_cgsort_t *sort_new;
 169     int           sort_new_nalloc;
 170     int          *ibuf;
 171     int           ibuf_nalloc;
 172 } gmx_domdec_sort_t;
 173
 174 typedef struct
 175 {
 176     rvec *v;
 177     int   nalloc;
 178 } vec_rvec_t;
 179
 180 /* This enum determines the order of the coordinates.
 181  * ddnatHOME and ddnatZONE should be first and second,
 182  * the others can be ordered as wanted.
 183  */
 184 enum {
 185     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 186 };
 187
 188 enum {
 189     edlbAUTO, edlbNO, edlbYES, edlbNR
 190 };
 191 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 192
 193 typedef struct
 194 {
 195     int      dim;       /* The dimension                                          */
 196     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 197     int      nslab;     /* The number of PME slabs in this dimension              */
 198     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 199     int     *pp_min;    /* The minimum pp node location, size nslab               */
 200     int     *pp_max;    /* The maximum pp node location,size nslab                */
 201     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 202 } gmx_ddpme_t;
 203
 204 typedef struct
 205 {
 206     real min0;    /* The minimum bottom of this zone                        */
 207     real max1;    /* The maximum top of this zone                           */
 208     real min1;    /* The minimum top of this zone                           */
 209     real mch0;    /* The maximum bottom communicaton height for this zone   */
 210     real mch1;    /* The maximum top communicaton height for this zone      */
 211     real p1_0;    /* The bottom value of the first cell in this zone        */
 212     real p1_1;    /* The top value of the first cell in this zone           */
 213 } gmx_ddzone_t;
 214
 215 typedef struct
 216 {
 217     gmx_domdec_ind_t ind;
 218     int             *ibuf;
 219     int              ibuf_nalloc;
 220     vec_rvec_t       vbuf;
 221     int              nsend;
 222     int              nat;
 223     int              nsend_zone;
 224 } dd_comm_setup_work_t;
 225
 226 typedef struct gmx_domdec_comm
 227 {
 228     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 229      * unless stated otherwise.
 230      */
 231
 232     /* The number of decomposition dimensions for PME, 0: no PME */
 233     int         npmedecompdim;
 234     /* The number of nodes doing PME (PP/PME or only PME) */
 235     int         npmenodes;
 236     int         npmenodes_x;
 237     int         npmenodes_y;
 238     /* The communication setup including the PME only nodes */
 239     gmx_bool    bCartesianPP_PME;
 240     ivec        ntot;
 241     int         cartpmedim;
 242     int        *pmenodes;          /* size npmenodes                         */
 243     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 244                                     * but with bCartesianPP_PME              */
 245     gmx_ddpme_t ddpme[2];
 246
 247     /* The DD particle-particle nodes only */
 248     gmx_bool bCartesianPP;
 249     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 250
 251     /* The global charge groups */
 252     t_block cgs_gl;
 253
 254     /* Should we sort the cgs */
 255     int                nstSortCG;
 256     gmx_domdec_sort_t *sort;
 257
 258     /* Are there charge groups? */
 259     gmx_bool bCGs;
 260
 261     /* Are there bonded and multi-body interactions between charge groups? */
 262     gmx_bool bInterCGBondeds;
 263     gmx_bool bInterCGMultiBody;
 264
 265     /* Data for the optional bonded interaction atom communication range */
 266     gmx_bool  bBondComm;
 267     t_blocka *cglink;
 268     char     *bLocalCG;
 269
 270     /* The DLB option */
 271     int      eDLB;
 272     /* Are we actually using DLB? */
 273     gmx_bool bDynLoadBal;
 274
 275     /* Cell sizes for static load balancing, first index cartesian */
 276     real **slb_frac;
 277
 278     /* The width of the communicated boundaries */
 279     real     cutoff_mbody;
 280     real     cutoff;
 281     /* The minimum cell size (including triclinic correction) */
 282     rvec     cellsize_min;
 283     /* For dlb, for use with edlbAUTO */
 284     rvec     cellsize_min_dlb;
 285     /* The lower limit for the DD cell size with DLB */
 286     real     cellsize_limit;
 287     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 288     gmx_bool bVacDLBNoLimit;
 289
 290     /* With PME load balancing we set limits on DLB */
 291     gmx_bool bPMELoadBalDLBLimits;
 292     /* DLB needs to take into account that we want to allow this maximum
 293      * cut-off (for PME load balancing), this could limit cell boundaries.
 294      */
 295     real PMELoadBal_max_cutoff;
 296
 297     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 298     ivec tric_dir;
 299     /* box0 and box_size are required with dim's without pbc and -gcom */
 300     rvec box0;
 301     rvec box_size;
 302
 303     /* The cell boundaries */
 304     rvec cell_x0;
 305     rvec cell_x1;
 306
 307     /* The old location of the cell boundaries, to check cg displacements */
 308     rvec old_cell_x0;
 309     rvec old_cell_x1;
 310
 311     /* The communication setup and charge group boundaries for the zones */
 312     gmx_domdec_zones_t zones;
 313
 314     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 315      * cell boundaries of neighboring cells for dynamic load balancing.
 316      */
 317     gmx_ddzone_t zone_d1[2];
 318     gmx_ddzone_t zone_d2[2][2];
 319
 320     /* The coordinate/force communication setup and indices */
 321     gmx_domdec_comm_dim_t cd[DIM];
 322     /* The maximum number of cells to communicate with in one dimension */
 323     int                   maxpulse;
 324
 325     /* Which cg distribution is stored on the master node */
 326     int master_cg_ddp_count;
 327
 328     /* The number of cg's received from the direct neighbors */
 329     int  zone_ncg1[DD_MAXZONE];
 330
 331     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 332     int  nat[ddnatNR];
 333
 334     /* Array for signalling if atoms have moved to another domain */
 335     int  *moved;
 336     int   moved_nalloc;
 337
 338     /* Communication buffer for general use */
 339     int  *buf_int;
 340     int   nalloc_int;
 341
 342     /* Communication buffer for general use */
 343     vec_rvec_t vbuf;
 344
 345     /* Temporary storage for thread parallel communication setup */
 346     int                   nth;
 347     dd_comm_setup_work_t *dth;
 348
 349     /* Communication buffers only used with multiple grid pulses */
 350     int       *buf_int2;
 351     int        nalloc_int2;
 352     vec_rvec_t vbuf2;
 353
 354     /* Communication buffers for local redistribution */
 355     int  **cggl_flag;
 356     int    cggl_flag_nalloc[DIM*2];
 357     rvec **cgcm_state;
 358     int    cgcm_state_nalloc[DIM*2];
 359
 360     /* Cell sizes for dynamic load balancing */
 361     gmx_domdec_root_t **root;
 362     real               *cell_f_row;
 363     real                cell_f0[DIM];
 364     real                cell_f1[DIM];
 365     real                cell_f_max0[DIM];
 366     real                cell_f_min1[DIM];
 367
 368     /* Stuff for load communication */
 369     gmx_bool           bRecordLoad;
 370     gmx_domdec_load_t *load;
 371 #ifdef GMX_MPI
 372     MPI_Comm          *mpi_comm_load;
 373 #endif
 374
 375     /* Maximum DLB scaling per load balancing step in percent */
 376     int dlb_scale_lim;
 377
 378     /* Cycle counters */
 379     float  cycl[ddCyclNr];
 380     int    cycl_n[ddCyclNr];
 381     float  cycl_max[ddCyclNr];
 382     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 383     int    eFlop;
 384     double flop;
 385     int    flop_n;
 386     /* Have often have did we have load measurements */
 387     int    n_load_have;
 388     /* Have often have we collected the load measurements */
 389     int    n_load_collect;
 390
 391     /* Statistics */
 392     double sum_nat[ddnatNR-ddnatZONE];
 393     int    ndecomp;
 394     int    nload;
 395     double load_step;
 396     double load_sum;
 397     double load_max;
 398     ivec   load_lim;
 399     double load_mdf;
 400     double load_pme;
 401
 402     /* The last partition step */
 403     gmx_large_int_t partition_step;
 404
 405     /* Debugging */
 406     int  nstDDDump;
 407     int  nstDDDumpGrid;
 408     int  DD_debug;
 409 } gmx_domdec_comm_t;
 410
 411 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 412 #define DD_CGIBS 2
 413
 414 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 415 #define DD_FLAG_NRCG  65535
 416 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 417 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 418
 419 /* Zone permutation required to obtain consecutive charge groups
 420  * for neighbor searching.
 421  */
 422 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 423
 424 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 425  * components see only j zones with that component 0.
 426  */
 427
 428 /* The DD zone order */
 429 static const ivec dd_zo[DD_MAXZONE] =
 430 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 431
 432 /* The 3D setup */
 433 #define dd_z3n  8
 434 #define dd_zp3n 4
 435 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 436
 437 /* The 2D setup */
 438 #define dd_z2n  4
 439 #define dd_zp2n 2
 440 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 441
 442 /* The 1D setup */
 443 #define dd_z1n  2
 444 #define dd_zp1n 1
 445 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 446
 447 /* Factors used to avoid problems due to rounding issues */
 448 #define DD_CELL_MARGIN       1.0001
 449 #define DD_CELL_MARGIN2      1.00005
 450 /* Factor to account for pressure scaling during nstlist steps */
 451 #define DD_PRES_SCALE_MARGIN 1.02
 452
 453 /* Allowed performance loss before we DLB or warn */
 454 #define DD_PERF_LOSS 0.05
 455
 456 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 457
 458 /* Use separate MPI send and receive commands
 459  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 460  * This saves memory (and some copying for small nnodes).
 461  * For high parallelization scatter and gather calls are used.
 462  */
 463 #define GMX_DD_NNODES_SENDRECV 4
 464
 465
 466 /*
 467    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 468
 469    static void index2xyz(ivec nc,int ind,ivec xyz)
 470    {
 471    xyz[XX] = ind % nc[XX];
 472    xyz[YY] = (ind / nc[XX]) % nc[YY];
 473    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 474    }
 475  */
 476
 477 /* This order is required to minimize the coordinate communication in PME
 478  * which uses decomposition in the x direction.
 479  */
 480 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 481
 482 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 483 {
 484     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 485     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 486     xyz[ZZ] = ind % nc[ZZ];
 487 }
 488
 489 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 490 {
 491     int ddindex;
 492     int ddnodeid = -1;
 493
 494     ddindex = dd_index(dd->nc, c);
 495     if (dd->comm->bCartesianPP_PME)
 496     {
 497         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 498     }
 499     else if (dd->comm->bCartesianPP)
 500     {
 501 #ifdef GMX_MPI
 502         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 503 #endif
 504     }
 505     else
 506     {
 507         ddnodeid = ddindex;
 508     }
 509
 510     return ddnodeid;
 511 }
 512
 513 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 514 {
 515     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 516 }
 517
 518 int ddglatnr(gmx_domdec_t *dd, int i)
 519 {
 520     int atnr;
 521
 522     if (dd == NULL)
 523     {
 524         atnr = i + 1;
 525     }
 526     else
 527     {
 528         if (i >= dd->comm->nat[ddnatNR-1])
 529         {
 530             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 531         }
 532         atnr = dd->gatindex[i] + 1;
 533     }
 534
 535     return atnr;
 536 }
 537
 538 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 539 {
 540     return &dd->comm->cgs_gl;
 541 }
 542
 543 static void vec_rvec_init(vec_rvec_t *v)
 544 {
 545     v->nalloc = 0;
 546     v->v      = NULL;
 547 }
 548
 549 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 550 {
 551     if (n > v->nalloc)
 552     {
 553         v->nalloc = over_alloc_dd(n);
 554         srenew(v->v, v->nalloc);
 555     }
 556 }
 557
 558 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 559 {
 560     int i;
 561
 562     if (state->ddp_count != dd->ddp_count)
 563     {
 564         gmx_incons("The state does not the domain decomposition state");
 565     }
 566
 567     state->ncg_gl = dd->ncg_home;
 568     if (state->ncg_gl > state->cg_gl_nalloc)
 569     {
 570         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 571         srenew(state->cg_gl, state->cg_gl_nalloc);
 572     }
 573     for (i = 0; i < state->ncg_gl; i++)
 574     {
 575         state->cg_gl[i] = dd->index_gl[i];
 576     }
 577
 578     state->ddp_count_cg_gl = dd->ddp_count;
 579 }
 580
 581 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 582 {
 583     return &dd->comm->zones;
 584 }
 585
 586 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 587                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 588 {
 589     gmx_domdec_zones_t *zones;
 590     int                 izone, d, dim;
 591
 592     zones = &dd->comm->zones;
 593
 594     izone = 0;
 595     while (icg >= zones->izone[izone].cg1)
 596     {
 597         izone++;
 598     }
 599
 600     if (izone == 0)
 601     {
 602         *jcg0 = icg;
 603     }
 604     else if (izone < zones->nizone)
 605     {
 606         *jcg0 = zones->izone[izone].jcg0;
 607     }
 608     else
 609     {
 610         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 611                   icg, izone, zones->nizone);
 612     }
 613
 614     *jcg1 = zones->izone[izone].jcg1;
 615
 616     for (d = 0; d < dd->ndim; d++)
 617     {
 618         dim         = dd->dim[d];
 619         shift0[dim] = zones->izone[izone].shift0[dim];
 620         shift1[dim] = zones->izone[izone].shift1[dim];
 621         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 622         {
 623             /* A conservative approach, this can be optimized */
 624             shift0[dim] -= 1;
 625             shift1[dim] += 1;
 626         }
 627     }
 628 }
 629
 630 int dd_natoms_vsite(gmx_domdec_t *dd)
 631 {
 632     return dd->comm->nat[ddnatVSITE];
 633 }
 634
 635 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 636 {
 637     *at_start = dd->comm->nat[ddnatCON-1];
 638     *at_end   = dd->comm->nat[ddnatCON];
 639 }
 640
 641 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 642 {
 643     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 644     int                   *index, *cgindex;
 645     gmx_domdec_comm_t     *comm;
 646     gmx_domdec_comm_dim_t *cd;
 647     gmx_domdec_ind_t      *ind;
 648     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 649     gmx_bool               bPBC, bScrew;
 650
 651     comm = dd->comm;
 652
 653     cgindex = dd->cgindex;
 654
 655     buf = comm->vbuf.v;
 656
 657     nzone   = 1;
 658     nat_tot = dd->nat_home;
 659     for (d = 0; d < dd->ndim; d++)
 660     {
 661         bPBC   = (dd->ci[dd->dim[d]] == 0);
 662         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 663         if (bPBC)
 664         {
 665             copy_rvec(box[dd->dim[d]], shift);
 666         }
 667         cd = &comm->cd[d];
 668         for (p = 0; p < cd->np; p++)
 669         {
 670             ind   = &cd->ind[p];
 671             index = ind->index;
 672             n     = 0;
 673             if (!bPBC)
 674             {
 675                 for (i = 0; i < ind->nsend[nzone]; i++)
 676                 {
 677                     at0 = cgindex[index[i]];
 678                     at1 = cgindex[index[i]+1];
 679                     for (j = at0; j < at1; j++)
 680                     {
 681                         copy_rvec(x[j], buf[n]);
 682                         n++;
 683                     }
 684                 }
 685             }
 686             else if (!bScrew)
 687             {
 688                 for (i = 0; i < ind->nsend[nzone]; i++)
 689                 {
 690                     at0 = cgindex[index[i]];
 691                     at1 = cgindex[index[i]+1];
 692                     for (j = at0; j < at1; j++)
 693                     {
 694                         /* We need to shift the coordinates */
 695                         rvec_add(x[j], shift, buf[n]);
 696                         n++;
 697                     }
 698                 }
 699             }
 700             else
 701             {
 702                 for (i = 0; i < ind->nsend[nzone]; i++)
 703                 {
 704                     at0 = cgindex[index[i]];
 705                     at1 = cgindex[index[i]+1];
 706                     for (j = at0; j < at1; j++)
 707                     {
 708                         /* Shift x */
 709                         buf[n][XX] = x[j][XX] + shift[XX];
 710                         /* Rotate y and z.
 711                          * This operation requires a special shift force
 712                          * treatment, which is performed in calc_vir.
 713                          */
 714                         buf[n][YY] = box[YY][YY] - x[j][YY];
 715                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 716                         n++;
 717                     }
 718                 }
 719             }
 720
 721             if (cd->bInPlace)
 722             {
 723                 rbuf = x + nat_tot;
 724             }
 725             else
 726             {
 727                 rbuf = comm->vbuf2.v;
 728             }
 729             /* Send and receive the coordinates */
 730             dd_sendrecv_rvec(dd, d, dddirBackward,
 731                              buf,  ind->nsend[nzone+1],
 732                              rbuf, ind->nrecv[nzone+1]);
 733             if (!cd->bInPlace)
 734             {
 735                 j = 0;
 736                 for (zone = 0; zone < nzone; zone++)
 737                 {
 738                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 739                     {
 740                         copy_rvec(rbuf[j], x[i]);
 741                         j++;
 742                     }
 743                 }
 744             }
 745             nat_tot += ind->nrecv[nzone+1];
 746         }
 747         nzone += nzone;
 748     }
 749 }
 750
 751 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 752 {
 753     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 754     int                   *index, *cgindex;
 755     gmx_domdec_comm_t     *comm;
 756     gmx_domdec_comm_dim_t *cd;
 757     gmx_domdec_ind_t      *ind;
 758     rvec                  *buf, *sbuf;
 759     ivec                   vis;
 760     int                    is;
 761     gmx_bool               bPBC, bScrew;
 762
 763     comm = dd->comm;
 764
 765     cgindex = dd->cgindex;
 766
 767     buf = comm->vbuf.v;
 768
 769     n       = 0;
 770     nzone   = comm->zones.n/2;
 771     nat_tot = dd->nat_tot;
 772     for (d = dd->ndim-1; d >= 0; d--)
 773     {
 774         bPBC   = (dd->ci[dd->dim[d]] == 0);
 775         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 776         if (fshift == NULL && !bScrew)
 777         {
 778             bPBC = FALSE;
 779         }
 780         /* Determine which shift vector we need */
 781         clear_ivec(vis);
 782         vis[dd->dim[d]] = 1;
 783         is              = IVEC2IS(vis);
 784
 785         cd = &comm->cd[d];
 786         for (p = cd->np-1; p >= 0; p--)
 787         {
 788             ind      = &cd->ind[p];
 789             nat_tot -= ind->nrecv[nzone+1];
 790             if (cd->bInPlace)
 791             {
 792                 sbuf = f + nat_tot;
 793             }
 794             else
 795             {
 796                 sbuf = comm->vbuf2.v;
 797                 j    = 0;
 798                 for (zone = 0; zone < nzone; zone++)
 799                 {
 800                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 801                     {
 802                         copy_rvec(f[i], sbuf[j]);
 803                         j++;
 804                     }
 805                 }
 806             }
 807             /* Communicate the forces */
 808             dd_sendrecv_rvec(dd, d, dddirForward,
 809                              sbuf, ind->nrecv[nzone+1],
 810                              buf,  ind->nsend[nzone+1]);
 811             index = ind->index;
 812             /* Add the received forces */
 813             n = 0;
 814             if (!bPBC)
 815             {
 816                 for (i = 0; i < ind->nsend[nzone]; i++)
 817                 {
 818                     at0 = cgindex[index[i]];
 819                     at1 = cgindex[index[i]+1];
 820                     for (j = at0; j < at1; j++)
 821                     {
 822                         rvec_inc(f[j], buf[n]);
 823                         n++;
 824                     }
 825                 }
 826             }
 827             else if (!bScrew)
 828             {
 829                 for (i = 0; i < ind->nsend[nzone]; i++)
 830                 {
 831                     at0 = cgindex[index[i]];
 832                     at1 = cgindex[index[i]+1];
 833                     for (j = at0; j < at1; j++)
 834                     {
 835                         rvec_inc(f[j], buf[n]);
 836                         /* Add this force to the shift force */
 837                         rvec_inc(fshift[is], buf[n]);
 838                         n++;
 839                     }
 840                 }
 841             }
 842             else
 843             {
 844                 for (i = 0; i < ind->nsend[nzone]; i++)
 845                 {
 846                     at0 = cgindex[index[i]];
 847                     at1 = cgindex[index[i]+1];
 848                     for (j = at0; j < at1; j++)
 849                     {
 850                         /* Rotate the force */
 851                         f[j][XX] += buf[n][XX];
 852                         f[j][YY] -= buf[n][YY];
 853                         f[j][ZZ] -= buf[n][ZZ];
 854                         if (fshift)
 855                         {
 856                             /* Add this force to the shift force */
 857                             rvec_inc(fshift[is], buf[n]);
 858                         }
 859                         n++;
 860                     }
 861                 }
 862             }
 863         }
 864         nzone /= 2;
 865     }
 866 }
 867
 868 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 869 {
 870     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 871     int                   *index, *cgindex;
 872     gmx_domdec_comm_t     *comm;
 873     gmx_domdec_comm_dim_t *cd;
 874     gmx_domdec_ind_t      *ind;
 875     real                  *buf, *rbuf;
 876
 877     comm = dd->comm;
 878
 879     cgindex = dd->cgindex;
 880
 881     buf = &comm->vbuf.v[0][0];
 882
 883     nzone   = 1;
 884     nat_tot = dd->nat_home;
 885     for (d = 0; d < dd->ndim; d++)
 886     {
 887         cd = &comm->cd[d];
 888         for (p = 0; p < cd->np; p++)
 889         {
 890             ind   = &cd->ind[p];
 891             index = ind->index;
 892             n     = 0;
 893             for (i = 0; i < ind->nsend[nzone]; i++)
 894             {
 895                 at0 = cgindex[index[i]];
 896                 at1 = cgindex[index[i]+1];
 897                 for (j = at0; j < at1; j++)
 898                 {
 899                     buf[n] = v[j];
 900                     n++;
 901                 }
 902             }
 903
 904             if (cd->bInPlace)
 905             {
 906                 rbuf = v + nat_tot;
 907             }
 908             else
 909             {
 910                 rbuf = &comm->vbuf2.v[0][0];
 911             }
 912             /* Send and receive the coordinates */
 913             dd_sendrecv_real(dd, d, dddirBackward,
 914                              buf,  ind->nsend[nzone+1],
 915                              rbuf, ind->nrecv[nzone+1]);
 916             if (!cd->bInPlace)
 917             {
 918                 j = 0;
 919                 for (zone = 0; zone < nzone; zone++)
 920                 {
 921                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 922                     {
 923                         v[i] = rbuf[j];
 924                         j++;
 925                     }
 926                 }
 927             }
 928             nat_tot += ind->nrecv[nzone+1];
 929         }
 930         nzone += nzone;
 931     }
 932 }
 933
 934 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 935 {
 936     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 937     int                   *index, *cgindex;
 938     gmx_domdec_comm_t     *comm;
 939     gmx_domdec_comm_dim_t *cd;
 940     gmx_domdec_ind_t      *ind;
 941     real                  *buf, *sbuf;
 942
 943     comm = dd->comm;
 944
 945     cgindex = dd->cgindex;
 946
 947     buf = &comm->vbuf.v[0][0];
 948
 949     n       = 0;
 950     nzone   = comm->zones.n/2;
 951     nat_tot = dd->nat_tot;
 952     for (d = dd->ndim-1; d >= 0; d--)
 953     {
 954         cd = &comm->cd[d];
 955         for (p = cd->np-1; p >= 0; p--)
 956         {
 957             ind      = &cd->ind[p];
 958             nat_tot -= ind->nrecv[nzone+1];
 959             if (cd->bInPlace)
 960             {
 961                 sbuf = v + nat_tot;
 962             }
 963             else
 964             {
 965                 sbuf = &comm->vbuf2.v[0][0];
 966                 j    = 0;
 967                 for (zone = 0; zone < nzone; zone++)
 968                 {
 969                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 970                     {
 971                         sbuf[j] = v[i];
 972                         j++;
 973                     }
 974                 }
 975             }
 976             /* Communicate the forces */
 977             dd_sendrecv_real(dd, d, dddirForward,
 978                              sbuf, ind->nrecv[nzone+1],
 979                              buf,  ind->nsend[nzone+1]);
 980             index = ind->index;
 981             /* Add the received forces */
 982             n = 0;
 983             for (i = 0; i < ind->nsend[nzone]; i++)
 984             {
 985                 at0 = cgindex[index[i]];
 986                 at1 = cgindex[index[i]+1];
 987                 for (j = at0; j < at1; j++)
 988                 {
 989                     v[j] += buf[n];
 990                     n++;
 991                 }
 992             }
 993         }
 994         nzone /= 2;
 995     }
 996 }
 997
 998 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
 999 {
1000     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1001             d, i, j,
1002             zone->min0, zone->max1,
1003             zone->mch0, zone->mch0,
1004             zone->p1_0, zone->p1_1);
1005 }
1006
1007
1008 #define DDZONECOMM_MAXZONE  5
1009 #define DDZONECOMM_BUFSIZE  3
1010
1011 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1012                                int ddimind, int direction,
1013                                gmx_ddzone_t *buf_s, int n_s,
1014                                gmx_ddzone_t *buf_r, int n_r)
1015 {
1016 #define ZBS  DDZONECOMM_BUFSIZE
1017     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1018     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1019     int  i;
1020
1021     for (i = 0; i < n_s; i++)
1022     {
1023         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1024         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1025         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1026         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1027         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1028         vbuf_s[i*ZBS+1][2] = 0;
1029         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1030         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1031         vbuf_s[i*ZBS+2][2] = 0;
1032     }
1033
1034     dd_sendrecv_rvec(dd, ddimind, direction,
1035                      vbuf_s, n_s*ZBS,
1036                      vbuf_r, n_r*ZBS);
1037
1038     for (i = 0; i < n_r; i++)
1039     {
1040         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1041         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1042         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1043         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1044         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1045         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1046         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1047     }
1048
1049 #undef ZBS
1050 }
1051
1052 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1053                           rvec cell_ns_x0, rvec cell_ns_x1)
1054 {
1055     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1056     gmx_ddzone_t      *zp;
1057     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1058     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1059     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1060     rvec               extr_s[2], extr_r[2];
1061     rvec               dh;
1062     real               dist_d, c = 0, det;
1063     gmx_domdec_comm_t *comm;
1064     gmx_bool           bPBC, bUse;
1065
1066     comm = dd->comm;
1067
1068     for (d = 1; d < dd->ndim; d++)
1069     {
1070         dim      = dd->dim[d];
1071         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1072         zp->min0 = cell_ns_x0[dim];
1073         zp->max1 = cell_ns_x1[dim];
1074         zp->min1 = cell_ns_x1[dim];
1075         zp->mch0 = cell_ns_x0[dim];
1076         zp->mch1 = cell_ns_x1[dim];
1077         zp->p1_0 = cell_ns_x0[dim];
1078         zp->p1_1 = cell_ns_x1[dim];
1079     }
1080
1081     for (d = dd->ndim-2; d >= 0; d--)
1082     {
1083         dim  = dd->dim[d];
1084         bPBC = (dim < ddbox->npbcdim);
1085
1086         /* Use an rvec to store two reals */
1087         extr_s[d][0] = comm->cell_f0[d+1];
1088         extr_s[d][1] = comm->cell_f1[d+1];
1089         extr_s[d][2] = comm->cell_f1[d+1];
1090
1091         pos = 0;
1092         /* Store the extremes in the backward sending buffer,
1093          * so the get updated separately from the forward communication.
1094          */
1095         for (d1 = d; d1 < dd->ndim-1; d1++)
1096         {
1097             /* We invert the order to be able to use the same loop for buf_e */
1098             buf_s[pos].min0 = extr_s[d1][1];
1099             buf_s[pos].max1 = extr_s[d1][0];
1100             buf_s[pos].min1 = extr_s[d1][2];
1101             buf_s[pos].mch0 = 0;
1102             buf_s[pos].mch1 = 0;
1103             /* Store the cell corner of the dimension we communicate along */
1104             buf_s[pos].p1_0 = comm->cell_x0[dim];
1105             buf_s[pos].p1_1 = 0;
1106             pos++;
1107         }
1108
1109         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1110         pos++;
1111
1112         if (dd->ndim == 3 && d == 0)
1113         {
1114             buf_s[pos] = comm->zone_d2[0][1];
1115             pos++;
1116             buf_s[pos] = comm->zone_d1[0];
1117             pos++;
1118         }
1119
1120         /* We only need to communicate the extremes
1121          * in the forward direction
1122          */
1123         npulse = comm->cd[d].np;
1124         if (bPBC)
1125         {
1126             /* Take the minimum to avoid double communication */
1127             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1128         }
1129         else
1130         {
1131             /* Without PBC we should really not communicate over
1132              * the boundaries, but implementing that complicates
1133              * the communication setup and therefore we simply
1134              * do all communication, but ignore some data.
1135              */
1136             npulse_min = npulse;
1137         }
1138         for (p = 0; p < npulse_min; p++)
1139         {
1140             /* Communicate the extremes forward */
1141             bUse = (bPBC || dd->ci[dim] > 0);
1142
1143             dd_sendrecv_rvec(dd, d, dddirForward,
1144                              extr_s+d, dd->ndim-d-1,
1145                              extr_r+d, dd->ndim-d-1);
1146
1147             if (bUse)
1148             {
1149                 for (d1 = d; d1 < dd->ndim-1; d1++)
1150                 {
1151                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1152                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1153                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1154                 }
1155             }
1156         }
1157
1158         buf_size = pos;
1159         for (p = 0; p < npulse; p++)
1160         {
1161             /* Communicate all the zone information backward */
1162             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1163
1164             dd_sendrecv_ddzone(dd, d, dddirBackward,
1165                                buf_s, buf_size,
1166                                buf_r, buf_size);
1167
1168             clear_rvec(dh);
1169             if (p > 0)
1170             {
1171                 for (d1 = d+1; d1 < dd->ndim; d1++)
1172                 {
1173                     /* Determine the decrease of maximum required
1174                      * communication height along d1 due to the distance along d,
1175                      * this avoids a lot of useless atom communication.
1176                      */
1177                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1178
1179                     if (ddbox->tric_dir[dim])
1180                     {
1181                         /* c is the off-diagonal coupling between the cell planes
1182                          * along directions d and d1.
1183                          */
1184                         c = ddbox->v[dim][dd->dim[d1]][dim];
1185                     }
1186                     else
1187                     {
1188                         c = 0;
1189                     }
1190                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1191                     if (det > 0)
1192                     {
1193                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1194                     }
1195                     else
1196                     {
1197                         /* A negative value signals out of range */
1198                         dh[d1] = -1;
1199                     }
1200                 }
1201             }
1202
1203             /* Accumulate the extremes over all pulses */
1204             for (i = 0; i < buf_size; i++)
1205             {
1206                 if (p == 0)
1207                 {
1208                     buf_e[i] = buf_r[i];
1209                 }
1210                 else
1211                 {
1212                     if (bUse)
1213                     {
1214                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1215                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1216                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1217                     }
1218
1219                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1220                     {
1221                         d1 = 1;
1222                     }
1223                     else
1224                     {
1225                         d1 = d + 1;
1226                     }
1227                     if (bUse && dh[d1] >= 0)
1228                     {
1229                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1230                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1231                     }
1232                 }
1233                 /* Copy the received buffer to the send buffer,
1234                  * to pass the data through with the next pulse.
1235                  */
1236                 buf_s[i] = buf_r[i];
1237             }
1238             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1239                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1240             {
1241                 /* Store the extremes */
1242                 pos = 0;
1243
1244                 for (d1 = d; d1 < dd->ndim-1; d1++)
1245                 {
1246                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1247                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1248                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1249                     pos++;
1250                 }
1251
1252                 if (d == 1 || (d == 0 && dd->ndim == 3))
1253                 {
1254                     for (i = d; i < 2; i++)
1255                     {
1256                         comm->zone_d2[1-d][i] = buf_e[pos];
1257                         pos++;
1258                     }
1259                 }
1260                 if (d == 0)
1261                 {
1262                     comm->zone_d1[1] = buf_e[pos];
1263                     pos++;
1264                 }
1265             }
1266         }
1267     }
1268
1269     if (dd->ndim >= 2)
1270     {
1271         dim = dd->dim[1];
1272         for (i = 0; i < 2; i++)
1273         {
1274             if (debug)
1275             {
1276                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1277             }
1278             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1279             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1280         }
1281     }
1282     if (dd->ndim >= 3)
1283     {
1284         dim = dd->dim[2];
1285         for (i = 0; i < 2; i++)
1286         {
1287             for (j = 0; j < 2; j++)
1288             {
1289                 if (debug)
1290                 {
1291                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1292                 }
1293                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1294                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1295             }
1296         }
1297     }
1298     for (d = 1; d < dd->ndim; d++)
1299     {
1300         comm->cell_f_max0[d] = extr_s[d-1][0];
1301         comm->cell_f_min1[d] = extr_s[d-1][1];
1302         if (debug)
1303         {
1304             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1305                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1306         }
1307     }
1308 }
1309
1310 static void dd_collect_cg(gmx_domdec_t *dd,
1311                           t_state      *state_local)
1312 {
1313     gmx_domdec_master_t *ma = NULL;
1314     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1315     t_block             *cgs_gl;
1316
1317     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1318     {
1319         /* The master has the correct distribution */
1320         return;
1321     }
1322
1323     if (state_local->ddp_count == dd->ddp_count)
1324     {
1325         ncg_home = dd->ncg_home;
1326         cg       = dd->index_gl;
1327         nat_home = dd->nat_home;
1328     }
1329     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1330     {
1331         cgs_gl = &dd->comm->cgs_gl;
1332
1333         ncg_home = state_local->ncg_gl;
1334         cg       = state_local->cg_gl;
1335         nat_home = 0;
1336         for (i = 0; i < ncg_home; i++)
1337         {
1338             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1339         }
1340     }
1341     else
1342     {
1343         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1344     }
1345
1346     buf2[0] = dd->ncg_home;
1347     buf2[1] = dd->nat_home;
1348     if (DDMASTER(dd))
1349     {
1350         ma   = dd->ma;
1351         ibuf = ma->ibuf;
1352     }
1353     else
1354     {
1355         ibuf = NULL;
1356     }
1357     /* Collect the charge group and atom counts on the master */
1358     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1359
1360     if (DDMASTER(dd))
1361     {
1362         ma->index[0] = 0;
1363         for (i = 0; i < dd->nnodes; i++)
1364         {
1365             ma->ncg[i]     = ma->ibuf[2*i];
1366             ma->nat[i]     = ma->ibuf[2*i+1];
1367             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1368
1369         }
1370         /* Make byte counts and indices */
1371         for (i = 0; i < dd->nnodes; i++)
1372         {
1373             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1374             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1375         }
1376         if (debug)
1377         {
1378             fprintf(debug, "Initial charge group distribution: ");
1379             for (i = 0; i < dd->nnodes; i++)
1380             {
1381                 fprintf(debug, " %d", ma->ncg[i]);
1382             }
1383             fprintf(debug, "\n");
1384         }
1385     }
1386
1387     /* Collect the charge group indices on the master */
1388     dd_gatherv(dd,
1389                dd->ncg_home*sizeof(int), dd->index_gl,
1390                DDMASTER(dd) ? ma->ibuf : NULL,
1391                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1392                DDMASTER(dd) ? ma->cg : NULL);
1393
1394     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1395 }
1396
1397 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1398                                     rvec *lv, rvec *v)
1399 {
1400     gmx_domdec_master_t *ma;
1401     int                  n, i, c, a, nalloc = 0;
1402     rvec                *buf = NULL;
1403     t_block             *cgs_gl;
1404
1405     ma = dd->ma;
1406
1407     if (!DDMASTER(dd))
1408     {
1409 #ifdef GMX_MPI
1410         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1411                  dd->rank, dd->mpi_comm_all);
1412 #endif
1413     }
1414     else
1415     {
1416         /* Copy the master coordinates to the global array */
1417         cgs_gl = &dd->comm->cgs_gl;
1418
1419         n = DDMASTERRANK(dd);
1420         a = 0;
1421         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1422         {
1423             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1424             {
1425                 copy_rvec(lv[a++], v[c]);
1426             }
1427         }
1428
1429         for (n = 0; n < dd->nnodes; n++)
1430         {
1431             if (n != dd->rank)
1432             {
1433                 if (ma->nat[n] > nalloc)
1434                 {
1435                     nalloc = over_alloc_dd(ma->nat[n]);
1436                     srenew(buf, nalloc);
1437                 }
1438 #ifdef GMX_MPI
1439                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1440                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1441 #endif
1442                 a = 0;
1443                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1444                 {
1445                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1446                     {
1447                         copy_rvec(buf[a++], v[c]);
1448                     }
1449                 }
1450             }
1451         }
1452         sfree(buf);
1453     }
1454 }
1455
1456 static void get_commbuffer_counts(gmx_domdec_t *dd,
1457                                   int **counts, int **disps)
1458 {
1459     gmx_domdec_master_t *ma;
1460     int                  n;
1461
1462     ma = dd->ma;
1463
1464     /* Make the rvec count and displacment arrays */
1465     *counts  = ma->ibuf;
1466     *disps   = ma->ibuf + dd->nnodes;
1467     for (n = 0; n < dd->nnodes; n++)
1468     {
1469         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1470         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1471     }
1472 }
1473
1474 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1475                                    rvec *lv, rvec *v)
1476 {
1477     gmx_domdec_master_t *ma;
1478     int                 *rcounts = NULL, *disps = NULL;
1479     int                  n, i, c, a;
1480     rvec                *buf = NULL;
1481     t_block             *cgs_gl;
1482
1483     ma = dd->ma;
1484
1485     if (DDMASTER(dd))
1486     {
1487         get_commbuffer_counts(dd, &rcounts, &disps);
1488
1489         buf = ma->vbuf;
1490     }
1491
1492     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1493
1494     if (DDMASTER(dd))
1495     {
1496         cgs_gl = &dd->comm->cgs_gl;
1497
1498         a = 0;
1499         for (n = 0; n < dd->nnodes; n++)
1500         {
1501             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1502             {
1503                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1504                 {
1505                     copy_rvec(buf[a++], v[c]);
1506                 }
1507             }
1508         }
1509     }
1510 }
1511
1512 void dd_collect_vec(gmx_domdec_t *dd,
1513                     t_state *state_local, rvec *lv, rvec *v)
1514 {
1515     gmx_domdec_master_t *ma;
1516     int                  n, i, c, a, nalloc = 0;
1517     rvec                *buf = NULL;
1518
1519     dd_collect_cg(dd, state_local);
1520
1521     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1522     {
1523         dd_collect_vec_sendrecv(dd, lv, v);
1524     }
1525     else
1526     {
1527         dd_collect_vec_gatherv(dd, lv, v);
1528     }
1529 }
1530
1531
1532 void dd_collect_state(gmx_domdec_t *dd,
1533                       t_state *state_local, t_state *state)
1534 {
1535     int est, i, j, nh;
1536
1537     nh = state->nhchainlength;
1538
1539     if (DDMASTER(dd))
1540     {
1541         for (i = 0; i < efptNR; i++)
1542         {
1543             state->lambda[i] = state_local->lambda[i];
1544         }
1545         state->fep_state = state_local->fep_state;
1546         state->veta      = state_local->veta;
1547         state->vol0      = state_local->vol0;
1548         copy_mat(state_local->box, state->box);
1549         copy_mat(state_local->boxv, state->boxv);
1550         copy_mat(state_local->svir_prev, state->svir_prev);
1551         copy_mat(state_local->fvir_prev, state->fvir_prev);
1552         copy_mat(state_local->pres_prev, state->pres_prev);
1553
1554
1555         for (i = 0; i < state_local->ngtc; i++)
1556         {
1557             for (j = 0; j < nh; j++)
1558             {
1559                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1560                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1561             }
1562             state->therm_integral[i] = state_local->therm_integral[i];
1563         }
1564         for (i = 0; i < state_local->nnhpres; i++)
1565         {
1566             for (j = 0; j < nh; j++)
1567             {
1568                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1569                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1570             }
1571         }
1572     }
1573     for (est = 0; est < estNR; est++)
1574     {
1575         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1576         {
1577             switch (est)
1578             {
1579                 case estX:
1580                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1581                     break;
1582                 case estV:
1583                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1584                     break;
1585                 case estSDX:
1586                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1587                     break;
1588                 case estCGP:
1589                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1590                     break;
1591                 case estLD_RNG:
1592                     if (state->nrngi == 1)
1593                     {
1594                         if (DDMASTER(dd))
1595                         {
1596                             for (i = 0; i < state_local->nrng; i++)
1597                             {
1598                                 state->ld_rng[i] = state_local->ld_rng[i];
1599                             }
1600                         }
1601                     }
1602                     else
1603                     {
1604                         dd_gather(dd, state_local->nrng*sizeof(state->ld_rng[0]),
1605                                   state_local->ld_rng, state->ld_rng);
1606                     }
1607                     break;
1608                 case estLD_RNGI:
1609                     if (state->nrngi == 1)
1610                     {
1611                         if (DDMASTER(dd))
1612                         {
1613                             state->ld_rngi[0] = state_local->ld_rngi[0];
1614                         }
1615                     }
1616                     else
1617                     {
1618                         dd_gather(dd, sizeof(state->ld_rngi[0]),
1619                                   state_local->ld_rngi, state->ld_rngi);
1620                     }
1621                     break;
1622                 case estDISRE_INITF:
1623                 case estDISRE_RM3TAV:
1624                 case estORIRE_INITF:
1625                 case estORIRE_DTAV:
1626                     break;
1627                 default:
1628                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1629             }
1630         }
1631     }
1632 }
1633
1634 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1635 {
1636     int est;
1637
1638     if (debug)
1639     {
1640         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1641     }
1642
1643     state->nalloc = over_alloc_dd(nalloc);
1644
1645     for (est = 0; est < estNR; est++)
1646     {
1647         if (EST_DISTR(est) && (state->flags & (1<<est)))
1648         {
1649             switch (est)
1650             {
1651                 case estX:
1652                     srenew(state->x, state->nalloc);
1653                     break;
1654                 case estV:
1655                     srenew(state->v, state->nalloc);
1656                     break;
1657                 case estSDX:
1658                     srenew(state->sd_X, state->nalloc);
1659                     break;
1660                 case estCGP:
1661                     srenew(state->cg_p, state->nalloc);
1662                     break;
1663                 case estLD_RNG:
1664                 case estLD_RNGI:
1665                 case estDISRE_INITF:
1666                 case estDISRE_RM3TAV:
1667                 case estORIRE_INITF:
1668                 case estORIRE_DTAV:
1669                     /* No reallocation required */
1670                     break;
1671                 default:
1672                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1673             }
1674         }
1675     }
1676
1677     if (f != NULL)
1678     {
1679         srenew(*f, state->nalloc);
1680     }
1681 }
1682
1683 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1684                                int nalloc)
1685 {
1686     if (nalloc > fr->cg_nalloc)
1687     {
1688         if (debug)
1689         {
1690             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1691         }
1692         fr->cg_nalloc = over_alloc_dd(nalloc);
1693         srenew(fr->cginfo, fr->cg_nalloc);
1694         if (fr->cutoff_scheme == ecutsGROUP)
1695         {
1696             srenew(fr->cg_cm, fr->cg_nalloc);
1697         }
1698     }
1699     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1700     {
1701         /* We don't use charge groups, we use x in state to set up
1702          * the atom communication.
1703          */
1704         dd_realloc_state(state, f, nalloc);
1705     }
1706 }
1707
1708 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1709                                        rvec *v, rvec *lv)
1710 {
1711     gmx_domdec_master_t *ma;
1712     int                  n, i, c, a, nalloc = 0;
1713     rvec                *buf = NULL;
1714
1715     if (DDMASTER(dd))
1716     {
1717         ma  = dd->ma;
1718
1719         for (n = 0; n < dd->nnodes; n++)
1720         {
1721             if (n != dd->rank)
1722             {
1723                 if (ma->nat[n] > nalloc)
1724                 {
1725                     nalloc = over_alloc_dd(ma->nat[n]);
1726                     srenew(buf, nalloc);
1727                 }
1728                 /* Use lv as a temporary buffer */
1729                 a = 0;
1730                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1731                 {
1732                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1733                     {
1734                         copy_rvec(v[c], buf[a++]);
1735                     }
1736                 }
1737                 if (a != ma->nat[n])
1738                 {
1739                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1740                               a, ma->nat[n]);
1741                 }
1742
1743 #ifdef GMX_MPI
1744                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1745                          DDRANK(dd, n), n, dd->mpi_comm_all);
1746 #endif
1747             }
1748         }
1749         sfree(buf);
1750         n = DDMASTERRANK(dd);
1751         a = 0;
1752         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1753         {
1754             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1755             {
1756                 copy_rvec(v[c], lv[a++]);
1757             }
1758         }
1759     }
1760     else
1761     {
1762 #ifdef GMX_MPI
1763         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1764                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1765 #endif
1766     }
1767 }
1768
1769 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1770                                        rvec *v, rvec *lv)
1771 {
1772     gmx_domdec_master_t *ma;
1773     int                 *scounts = NULL, *disps = NULL;
1774     int                  n, i, c, a, nalloc = 0;
1775     rvec                *buf = NULL;
1776
1777     if (DDMASTER(dd))
1778     {
1779         ma  = dd->ma;
1780
1781         get_commbuffer_counts(dd, &scounts, &disps);
1782
1783         buf = ma->vbuf;
1784         a   = 0;
1785         for (n = 0; n < dd->nnodes; n++)
1786         {
1787             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1788             {
1789                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1790                 {
1791                     copy_rvec(v[c], buf[a++]);
1792                 }
1793             }
1794         }
1795     }
1796
1797     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1798 }
1799
1800 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1801 {
1802     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1803     {
1804         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1805     }
1806     else
1807     {
1808         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1809     }
1810 }
1811
1812 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1813                                 t_state *state, t_state *state_local,
1814                                 rvec **f)
1815 {
1816     int  i, j, nh;
1817
1818     nh = state->nhchainlength;
1819
1820     if (DDMASTER(dd))
1821     {
1822         for (i = 0; i < efptNR; i++)
1823         {
1824             state_local->lambda[i] = state->lambda[i];
1825         }
1826         state_local->fep_state = state->fep_state;
1827         state_local->veta      = state->veta;
1828         state_local->vol0      = state->vol0;
1829         copy_mat(state->box, state_local->box);
1830         copy_mat(state->box_rel, state_local->box_rel);
1831         copy_mat(state->boxv, state_local->boxv);
1832         copy_mat(state->svir_prev, state_local->svir_prev);
1833         copy_mat(state->fvir_prev, state_local->fvir_prev);
1834         for (i = 0; i < state_local->ngtc; i++)
1835         {
1836             for (j = 0; j < nh; j++)
1837             {
1838                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1839                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1840             }
1841             state_local->therm_integral[i] = state->therm_integral[i];
1842         }
1843         for (i = 0; i < state_local->nnhpres; i++)
1844         {
1845             for (j = 0; j < nh; j++)
1846             {
1847                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1848                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1849             }
1850         }
1851     }
1852     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1853     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1854     dd_bcast(dd, sizeof(real), &state_local->veta);
1855     dd_bcast(dd, sizeof(real), &state_local->vol0);
1856     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1857     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1858     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1859     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1860     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1861     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1862     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1863     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1864     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1865     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1866
1867     if (dd->nat_home > state_local->nalloc)
1868     {
1869         dd_realloc_state(state_local, f, dd->nat_home);
1870     }
1871     for (i = 0; i < estNR; i++)
1872     {
1873         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1874         {
1875             switch (i)
1876             {
1877                 case estX:
1878                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1879                     break;
1880                 case estV:
1881                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1882                     break;
1883                 case estSDX:
1884                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1885                     break;
1886                 case estCGP:
1887                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1888                     break;
1889                 case estLD_RNG:
1890                     if (state->nrngi == 1)
1891                     {
1892                         dd_bcastc(dd,
1893                                   state_local->nrng*sizeof(state_local->ld_rng[0]),
1894                                   state->ld_rng, state_local->ld_rng);
1895                     }
1896                     else
1897                     {
1898                         dd_scatter(dd,
1899                                    state_local->nrng*sizeof(state_local->ld_rng[0]),
1900                                    state->ld_rng, state_local->ld_rng);
1901                     }
1902                     break;
1903                 case estLD_RNGI:
1904                     if (state->nrngi == 1)
1905                     {
1906                         dd_bcastc(dd, sizeof(state_local->ld_rngi[0]),
1907                                   state->ld_rngi, state_local->ld_rngi);
1908                     }
1909                     else
1910                     {
1911                         dd_scatter(dd, sizeof(state_local->ld_rngi[0]),
1912                                    state->ld_rngi, state_local->ld_rngi);
1913                     }
1914                     break;
1915                 case estDISRE_INITF:
1916                 case estDISRE_RM3TAV:
1917                 case estORIRE_INITF:
1918                 case estORIRE_DTAV:
1919                     /* Not implemented yet */
1920                     break;
1921                 default:
1922                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1923             }
1924         }
1925     }
1926 }
1927
1928 static char dim2char(int dim)
1929 {
1930     char c = '?';
1931
1932     switch (dim)
1933     {
1934         case XX: c = 'X'; break;
1935         case YY: c = 'Y'; break;
1936         case ZZ: c = 'Z'; break;
1937         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1938     }
1939
1940     return c;
1941 }
1942
1943 static void write_dd_grid_pdb(const char *fn, gmx_large_int_t step,
1944                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1945 {
1946     rvec   grid_s[2], *grid_r = NULL, cx, r;
1947     char   fname[STRLEN], format[STRLEN], buf[22];
1948     FILE  *out;
1949     int    a, i, d, z, y, x;
1950     matrix tric;
1951     real   vol;
1952
1953     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1954     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1955
1956     if (DDMASTER(dd))
1957     {
1958         snew(grid_r, 2*dd->nnodes);
1959     }
1960
1961     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1962
1963     if (DDMASTER(dd))
1964     {
1965         for (d = 0; d < DIM; d++)
1966         {
1967             for (i = 0; i < DIM; i++)
1968             {
1969                 if (d == i)
1970                 {
1971                     tric[d][i] = 1;
1972                 }
1973                 else
1974                 {
1975                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1976                     {
1977                         tric[d][i] = box[i][d]/box[i][i];
1978                     }
1979                     else
1980                     {
1981                         tric[d][i] = 0;
1982                     }
1983                 }
1984             }
1985         }
1986         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1987         sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
1988         out = gmx_fio_fopen(fname, "w");
1989         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1990         a = 1;
1991         for (i = 0; i < dd->nnodes; i++)
1992         {
1993             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1994             for (d = 0; d < DIM; d++)
1995             {
1996                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1997             }
1998             for (z = 0; z < 2; z++)
1999             {
2000                 for (y = 0; y < 2; y++)
2001                 {
2002                     for (x = 0; x < 2; x++)
2003                     {
2004                         cx[XX] = grid_r[i*2+x][XX];
2005                         cx[YY] = grid_r[i*2+y][YY];
2006                         cx[ZZ] = grid_r[i*2+z][ZZ];
2007                         mvmul(tric, cx, r);
2008                         fprintf(out, format, "ATOM", a++, "CA", "GLY", ' ', 1+i,
2009                                 10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol);
2010                     }
2011                 }
2012             }
2013             for (d = 0; d < DIM; d++)
2014             {
2015                 for (x = 0; x < 4; x++)
2016                 {
2017                     switch (d)
2018                     {
2019                         case 0: y = 1 + i*8 + 2*x; break;
2020                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2021                         case 2: y = 1 + i*8 + x; break;
2022                     }
2023                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2024                 }
2025             }
2026         }
2027         gmx_fio_fclose(out);
2028         sfree(grid_r);
2029     }
2030 }
2031
2032 void write_dd_pdb(const char *fn, gmx_large_int_t step, const char *title,
2033                   gmx_mtop_t *mtop, t_commrec *cr,
2034                   int natoms, rvec x[], matrix box)
2035 {
2036     char          fname[STRLEN], format[STRLEN], format4[STRLEN], buf[22];
2037     FILE         *out;
2038     int           i, ii, resnr, c;
2039     char         *atomname, *resname;
2040     real          b;
2041     gmx_domdec_t *dd;
2042
2043     dd = cr->dd;
2044     if (natoms == -1)
2045     {
2046         natoms = dd->comm->nat[ddnatVSITE];
2047     }
2048
2049     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2050
2051     sprintf(format, "%s%s\n", pdbformat, "%6.2f%6.2f");
2052     sprintf(format4, "%s%s\n", pdbformat4, "%6.2f%6.2f");
2053
2054     out = gmx_fio_fopen(fname, "w");
2055
2056     fprintf(out, "TITLE     %s\n", title);
2057     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2058     for (i = 0; i < natoms; i++)
2059     {
2060         ii = dd->gatindex[i];
2061         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2062         if (i < dd->comm->nat[ddnatZONE])
2063         {
2064             c = 0;
2065             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2066             {
2067                 c++;
2068             }
2069             b = c;
2070         }
2071         else if (i < dd->comm->nat[ddnatVSITE])
2072         {
2073             b = dd->comm->zones.n;
2074         }
2075         else
2076         {
2077             b = dd->comm->zones.n + 1;
2078         }
2079         fprintf(out, strlen(atomname) < 4 ? format : format4,
2080                 "ATOM", (ii+1)%100000,
2081                 atomname, resname, ' ', resnr%10000, ' ',
2082                 10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b);
2083     }
2084     fprintf(out, "TER\n");
2085
2086     gmx_fio_fclose(out);
2087 }
2088
2089 real dd_cutoff_mbody(gmx_domdec_t *dd)
2090 {
2091     gmx_domdec_comm_t *comm;
2092     int                di;
2093     real               r;
2094
2095     comm = dd->comm;
2096
2097     r = -1;
2098     if (comm->bInterCGBondeds)
2099     {
2100         if (comm->cutoff_mbody > 0)
2101         {
2102             r = comm->cutoff_mbody;
2103         }
2104         else
2105         {
2106             /* cutoff_mbody=0 means we do not have DLB */
2107             r = comm->cellsize_min[dd->dim[0]];
2108             for (di = 1; di < dd->ndim; di++)
2109             {
2110                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2111             }
2112             if (comm->bBondComm)
2113             {
2114                 r = max(r, comm->cutoff_mbody);
2115             }
2116             else
2117             {
2118                 r = min(r, comm->cutoff);
2119             }
2120         }
2121     }
2122
2123     return r;
2124 }
2125
2126 real dd_cutoff_twobody(gmx_domdec_t *dd)
2127 {
2128     real r_mb;
2129
2130     r_mb = dd_cutoff_mbody(dd);
2131
2132     return max(dd->comm->cutoff, r_mb);
2133 }
2134
2135
2136 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2137 {
2138     int nc, ntot;
2139
2140     nc   = dd->nc[dd->comm->cartpmedim];
2141     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2142     copy_ivec(coord, coord_pme);
2143     coord_pme[dd->comm->cartpmedim] =
2144         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2145 }
2146
2147 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2148 {
2149     /* Here we assign a PME node to communicate with this DD node
2150      * by assuming that the major index of both is x.
2151      * We add cr->npmenodes/2 to obtain an even distribution.
2152      */
2153     return (ddindex*npme + npme/2)/ndd;
2154 }
2155
2156 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2157 {
2158     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2159 }
2160
2161 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2162 {
2163     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2164 }
2165
2166 static int *dd_pmenodes(t_commrec *cr)
2167 {
2168     int *pmenodes;
2169     int  n, i, p0, p1;
2170
2171     snew(pmenodes, cr->npmenodes);
2172     n = 0;
2173     for (i = 0; i < cr->dd->nnodes; i++)
2174     {
2175         p0 = cr_ddindex2pmeindex(cr, i);
2176         p1 = cr_ddindex2pmeindex(cr, i+1);
2177         if (i+1 == cr->dd->nnodes || p1 > p0)
2178         {
2179             if (debug)
2180             {
2181                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2182             }
2183             pmenodes[n] = i + 1 + n;
2184             n++;
2185         }
2186     }
2187
2188     return pmenodes;
2189 }
2190
2191 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2192 {
2193     gmx_domdec_t *dd;
2194     ivec          coords, coords_pme, nc;
2195     int           slab;
2196
2197     dd = cr->dd;
2198     /*
2199        if (dd->comm->bCartesian) {
2200        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2201        dd_coords2pmecoords(dd,coords,coords_pme);
2202        copy_ivec(dd->ntot,nc);
2203        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2204        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2205
2206        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2207        } else {
2208        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2209        }
2210      */
2211     coords[XX] = x;
2212     coords[YY] = y;
2213     coords[ZZ] = z;
2214     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2215
2216     return slab;
2217 }
2218
2219 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2220 {
2221     gmx_domdec_comm_t *comm;
2222     ivec               coords;
2223     int                ddindex, nodeid = -1;
2224
2225     comm = cr->dd->comm;
2226
2227     coords[XX] = x;
2228     coords[YY] = y;
2229     coords[ZZ] = z;
2230     if (comm->bCartesianPP_PME)
2231     {
2232 #ifdef GMX_MPI
2233         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2234 #endif
2235     }
2236     else
2237     {
2238         ddindex = dd_index(cr->dd->nc, coords);
2239         if (comm->bCartesianPP)
2240         {
2241             nodeid = comm->ddindex2simnodeid[ddindex];
2242         }
2243         else
2244         {
2245             if (comm->pmenodes)
2246             {
2247                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2248             }
2249             else
2250             {
2251                 nodeid = ddindex;
2252             }
2253         }
2254     }
2255
2256     return nodeid;
2257 }
2258
2259 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2260 {
2261     gmx_domdec_t      *dd;
2262     gmx_domdec_comm_t *comm;
2263     ivec               coord, coord_pme;
2264     int                i;
2265     int                pmenode = -1;
2266
2267     dd   = cr->dd;
2268     comm = dd->comm;
2269
2270     /* This assumes a uniform x domain decomposition grid cell size */
2271     if (comm->bCartesianPP_PME)
2272     {
2273 #ifdef GMX_MPI
2274         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2275         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2276         {
2277             /* This is a PP node */
2278             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2279             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2280         }
2281 #endif
2282     }
2283     else if (comm->bCartesianPP)
2284     {
2285         if (sim_nodeid < dd->nnodes)
2286         {
2287             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2288         }
2289     }
2290     else
2291     {
2292         /* This assumes DD cells with identical x coordinates
2293          * are numbered sequentially.
2294          */
2295         if (dd->comm->pmenodes == NULL)
2296         {
2297             if (sim_nodeid < dd->nnodes)
2298             {
2299                 /* The DD index equals the nodeid */
2300                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2301             }
2302         }
2303         else
2304         {
2305             i = 0;
2306             while (sim_nodeid > dd->comm->pmenodes[i])
2307             {
2308                 i++;
2309             }
2310             if (sim_nodeid < dd->comm->pmenodes[i])
2311             {
2312                 pmenode = dd->comm->pmenodes[i];
2313             }
2314         }
2315     }
2316
2317     return pmenode;
2318 }
2319
2320 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2321 {
2322     gmx_bool bPMEOnlyNode;
2323
2324     if (DOMAINDECOMP(cr))
2325     {
2326         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2327     }
2328     else
2329     {
2330         bPMEOnlyNode = FALSE;
2331     }
2332
2333     return bPMEOnlyNode;
2334 }
2335
2336 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2337                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2338 {
2339     gmx_domdec_t *dd;
2340     int           x, y, z;
2341     ivec          coord, coord_pme;
2342
2343     dd = cr->dd;
2344
2345     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2346
2347     *nmy_ddnodes = 0;
2348     for (x = 0; x < dd->nc[XX]; x++)
2349     {
2350         for (y = 0; y < dd->nc[YY]; y++)
2351         {
2352             for (z = 0; z < dd->nc[ZZ]; z++)
2353             {
2354                 if (dd->comm->bCartesianPP_PME)
2355                 {
2356                     coord[XX] = x;
2357                     coord[YY] = y;
2358                     coord[ZZ] = z;
2359                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2360                     if (dd->ci[XX] == coord_pme[XX] &&
2361                         dd->ci[YY] == coord_pme[YY] &&
2362                         dd->ci[ZZ] == coord_pme[ZZ])
2363                     {
2364                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2365                     }
2366                 }
2367                 else
2368                 {
2369                     /* The slab corresponds to the nodeid in the PME group */
2370                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2371                     {
2372                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2373                     }
2374                 }
2375             }
2376         }
2377     }
2378
2379     /* The last PP-only node is the peer node */
2380     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2381
2382     if (debug)
2383     {
2384         fprintf(debug, "Receive coordinates from PP nodes:");
2385         for (x = 0; x < *nmy_ddnodes; x++)
2386         {
2387             fprintf(debug, " %d", (*my_ddnodes)[x]);
2388         }
2389         fprintf(debug, "\n");
2390     }
2391 }
2392
2393 static gmx_bool receive_vir_ener(t_commrec *cr)
2394 {
2395     gmx_domdec_comm_t *comm;
2396     int                pmenode, coords[DIM], rank;
2397     gmx_bool           bReceive;
2398
2399     bReceive = TRUE;
2400     if (cr->npmenodes < cr->dd->nnodes)
2401     {
2402         comm = cr->dd->comm;
2403         if (comm->bCartesianPP_PME)
2404         {
2405             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2406 #ifdef GMX_MPI
2407             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2408             coords[comm->cartpmedim]++;
2409             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2410             {
2411                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2412                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2413                 {
2414                     /* This is not the last PP node for pmenode */
2415                     bReceive = FALSE;
2416                 }
2417             }
2418 #endif
2419         }
2420         else
2421         {
2422             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2423             if (cr->sim_nodeid+1 < cr->nnodes &&
2424                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2425             {
2426                 /* This is not the last PP node for pmenode */
2427                 bReceive = FALSE;
2428             }
2429         }
2430     }
2431
2432     return bReceive;
2433 }
2434
2435 static void set_zones_ncg_home(gmx_domdec_t *dd)
2436 {
2437     gmx_domdec_zones_t *zones;
2438     int                 i;
2439
2440     zones = &dd->comm->zones;
2441
2442     zones->cg_range[0] = 0;
2443     for (i = 1; i < zones->n+1; i++)
2444     {
2445         zones->cg_range[i] = dd->ncg_home;
2446     }
2447 }
2448
2449 static void rebuild_cgindex(gmx_domdec_t *dd,
2450                             const int *gcgs_index, t_state *state)
2451 {
2452     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2453
2454     ind        = state->cg_gl;
2455     dd_cg_gl   = dd->index_gl;
2456     cgindex    = dd->cgindex;
2457     nat        = 0;
2458     cgindex[0] = nat;
2459     for (i = 0; i < state->ncg_gl; i++)
2460     {
2461         cgindex[i]  = nat;
2462         cg_gl       = ind[i];
2463         dd_cg_gl[i] = cg_gl;
2464         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2465     }
2466     cgindex[i] = nat;
2467
2468     dd->ncg_home = state->ncg_gl;
2469     dd->nat_home = nat;
2470
2471     set_zones_ncg_home(dd);
2472 }
2473
2474 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2475 {
2476     while (cg >= cginfo_mb->cg_end)
2477     {
2478         cginfo_mb++;
2479     }
2480
2481     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2482 }
2483
2484 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2485                           t_forcerec *fr, char *bLocalCG)
2486 {
2487     cginfo_mb_t *cginfo_mb;
2488     int         *cginfo;
2489     int          cg;
2490
2491     if (fr != NULL)
2492     {
2493         cginfo_mb = fr->cginfo_mb;
2494         cginfo    = fr->cginfo;
2495
2496         for (cg = cg0; cg < cg1; cg++)
2497         {
2498             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2499         }
2500     }
2501
2502     if (bLocalCG != NULL)
2503     {
2504         for (cg = cg0; cg < cg1; cg++)
2505         {
2506             bLocalCG[index_gl[cg]] = TRUE;
2507         }
2508     }
2509 }
2510
2511 static void make_dd_indices(gmx_domdec_t *dd,
2512                             const int *gcgs_index, int cg_start)
2513 {
2514     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2515     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2516     gmx_ga2la_t *ga2la;
2517     char        *bLocalCG;
2518     gmx_bool     bCGs;
2519
2520     bLocalCG = dd->comm->bLocalCG;
2521
2522     if (dd->nat_tot > dd->gatindex_nalloc)
2523     {
2524         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2525         srenew(dd->gatindex, dd->gatindex_nalloc);
2526     }
2527
2528     nzone      = dd->comm->zones.n;
2529     zone2cg    = dd->comm->zones.cg_range;
2530     zone_ncg1  = dd->comm->zone_ncg1;
2531     index_gl   = dd->index_gl;
2532     gatindex   = dd->gatindex;
2533     bCGs       = dd->comm->bCGs;
2534
2535     if (zone2cg[1] != dd->ncg_home)
2536     {
2537         gmx_incons("dd->ncg_zone is not up to date");
2538     }
2539
2540     /* Make the local to global and global to local atom index */
2541     a = dd->cgindex[cg_start];
2542     for (zone = 0; zone < nzone; zone++)
2543     {
2544         if (zone == 0)
2545         {
2546             cg0 = cg_start;
2547         }
2548         else
2549         {
2550             cg0 = zone2cg[zone];
2551         }
2552         cg1    = zone2cg[zone+1];
2553         cg1_p1 = cg0 + zone_ncg1[zone];
2554
2555         for (cg = cg0; cg < cg1; cg++)
2556         {
2557             zone1 = zone;
2558             if (cg >= cg1_p1)
2559             {
2560                 /* Signal that this cg is from more than one pulse away */
2561                 zone1 += nzone;
2562             }
2563             cg_gl = index_gl[cg];
2564             if (bCGs)
2565             {
2566                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2567                 {
2568                     gatindex[a] = a_gl;
2569                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2570                     a++;
2571                 }
2572             }
2573             else
2574             {
2575                 gatindex[a] = cg_gl;
2576                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2577                 a++;
2578             }
2579         }
2580     }
2581 }
2582
2583 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2584                           const char *where)
2585 {
2586     int ncg, i, ngl, nerr;
2587
2588     nerr = 0;
2589     if (bLocalCG == NULL)
2590     {
2591         return nerr;
2592     }
2593     for (i = 0; i < dd->ncg_tot; i++)
2594     {
2595         if (!bLocalCG[dd->index_gl[i]])
2596         {
2597             fprintf(stderr,
2598                     "DD node %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2599             nerr++;
2600         }
2601     }
2602     ngl = 0;
2603     for (i = 0; i < ncg_sys; i++)
2604     {
2605         if (bLocalCG[i])
2606         {
2607             ngl++;
2608         }
2609     }
2610     if (ngl != dd->ncg_tot)
2611     {
2612         fprintf(stderr, "DD node %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2613         nerr++;
2614     }
2615
2616     return nerr;
2617 }
2618
2619 static void check_index_consistency(gmx_domdec_t *dd,
2620                                     int natoms_sys, int ncg_sys,
2621                                     const char *where)
2622 {
2623     int   nerr, ngl, i, a, cell;
2624     int  *have;
2625
2626     nerr = 0;
2627
2628     if (dd->comm->DD_debug > 1)
2629     {
2630         snew(have, natoms_sys);
2631         for (a = 0; a < dd->nat_tot; a++)
2632         {
2633             if (have[dd->gatindex[a]] > 0)
2634             {
2635                 fprintf(stderr, "DD node %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2636             }
2637             else
2638             {
2639                 have[dd->gatindex[a]] = a + 1;
2640             }
2641         }
2642         sfree(have);
2643     }
2644
2645     snew(have, dd->nat_tot);
2646
2647     ngl  = 0;
2648     for (i = 0; i < natoms_sys; i++)
2649     {
2650         if (ga2la_get(dd->ga2la, i, &a, &cell))
2651         {
2652             if (a >= dd->nat_tot)
2653             {
2654                 fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2655                 nerr++;
2656             }
2657             else
2658             {
2659                 have[a] = 1;
2660                 if (dd->gatindex[a] != i)
2661                 {
2662                     fprintf(stderr, "DD node %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2663                     nerr++;
2664                 }
2665             }
2666             ngl++;
2667         }
2668     }
2669     if (ngl != dd->nat_tot)
2670     {
2671         fprintf(stderr,
2672                 "DD node %d, %s: %d global atom indices, %d local atoms\n",
2673                 dd->rank, where, ngl, dd->nat_tot);
2674     }
2675     for (a = 0; a < dd->nat_tot; a++)
2676     {
2677         if (have[a] == 0)
2678         {
2679             fprintf(stderr,
2680                     "DD node %d, %s: local atom %d, global %d has no global index\n",
2681                     dd->rank, where, a+1, dd->gatindex[a]+1);
2682         }
2683     }
2684     sfree(have);
2685
2686     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2687
2688     if (nerr > 0)
2689     {
2690         gmx_fatal(FARGS, "DD node %d, %s: %d atom/cg index inconsistencies",
2691                   dd->rank, where, nerr);
2692     }
2693 }
2694
2695 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2696 {
2697     int   i;
2698     char *bLocalCG;
2699
2700     if (a_start == 0)
2701     {
2702         /* Clear the whole list without searching */
2703         ga2la_clear(dd->ga2la);
2704     }
2705     else
2706     {
2707         for (i = a_start; i < dd->nat_tot; i++)
2708         {
2709             ga2la_del(dd->ga2la, dd->gatindex[i]);
2710         }
2711     }
2712
2713     bLocalCG = dd->comm->bLocalCG;
2714     if (bLocalCG)
2715     {
2716         for (i = cg_start; i < dd->ncg_tot; i++)
2717         {
2718             bLocalCG[dd->index_gl[i]] = FALSE;
2719         }
2720     }
2721
2722     dd_clear_local_vsite_indices(dd);
2723
2724     if (dd->constraints)
2725     {
2726         dd_clear_local_constraint_indices(dd);
2727     }
2728 }
2729
2730 /* This function should be used for moving the domain boudaries during DLB,
2731  * for obtaining the minimum cell size. It checks the initially set limit
2732  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2733  * and, possibly, a longer cut-off limit set for PME load balancing.
2734  */
2735 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2736 {
2737     real cellsize_min;
2738
2739     cellsize_min = comm->cellsize_min[dim];
2740
2741     if (!comm->bVacDLBNoLimit && comm->bPMELoadBalDLBLimits)
2742     {
2743         cellsize_min = max(cellsize_min,
2744                            comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2745     }
2746
2747     return cellsize_min;
2748 }
2749
2750 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2751                             int dim_ind)
2752 {
2753     real grid_jump_limit;
2754
2755     /* The distance between the boundaries of cells at distance
2756      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2757      * and by the fact that cells should not be shifted by more than
2758      * half their size, such that cg's only shift by one cell
2759      * at redecomposition.
2760      */
2761     grid_jump_limit = comm->cellsize_limit;
2762     if (!comm->bVacDLBNoLimit)
2763     {
2764         if (comm->bPMELoadBalDLBLimits)
2765         {
2766             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2767         }
2768         grid_jump_limit = max(grid_jump_limit,
2769                               cutoff/comm->cd[dim_ind].np);
2770     }
2771
2772     return grid_jump_limit;
2773 }
2774
2775 static gmx_bool check_grid_jump(gmx_large_int_t step,
2776                                 gmx_domdec_t   *dd,
2777                                 real            cutoff,
2778                                 gmx_ddbox_t    *ddbox,
2779                                 gmx_bool        bFatal)
2780 {
2781     gmx_domdec_comm_t *comm;
2782     int                d, dim;
2783     real               limit, bfac;
2784     gmx_bool           bInvalid;
2785
2786     bInvalid = FALSE;
2787
2788     comm = dd->comm;
2789
2790     for (d = 1; d < dd->ndim; d++)
2791     {
2792         dim   = dd->dim[d];
2793         limit = grid_jump_limit(comm, cutoff, d);
2794         bfac  = ddbox->box_size[dim];
2795         if (ddbox->tric_dir[dim])
2796         {
2797             bfac *= ddbox->skew_fac[dim];
2798         }
2799         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2800                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2801         {
2802             bInvalid = TRUE;
2803
2804             if (bFatal)
2805             {
2806                 char buf[22];
2807
2808                 /* This error should never be triggered under normal
2809                  * circumstances, but you never know ...
2810                  */
2811                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with less nodes might avoid this issue.",
2812                           gmx_step_str(step, buf),
2813                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2814             }
2815         }
2816     }
2817
2818     return bInvalid;
2819 }
2820
2821 static int dd_load_count(gmx_domdec_comm_t *comm)
2822 {
2823     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2824 }
2825
2826 static float dd_force_load(gmx_domdec_comm_t *comm)
2827 {
2828     float load;
2829
2830     if (comm->eFlop)
2831     {
2832         load = comm->flop;
2833         if (comm->eFlop > 1)
2834         {
2835             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2836         }
2837     }
2838     else
2839     {
2840         load = comm->cycl[ddCyclF];
2841         if (comm->cycl_n[ddCyclF] > 1)
2842         {
2843             /* Subtract the maximum of the last n cycle counts
2844              * to get rid of possible high counts due to other soures,
2845              * for instance system activity, that would otherwise
2846              * affect the dynamic load balancing.
2847              */
2848             load -= comm->cycl_max[ddCyclF];
2849         }
2850     }
2851
2852     return load;
2853 }
2854
2855 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2856 {
2857     gmx_domdec_comm_t *comm;
2858     int                i;
2859
2860     comm = dd->comm;
2861
2862     snew(*dim_f, dd->nc[dim]+1);
2863     (*dim_f)[0] = 0;
2864     for (i = 1; i < dd->nc[dim]; i++)
2865     {
2866         if (comm->slb_frac[dim])
2867         {
2868             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2869         }
2870         else
2871         {
2872             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2873         }
2874     }
2875     (*dim_f)[dd->nc[dim]] = 1;
2876 }
2877
2878 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2879 {
2880     int  pmeindex, slab, nso, i;
2881     ivec xyz;
2882
2883     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2884     {
2885         ddpme->dim = YY;
2886     }
2887     else
2888     {
2889         ddpme->dim = dimind;
2890     }
2891     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2892
2893     ddpme->nslab = (ddpme->dim == 0 ?
2894                     dd->comm->npmenodes_x :
2895                     dd->comm->npmenodes_y);
2896
2897     if (ddpme->nslab <= 1)
2898     {
2899         return;
2900     }
2901
2902     nso = dd->comm->npmenodes/ddpme->nslab;
2903     /* Determine for each PME slab the PP location range for dimension dim */
2904     snew(ddpme->pp_min, ddpme->nslab);
2905     snew(ddpme->pp_max, ddpme->nslab);
2906     for (slab = 0; slab < ddpme->nslab; slab++)
2907     {
2908         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2909         ddpme->pp_max[slab] = 0;
2910     }
2911     for (i = 0; i < dd->nnodes; i++)
2912     {
2913         ddindex2xyz(dd->nc, i, xyz);
2914         /* For y only use our y/z slab.
2915          * This assumes that the PME x grid size matches the DD grid size.
2916          */
2917         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2918         {
2919             pmeindex = ddindex2pmeindex(dd, i);
2920             if (dimind == 0)
2921             {
2922                 slab = pmeindex/nso;
2923             }
2924             else
2925             {
2926                 slab = pmeindex % ddpme->nslab;
2927             }
2928             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2929             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2930         }
2931     }
2932
2933     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2934 }
2935
2936 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2937 {
2938     if (dd->comm->ddpme[0].dim == XX)
2939     {
2940         return dd->comm->ddpme[0].maxshift;
2941     }
2942     else
2943     {
2944         return 0;
2945     }
2946 }
2947
2948 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2949 {
2950     if (dd->comm->ddpme[0].dim == YY)
2951     {
2952         return dd->comm->ddpme[0].maxshift;
2953     }
2954     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2955     {
2956         return dd->comm->ddpme[1].maxshift;
2957     }
2958     else
2959     {
2960         return 0;
2961     }
2962 }
2963
2964 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
2965                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
2966 {
2967     gmx_domdec_comm_t *comm;
2968     int                nc, ns, s;
2969     int               *xmin, *xmax;
2970     real               range, pme_boundary;
2971     int                sh;
2972
2973     comm = dd->comm;
2974     nc   = dd->nc[ddpme->dim];
2975     ns   = ddpme->nslab;
2976
2977     if (!ddpme->dim_match)
2978     {
2979         /* PP decomposition is not along dim: the worst situation */
2980         sh = ns/2;
2981     }
2982     else if (ns <= 3 || (bUniform && ns == nc))
2983     {
2984         /* The optimal situation */
2985         sh = 1;
2986     }
2987     else
2988     {
2989         /* We need to check for all pme nodes which nodes they
2990          * could possibly need to communicate with.
2991          */
2992         xmin = ddpme->pp_min;
2993         xmax = ddpme->pp_max;
2994         /* Allow for atoms to be maximally 2/3 times the cut-off
2995          * out of their DD cell. This is a reasonable balance between
2996          * between performance and support for most charge-group/cut-off
2997          * combinations.
2998          */
2999         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3000         /* Avoid extra communication when we are exactly at a boundary */
3001         range *= 0.999;
3002
3003         sh = 1;
3004         for (s = 0; s < ns; s++)
3005         {
3006             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3007             pme_boundary = (real)s/ns;
3008             while (sh+1 < ns &&
3009                    ((s-(sh+1) >= 0 &&
3010                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3011                     (s-(sh+1) <  0 &&
3012                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3013             {
3014                 sh++;
3015             }
3016             pme_boundary = (real)(s+1)/ns;
3017             while (sh+1 < ns &&
3018                    ((s+(sh+1) <  ns &&
3019                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3020                     (s+(sh+1) >= ns &&
3021                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3022             {
3023                 sh++;
3024             }
3025         }
3026     }
3027
3028     ddpme->maxshift = sh;
3029
3030     if (debug)
3031     {
3032         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3033                 ddpme->dim, ddpme->maxshift);
3034     }
3035 }
3036
3037 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3038 {
3039     int d, dim;
3040
3041     for (d = 0; d < dd->ndim; d++)
3042     {
3043         dim = dd->dim[d];
3044         if (dim < ddbox->nboundeddim &&
3045             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3046             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3047         {
3048             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3049                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3050                       dd->nc[dim], dd->comm->cellsize_limit);
3051         }
3052     }
3053 }
3054
3055 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3056                                   gmx_bool bMaster, ivec npulse)
3057 {
3058     gmx_domdec_comm_t *comm;
3059     int                d, j;
3060     rvec               cellsize_min;
3061     real              *cell_x, cell_dx, cellsize;
3062
3063     comm = dd->comm;
3064
3065     for (d = 0; d < DIM; d++)
3066     {
3067         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3068         npulse[d]       = 1;
3069         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3070         {
3071             /* Uniform grid */
3072             cell_dx = ddbox->box_size[d]/dd->nc[d];
3073             if (bMaster)
3074             {
3075                 for (j = 0; j < dd->nc[d]+1; j++)
3076                 {
3077                     dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3078                 }
3079             }
3080             else
3081             {
3082                 comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3083                 comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3084             }
3085             cellsize = cell_dx*ddbox->skew_fac[d];
3086             while (cellsize*npulse[d] < comm->cutoff && npulse[d] < dd->nc[d]-1)
3087             {
3088                 npulse[d]++;
3089             }
3090             cellsize_min[d] = cellsize;
3091         }
3092         else
3093         {
3094             /* Statically load balanced grid */
3095             /* Also when we are not doing a master distribution we determine
3096              * all cell borders in a loop to obtain identical values
3097              * to the master distribution case and to determine npulse.
3098              */
3099             if (bMaster)
3100             {
3101                 cell_x = dd->ma->cell_x[d];
3102             }
3103             else
3104             {
3105                 snew(cell_x, dd->nc[d]+1);
3106             }
3107             cell_x[0] = ddbox->box0[d];
3108             for (j = 0; j < dd->nc[d]; j++)
3109             {
3110                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3111                 cell_x[j+1] = cell_x[j] + cell_dx;
3112                 cellsize    = cell_dx*ddbox->skew_fac[d];
3113                 while (cellsize*npulse[d] < comm->cutoff &&
3114                        npulse[d] < dd->nc[d]-1)
3115                 {
3116                     npulse[d]++;
3117                 }
3118                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3119             }
3120             if (!bMaster)
3121             {
3122                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3123                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3124                 sfree(cell_x);
3125             }
3126         }
3127         /* The following limitation is to avoid that a cell would receive
3128          * some of its own home charge groups back over the periodic boundary.
3129          * Double charge groups cause trouble with the global indices.
3130          */
3131         if (d < ddbox->npbcdim &&
3132             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3133         {
3134             gmx_fatal_collective(FARGS, NULL, dd,
3135                                  "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3136                                  dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3137                                  comm->cutoff,
3138                                  dd->nc[d], dd->nc[d],
3139                                  dd->nnodes > dd->nc[d] ? "cells" : "processors");
3140         }
3141     }
3142
3143     if (!comm->bDynLoadBal)
3144     {
3145         copy_rvec(cellsize_min, comm->cellsize_min);
3146     }
3147
3148     for (d = 0; d < comm->npmedecompdim; d++)
3149     {
3150         set_pme_maxshift(dd, &comm->ddpme[d],
3151                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3152                          comm->ddpme[d].slb_dim_f);
3153     }
3154 }
3155
3156
3157 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3158                                                   int d, int dim, gmx_domdec_root_t *root,
3159                                                   gmx_ddbox_t *ddbox,
3160                                                   gmx_bool bUniform, gmx_large_int_t step, real cellsize_limit_f, int range[])
3161 {
3162     gmx_domdec_comm_t *comm;
3163     int                ncd, i, j, nmin, nmin_old;
3164     gmx_bool           bLimLo, bLimHi;
3165     real              *cell_size;
3166     real               fac, halfway, cellsize_limit_f_i, region_size;
3167     gmx_bool           bPBC, bLastHi = FALSE;
3168     int                nrange[] = {range[0], range[1]};
3169
3170     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3171
3172     comm = dd->comm;
3173
3174     ncd = dd->nc[dim];
3175
3176     bPBC = (dim < ddbox->npbcdim);
3177
3178     cell_size = root->buf_ncd;
3179
3180     if (debug)
3181     {
3182         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3183     }
3184
3185     /* First we need to check if the scaling does not make cells
3186      * smaller than the smallest allowed size.
3187      * We need to do this iteratively, since if a cell is too small,
3188      * it needs to be enlarged, which makes all the other cells smaller,
3189      * which could in turn make another cell smaller than allowed.
3190      */
3191     for (i = range[0]; i < range[1]; i++)
3192     {
3193         root->bCellMin[i] = FALSE;
3194     }
3195     nmin = 0;
3196     do
3197     {
3198         nmin_old = nmin;
3199         /* We need the total for normalization */
3200         fac = 0;
3201         for (i = range[0]; i < range[1]; i++)
3202         {
3203             if (root->bCellMin[i] == FALSE)
3204             {
3205                 fac += cell_size[i];
3206             }
3207         }
3208         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3209         /* Determine the cell boundaries */
3210         for (i = range[0]; i < range[1]; i++)
3211         {
3212             if (root->bCellMin[i] == FALSE)
3213             {
3214                 cell_size[i] *= fac;
3215                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3216                 {
3217                     cellsize_limit_f_i = 0;
3218                 }
3219                 else
3220                 {
3221                     cellsize_limit_f_i = cellsize_limit_f;
3222                 }
3223                 if (cell_size[i] < cellsize_limit_f_i)
3224                 {
3225                     root->bCellMin[i] = TRUE;
3226                     cell_size[i]      = cellsize_limit_f_i;
3227                     nmin++;
3228                 }
3229             }
3230             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3231         }
3232     }
3233     while (nmin > nmin_old);
3234
3235     i            = range[1]-1;
3236     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3237     /* For this check we should not use DD_CELL_MARGIN,
3238      * but a slightly smaller factor,
3239      * since rounding could get use below the limit.
3240      */
3241     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3242     {
3243         char buf[22];
3244         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3245                   gmx_step_str(step, buf),
3246                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3247                   ncd, comm->cellsize_min[dim]);
3248     }
3249
3250     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3251
3252     if (!bUniform)
3253     {
3254         /* Check if the boundary did not displace more than halfway
3255          * each of the cells it bounds, as this could cause problems,
3256          * especially when the differences between cell sizes are large.
3257          * If changes are applied, they will not make cells smaller
3258          * than the cut-off, as we check all the boundaries which
3259          * might be affected by a change and if the old state was ok,
3260          * the cells will at most be shrunk back to their old size.
3261          */
3262         for (i = range[0]+1; i < range[1]; i++)
3263         {
3264             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3265             if (root->cell_f[i] < halfway)
3266             {
3267                 root->cell_f[i] = halfway;
3268                 /* Check if the change also causes shifts of the next boundaries */
3269                 for (j = i+1; j < range[1]; j++)
3270                 {
3271                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3272                     {
3273                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3274                     }
3275                 }
3276             }
3277             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3278             if (root->cell_f[i] > halfway)
3279             {
3280                 root->cell_f[i] = halfway;
3281                 /* Check if the change also causes shifts of the next boundaries */
3282                 for (j = i-1; j >= range[0]+1; j--)
3283                 {
3284                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3285                     {
3286                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3287                     }
3288                 }
3289             }
3290         }
3291     }
3292
3293     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3294     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3295      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3296      * for a and b nrange is used */
3297     if (d > 0)
3298     {
3299         /* Take care of the staggering of the cell boundaries */
3300         if (bUniform)
3301         {
3302             for (i = range[0]; i < range[1]; i++)
3303             {
3304                 root->cell_f_max0[i] = root->cell_f[i];
3305                 root->cell_f_min1[i] = root->cell_f[i+1];
3306             }
3307         }
3308         else
3309         {
3310             for (i = range[0]+1; i < range[1]; i++)
3311             {
3312                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3313                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3314                 if (bLimLo && bLimHi)
3315                 {
3316                     /* Both limits violated, try the best we can */
3317                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3318                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3319                     nrange[0]       = range[0];
3320                     nrange[1]       = i;
3321                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3322
3323                     nrange[0] = i;
3324                     nrange[1] = range[1];
3325                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3326
3327                     return;
3328                 }
3329                 else if (bLimLo)
3330                 {
3331                     /* root->cell_f[i] = root->bound_min[i]; */
3332                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3333                     bLastHi   = FALSE;
3334                 }
3335                 else if (bLimHi && !bLastHi)
3336                 {
3337                     bLastHi = TRUE;
3338                     if (nrange[1] < range[1])   /* found a LimLo before */
3339                     {
3340                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3341                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3342                         nrange[0] = nrange[1];
3343                     }
3344                     root->cell_f[i] = root->bound_max[i];
3345                     nrange[1]       = i;
3346                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3347                     nrange[0] = i;
3348                     nrange[1] = range[1];
3349                 }
3350             }
3351             if (nrange[1] < range[1])   /* found last a LimLo */
3352             {
3353                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3354                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3355                 nrange[0] = nrange[1];
3356                 nrange[1] = range[1];
3357                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3358             }
3359             else if (nrange[0] > range[0]) /* found at least one LimHi */
3360             {
3361                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3362             }
3363         }
3364     }
3365 }
3366
3367
3368 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3369                                        int d, int dim, gmx_domdec_root_t *root,
3370                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3371                                        gmx_bool bUniform, gmx_large_int_t step)
3372 {
3373     gmx_domdec_comm_t *comm;
3374     int                ncd, d1, i, j, pos;
3375     real              *cell_size;
3376     real               load_aver, load_i, imbalance, change, change_max, sc;
3377     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3378     real               change_limit;
3379     real               relax = 0.5;
3380     gmx_bool           bPBC;
3381     int                range[] = { 0, 0 };
3382
3383     comm = dd->comm;
3384
3385     /* Convert the maximum change from the input percentage to a fraction */
3386     change_limit = comm->dlb_scale_lim*0.01;
3387
3388     ncd = dd->nc[dim];
3389
3390     bPBC = (dim < ddbox->npbcdim);
3391
3392     cell_size = root->buf_ncd;
3393
3394     /* Store the original boundaries */
3395     for (i = 0; i < ncd+1; i++)
3396     {
3397         root->old_cell_f[i] = root->cell_f[i];
3398     }
3399     if (bUniform)
3400     {
3401         for (i = 0; i < ncd; i++)
3402         {
3403             cell_size[i] = 1.0/ncd;
3404         }
3405     }
3406     else if (dd_load_count(comm))
3407     {
3408         load_aver  = comm->load[d].sum_m/ncd;
3409         change_max = 0;
3410         for (i = 0; i < ncd; i++)
3411         {
3412             /* Determine the relative imbalance of cell i */
3413             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3414             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3415             /* Determine the change of the cell size using underrelaxation */
3416             change     = -relax*imbalance;
3417             change_max = max(change_max, max(change, -change));
3418         }
3419         /* Limit the amount of scaling.
3420          * We need to use the same rescaling for all cells in one row,
3421          * otherwise the load balancing might not converge.
3422          */
3423         sc = relax;
3424         if (change_max > change_limit)
3425         {
3426             sc *= change_limit/change_max;
3427         }
3428         for (i = 0; i < ncd; i++)
3429         {
3430             /* Determine the relative imbalance of cell i */
3431             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3432             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3433             /* Determine the change of the cell size using underrelaxation */
3434             change       = -sc*imbalance;
3435             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3436         }
3437     }
3438
3439     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3440     cellsize_limit_f *= DD_CELL_MARGIN;
3441     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3442     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3443     if (ddbox->tric_dir[dim])
3444     {
3445         cellsize_limit_f /= ddbox->skew_fac[dim];
3446         dist_min_f       /= ddbox->skew_fac[dim];
3447     }
3448     if (bDynamicBox && d > 0)
3449     {
3450         dist_min_f *= DD_PRES_SCALE_MARGIN;
3451     }
3452     if (d > 0 && !bUniform)
3453     {
3454         /* Make sure that the grid is not shifted too much */
3455         for (i = 1; i < ncd; i++)
3456         {
3457             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3458             {
3459                 gmx_incons("Inconsistent DD boundary staggering limits!");
3460             }
3461             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3462             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3463             if (space > 0)
3464             {
3465                 root->bound_min[i] += 0.5*space;
3466             }
3467             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3468             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3469             if (space < 0)
3470             {
3471                 root->bound_max[i] += 0.5*space;
3472             }
3473             if (debug)
3474             {
3475                 fprintf(debug,
3476                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3477                         d, i,
3478                         root->cell_f_max0[i-1] + dist_min_f,
3479                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3480                         root->cell_f_min1[i] - dist_min_f);
3481             }
3482         }
3483     }
3484     range[1]          = ncd;
3485     root->cell_f[0]   = 0;
3486     root->cell_f[ncd] = 1;
3487     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3488
3489
3490     /* After the checks above, the cells should obey the cut-off
3491      * restrictions, but it does not hurt to check.
3492      */
3493     for (i = 0; i < ncd; i++)
3494     {
3495         if (debug)
3496         {
3497             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3498                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3499         }
3500
3501         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3502             root->cell_f[i+1] - root->cell_f[i] <
3503             cellsize_limit_f/DD_CELL_MARGIN)
3504         {
3505             char buf[22];
3506             fprintf(stderr,
3507                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3508                     gmx_step_str(step, buf), dim2char(dim), i,
3509                     (root->cell_f[i+1] - root->cell_f[i])
3510                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3511         }
3512     }
3513
3514     pos = ncd + 1;
3515     /* Store the cell boundaries of the lower dimensions at the end */
3516     for (d1 = 0; d1 < d; d1++)
3517     {
3518         root->cell_f[pos++] = comm->cell_f0[d1];
3519         root->cell_f[pos++] = comm->cell_f1[d1];
3520     }
3521
3522     if (d < comm->npmedecompdim)
3523     {
3524         /* The master determines the maximum shift for
3525          * the coordinate communication between separate PME nodes.
3526          */
3527         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3528     }
3529     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3530     if (d >= 1)
3531     {
3532         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3533     }
3534 }
3535
3536 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3537                                              gmx_ddbox_t *ddbox, int dimind)
3538 {
3539     gmx_domdec_comm_t *comm;
3540     int                dim;
3541
3542     comm = dd->comm;
3543
3544     /* Set the cell dimensions */
3545     dim                = dd->dim[dimind];
3546     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3547     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3548     if (dim >= ddbox->nboundeddim)
3549     {
3550         comm->cell_x0[dim] += ddbox->box0[dim];
3551         comm->cell_x1[dim] += ddbox->box0[dim];
3552     }
3553 }
3554
3555 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3556                                          int d, int dim, real *cell_f_row,
3557                                          gmx_ddbox_t *ddbox)
3558 {
3559     gmx_domdec_comm_t *comm;
3560     int                d1, dim1, pos;
3561
3562     comm = dd->comm;
3563
3564 #ifdef GMX_MPI
3565     /* Each node would only need to know two fractions,
3566      * but it is probably cheaper to broadcast the whole array.
3567      */
3568     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3569               0, comm->mpi_comm_load[d]);
3570 #endif
3571     /* Copy the fractions for this dimension from the buffer */
3572     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3573     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3574     /* The whole array was communicated, so set the buffer position */
3575     pos = dd->nc[dim] + 1;
3576     for (d1 = 0; d1 <= d; d1++)
3577     {
3578         if (d1 < d)
3579         {
3580             /* Copy the cell fractions of the lower dimensions */
3581             comm->cell_f0[d1] = cell_f_row[pos++];
3582             comm->cell_f1[d1] = cell_f_row[pos++];
3583         }
3584         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3585     }
3586     /* Convert the communicated shift from float to int */
3587     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3588     if (d >= 1)
3589     {
3590         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3591     }
3592 }
3593
3594 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3595                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3596                                          gmx_bool bUniform, gmx_large_int_t step)
3597 {
3598     gmx_domdec_comm_t *comm;
3599     int                d, dim, d1;
3600     gmx_bool           bRowMember, bRowRoot;
3601     real              *cell_f_row;
3602
3603     comm = dd->comm;
3604
3605     for (d = 0; d < dd->ndim; d++)
3606     {
3607         dim        = dd->dim[d];
3608         bRowMember = TRUE;
3609         bRowRoot   = TRUE;
3610         for (d1 = d; d1 < dd->ndim; d1++)
3611         {
3612             if (dd->ci[dd->dim[d1]] > 0)
3613             {
3614                 if (d1 > d)
3615                 {
3616                     bRowMember = FALSE;
3617                 }
3618                 bRowRoot = FALSE;
3619             }
3620         }
3621         if (bRowMember)
3622         {
3623             if (bRowRoot)
3624             {
3625                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3626                                            ddbox, bDynamicBox, bUniform, step);
3627                 cell_f_row = comm->root[d]->cell_f;
3628             }
3629             else
3630             {
3631                 cell_f_row = comm->cell_f_row;
3632             }
3633             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3634         }
3635     }
3636 }
3637
3638 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3639 {
3640     int d;
3641
3642     /* This function assumes the box is static and should therefore
3643      * not be called when the box has changed since the last
3644      * call to dd_partition_system.
3645      */
3646     for (d = 0; d < dd->ndim; d++)
3647     {
3648         relative_to_absolute_cell_bounds(dd, ddbox, d);
3649     }
3650 }
3651
3652
3653
3654 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3655                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3656                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3657                                   gmx_wallcycle_t wcycle)
3658 {
3659     gmx_domdec_comm_t *comm;
3660     int                dim;
3661
3662     comm = dd->comm;
3663
3664     if (bDoDLB)
3665     {
3666         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3667         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3668         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3669     }
3670     else if (bDynamicBox)
3671     {
3672         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3673     }
3674
3675     /* Set the dimensions for which no DD is used */
3676     for (dim = 0; dim < DIM; dim++)
3677     {
3678         if (dd->nc[dim] == 1)
3679         {
3680             comm->cell_x0[dim] = 0;
3681             comm->cell_x1[dim] = ddbox->box_size[dim];
3682             if (dim >= ddbox->nboundeddim)
3683             {
3684                 comm->cell_x0[dim] += ddbox->box0[dim];
3685                 comm->cell_x1[dim] += ddbox->box0[dim];
3686             }
3687         }
3688     }
3689 }
3690
3691 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3692 {
3693     int                    d, np, i;
3694     gmx_domdec_comm_dim_t *cd;
3695
3696     for (d = 0; d < dd->ndim; d++)
3697     {
3698         cd = &dd->comm->cd[d];
3699         np = npulse[dd->dim[d]];
3700         if (np > cd->np_nalloc)
3701         {
3702             if (debug)
3703             {
3704                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3705                         dim2char(dd->dim[d]), np);
3706             }
3707             if (DDMASTER(dd) && cd->np_nalloc > 0)
3708             {
3709                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3710             }
3711             srenew(cd->ind, np);
3712             for (i = cd->np_nalloc; i < np; i++)
3713             {
3714                 cd->ind[i].index  = NULL;
3715                 cd->ind[i].nalloc = 0;
3716             }
3717             cd->np_nalloc = np;
3718         }
3719         cd->np = np;
3720     }
3721 }
3722
3723
3724 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3725                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3726                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_large_int_t step,
3727                               gmx_wallcycle_t wcycle)
3728 {
3729     gmx_domdec_comm_t *comm;
3730     int                d;
3731     ivec               npulse;
3732
3733     comm = dd->comm;
3734
3735     /* Copy the old cell boundaries for the cg displacement check */
3736     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3737     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3738
3739     if (comm->bDynLoadBal)
3740     {
3741         if (DDMASTER(dd))
3742         {
3743             check_box_size(dd, ddbox);
3744         }
3745         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3746     }
3747     else
3748     {
3749         set_dd_cell_sizes_slb(dd, ddbox, FALSE, npulse);
3750         realloc_comm_ind(dd, npulse);
3751     }
3752
3753     if (debug)
3754     {
3755         for (d = 0; d < DIM; d++)
3756         {
3757             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3758                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3759         }
3760     }
3761 }
3762
3763 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3764                                   gmx_ddbox_t *ddbox,
3765                                   rvec cell_ns_x0, rvec cell_ns_x1,
3766                                   gmx_large_int_t step)
3767 {
3768     gmx_domdec_comm_t *comm;
3769     int                dim_ind, dim;
3770
3771     comm = dd->comm;
3772
3773     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3774     {
3775         dim = dd->dim[dim_ind];
3776
3777         /* Without PBC we don't have restrictions on the outer cells */
3778         if (!(dim >= ddbox->npbcdim &&
3779               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3780             comm->bDynLoadBal &&
3781             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3782             comm->cellsize_min[dim])
3783         {
3784             char buf[22];
3785             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3786                       gmx_step_str(step, buf), dim2char(dim),
3787                       comm->cell_x1[dim] - comm->cell_x0[dim],
3788                       ddbox->skew_fac[dim],
3789                       dd->comm->cellsize_min[dim],
3790                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3791         }
3792     }
3793
3794     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3795     {
3796         /* Communicate the boundaries and update cell_ns_x0/1 */
3797         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3798         if (dd->bGridJump && dd->ndim > 1)
3799         {
3800             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3801         }
3802     }
3803 }
3804
3805 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3806 {
3807     if (YY < npbcdim)
3808     {
3809         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3810     }
3811     else
3812     {
3813         tcm[YY][XX] = 0;
3814     }
3815     if (ZZ < npbcdim)
3816     {
3817         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3818         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3819     }
3820     else
3821     {
3822         tcm[ZZ][XX] = 0;
3823         tcm[ZZ][YY] = 0;
3824     }
3825 }
3826
3827 static void check_screw_box(matrix box)
3828 {
3829     /* Mathematical limitation */
3830     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3831     {
3832         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3833     }
3834
3835     /* Limitation due to the asymmetry of the eighth shell method */
3836     if (box[ZZ][YY] != 0)
3837     {
3838         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3839     }
3840 }
3841
3842 static void distribute_cg(FILE *fplog, gmx_large_int_t step,
3843                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3844                           gmx_domdec_t *dd)
3845 {
3846     gmx_domdec_master_t *ma;
3847     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3848     int                  i, icg, j, k, k0, k1, d, npbcdim;
3849     matrix               tcm;
3850     rvec                 box_size, cg_cm;
3851     ivec                 ind;
3852     real                 nrcg, inv_ncg, pos_d;
3853     atom_id             *cgindex;
3854     gmx_bool             bUnbounded, bScrew;
3855
3856     ma = dd->ma;
3857
3858     if (tmp_ind == NULL)
3859     {
3860         snew(tmp_nalloc, dd->nnodes);
3861         snew(tmp_ind, dd->nnodes);
3862         for (i = 0; i < dd->nnodes; i++)
3863         {
3864             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3865             snew(tmp_ind[i], tmp_nalloc[i]);
3866         }
3867     }
3868
3869     /* Clear the count */
3870     for (i = 0; i < dd->nnodes; i++)
3871     {
3872         ma->ncg[i] = 0;
3873         ma->nat[i] = 0;
3874     }
3875
3876     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3877
3878     cgindex = cgs->index;
3879
3880     /* Compute the center of geometry for all charge groups */
3881     for (icg = 0; icg < cgs->nr; icg++)
3882     {
3883         k0      = cgindex[icg];
3884         k1      = cgindex[icg+1];
3885         nrcg    = k1 - k0;
3886         if (nrcg == 1)
3887         {
3888             copy_rvec(pos[k0], cg_cm);
3889         }
3890         else
3891         {
3892             inv_ncg = 1.0/nrcg;
3893
3894             clear_rvec(cg_cm);
3895             for (k = k0; (k < k1); k++)
3896             {
3897                 rvec_inc(cg_cm, pos[k]);
3898             }
3899             for (d = 0; (d < DIM); d++)
3900             {
3901                 cg_cm[d] *= inv_ncg;
3902             }
3903         }
3904         /* Put the charge group in the box and determine the cell index */
3905         for (d = DIM-1; d >= 0; d--)
3906         {
3907             pos_d = cg_cm[d];
3908             if (d < dd->npbcdim)
3909             {
3910                 bScrew = (dd->bScrewPBC && d == XX);
3911                 if (tric_dir[d] && dd->nc[d] > 1)
3912                 {
3913                     /* Use triclinic coordintates for this dimension */
3914                     for (j = d+1; j < DIM; j++)
3915                     {
3916                         pos_d += cg_cm[j]*tcm[j][d];
3917                     }
3918                 }
3919                 while (pos_d >= box[d][d])
3920                 {
3921                     pos_d -= box[d][d];
3922                     rvec_dec(cg_cm, box[d]);
3923                     if (bScrew)
3924                     {
3925                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3926                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3927                     }
3928                     for (k = k0; (k < k1); k++)
3929                     {
3930                         rvec_dec(pos[k], box[d]);
3931                         if (bScrew)
3932                         {
3933                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3934                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3935                         }
3936                     }
3937                 }
3938                 while (pos_d < 0)
3939                 {
3940                     pos_d += box[d][d];
3941                     rvec_inc(cg_cm, box[d]);
3942                     if (bScrew)
3943                     {
3944                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3945                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3946                     }
3947                     for (k = k0; (k < k1); k++)
3948                     {
3949                         rvec_inc(pos[k], box[d]);
3950                         if (bScrew)
3951                         {
3952                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3953                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3954                         }
3955                     }
3956                 }
3957             }
3958             /* This could be done more efficiently */
3959             ind[d] = 0;
3960             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
3961             {
3962                 ind[d]++;
3963             }
3964         }
3965         i = dd_index(dd->nc, ind);
3966         if (ma->ncg[i] == tmp_nalloc[i])
3967         {
3968             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
3969             srenew(tmp_ind[i], tmp_nalloc[i]);
3970         }
3971         tmp_ind[i][ma->ncg[i]] = icg;
3972         ma->ncg[i]++;
3973         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
3974     }
3975
3976     k1 = 0;
3977     for (i = 0; i < dd->nnodes; i++)
3978     {
3979         ma->index[i] = k1;
3980         for (k = 0; k < ma->ncg[i]; k++)
3981         {
3982             ma->cg[k1++] = tmp_ind[i][k];
3983         }
3984     }
3985     ma->index[dd->nnodes] = k1;
3986
3987     for (i = 0; i < dd->nnodes; i++)
3988     {
3989         sfree(tmp_ind[i]);
3990     }
3991     sfree(tmp_ind);
3992     sfree(tmp_nalloc);
3993
3994     if (fplog)
3995     {
3996         char buf[22];
3997         fprintf(fplog, "Charge group distribution at step %s:",
3998                 gmx_step_str(step, buf));
3999         for (i = 0; i < dd->nnodes; i++)
4000         {
4001             fprintf(fplog, " %d", ma->ncg[i]);
4002         }
4003         fprintf(fplog, "\n");
4004     }
4005 }
4006
4007 static void get_cg_distribution(FILE *fplog, gmx_large_int_t step, gmx_domdec_t *dd,
4008                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4009                                 rvec pos[])
4010 {
4011     gmx_domdec_master_t *ma = NULL;
4012     ivec                 npulse;
4013     int                  i, cg_gl;
4014     int                 *ibuf, buf2[2] = { 0, 0 };
4015     gmx_bool             bMaster = DDMASTER(dd);
4016     if (bMaster)
4017     {
4018         ma = dd->ma;
4019
4020         if (dd->bScrewPBC)
4021         {
4022             check_screw_box(box);
4023         }
4024
4025         set_dd_cell_sizes_slb(dd, ddbox, TRUE, npulse);
4026
4027         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4028         for (i = 0; i < dd->nnodes; i++)
4029         {
4030             ma->ibuf[2*i]   = ma->ncg[i];
4031             ma->ibuf[2*i+1] = ma->nat[i];
4032         }
4033         ibuf = ma->ibuf;
4034     }
4035     else
4036     {
4037         ibuf = NULL;
4038     }
4039     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4040
4041     dd->ncg_home = buf2[0];
4042     dd->nat_home = buf2[1];
4043     dd->ncg_tot  = dd->ncg_home;
4044     dd->nat_tot  = dd->nat_home;
4045     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4046     {
4047         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4048         srenew(dd->index_gl, dd->cg_nalloc);
4049         srenew(dd->cgindex, dd->cg_nalloc+1);
4050     }
4051     if (bMaster)
4052     {
4053         for (i = 0; i < dd->nnodes; i++)
4054         {
4055             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4056             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4057         }
4058     }
4059
4060     dd_scatterv(dd,
4061                 DDMASTER(dd) ? ma->ibuf : NULL,
4062                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4063                 DDMASTER(dd) ? ma->cg : NULL,
4064                 dd->ncg_home*sizeof(int), dd->index_gl);
4065
4066     /* Determine the home charge group sizes */
4067     dd->cgindex[0] = 0;
4068     for (i = 0; i < dd->ncg_home; i++)
4069     {
4070         cg_gl            = dd->index_gl[i];
4071         dd->cgindex[i+1] =
4072             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4073     }
4074
4075     if (debug)
4076     {
4077         fprintf(debug, "Home charge groups:\n");
4078         for (i = 0; i < dd->ncg_home; i++)
4079         {
4080             fprintf(debug, " %d", dd->index_gl[i]);
4081             if (i % 10 == 9)
4082             {
4083                 fprintf(debug, "\n");
4084             }
4085         }
4086         fprintf(debug, "\n");
4087     }
4088 }
4089
4090 static int compact_and_copy_vec_at(int ncg, int *move,
4091                                    int *cgindex,
4092                                    int nvec, int vec,
4093                                    rvec *src, gmx_domdec_comm_t *comm,
4094                                    gmx_bool bCompact)
4095 {
4096     int m, icg, i, i0, i1, nrcg;
4097     int home_pos;
4098     int pos_vec[DIM*2];
4099
4100     home_pos = 0;
4101
4102     for (m = 0; m < DIM*2; m++)
4103     {
4104         pos_vec[m] = 0;
4105     }
4106
4107     i0 = 0;
4108     for (icg = 0; icg < ncg; icg++)
4109     {
4110         i1 = cgindex[icg+1];
4111         m  = move[icg];
4112         if (m == -1)
4113         {
4114             if (bCompact)
4115             {
4116                 /* Compact the home array in place */
4117                 for (i = i0; i < i1; i++)
4118                 {
4119                     copy_rvec(src[i], src[home_pos++]);
4120                 }
4121             }
4122         }
4123         else
4124         {
4125             /* Copy to the communication buffer */
4126             nrcg        = i1 - i0;
4127             pos_vec[m] += 1 + vec*nrcg;
4128             for (i = i0; i < i1; i++)
4129             {
4130                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4131             }
4132             pos_vec[m] += (nvec - vec - 1)*nrcg;
4133         }
4134         if (!bCompact)
4135         {
4136             home_pos += i1 - i0;
4137         }
4138         i0 = i1;
4139     }
4140
4141     return home_pos;
4142 }
4143
4144 static int compact_and_copy_vec_cg(int ncg, int *move,
4145                                    int *cgindex,
4146                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4147                                    gmx_bool bCompact)
4148 {
4149     int m, icg, i0, i1, nrcg;
4150     int home_pos;
4151     int pos_vec[DIM*2];
4152
4153     home_pos = 0;
4154
4155     for (m = 0; m < DIM*2; m++)
4156     {
4157         pos_vec[m] = 0;
4158     }
4159
4160     i0 = 0;
4161     for (icg = 0; icg < ncg; icg++)
4162     {
4163         i1 = cgindex[icg+1];
4164         m  = move[icg];
4165         if (m == -1)
4166         {
4167             if (bCompact)
4168             {
4169                 /* Compact the home array in place */
4170                 copy_rvec(src[icg], src[home_pos++]);
4171             }
4172         }
4173         else
4174         {
4175             nrcg = i1 - i0;
4176             /* Copy to the communication buffer */
4177             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4178             pos_vec[m] += 1 + nrcg*nvec;
4179         }
4180         i0 = i1;
4181     }
4182     if (!bCompact)
4183     {
4184         home_pos = ncg;
4185     }
4186
4187     return home_pos;
4188 }
4189
4190 static int compact_ind(int ncg, int *move,
4191                        int *index_gl, int *cgindex,
4192                        int *gatindex,
4193                        gmx_ga2la_t ga2la, char *bLocalCG,
4194                        int *cginfo)
4195 {
4196     int cg, nat, a0, a1, a, a_gl;
4197     int home_pos;
4198
4199     home_pos = 0;
4200     nat      = 0;
4201     for (cg = 0; cg < ncg; cg++)
4202     {
4203         a0 = cgindex[cg];
4204         a1 = cgindex[cg+1];
4205         if (move[cg] == -1)
4206         {
4207             /* Compact the home arrays in place.
4208              * Anything that can be done here avoids access to global arrays.
4209              */
4210             cgindex[home_pos] = nat;
4211             for (a = a0; a < a1; a++)
4212             {
4213                 a_gl          = gatindex[a];
4214                 gatindex[nat] = a_gl;
4215                 /* The cell number stays 0, so we don't need to set it */
4216                 ga2la_change_la(ga2la, a_gl, nat);
4217                 nat++;
4218             }
4219             index_gl[home_pos] = index_gl[cg];
4220             cginfo[home_pos]   = cginfo[cg];
4221             /* The charge group remains local, so bLocalCG does not change */
4222             home_pos++;
4223         }
4224         else
4225         {
4226             /* Clear the global indices */
4227             for (a = a0; a < a1; a++)
4228             {
4229                 ga2la_del(ga2la, gatindex[a]);
4230             }
4231             if (bLocalCG)
4232             {
4233                 bLocalCG[index_gl[cg]] = FALSE;
4234             }
4235         }
4236     }
4237     cgindex[home_pos] = nat;
4238
4239     return home_pos;
4240 }
4241
4242 static void clear_and_mark_ind(int ncg, int *move,
4243                                int *index_gl, int *cgindex, int *gatindex,
4244                                gmx_ga2la_t ga2la, char *bLocalCG,
4245                                int *cell_index)
4246 {
4247     int cg, a0, a1, a;
4248
4249     for (cg = 0; cg < ncg; cg++)
4250     {
4251         if (move[cg] >= 0)
4252         {
4253             a0 = cgindex[cg];
4254             a1 = cgindex[cg+1];
4255             /* Clear the global indices */
4256             for (a = a0; a < a1; a++)
4257             {
4258                 ga2la_del(ga2la, gatindex[a]);
4259             }
4260             if (bLocalCG)
4261             {
4262                 bLocalCG[index_gl[cg]] = FALSE;
4263             }
4264             /* Signal that this cg has moved using the ns cell index.
4265              * Here we set it to -1. fill_grid will change it
4266              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4267              */
4268             cell_index[cg] = -1;
4269         }
4270     }
4271 }
4272
4273 static void print_cg_move(FILE *fplog,
4274                           gmx_domdec_t *dd,
4275                           gmx_large_int_t step, int cg, int dim, int dir,
4276                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4277                           rvec cm_old, rvec cm_new, real pos_d)
4278 {
4279     gmx_domdec_comm_t *comm;
4280     char               buf[22];
4281
4282     comm = dd->comm;
4283
4284     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4285     if (bHaveLimitdAndCMOld)
4286     {
4287         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4288                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4289     }
4290     else
4291     {
4292         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4293                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4294     }
4295     fprintf(fplog, "distance out of cell %f\n",
4296             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4297     if (bHaveLimitdAndCMOld)
4298     {
4299         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4300                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4301     }
4302     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4303             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4304     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4305             dim2char(dim),
4306             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4307     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4308             dim2char(dim),
4309             comm->cell_x0[dim], comm->cell_x1[dim]);
4310 }
4311
4312 static void cg_move_error(FILE *fplog,
4313                           gmx_domdec_t *dd,
4314                           gmx_large_int_t step, int cg, int dim, int dir,
4315                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4316                           rvec cm_old, rvec cm_new, real pos_d)
4317 {
4318     if (fplog)
4319     {
4320         print_cg_move(fplog, dd, step, cg, dim, dir,
4321                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4322     }
4323     print_cg_move(stderr, dd, step, cg, dim, dir,
4324                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4325     gmx_fatal(FARGS,
4326               "A charge group moved too far between two domain decomposition steps\n"
4327               "This usually means that your system is not well equilibrated");
4328 }
4329
4330 static void rotate_state_atom(t_state *state, int a)
4331 {
4332     int est;
4333
4334     for (est = 0; est < estNR; est++)
4335     {
4336         if (EST_DISTR(est) && (state->flags & (1<<est)))
4337         {
4338             switch (est)
4339             {
4340                 case estX:
4341                     /* Rotate the complete state; for a rectangular box only */
4342                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4343                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4344                     break;
4345                 case estV:
4346                     state->v[a][YY] = -state->v[a][YY];
4347                     state->v[a][ZZ] = -state->v[a][ZZ];
4348                     break;
4349                 case estSDX:
4350                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4351                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4352                     break;
4353                 case estCGP:
4354                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4355                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4356                     break;
4357                 case estDISRE_INITF:
4358                 case estDISRE_RM3TAV:
4359                 case estORIRE_INITF:
4360                 case estORIRE_DTAV:
4361                     /* These are distances, so not affected by rotation */
4362                     break;
4363                 default:
4364                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4365             }
4366         }
4367     }
4368 }
4369
4370 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4371 {
4372     if (natoms > comm->moved_nalloc)
4373     {
4374         /* Contents should be preserved here */
4375         comm->moved_nalloc = over_alloc_dd(natoms);
4376         srenew(comm->moved, comm->moved_nalloc);
4377     }
4378
4379     return comm->moved;
4380 }
4381
4382 static void calc_cg_move(FILE *fplog, gmx_large_int_t step,
4383                          gmx_domdec_t *dd,
4384                          t_state *state,
4385                          ivec tric_dir, matrix tcm,
4386                          rvec cell_x0, rvec cell_x1,
4387                          rvec limitd, rvec limit0, rvec limit1,
4388                          const int *cgindex,
4389                          int cg_start, int cg_end,
4390                          rvec *cg_cm,
4391                          int *move)
4392 {
4393     int      npbcdim;
4394     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4395     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4396     int      flag;
4397     gmx_bool bScrew;
4398     ivec     dev;
4399     real     inv_ncg, pos_d;
4400     rvec     cm_new;
4401
4402     npbcdim = dd->npbcdim;
4403
4404     for (cg = cg_start; cg < cg_end; cg++)
4405     {
4406         k0   = cgindex[cg];
4407         k1   = cgindex[cg+1];
4408         nrcg = k1 - k0;
4409         if (nrcg == 1)
4410         {
4411             copy_rvec(state->x[k0], cm_new);
4412         }
4413         else
4414         {
4415             inv_ncg = 1.0/nrcg;
4416
4417             clear_rvec(cm_new);
4418             for (k = k0; (k < k1); k++)
4419             {
4420                 rvec_inc(cm_new, state->x[k]);
4421             }
4422             for (d = 0; (d < DIM); d++)
4423             {
4424                 cm_new[d] = inv_ncg*cm_new[d];
4425             }
4426         }
4427
4428         clear_ivec(dev);
4429         /* Do pbc and check DD cell boundary crossings */
4430         for (d = DIM-1; d >= 0; d--)
4431         {
4432             if (dd->nc[d] > 1)
4433             {
4434                 bScrew = (dd->bScrewPBC && d == XX);
4435                 /* Determine the location of this cg in lattice coordinates */
4436                 pos_d = cm_new[d];
4437                 if (tric_dir[d])
4438                 {
4439                     for (d2 = d+1; d2 < DIM; d2++)
4440                     {
4441                         pos_d += cm_new[d2]*tcm[d2][d];
4442                     }
4443                 }
4444                 /* Put the charge group in the triclinic unit-cell */
4445                 if (pos_d >= cell_x1[d])
4446                 {
4447                     if (pos_d >= limit1[d])
4448                     {
4449                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4450                                       cg_cm[cg], cm_new, pos_d);
4451                     }
4452                     dev[d] = 1;
4453                     if (dd->ci[d] == dd->nc[d] - 1)
4454                     {
4455                         rvec_dec(cm_new, state->box[d]);
4456                         if (bScrew)
4457                         {
4458                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4459                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4460                         }
4461                         for (k = k0; (k < k1); k++)
4462                         {
4463                             rvec_dec(state->x[k], state->box[d]);
4464                             if (bScrew)
4465                             {
4466                                 rotate_state_atom(state, k);
4467                             }
4468                         }
4469                     }
4470                 }
4471                 else if (pos_d < cell_x0[d])
4472                 {
4473                     if (pos_d < limit0[d])
4474                     {
4475                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4476                                       cg_cm[cg], cm_new, pos_d);
4477                     }
4478                     dev[d] = -1;
4479                     if (dd->ci[d] == 0)
4480                     {
4481                         rvec_inc(cm_new, state->box[d]);
4482                         if (bScrew)
4483                         {
4484                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4485                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4486                         }
4487                         for (k = k0; (k < k1); k++)
4488                         {
4489                             rvec_inc(state->x[k], state->box[d]);
4490                             if (bScrew)
4491                             {
4492                                 rotate_state_atom(state, k);
4493                             }
4494                         }
4495                     }
4496                 }
4497             }
4498             else if (d < npbcdim)
4499             {
4500                 /* Put the charge group in the rectangular unit-cell */
4501                 while (cm_new[d] >= state->box[d][d])
4502                 {
4503                     rvec_dec(cm_new, state->box[d]);
4504                     for (k = k0; (k < k1); k++)
4505                     {
4506                         rvec_dec(state->x[k], state->box[d]);
4507                     }
4508                 }
4509                 while (cm_new[d] < 0)
4510                 {
4511                     rvec_inc(cm_new, state->box[d]);
4512                     for (k = k0; (k < k1); k++)
4513                     {
4514                         rvec_inc(state->x[k], state->box[d]);
4515                     }
4516                 }
4517             }
4518         }
4519
4520         copy_rvec(cm_new, cg_cm[cg]);
4521
4522         /* Determine where this cg should go */
4523         flag = 0;
4524         mc   = -1;
4525         for (d = 0; d < dd->ndim; d++)
4526         {
4527             dim = dd->dim[d];
4528             if (dev[dim] == 1)
4529             {
4530                 flag |= DD_FLAG_FW(d);
4531                 if (mc == -1)
4532                 {
4533                     mc = d*2;
4534                 }
4535             }
4536             else if (dev[dim] == -1)
4537             {
4538                 flag |= DD_FLAG_BW(d);
4539                 if (mc == -1)
4540                 {
4541                     if (dd->nc[dim] > 2)
4542                     {
4543                         mc = d*2 + 1;
4544                     }
4545                     else
4546                     {
4547                         mc = d*2;
4548                     }
4549                 }
4550             }
4551         }
4552         /* Temporarily store the flag in move */
4553         move[cg] = mc + flag;
4554     }
4555 }
4556
4557 static void dd_redistribute_cg(FILE *fplog, gmx_large_int_t step,
4558                                gmx_domdec_t *dd, ivec tric_dir,
4559                                t_state *state, rvec **f,
4560                                t_forcerec *fr, t_mdatoms *md,
4561                                gmx_bool bCompact,
4562                                t_nrnb *nrnb,
4563                                int *ncg_stay_home,
4564                                int *ncg_moved)
4565 {
4566     int               *move;
4567     int                npbcdim;
4568     int                ncg[DIM*2], nat[DIM*2];
4569     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4570     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4571     int                sbuf[2], rbuf[2];
4572     int                home_pos_cg, home_pos_at, buf_pos;
4573     int                flag;
4574     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4575     gmx_bool           bScrew;
4576     ivec               dev;
4577     real               inv_ncg, pos_d;
4578     matrix             tcm;
4579     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4580     atom_id           *cgindex;
4581     cginfo_mb_t       *cginfo_mb;
4582     gmx_domdec_comm_t *comm;
4583     int               *moved;
4584     int                nthread, thread;
4585
4586     if (dd->bScrewPBC)
4587     {
4588         check_screw_box(state->box);
4589     }
4590
4591     comm  = dd->comm;
4592     if (fr->cutoff_scheme == ecutsGROUP)
4593     {
4594         cg_cm = fr->cg_cm;
4595     }
4596
4597     for (i = 0; i < estNR; i++)
4598     {
4599         if (EST_DISTR(i))
4600         {
4601             switch (i)
4602             {
4603                 case estX: /* Always present */ break;
4604                 case estV:   bV   = (state->flags & (1<<i)); break;
4605                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4606                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4607                 case estLD_RNG:
4608                 case estLD_RNGI:
4609                 case estDISRE_INITF:
4610                 case estDISRE_RM3TAV:
4611                 case estORIRE_INITF:
4612                 case estORIRE_DTAV:
4613                     /* No processing required */
4614                     break;
4615                 default:
4616                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4617             }
4618         }
4619     }
4620
4621     if (dd->ncg_tot > comm->nalloc_int)
4622     {
4623         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4624         srenew(comm->buf_int, comm->nalloc_int);
4625     }
4626     move = comm->buf_int;
4627
4628     /* Clear the count */
4629     for (c = 0; c < dd->ndim*2; c++)
4630     {
4631         ncg[c] = 0;
4632         nat[c] = 0;
4633     }
4634
4635     npbcdim = dd->npbcdim;
4636
4637     for (d = 0; (d < DIM); d++)
4638     {
4639         limitd[d] = dd->comm->cellsize_min[d];
4640         if (d >= npbcdim && dd->ci[d] == 0)
4641         {
4642             cell_x0[d] = -GMX_FLOAT_MAX;
4643         }
4644         else
4645         {
4646             cell_x0[d] = comm->cell_x0[d];
4647         }
4648         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4649         {
4650             cell_x1[d] = GMX_FLOAT_MAX;
4651         }
4652         else
4653         {
4654             cell_x1[d] = comm->cell_x1[d];
4655         }
4656         if (d < npbcdim)
4657         {
4658             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4659             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4660         }
4661         else
4662         {
4663             /* We check after communication if a charge group moved
4664              * more than one cell. Set the pre-comm check limit to float_max.
4665              */
4666             limit0[d] = -GMX_FLOAT_MAX;
4667             limit1[d] =  GMX_FLOAT_MAX;
4668         }
4669     }
4670
4671     make_tric_corr_matrix(npbcdim, state->box, tcm);
4672
4673     cgindex = dd->cgindex;
4674
4675     nthread = gmx_omp_nthreads_get(emntDomdec);
4676
4677     /* Compute the center of geometry for all home charge groups
4678      * and put them in the box and determine where they should go.
4679      */
4680 #pragma omp parallel for num_threads(nthread) schedule(static)
4681     for (thread = 0; thread < nthread; thread++)
4682     {
4683         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4684                      cell_x0, cell_x1, limitd, limit0, limit1,
4685                      cgindex,
4686                      ( thread   *dd->ncg_home)/nthread,
4687                      ((thread+1)*dd->ncg_home)/nthread,
4688                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4689                      move);
4690     }
4691
4692     for (cg = 0; cg < dd->ncg_home; cg++)
4693     {
4694         if (move[cg] >= 0)
4695         {
4696             mc       = move[cg];
4697             flag     = mc & ~DD_FLAG_NRCG;
4698             mc       = mc & DD_FLAG_NRCG;
4699             move[cg] = mc;
4700
4701             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4702             {
4703                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4704                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4705             }
4706             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4707             /* We store the cg size in the lower 16 bits
4708              * and the place where the charge group should go
4709              * in the next 6 bits. This saves some communication volume.
4710              */
4711             nrcg = cgindex[cg+1] - cgindex[cg];
4712             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4713             ncg[mc] += 1;
4714             nat[mc] += nrcg;
4715         }
4716     }
4717
4718     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4719     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4720
4721     *ncg_moved = 0;
4722     for (i = 0; i < dd->ndim*2; i++)
4723     {
4724         *ncg_moved += ncg[i];
4725     }
4726
4727     nvec = 1;
4728     if (bV)
4729     {
4730         nvec++;
4731     }
4732     if (bSDX)
4733     {
4734         nvec++;
4735     }
4736     if (bCGP)
4737     {
4738         nvec++;
4739     }
4740
4741     /* Make sure the communication buffers are large enough */
4742     for (mc = 0; mc < dd->ndim*2; mc++)
4743     {
4744         nvr = ncg[mc] + nat[mc]*nvec;
4745         if (nvr > comm->cgcm_state_nalloc[mc])
4746         {
4747             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4748             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4749         }
4750     }
4751
4752     switch (fr->cutoff_scheme)
4753     {
4754         case ecutsGROUP:
4755             /* Recalculating cg_cm might be cheaper than communicating,
4756              * but that could give rise to rounding issues.
4757              */
4758             home_pos_cg =
4759                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4760                                         nvec, cg_cm, comm, bCompact);
4761             break;
4762         case ecutsVERLET:
4763             /* Without charge groups we send the moved atom coordinates
4764              * over twice. This is so the code below can be used without
4765              * many conditionals for both for with and without charge groups.
4766              */
4767             home_pos_cg =
4768                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4769                                         nvec, state->x, comm, FALSE);
4770             if (bCompact)
4771             {
4772                 home_pos_cg -= *ncg_moved;
4773             }
4774             break;
4775         default:
4776             gmx_incons("unimplemented");
4777             home_pos_cg = 0;
4778     }
4779
4780     vec         = 0;
4781     home_pos_at =
4782         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4783                                 nvec, vec++, state->x, comm, bCompact);
4784     if (bV)
4785     {
4786         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4787                                 nvec, vec++, state->v, comm, bCompact);
4788     }
4789     if (bSDX)
4790     {
4791         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4792                                 nvec, vec++, state->sd_X, comm, bCompact);
4793     }
4794     if (bCGP)
4795     {
4796         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4797                                 nvec, vec++, state->cg_p, comm, bCompact);
4798     }
4799
4800     if (bCompact)
4801     {
4802         compact_ind(dd->ncg_home, move,
4803                     dd->index_gl, dd->cgindex, dd->gatindex,
4804                     dd->ga2la, comm->bLocalCG,
4805                     fr->cginfo);
4806     }
4807     else
4808     {
4809         if (fr->cutoff_scheme == ecutsVERLET)
4810         {
4811             moved = get_moved(comm, dd->ncg_home);
4812
4813             for (k = 0; k < dd->ncg_home; k++)
4814             {
4815                 moved[k] = 0;
4816             }
4817         }
4818         else
4819         {
4820             moved = fr->ns.grid->cell_index;
4821         }
4822
4823         clear_and_mark_ind(dd->ncg_home, move,
4824                            dd->index_gl, dd->cgindex, dd->gatindex,
4825                            dd->ga2la, comm->bLocalCG,
4826                            moved);
4827     }
4828
4829     cginfo_mb = fr->cginfo_mb;
4830
4831     *ncg_stay_home = home_pos_cg;
4832     for (d = 0; d < dd->ndim; d++)
4833     {
4834         dim      = dd->dim[d];
4835         ncg_recv = 0;
4836         nat_recv = 0;
4837         nvr      = 0;
4838         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4839         {
4840             cdd = d*2 + dir;
4841             /* Communicate the cg and atom counts */
4842             sbuf[0] = ncg[cdd];
4843             sbuf[1] = nat[cdd];
4844             if (debug)
4845             {
4846                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4847                         d, dir, sbuf[0], sbuf[1]);
4848             }
4849             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4850
4851             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4852             {
4853                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4854                 srenew(comm->buf_int, comm->nalloc_int);
4855             }
4856
4857             /* Communicate the charge group indices, sizes and flags */
4858             dd_sendrecv_int(dd, d, dir,
4859                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4860                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4861
4862             nvs = ncg[cdd] + nat[cdd]*nvec;
4863             i   = rbuf[0]  + rbuf[1] *nvec;
4864             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4865
4866             /* Communicate cgcm and state */
4867             dd_sendrecv_rvec(dd, d, dir,
4868                              comm->cgcm_state[cdd], nvs,
4869                              comm->vbuf.v+nvr, i);
4870             ncg_recv += rbuf[0];
4871             nat_recv += rbuf[1];
4872             nvr      += i;
4873         }
4874
4875         /* Process the received charge groups */
4876         buf_pos = 0;
4877         for (cg = 0; cg < ncg_recv; cg++)
4878         {
4879             flag = comm->buf_int[cg*DD_CGIBS+1];
4880
4881             if (dim >= npbcdim && dd->nc[dim] > 2)
4882             {
4883                 /* No pbc in this dim and more than one domain boundary.
4884                  * We do a separate check if a charge group didn't move too far.
4885                  */
4886                 if (((flag & DD_FLAG_FW(d)) &&
4887                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4888                     ((flag & DD_FLAG_BW(d)) &&
4889                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4890                 {
4891                     cg_move_error(fplog, dd, step, cg, dim,
4892                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4893                                   FALSE, 0,
4894                                   comm->vbuf.v[buf_pos],
4895                                   comm->vbuf.v[buf_pos],
4896                                   comm->vbuf.v[buf_pos][dim]);
4897                 }
4898             }
4899
4900             mc = -1;
4901             if (d < dd->ndim-1)
4902             {
4903                 /* Check which direction this cg should go */
4904                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4905                 {
4906                     if (dd->bGridJump)
4907                     {
4908                         /* The cell boundaries for dimension d2 are not equal
4909                          * for each cell row of the lower dimension(s),
4910                          * therefore we might need to redetermine where
4911                          * this cg should go.
4912                          */
4913                         dim2 = dd->dim[d2];
4914                         /* If this cg crosses the box boundary in dimension d2
4915                          * we can use the communicated flag, so we do not
4916                          * have to worry about pbc.
4917                          */
4918                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4919                                (flag & DD_FLAG_FW(d2))) ||
4920                               (dd->ci[dim2] == 0 &&
4921                                (flag & DD_FLAG_BW(d2)))))
4922                         {
4923                             /* Clear the two flags for this dimension */
4924                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4925                             /* Determine the location of this cg
4926                              * in lattice coordinates
4927                              */
4928                             pos_d = comm->vbuf.v[buf_pos][dim2];
4929                             if (tric_dir[dim2])
4930                             {
4931                                 for (d3 = dim2+1; d3 < DIM; d3++)
4932                                 {
4933                                     pos_d +=
4934                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4935                                 }
4936                             }
4937                             /* Check of we are not at the box edge.
4938                              * pbc is only handled in the first step above,
4939                              * but this check could move over pbc while
4940                              * the first step did not due to different rounding.
4941                              */
4942                             if (pos_d >= cell_x1[dim2] &&
4943                                 dd->ci[dim2] != dd->nc[dim2]-1)
4944                             {
4945                                 flag |= DD_FLAG_FW(d2);
4946                             }
4947                             else if (pos_d < cell_x0[dim2] &&
4948                                      dd->ci[dim2] != 0)
4949                             {
4950                                 flag |= DD_FLAG_BW(d2);
4951                             }
4952                             comm->buf_int[cg*DD_CGIBS+1] = flag;
4953                         }
4954                     }
4955                     /* Set to which neighboring cell this cg should go */
4956                     if (flag & DD_FLAG_FW(d2))
4957                     {
4958                         mc = d2*2;
4959                     }
4960                     else if (flag & DD_FLAG_BW(d2))
4961                     {
4962                         if (dd->nc[dd->dim[d2]] > 2)
4963                         {
4964                             mc = d2*2+1;
4965                         }
4966                         else
4967                         {
4968                             mc = d2*2;
4969                         }
4970                     }
4971                 }
4972             }
4973
4974             nrcg = flag & DD_FLAG_NRCG;
4975             if (mc == -1)
4976             {
4977                 if (home_pos_cg+1 > dd->cg_nalloc)
4978                 {
4979                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
4980                     srenew(dd->index_gl, dd->cg_nalloc);
4981                     srenew(dd->cgindex, dd->cg_nalloc+1);
4982                 }
4983                 /* Set the global charge group index and size */
4984                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
4985                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
4986                 /* Copy the state from the buffer */
4987                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
4988                 if (fr->cutoff_scheme == ecutsGROUP)
4989                 {
4990                     cg_cm = fr->cg_cm;
4991                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
4992                 }
4993                 buf_pos++;
4994
4995                 /* Set the cginfo */
4996                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
4997                                                    dd->index_gl[home_pos_cg]);
4998                 if (comm->bLocalCG)
4999                 {
5000                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5001                 }
5002
5003                 if (home_pos_at+nrcg > state->nalloc)
5004                 {
5005                     dd_realloc_state(state, f, home_pos_at+nrcg);
5006                 }
5007                 for (i = 0; i < nrcg; i++)
5008                 {
5009                     copy_rvec(comm->vbuf.v[buf_pos++],
5010                               state->x[home_pos_at+i]);
5011                 }
5012                 if (bV)
5013                 {
5014                     for (i = 0; i < nrcg; i++)
5015                     {
5016                         copy_rvec(comm->vbuf.v[buf_pos++],
5017                                   state->v[home_pos_at+i]);
5018                     }
5019                 }
5020                 if (bSDX)
5021                 {
5022                     for (i = 0; i < nrcg; i++)
5023                     {
5024                         copy_rvec(comm->vbuf.v[buf_pos++],
5025                                   state->sd_X[home_pos_at+i]);
5026                     }
5027                 }
5028                 if (bCGP)
5029                 {
5030                     for (i = 0; i < nrcg; i++)
5031                     {
5032                         copy_rvec(comm->vbuf.v[buf_pos++],
5033                                   state->cg_p[home_pos_at+i]);
5034                     }
5035                 }
5036                 home_pos_cg += 1;
5037                 home_pos_at += nrcg;
5038             }
5039             else
5040             {
5041                 /* Reallocate the buffers if necessary  */
5042                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5043                 {
5044                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5045                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5046                 }
5047                 nvr = ncg[mc] + nat[mc]*nvec;
5048                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5049                 {
5050                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5051                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5052                 }
5053                 /* Copy from the receive to the send buffers */
5054                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5055                        comm->buf_int + cg*DD_CGIBS,
5056                        DD_CGIBS*sizeof(int));
5057                 memcpy(comm->cgcm_state[mc][nvr],
5058                        comm->vbuf.v[buf_pos],
5059                        (1+nrcg*nvec)*sizeof(rvec));
5060                 buf_pos += 1 + nrcg*nvec;
5061                 ncg[mc] += 1;
5062                 nat[mc] += nrcg;
5063             }
5064         }
5065     }
5066
5067     /* With sorting (!bCompact) the indices are now only partially up to date
5068      * and ncg_home and nat_home are not the real count, since there are
5069      * "holes" in the arrays for the charge groups that moved to neighbors.
5070      */
5071     if (fr->cutoff_scheme == ecutsVERLET)
5072     {
5073         moved = get_moved(comm, home_pos_cg);
5074
5075         for (i = dd->ncg_home; i < home_pos_cg; i++)
5076         {
5077             moved[i] = 0;
5078         }
5079     }
5080     dd->ncg_home = home_pos_cg;
5081     dd->nat_home = home_pos_at;
5082
5083     if (debug)
5084     {
5085         fprintf(debug,
5086                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5087                 *ncg_moved, dd->ncg_home-*ncg_moved);
5088
5089     }
5090 }
5091
5092 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5093 {
5094     dd->comm->cycl[ddCycl] += cycles;
5095     dd->comm->cycl_n[ddCycl]++;
5096     if (cycles > dd->comm->cycl_max[ddCycl])
5097     {
5098         dd->comm->cycl_max[ddCycl] = cycles;
5099     }
5100 }
5101
5102 static double force_flop_count(t_nrnb *nrnb)
5103 {
5104     int         i;
5105     double      sum;
5106     const char *name;
5107
5108     sum = 0;
5109     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5110     {
5111         /* To get closer to the real timings, we half the count
5112          * for the normal loops and again half it for water loops.
5113          */
5114         name = nrnb_str(i);
5115         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5116         {
5117             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5118         }
5119         else
5120         {
5121             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5122         }
5123     }
5124     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5125     {
5126         name = nrnb_str(i);
5127         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5128         {
5129             sum += nrnb->n[i]*cost_nrnb(i);
5130         }
5131     }
5132     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5133     {
5134         sum += nrnb->n[i]*cost_nrnb(i);
5135     }
5136
5137     return sum;
5138 }
5139
5140 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5141 {
5142     if (dd->comm->eFlop)
5143     {
5144         dd->comm->flop -= force_flop_count(nrnb);
5145     }
5146 }
5147 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5148 {
5149     if (dd->comm->eFlop)
5150     {
5151         dd->comm->flop += force_flop_count(nrnb);
5152         dd->comm->flop_n++;
5153     }
5154 }
5155
5156 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5157 {
5158     int i;
5159
5160     for (i = 0; i < ddCyclNr; i++)
5161     {
5162         dd->comm->cycl[i]     = 0;
5163         dd->comm->cycl_n[i]   = 0;
5164         dd->comm->cycl_max[i] = 0;
5165     }
5166     dd->comm->flop   = 0;
5167     dd->comm->flop_n = 0;
5168 }
5169
5170 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5171 {
5172     gmx_domdec_comm_t *comm;
5173     gmx_domdec_load_t *load;
5174     gmx_domdec_root_t *root = NULL;
5175     int                d, dim, cid, i, pos;
5176     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5177     gmx_bool           bSepPME;
5178
5179     if (debug)
5180     {
5181         fprintf(debug, "get_load_distribution start\n");
5182     }
5183
5184     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5185
5186     comm = dd->comm;
5187
5188     bSepPME = (dd->pme_nodeid >= 0);
5189
5190     for (d = dd->ndim-1; d >= 0; d--)
5191     {
5192         dim = dd->dim[d];
5193         /* Check if we participate in the communication in this dimension */
5194         if (d == dd->ndim-1 ||
5195             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5196         {
5197             load = &comm->load[d];
5198             if (dd->bGridJump)
5199             {
5200                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5201             }
5202             pos = 0;
5203             if (d == dd->ndim-1)
5204             {
5205                 sbuf[pos++] = dd_force_load(comm);
5206                 sbuf[pos++] = sbuf[0];
5207                 if (dd->bGridJump)
5208                 {
5209                     sbuf[pos++] = sbuf[0];
5210                     sbuf[pos++] = cell_frac;
5211                     if (d > 0)
5212                     {
5213                         sbuf[pos++] = comm->cell_f_max0[d];
5214                         sbuf[pos++] = comm->cell_f_min1[d];
5215                     }
5216                 }
5217                 if (bSepPME)
5218                 {
5219                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5220                     sbuf[pos++] = comm->cycl[ddCyclPME];
5221                 }
5222             }
5223             else
5224             {
5225                 sbuf[pos++] = comm->load[d+1].sum;
5226                 sbuf[pos++] = comm->load[d+1].max;
5227                 if (dd->bGridJump)
5228                 {
5229                     sbuf[pos++] = comm->load[d+1].sum_m;
5230                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5231                     sbuf[pos++] = comm->load[d+1].flags;
5232                     if (d > 0)
5233                     {
5234                         sbuf[pos++] = comm->cell_f_max0[d];
5235                         sbuf[pos++] = comm->cell_f_min1[d];
5236                     }
5237                 }
5238                 if (bSepPME)
5239                 {
5240                     sbuf[pos++] = comm->load[d+1].mdf;
5241                     sbuf[pos++] = comm->load[d+1].pme;
5242                 }
5243             }
5244             load->nload = pos;
5245             /* Communicate a row in DD direction d.
5246              * The communicators are setup such that the root always has rank 0.
5247              */
5248 #ifdef GMX_MPI
5249             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5250                        load->load, load->nload*sizeof(float), MPI_BYTE,
5251                        0, comm->mpi_comm_load[d]);
5252 #endif
5253             if (dd->ci[dim] == dd->master_ci[dim])
5254             {
5255                 /* We are the root, process this row */
5256                 if (comm->bDynLoadBal)
5257                 {
5258                     root = comm->root[d];
5259                 }
5260                 load->sum      = 0;
5261                 load->max      = 0;
5262                 load->sum_m    = 0;
5263                 load->cvol_min = 1;
5264                 load->flags    = 0;
5265                 load->mdf      = 0;
5266                 load->pme      = 0;
5267                 pos            = 0;
5268                 for (i = 0; i < dd->nc[dim]; i++)
5269                 {
5270                     load->sum += load->load[pos++];
5271                     load->max  = max(load->max, load->load[pos]);
5272                     pos++;
5273                     if (dd->bGridJump)
5274                     {
5275                         if (root->bLimited)
5276                         {
5277                             /* This direction could not be load balanced properly,
5278                              * therefore we need to use the maximum iso the average load.
5279                              */
5280                             load->sum_m = max(load->sum_m, load->load[pos]);
5281                         }
5282                         else
5283                         {
5284                             load->sum_m += load->load[pos];
5285                         }
5286                         pos++;
5287                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5288                         pos++;
5289                         if (d < dd->ndim-1)
5290                         {
5291                             load->flags = (int)(load->load[pos++] + 0.5);
5292                         }
5293                         if (d > 0)
5294                         {
5295                             root->cell_f_max0[i] = load->load[pos++];
5296                             root->cell_f_min1[i] = load->load[pos++];
5297                         }
5298                     }
5299                     if (bSepPME)
5300                     {
5301                         load->mdf = max(load->mdf, load->load[pos]);
5302                         pos++;
5303                         load->pme = max(load->pme, load->load[pos]);
5304                         pos++;
5305                     }
5306                 }
5307                 if (comm->bDynLoadBal && root->bLimited)
5308                 {
5309                     load->sum_m *= dd->nc[dim];
5310                     load->flags |= (1<<d);
5311                 }
5312             }
5313         }
5314     }
5315
5316     if (DDMASTER(dd))
5317     {
5318         comm->nload      += dd_load_count(comm);
5319         comm->load_step  += comm->cycl[ddCyclStep];
5320         comm->load_sum   += comm->load[0].sum;
5321         comm->load_max   += comm->load[0].max;
5322         if (comm->bDynLoadBal)
5323         {
5324             for (d = 0; d < dd->ndim; d++)
5325             {
5326                 if (comm->load[0].flags & (1<<d))
5327                 {
5328                     comm->load_lim[d]++;
5329                 }
5330             }
5331         }
5332         if (bSepPME)
5333         {
5334             comm->load_mdf += comm->load[0].mdf;
5335             comm->load_pme += comm->load[0].pme;
5336         }
5337     }
5338
5339     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5340
5341     if (debug)
5342     {
5343         fprintf(debug, "get_load_distribution finished\n");
5344     }
5345 }
5346
5347 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5348 {
5349     /* Return the relative performance loss on the total run time
5350      * due to the force calculation load imbalance.
5351      */
5352     if (dd->comm->nload > 0)
5353     {
5354         return
5355             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5356             (dd->comm->load_step*dd->nnodes);
5357     }
5358     else
5359     {
5360         return 0;
5361     }
5362 }
5363
5364 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5365 {
5366     char               buf[STRLEN];
5367     int                npp, npme, nnodes, d, limp;
5368     float              imbal, pme_f_ratio, lossf, lossp = 0;
5369     gmx_bool           bLim;
5370     gmx_domdec_comm_t *comm;
5371
5372     comm = dd->comm;
5373     if (DDMASTER(dd) && comm->nload > 0)
5374     {
5375         npp    = dd->nnodes;
5376         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5377         nnodes = npp + npme;
5378         imbal  = comm->load_max*npp/comm->load_sum - 1;
5379         lossf  = dd_force_imb_perf_loss(dd);
5380         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5381         fprintf(fplog, "%s", buf);
5382         fprintf(stderr, "\n");
5383         fprintf(stderr, "%s", buf);
5384         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5385         fprintf(fplog, "%s", buf);
5386         fprintf(stderr, "%s", buf);
5387         bLim = FALSE;
5388         if (comm->bDynLoadBal)
5389         {
5390             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5391             for (d = 0; d < dd->ndim; d++)
5392             {
5393                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5394                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5395                 if (limp >= 50)
5396                 {
5397                     bLim = TRUE;
5398                 }
5399             }
5400             sprintf(buf+strlen(buf), "\n");
5401             fprintf(fplog, "%s", buf);
5402             fprintf(stderr, "%s", buf);
5403         }
5404         if (npme > 0)
5405         {
5406             pme_f_ratio = comm->load_pme/comm->load_mdf;
5407             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5408             if (lossp <= 0)
5409             {
5410                 lossp *= (float)npme/(float)nnodes;
5411             }
5412             else
5413             {
5414                 lossp *= (float)npp/(float)nnodes;
5415             }
5416             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5417             fprintf(fplog, "%s", buf);
5418             fprintf(stderr, "%s", buf);
5419             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5420             fprintf(fplog, "%s", buf);
5421             fprintf(stderr, "%s", buf);
5422         }
5423         fprintf(fplog, "\n");
5424         fprintf(stderr, "\n");
5425
5426         if (lossf >= DD_PERF_LOSS)
5427         {
5428             sprintf(buf,
5429                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5430                     "      in the domain decomposition.\n", lossf*100);
5431             if (!comm->bDynLoadBal)
5432             {
5433                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5434             }
5435             else if (bLim)
5436             {
5437                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5438             }
5439             fprintf(fplog, "%s\n", buf);
5440             fprintf(stderr, "%s\n", buf);
5441         }
5442         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS)
5443         {
5444             sprintf(buf,
5445                     "NOTE: %.1f %% performance was lost because the PME nodes\n"
5446                     "      had %s work to do than the PP nodes.\n"
5447                     "      You might want to %s the number of PME nodes\n"
5448                     "      or %s the cut-off and the grid spacing.\n",
5449                     fabs(lossp*100),
5450                     (lossp < 0) ? "less"     : "more",
5451                     (lossp < 0) ? "decrease" : "increase",
5452                     (lossp < 0) ? "decrease" : "increase");
5453             fprintf(fplog, "%s\n", buf);
5454             fprintf(stderr, "%s\n", buf);
5455         }
5456     }
5457 }
5458
5459 static float dd_vol_min(gmx_domdec_t *dd)
5460 {
5461     return dd->comm->load[0].cvol_min*dd->nnodes;
5462 }
5463
5464 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5465 {
5466     return dd->comm->load[0].flags;
5467 }
5468
5469 static float dd_f_imbal(gmx_domdec_t *dd)
5470 {
5471     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5472 }
5473
5474 float dd_pme_f_ratio(gmx_domdec_t *dd)
5475 {
5476     if (dd->comm->cycl_n[ddCyclPME] > 0)
5477     {
5478         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5479     }
5480     else
5481     {
5482         return -1.0;
5483     }
5484 }
5485
5486 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_large_int_t step)
5487 {
5488     int  flags, d;
5489     char buf[22];
5490
5491     flags = dd_load_flags(dd);
5492     if (flags)
5493     {
5494         fprintf(fplog,
5495                 "DD  load balancing is limited by minimum cell size in dimension");
5496         for (d = 0; d < dd->ndim; d++)
5497         {
5498             if (flags & (1<<d))
5499             {
5500                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5501             }
5502         }
5503         fprintf(fplog, "\n");
5504     }
5505     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5506     if (dd->comm->bDynLoadBal)
5507     {
5508         fprintf(fplog, "  vol min/aver %5.3f%c",
5509                 dd_vol_min(dd), flags ? '!' : ' ');
5510     }
5511     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5512     if (dd->comm->cycl_n[ddCyclPME])
5513     {
5514         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5515     }
5516     fprintf(fplog, "\n\n");
5517 }
5518
5519 static void dd_print_load_verbose(gmx_domdec_t *dd)
5520 {
5521     if (dd->comm->bDynLoadBal)
5522     {
5523         fprintf(stderr, "vol %4.2f%c ",
5524                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5525     }
5526     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5527     if (dd->comm->cycl_n[ddCyclPME])
5528     {
5529         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5530     }
5531 }
5532
5533 #ifdef GMX_MPI
5534 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5535 {
5536     MPI_Comm           c_row;
5537     int                dim, i, rank;
5538     ivec               loc_c;
5539     gmx_domdec_root_t *root;
5540     gmx_bool           bPartOfGroup = FALSE;
5541
5542     dim = dd->dim[dim_ind];
5543     copy_ivec(loc, loc_c);
5544     for (i = 0; i < dd->nc[dim]; i++)
5545     {
5546         loc_c[dim] = i;
5547         rank       = dd_index(dd->nc, loc_c);
5548         if (rank == dd->rank)
5549         {
5550             /* This process is part of the group */
5551             bPartOfGroup = TRUE;
5552         }
5553     }
5554     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5555                    &c_row);
5556     if (bPartOfGroup)
5557     {
5558         dd->comm->mpi_comm_load[dim_ind] = c_row;
5559         if (dd->comm->eDLB != edlbNO)
5560         {
5561             if (dd->ci[dim] == dd->master_ci[dim])
5562             {
5563                 /* This is the root process of this row */
5564                 snew(dd->comm->root[dim_ind], 1);
5565                 root = dd->comm->root[dim_ind];
5566                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5567                 snew(root->old_cell_f, dd->nc[dim]+1);
5568                 snew(root->bCellMin, dd->nc[dim]);
5569                 if (dim_ind > 0)
5570                 {
5571                     snew(root->cell_f_max0, dd->nc[dim]);
5572                     snew(root->cell_f_min1, dd->nc[dim]);
5573                     snew(root->bound_min, dd->nc[dim]);
5574                     snew(root->bound_max, dd->nc[dim]);
5575                 }
5576                 snew(root->buf_ncd, dd->nc[dim]);
5577             }
5578             else
5579             {
5580                 /* This is not a root process, we only need to receive cell_f */
5581                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5582             }
5583         }
5584         if (dd->ci[dim] == dd->master_ci[dim])
5585         {
5586             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5587         }
5588     }
5589 }
5590 #endif
5591
5592 static void make_load_communicators(gmx_domdec_t *dd)
5593 {
5594 #ifdef GMX_MPI
5595     int  dim0, dim1, i, j;
5596     ivec loc;
5597
5598     if (debug)
5599     {
5600         fprintf(debug, "Making load communicators\n");
5601     }
5602
5603     snew(dd->comm->load, dd->ndim);
5604     snew(dd->comm->mpi_comm_load, dd->ndim);
5605
5606     clear_ivec(loc);
5607     make_load_communicator(dd, 0, loc);
5608     if (dd->ndim > 1)
5609     {
5610         dim0 = dd->dim[0];
5611         for (i = 0; i < dd->nc[dim0]; i++)
5612         {
5613             loc[dim0] = i;
5614             make_load_communicator(dd, 1, loc);
5615         }
5616     }
5617     if (dd->ndim > 2)
5618     {
5619         dim0 = dd->dim[0];
5620         for (i = 0; i < dd->nc[dim0]; i++)
5621         {
5622             loc[dim0] = i;
5623             dim1      = dd->dim[1];
5624             for (j = 0; j < dd->nc[dim1]; j++)
5625             {
5626                 loc[dim1] = j;
5627                 make_load_communicator(dd, 2, loc);
5628             }
5629         }
5630     }
5631
5632     if (debug)
5633     {
5634         fprintf(debug, "Finished making load communicators\n");
5635     }
5636 #endif
5637 }
5638
5639 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5640 {
5641     gmx_bool                bZYX;
5642     int                     d, dim, i, j, m;
5643     ivec                    tmp, s;
5644     int                     nzone, nzonep;
5645     ivec                    dd_zp[DD_MAXIZONE];
5646     gmx_domdec_zones_t     *zones;
5647     gmx_domdec_ns_ranges_t *izone;
5648
5649     for (d = 0; d < dd->ndim; d++)
5650     {
5651         dim = dd->dim[d];
5652         copy_ivec(dd->ci, tmp);
5653         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5654         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5655         copy_ivec(dd->ci, tmp);
5656         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5657         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5658         if (debug)
5659         {
5660             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5661                     dd->rank, dim,
5662                     dd->neighbor[d][0],
5663                     dd->neighbor[d][1]);
5664         }
5665     }
5666
5667     if (fplog)
5668     {
5669         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5670                 dd->ndim,
5671                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5672                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5673     }
5674     switch (dd->ndim)
5675     {
5676         case 3:
5677             nzone  = dd_z3n;
5678             nzonep = dd_zp3n;
5679             for (i = 0; i < nzonep; i++)
5680             {
5681                 copy_ivec(dd_zp3[i], dd_zp[i]);
5682             }
5683             break;
5684         case 2:
5685             nzone  = dd_z2n;
5686             nzonep = dd_zp2n;
5687             for (i = 0; i < nzonep; i++)
5688             {
5689                 copy_ivec(dd_zp2[i], dd_zp[i]);
5690             }
5691             break;
5692         case 1:
5693             nzone  = dd_z1n;
5694             nzonep = dd_zp1n;
5695             for (i = 0; i < nzonep; i++)
5696             {
5697                 copy_ivec(dd_zp1[i], dd_zp[i]);
5698             }
5699             break;
5700         default:
5701             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5702             nzone  = 0;
5703             nzonep = 0;
5704     }
5705
5706     zones = &dd->comm->zones;
5707
5708     for (i = 0; i < nzone; i++)
5709     {
5710         m = 0;
5711         clear_ivec(zones->shift[i]);
5712         for (d = 0; d < dd->ndim; d++)
5713         {
5714             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5715         }
5716     }
5717
5718     zones->n = nzone;
5719     for (i = 0; i < nzone; i++)
5720     {
5721         for (d = 0; d < DIM; d++)
5722         {
5723             s[d] = dd->ci[d] - zones->shift[i][d];
5724             if (s[d] < 0)
5725             {
5726                 s[d] += dd->nc[d];
5727             }
5728             else if (s[d] >= dd->nc[d])
5729             {
5730                 s[d] -= dd->nc[d];
5731             }
5732         }
5733     }
5734     zones->nizone = nzonep;
5735     for (i = 0; i < zones->nizone; i++)
5736     {
5737         if (dd_zp[i][0] != i)
5738         {
5739             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5740         }
5741         izone     = &zones->izone[i];
5742         izone->j0 = dd_zp[i][1];
5743         izone->j1 = dd_zp[i][2];
5744         for (dim = 0; dim < DIM; dim++)
5745         {
5746             if (dd->nc[dim] == 1)
5747             {
5748                 /* All shifts should be allowed */
5749                 izone->shift0[dim] = -1;
5750                 izone->shift1[dim] = 1;
5751             }
5752             else
5753             {
5754                 /*
5755                    izone->shift0[d] = 0;
5756                    izone->shift1[d] = 0;
5757                    for(j=izone->j0; j<izone->j1; j++) {
5758                    if (dd->shift[j][d] > dd->shift[i][d])
5759                    izone->shift0[d] = -1;
5760                    if (dd->shift[j][d] < dd->shift[i][d])
5761                    izone->shift1[d] = 1;
5762                    }
5763                  */
5764
5765                 int shift_diff;
5766
5767                 /* Assume the shift are not more than 1 cell */
5768                 izone->shift0[dim] = 1;
5769                 izone->shift1[dim] = -1;
5770                 for (j = izone->j0; j < izone->j1; j++)
5771                 {
5772                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5773                     if (shift_diff < izone->shift0[dim])
5774                     {
5775                         izone->shift0[dim] = shift_diff;
5776                     }
5777                     if (shift_diff > izone->shift1[dim])
5778                     {
5779                         izone->shift1[dim] = shift_diff;
5780                     }
5781                 }
5782             }
5783         }
5784     }
5785
5786     if (dd->comm->eDLB != edlbNO)
5787     {
5788         snew(dd->comm->root, dd->ndim);
5789     }
5790
5791     if (dd->comm->bRecordLoad)
5792     {
5793         make_load_communicators(dd);
5794     }
5795 }
5796
5797 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int reorder)
5798 {
5799     gmx_domdec_t      *dd;
5800     gmx_domdec_comm_t *comm;
5801     int                i, rank, *buf;
5802     ivec               periods;
5803 #ifdef GMX_MPI
5804     MPI_Comm           comm_cart;
5805 #endif
5806
5807     dd   = cr->dd;
5808     comm = dd->comm;
5809
5810 #ifdef GMX_MPI
5811     if (comm->bCartesianPP)
5812     {
5813         /* Set up cartesian communication for the particle-particle part */
5814         if (fplog)
5815         {
5816             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5817                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5818         }
5819
5820         for (i = 0; i < DIM; i++)
5821         {
5822             periods[i] = TRUE;
5823         }
5824         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5825                         &comm_cart);
5826         /* We overwrite the old communicator with the new cartesian one */
5827         cr->mpi_comm_mygroup = comm_cart;
5828     }
5829
5830     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5831     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5832
5833     if (comm->bCartesianPP_PME)
5834     {
5835         /* Since we want to use the original cartesian setup for sim,
5836          * and not the one after split, we need to make an index.
5837          */
5838         snew(comm->ddindex2ddnodeid, dd->nnodes);
5839         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5840         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5841         /* Get the rank of the DD master,
5842          * above we made sure that the master node is a PP node.
5843          */
5844         if (MASTER(cr))
5845         {
5846             rank = dd->rank;
5847         }
5848         else
5849         {
5850             rank = 0;
5851         }
5852         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5853     }
5854     else if (comm->bCartesianPP)
5855     {
5856         if (cr->npmenodes == 0)
5857         {
5858             /* The PP communicator is also
5859              * the communicator for this simulation
5860              */
5861             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5862         }
5863         cr->nodeid = dd->rank;
5864
5865         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5866
5867         /* We need to make an index to go from the coordinates
5868          * to the nodeid of this simulation.
5869          */
5870         snew(comm->ddindex2simnodeid, dd->nnodes);
5871         snew(buf, dd->nnodes);
5872         if (cr->duty & DUTY_PP)
5873         {
5874             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5875         }
5876         /* Communicate the ddindex to simulation nodeid index */
5877         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5878                       cr->mpi_comm_mysim);
5879         sfree(buf);
5880
5881         /* Determine the master coordinates and rank.
5882          * The DD master should be the same node as the master of this sim.
5883          */
5884         for (i = 0; i < dd->nnodes; i++)
5885         {
5886             if (comm->ddindex2simnodeid[i] == 0)
5887             {
5888                 ddindex2xyz(dd->nc, i, dd->master_ci);
5889                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
5890             }
5891         }
5892         if (debug)
5893         {
5894             fprintf(debug, "The master rank is %d\n", dd->masterrank);
5895         }
5896     }
5897     else
5898     {
5899         /* No Cartesian communicators */
5900         /* We use the rank in dd->comm->all as DD index */
5901         ddindex2xyz(dd->nc, dd->rank, dd->ci);
5902         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
5903         dd->masterrank = 0;
5904         clear_ivec(dd->master_ci);
5905     }
5906 #endif
5907
5908     if (fplog)
5909     {
5910         fprintf(fplog,
5911                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5912                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5913     }
5914     if (debug)
5915     {
5916         fprintf(debug,
5917                 "Domain decomposition nodeid %d, coordinates %d %d %d\n\n",
5918                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5919     }
5920 }
5921
5922 static void receive_ddindex2simnodeid(t_commrec *cr)
5923 {
5924     gmx_domdec_t      *dd;
5925
5926     gmx_domdec_comm_t *comm;
5927     int               *buf;
5928
5929     dd   = cr->dd;
5930     comm = dd->comm;
5931
5932 #ifdef GMX_MPI
5933     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
5934     {
5935         snew(comm->ddindex2simnodeid, dd->nnodes);
5936         snew(buf, dd->nnodes);
5937         if (cr->duty & DUTY_PP)
5938         {
5939             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5940         }
5941 #ifdef GMX_MPI
5942         /* Communicate the ddindex to simulation nodeid index */
5943         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5944                       cr->mpi_comm_mysim);
5945 #endif
5946         sfree(buf);
5947     }
5948 #endif
5949 }
5950
5951 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
5952                                                      int ncg, int natoms)
5953 {
5954     gmx_domdec_master_t *ma;
5955     int                  i;
5956
5957     snew(ma, 1);
5958
5959     snew(ma->ncg, dd->nnodes);
5960     snew(ma->index, dd->nnodes+1);
5961     snew(ma->cg, ncg);
5962     snew(ma->nat, dd->nnodes);
5963     snew(ma->ibuf, dd->nnodes*2);
5964     snew(ma->cell_x, DIM);
5965     for (i = 0; i < DIM; i++)
5966     {
5967         snew(ma->cell_x[i], dd->nc[i]+1);
5968     }
5969
5970     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
5971     {
5972         ma->vbuf = NULL;
5973     }
5974     else
5975     {
5976         snew(ma->vbuf, natoms);
5977     }
5978
5979     return ma;
5980 }
5981
5982 static void split_communicator(FILE *fplog, t_commrec *cr, int dd_node_order,
5983                                int reorder)
5984 {
5985     gmx_domdec_t      *dd;
5986     gmx_domdec_comm_t *comm;
5987     int                i, rank;
5988     gmx_bool           bDiv[DIM];
5989     ivec               periods;
5990 #ifdef GMX_MPI
5991     MPI_Comm           comm_cart;
5992 #endif
5993
5994     dd   = cr->dd;
5995     comm = dd->comm;
5996
5997     if (comm->bCartesianPP)
5998     {
5999         for (i = 1; i < DIM; i++)
6000         {
6001             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6002         }
6003         if (bDiv[YY] || bDiv[ZZ])
6004         {
6005             comm->bCartesianPP_PME = TRUE;
6006             /* If we have 2D PME decomposition, which is always in x+y,
6007              * we stack the PME only nodes in z.
6008              * Otherwise we choose the direction that provides the thinnest slab
6009              * of PME only nodes as this will have the least effect
6010              * on the PP communication.
6011              * But for the PME communication the opposite might be better.
6012              */
6013             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6014                              !bDiv[YY] ||
6015                              dd->nc[YY] > dd->nc[ZZ]))
6016             {
6017                 comm->cartpmedim = ZZ;
6018             }
6019             else
6020             {
6021                 comm->cartpmedim = YY;
6022             }
6023             comm->ntot[comm->cartpmedim]
6024                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6025         }
6026         else if (fplog)
6027         {
6028             fprintf(fplog, "#pmenodes (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6029             fprintf(fplog,
6030                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6031         }
6032     }
6033
6034 #ifdef GMX_MPI
6035     if (comm->bCartesianPP_PME)
6036     {
6037         if (fplog)
6038         {
6039             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6040         }
6041
6042         for (i = 0; i < DIM; i++)
6043         {
6044             periods[i] = TRUE;
6045         }
6046         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6047                         &comm_cart);
6048
6049         MPI_Comm_rank(comm_cart, &rank);
6050         if (MASTERNODE(cr) && rank != 0)
6051         {
6052             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6053         }
6054
6055         /* With this assigment we loose the link to the original communicator
6056          * which will usually be MPI_COMM_WORLD, unless have multisim.
6057          */
6058         cr->mpi_comm_mysim = comm_cart;
6059         cr->sim_nodeid     = rank;
6060
6061         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6062
6063         if (fplog)
6064         {
6065             fprintf(fplog, "Cartesian nodeid %d, coordinates %d %d %d\n\n",
6066                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6067         }
6068
6069         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6070         {
6071             cr->duty = DUTY_PP;
6072         }
6073         if (cr->npmenodes == 0 ||
6074             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6075         {
6076             cr->duty = DUTY_PME;
6077         }
6078
6079         /* Split the sim communicator into PP and PME only nodes */
6080         MPI_Comm_split(cr->mpi_comm_mysim,
6081                        cr->duty,
6082                        dd_index(comm->ntot, dd->ci),
6083                        &cr->mpi_comm_mygroup);
6084     }
6085     else
6086     {
6087         switch (dd_node_order)
6088         {
6089             case ddnoPP_PME:
6090                 if (fplog)
6091                 {
6092                     fprintf(fplog, "Order of the nodes: PP first, PME last\n");
6093                 }
6094                 break;
6095             case ddnoINTERLEAVE:
6096                 /* Interleave the PP-only and PME-only nodes,
6097                  * as on clusters with dual-core machines this will double
6098                  * the communication bandwidth of the PME processes
6099                  * and thus speed up the PP <-> PME and inter PME communication.
6100                  */
6101                 if (fplog)
6102                 {
6103                     fprintf(fplog, "Interleaving PP and PME nodes\n");
6104                 }
6105                 comm->pmenodes = dd_pmenodes(cr);
6106                 break;
6107             case ddnoCARTESIAN:
6108                 break;
6109             default:
6110                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6111         }
6112
6113         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6114         {
6115             cr->duty = DUTY_PME;
6116         }
6117         else
6118         {
6119             cr->duty = DUTY_PP;
6120         }
6121
6122         /* Split the sim communicator into PP and PME only nodes */
6123         MPI_Comm_split(cr->mpi_comm_mysim,
6124                        cr->duty,
6125                        cr->nodeid,
6126                        &cr->mpi_comm_mygroup);
6127         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6128     }
6129 #endif
6130
6131     if (fplog)
6132     {
6133         fprintf(fplog, "This is a %s only node\n\n",
6134                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6135     }
6136 }
6137
6138 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6139 {
6140     gmx_domdec_t      *dd;
6141     gmx_domdec_comm_t *comm;
6142     int                CartReorder;
6143
6144     dd   = cr->dd;
6145     comm = dd->comm;
6146
6147     copy_ivec(dd->nc, comm->ntot);
6148
6149     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6150     comm->bCartesianPP_PME = FALSE;
6151
6152     /* Reorder the nodes by default. This might change the MPI ranks.
6153      * Real reordering is only supported on very few architectures,
6154      * Blue Gene is one of them.
6155      */
6156     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6157
6158     if (cr->npmenodes > 0)
6159     {
6160         /* Split the communicator into a PP and PME part */
6161         split_communicator(fplog, cr, dd_node_order, CartReorder);
6162         if (comm->bCartesianPP_PME)
6163         {
6164             /* We (possibly) reordered the nodes in split_communicator,
6165              * so it is no longer required in make_pp_communicator.
6166              */
6167             CartReorder = FALSE;
6168         }
6169     }
6170     else
6171     {
6172         /* All nodes do PP and PME */
6173 #ifdef GMX_MPI
6174         /* We do not require separate communicators */
6175         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6176 #endif
6177     }
6178
6179     if (cr->duty & DUTY_PP)
6180     {
6181         /* Copy or make a new PP communicator */
6182         make_pp_communicator(fplog, cr, CartReorder);
6183     }
6184     else
6185     {
6186         receive_ddindex2simnodeid(cr);
6187     }
6188
6189     if (!(cr->duty & DUTY_PME))
6190     {
6191         /* Set up the commnuication to our PME node */
6192         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6193         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6194         if (debug)
6195         {
6196             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6197                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6198         }
6199     }
6200     else
6201     {
6202         dd->pme_nodeid = -1;
6203     }
6204
6205     if (DDMASTER(dd))
6206     {
6207         dd->ma = init_gmx_domdec_master_t(dd,
6208                                           comm->cgs_gl.nr,
6209                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6210     }
6211 }
6212
6213 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6214 {
6215     real  *slb_frac, tot;
6216     int    i, n;
6217     double dbl;
6218
6219     slb_frac = NULL;
6220     if (nc > 1 && size_string != NULL)
6221     {
6222         if (fplog)
6223         {
6224             fprintf(fplog, "Using static load balancing for the %s direction\n",
6225                     dir);
6226         }
6227         snew(slb_frac, nc);
6228         tot = 0;
6229         for (i = 0; i < nc; i++)
6230         {
6231             dbl = 0;
6232             sscanf(size_string, "%lf%n", &dbl, &n);
6233             if (dbl == 0)
6234             {
6235                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6236             }
6237             slb_frac[i]  = dbl;
6238             size_string += n;
6239             tot         += slb_frac[i];
6240         }
6241         /* Normalize */
6242         if (fplog)
6243         {
6244             fprintf(fplog, "Relative cell sizes:");
6245         }
6246         for (i = 0; i < nc; i++)
6247         {
6248             slb_frac[i] /= tot;
6249             if (fplog)
6250             {
6251                 fprintf(fplog, " %5.3f", slb_frac[i]);
6252             }
6253         }
6254         if (fplog)
6255         {
6256             fprintf(fplog, "\n");
6257         }
6258     }
6259
6260     return slb_frac;
6261 }
6262
6263 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6264 {
6265     int                  n, nmol, ftype;
6266     gmx_mtop_ilistloop_t iloop;
6267     t_ilist             *il;
6268
6269     n     = 0;
6270     iloop = gmx_mtop_ilistloop_init(mtop);
6271     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6272     {
6273         for (ftype = 0; ftype < F_NRE; ftype++)
6274         {
6275             if ((interaction_function[ftype].flags & IF_BOND) &&
6276                 NRAL(ftype) >  2)
6277             {
6278                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6279             }
6280         }
6281     }
6282
6283     return n;
6284 }
6285
6286 static int dd_nst_env(FILE *fplog, const char *env_var, int def)
6287 {
6288     char *val;
6289     int   nst;
6290
6291     nst = def;
6292     val = getenv(env_var);
6293     if (val)
6294     {
6295         if (sscanf(val, "%d", &nst) <= 0)
6296         {
6297             nst = 1;
6298         }
6299         if (fplog)
6300         {
6301             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6302                     env_var, val, nst);
6303         }
6304     }
6305
6306     return nst;
6307 }
6308
6309 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6310 {
6311     if (MASTER(cr))
6312     {
6313         fprintf(stderr, "\n%s\n", warn_string);
6314     }
6315     if (fplog)
6316     {
6317         fprintf(fplog, "\n%s\n", warn_string);
6318     }
6319 }
6320
6321 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6322                                   t_inputrec *ir, FILE *fplog)
6323 {
6324     if (ir->ePBC == epbcSCREW &&
6325         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6326     {
6327         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6328     }
6329
6330     if (ir->ns_type == ensSIMPLE)
6331     {
6332         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or use particle decomposition");
6333     }
6334
6335     if (ir->nstlist == 0)
6336     {
6337         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6338     }
6339
6340     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6341     {
6342         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6343     }
6344 }
6345
6346 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6347 {
6348     int  di, d;
6349     real r;
6350
6351     r = ddbox->box_size[XX];
6352     for (di = 0; di < dd->ndim; di++)
6353     {
6354         d = dd->dim[di];
6355         /* Check using the initial average cell size */
6356         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6357     }
6358
6359     return r;
6360 }
6361
6362 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6363                              const char *dlb_opt, gmx_bool bRecordLoad,
6364                              unsigned long Flags, t_inputrec *ir)
6365 {
6366     gmx_domdec_t *dd;
6367     int           eDLB = -1;
6368     char          buf[STRLEN];
6369
6370     switch (dlb_opt[0])
6371     {
6372         case 'a': eDLB = edlbAUTO; break;
6373         case 'n': eDLB = edlbNO;   break;
6374         case 'y': eDLB = edlbYES;  break;
6375         default: gmx_incons("Unknown dlb_opt");
6376     }
6377
6378     if (Flags & MD_RERUN)
6379     {
6380         return edlbNO;
6381     }
6382
6383     if (!EI_DYNAMICS(ir->eI))
6384     {
6385         if (eDLB == edlbYES)
6386         {
6387             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6388             dd_warning(cr, fplog, buf);
6389         }
6390
6391         return edlbNO;
6392     }
6393
6394     if (!bRecordLoad)
6395     {
6396         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6397
6398         return edlbNO;
6399     }
6400
6401     if (Flags & MD_REPRODUCIBLE)
6402     {
6403         switch (eDLB)
6404         {
6405             case edlbNO:
6406                 break;
6407             case edlbAUTO:
6408                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6409                 eDLB = edlbNO;
6410                 break;
6411             case edlbYES:
6412                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6413                 break;
6414             default:
6415                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6416                 break;
6417         }
6418     }
6419
6420     return eDLB;
6421 }
6422
6423 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6424 {
6425     int dim;
6426
6427     dd->ndim = 0;
6428     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6429     {
6430         /* Decomposition order z,y,x */
6431         if (fplog)
6432         {
6433             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6434         }
6435         for (dim = DIM-1; dim >= 0; dim--)
6436         {
6437             if (dd->nc[dim] > 1)
6438             {
6439                 dd->dim[dd->ndim++] = dim;
6440             }
6441         }
6442     }
6443     else
6444     {
6445         /* Decomposition order x,y,z */
6446         for (dim = 0; dim < DIM; dim++)
6447         {
6448             if (dd->nc[dim] > 1)
6449             {
6450                 dd->dim[dd->ndim++] = dim;
6451             }
6452         }
6453     }
6454 }
6455
6456 static gmx_domdec_comm_t *init_dd_comm()
6457 {
6458     gmx_domdec_comm_t *comm;
6459     int                i;
6460
6461     snew(comm, 1);
6462     snew(comm->cggl_flag, DIM*2);
6463     snew(comm->cgcm_state, DIM*2);
6464     for (i = 0; i < DIM*2; i++)
6465     {
6466         comm->cggl_flag_nalloc[i]  = 0;
6467         comm->cgcm_state_nalloc[i] = 0;
6468     }
6469
6470     comm->nalloc_int = 0;
6471     comm->buf_int    = NULL;
6472
6473     vec_rvec_init(&comm->vbuf);
6474
6475     comm->n_load_have    = 0;
6476     comm->n_load_collect = 0;
6477
6478     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6479     {
6480         comm->sum_nat[i] = 0;
6481     }
6482     comm->ndecomp   = 0;
6483     comm->nload     = 0;
6484     comm->load_step = 0;
6485     comm->load_sum  = 0;
6486     comm->load_max  = 0;
6487     clear_ivec(comm->load_lim);
6488     comm->load_mdf  = 0;
6489     comm->load_pme  = 0;
6490
6491     return comm;
6492 }
6493
6494 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6495                                         unsigned long Flags,
6496                                         ivec nc,
6497                                         real comm_distance_min, real rconstr,
6498                                         const char *dlb_opt, real dlb_scale,
6499                                         const char *sizex, const char *sizey, const char *sizez,
6500                                         gmx_mtop_t *mtop, t_inputrec *ir,
6501                                         matrix box, rvec *x,
6502                                         gmx_ddbox_t *ddbox,
6503                                         int *npme_x, int *npme_y)
6504 {
6505     gmx_domdec_t      *dd;
6506     gmx_domdec_comm_t *comm;
6507     int                recload;
6508     int                d, i, j;
6509     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6510     gmx_bool           bC;
6511     char               buf[STRLEN];
6512
6513     if (fplog)
6514     {
6515         fprintf(fplog,
6516                 "\nInitializing Domain Decomposition on %d nodes\n", cr->nnodes);
6517     }
6518
6519     snew(dd, 1);
6520
6521     dd->comm = init_dd_comm();
6522     comm     = dd->comm;
6523     snew(comm->cggl_flag, DIM*2);
6524     snew(comm->cgcm_state, DIM*2);
6525
6526     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6527     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6528
6529     dd->bSendRecv2      = dd_nst_env(fplog, "GMX_DD_SENDRECV2", 0);
6530     comm->dlb_scale_lim = dd_nst_env(fplog, "GMX_DLB_MAX", 10);
6531     comm->eFlop         = dd_nst_env(fplog, "GMX_DLB_FLOP", 0);
6532     recload             = dd_nst_env(fplog, "GMX_DD_LOAD", 1);
6533     comm->nstSortCG     = dd_nst_env(fplog, "GMX_DD_SORT", 1);
6534     comm->nstDDDump     = dd_nst_env(fplog, "GMX_DD_DUMP", 0);
6535     comm->nstDDDumpGrid = dd_nst_env(fplog, "GMX_DD_DUMP_GRID", 0);
6536     comm->DD_debug      = dd_nst_env(fplog, "GMX_DD_DEBUG", 0);
6537
6538     dd->pme_recv_f_alloc = 0;
6539     dd->pme_recv_f_buf   = NULL;
6540
6541     if (dd->bSendRecv2 && fplog)
6542     {
6543         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6544     }
6545     if (comm->eFlop)
6546     {
6547         if (fplog)
6548         {
6549             fprintf(fplog, "Will load balance based on FLOP count\n");
6550         }
6551         if (comm->eFlop > 1)
6552         {
6553             srand(1+cr->nodeid);
6554         }
6555         comm->bRecordLoad = TRUE;
6556     }
6557     else
6558     {
6559         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6560
6561     }
6562
6563     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6564
6565     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6566     if (fplog)
6567     {
6568         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6569     }
6570     dd->bGridJump              = comm->bDynLoadBal;
6571     comm->bPMELoadBalDLBLimits = FALSE;
6572
6573     if (comm->nstSortCG)
6574     {
6575         if (fplog)
6576         {
6577             if (comm->nstSortCG == 1)
6578             {
6579                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6580             }
6581             else
6582             {
6583                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6584                         comm->nstSortCG);
6585             }
6586         }
6587         snew(comm->sort, 1);
6588     }
6589     else
6590     {
6591         if (fplog)
6592         {
6593             fprintf(fplog, "Will not sort the charge groups\n");
6594         }
6595     }
6596
6597     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6598
6599     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6600     if (comm->bInterCGBondeds)
6601     {
6602         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6603     }
6604     else
6605     {
6606         comm->bInterCGMultiBody = FALSE;
6607     }
6608
6609     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6610     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6611
6612     if (ir->rlistlong == 0)
6613     {
6614         /* Set the cut-off to some very large value,
6615          * so we don't need if statements everywhere in the code.
6616          * We use sqrt, since the cut-off is squared in some places.
6617          */
6618         comm->cutoff   = GMX_CUTOFF_INF;
6619     }
6620     else
6621     {
6622         comm->cutoff   = ir->rlistlong;
6623     }
6624     comm->cutoff_mbody = 0;
6625
6626     comm->cellsize_limit = 0;
6627     comm->bBondComm      = FALSE;
6628
6629     if (comm->bInterCGBondeds)
6630     {
6631         if (comm_distance_min > 0)
6632         {
6633             comm->cutoff_mbody = comm_distance_min;
6634             if (Flags & MD_DDBONDCOMM)
6635             {
6636                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6637             }
6638             else
6639             {
6640                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6641             }
6642             r_bonded_limit = comm->cutoff_mbody;
6643         }
6644         else if (ir->bPeriodicMols)
6645         {
6646             /* Can not easily determine the required cut-off */
6647             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6648             comm->cutoff_mbody = comm->cutoff/2;
6649             r_bonded_limit     = comm->cutoff_mbody;
6650         }
6651         else
6652         {
6653             if (MASTER(cr))
6654             {
6655                 dd_bonded_cg_distance(fplog, dd, mtop, ir, x, box,
6656                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6657             }
6658             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6659             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6660
6661             /* We use an initial margin of 10% for the minimum cell size,
6662              * except when we are just below the non-bonded cut-off.
6663              */
6664             if (Flags & MD_DDBONDCOMM)
6665             {
6666                 if (max(r_2b, r_mb) > comm->cutoff)
6667                 {
6668                     r_bonded        = max(r_2b, r_mb);
6669                     r_bonded_limit  = 1.1*r_bonded;
6670                     comm->bBondComm = TRUE;
6671                 }
6672                 else
6673                 {
6674                     r_bonded       = r_mb;
6675                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6676                 }
6677                 /* We determine cutoff_mbody later */
6678             }
6679             else
6680             {
6681                 /* No special bonded communication,
6682                  * simply increase the DD cut-off.
6683                  */
6684                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6685                 comm->cutoff_mbody = r_bonded_limit;
6686                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6687             }
6688         }
6689         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6690         if (fplog)
6691         {
6692             fprintf(fplog,
6693                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6694                     comm->cellsize_limit);
6695         }
6696     }
6697
6698     if (dd->bInterCGcons && rconstr <= 0)
6699     {
6700         /* There is a cell size limit due to the constraints (P-LINCS) */
6701         rconstr = constr_r_max(fplog, mtop, ir);
6702         if (fplog)
6703         {
6704             fprintf(fplog,
6705                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6706                     rconstr);
6707             if (rconstr > comm->cellsize_limit)
6708             {
6709                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6710             }
6711         }
6712     }
6713     else if (rconstr > 0 && fplog)
6714     {
6715         /* Here we do not check for dd->bInterCGcons,
6716          * because one can also set a cell size limit for virtual sites only
6717          * and at this point we don't know yet if there are intercg v-sites.
6718          */
6719         fprintf(fplog,
6720                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6721                 rconstr);
6722     }
6723     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6724
6725     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6726
6727     if (nc[XX] > 0)
6728     {
6729         copy_ivec(nc, dd->nc);
6730         set_dd_dim(fplog, dd);
6731         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6732
6733         if (cr->npmenodes == -1)
6734         {
6735             cr->npmenodes = 0;
6736         }
6737         acs = average_cellsize_min(dd, ddbox);
6738         if (acs < comm->cellsize_limit)
6739         {
6740             if (fplog)
6741             {
6742                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6743             }
6744             gmx_fatal_collective(FARGS, cr, NULL,
6745                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6746                                  acs, comm->cellsize_limit);
6747         }
6748     }
6749     else
6750     {
6751         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6752
6753         /* We need to choose the optimal DD grid and possibly PME nodes */
6754         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6755                                comm->eDLB != edlbNO, dlb_scale,
6756                                comm->cellsize_limit, comm->cutoff,
6757                                comm->bInterCGBondeds, comm->bInterCGMultiBody);
6758
6759         if (dd->nc[XX] == 0)
6760         {
6761             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6762             sprintf(buf, "Change the number of nodes or mdrun option %s%s%s",
6763                     !bC ? "-rdd" : "-rcon",
6764                     comm->eDLB != edlbNO ? " or -dds" : "",
6765                     bC ? " or your LINCS settings" : "");
6766
6767             gmx_fatal_collective(FARGS, cr, NULL,
6768                                  "There is no domain decomposition for %d nodes that is compatible with the given box and a minimum cell size of %g nm\n"
6769                                  "%s\n"
6770                                  "Look in the log file for details on the domain decomposition",
6771                                  cr->nnodes-cr->npmenodes, limit, buf);
6772         }
6773         set_dd_dim(fplog, dd);
6774     }
6775
6776     if (fplog)
6777     {
6778         fprintf(fplog,
6779                 "Domain decomposition grid %d x %d x %d, separate PME nodes %d\n",
6780                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6781     }
6782
6783     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6784     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6785     {
6786         gmx_fatal_collective(FARGS, cr, NULL,
6787                              "The size of the domain decomposition grid (%d) does not match the number of nodes (%d). The total number of nodes is %d",
6788                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6789     }
6790     if (cr->npmenodes > dd->nnodes)
6791     {
6792         gmx_fatal_collective(FARGS, cr, NULL,
6793                              "The number of separate PME nodes (%d) is larger than the number of PP nodes (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6794     }
6795     if (cr->npmenodes > 0)
6796     {
6797         comm->npmenodes = cr->npmenodes;
6798     }
6799     else
6800     {
6801         comm->npmenodes = dd->nnodes;
6802     }
6803
6804     if (EEL_PME(ir->coulombtype))
6805     {
6806         /* The following choices should match those
6807          * in comm_cost_est in domdec_setup.c.
6808          * Note that here the checks have to take into account
6809          * that the decomposition might occur in a different order than xyz
6810          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6811          * in which case they will not match those in comm_cost_est,
6812          * but since that is mainly for testing purposes that's fine.
6813          */
6814         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6815             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6816             getenv("GMX_PMEONEDD") == NULL)
6817         {
6818             comm->npmedecompdim = 2;
6819             comm->npmenodes_x   = dd->nc[XX];
6820             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6821         }
6822         else
6823         {
6824             /* In case nc is 1 in both x and y we could still choose to
6825              * decompose pme in y instead of x, but we use x for simplicity.
6826              */
6827             comm->npmedecompdim = 1;
6828             if (dd->dim[0] == YY)
6829             {
6830                 comm->npmenodes_x = 1;
6831                 comm->npmenodes_y = comm->npmenodes;
6832             }
6833             else
6834             {
6835                 comm->npmenodes_x = comm->npmenodes;
6836                 comm->npmenodes_y = 1;
6837             }
6838         }
6839         if (fplog)
6840         {
6841             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6842                     comm->npmenodes_x, comm->npmenodes_y, 1);
6843         }
6844     }
6845     else
6846     {
6847         comm->npmedecompdim = 0;
6848         comm->npmenodes_x   = 0;
6849         comm->npmenodes_y   = 0;
6850     }
6851
6852     /* Technically we don't need both of these,
6853      * but it simplifies code not having to recalculate it.
6854      */
6855     *npme_x = comm->npmenodes_x;
6856     *npme_y = comm->npmenodes_y;
6857
6858     snew(comm->slb_frac, DIM);
6859     if (comm->eDLB == edlbNO)
6860     {
6861         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6862         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6863         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6864     }
6865
6866     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6867     {
6868         if (comm->bBondComm || comm->eDLB != edlbNO)
6869         {
6870             /* Set the bonded communication distance to halfway
6871              * the minimum and the maximum,
6872              * since the extra communication cost is nearly zero.
6873              */
6874             acs                = average_cellsize_min(dd, ddbox);
6875             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6876             if (comm->eDLB != edlbNO)
6877             {
6878                 /* Check if this does not limit the scaling */
6879                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
6880             }
6881             if (!comm->bBondComm)
6882             {
6883                 /* Without bBondComm do not go beyond the n.b. cut-off */
6884                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
6885                 if (comm->cellsize_limit >= comm->cutoff)
6886                 {
6887                     /* We don't loose a lot of efficieny
6888                      * when increasing it to the n.b. cut-off.
6889                      * It can even be slightly faster, because we need
6890                      * less checks for the communication setup.
6891                      */
6892                     comm->cutoff_mbody = comm->cutoff;
6893                 }
6894             }
6895             /* Check if we did not end up below our original limit */
6896             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
6897
6898             if (comm->cutoff_mbody > comm->cellsize_limit)
6899             {
6900                 comm->cellsize_limit = comm->cutoff_mbody;
6901             }
6902         }
6903         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
6904     }
6905
6906     if (debug)
6907     {
6908         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
6909                 "cellsize limit %f\n",
6910                 comm->bBondComm, comm->cellsize_limit);
6911     }
6912
6913     if (MASTER(cr))
6914     {
6915         check_dd_restrictions(cr, dd, ir, fplog);
6916     }
6917
6918     comm->partition_step = INT_MIN;
6919     dd->ddp_count        = 0;
6920
6921     clear_dd_cycle_counts(dd);
6922
6923     return dd;
6924 }
6925
6926 static void set_dlb_limits(gmx_domdec_t *dd)
6927
6928 {
6929     int d;
6930
6931     for (d = 0; d < dd->ndim; d++)
6932     {
6933         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
6934         dd->comm->cellsize_min[dd->dim[d]] =
6935             dd->comm->cellsize_min_dlb[dd->dim[d]];
6936     }
6937 }
6938
6939
6940 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_large_int_t step)
6941 {
6942     gmx_domdec_t      *dd;
6943     gmx_domdec_comm_t *comm;
6944     real               cellsize_min;
6945     int                d, nc, i;
6946     char               buf[STRLEN];
6947
6948     dd   = cr->dd;
6949     comm = dd->comm;
6950
6951     if (fplog)
6952     {
6953         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
6954     }
6955
6956     cellsize_min = comm->cellsize_min[dd->dim[0]];
6957     for (d = 1; d < dd->ndim; d++)
6958     {
6959         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
6960     }
6961
6962     if (cellsize_min < comm->cellsize_limit*1.05)
6963     {
6964         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
6965
6966         /* Change DLB from "auto" to "no". */
6967         comm->eDLB = edlbNO;
6968
6969         return;
6970     }
6971
6972     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
6973     comm->bDynLoadBal = TRUE;
6974     dd->bGridJump     = TRUE;
6975
6976     set_dlb_limits(dd);
6977
6978     /* We can set the required cell size info here,
6979      * so we do not need to communicate this.
6980      * The grid is completely uniform.
6981      */
6982     for (d = 0; d < dd->ndim; d++)
6983     {
6984         if (comm->root[d])
6985         {
6986             comm->load[d].sum_m = comm->load[d].sum;
6987
6988             nc = dd->nc[dd->dim[d]];
6989             for (i = 0; i < nc; i++)
6990             {
6991                 comm->root[d]->cell_f[i]    = i/(real)nc;
6992                 if (d > 0)
6993                 {
6994                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
6995                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
6996                 }
6997             }
6998             comm->root[d]->cell_f[nc] = 1.0;
6999         }
7000     }
7001 }
7002
7003 static char *init_bLocalCG(gmx_mtop_t *mtop)
7004 {
7005     int   ncg, cg;
7006     char *bLocalCG;
7007
7008     ncg = ncg_mtop(mtop);
7009     snew(bLocalCG, ncg);
7010     for (cg = 0; cg < ncg; cg++)
7011     {
7012         bLocalCG[cg] = FALSE;
7013     }
7014
7015     return bLocalCG;
7016 }
7017
7018 void dd_init_bondeds(FILE *fplog,
7019                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7020                      gmx_vsite_t *vsite, gmx_constr_t constr,
7021                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7022 {
7023     gmx_domdec_comm_t *comm;
7024     gmx_bool           bBondComm;
7025     int                d;
7026
7027     dd_make_reverse_top(fplog, dd, mtop, vsite, constr, ir, bBCheck);
7028
7029     comm = dd->comm;
7030
7031     if (comm->bBondComm)
7032     {
7033         /* Communicate atoms beyond the cut-off for bonded interactions */
7034         comm = dd->comm;
7035
7036         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7037
7038         comm->bLocalCG = init_bLocalCG(mtop);
7039     }
7040     else
7041     {
7042         /* Only communicate atoms based on cut-off */
7043         comm->cglink   = NULL;
7044         comm->bLocalCG = NULL;
7045     }
7046 }
7047
7048 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7049                               t_inputrec *ir,
7050                               gmx_bool bDynLoadBal, real dlb_scale,
7051                               gmx_ddbox_t *ddbox)
7052 {
7053     gmx_domdec_comm_t *comm;
7054     int                d;
7055     ivec               np;
7056     real               limit, shrink;
7057     char               buf[64];
7058
7059     if (fplog == NULL)
7060     {
7061         return;
7062     }
7063
7064     comm = dd->comm;
7065
7066     if (bDynLoadBal)
7067     {
7068         fprintf(fplog, "The maximum number of communication pulses is:");
7069         for (d = 0; d < dd->ndim; d++)
7070         {
7071             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7072         }
7073         fprintf(fplog, "\n");
7074         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7075         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7076         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7077         for (d = 0; d < DIM; d++)
7078         {
7079             if (dd->nc[d] > 1)
7080             {
7081                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7082                 {
7083                     shrink = 0;
7084                 }
7085                 else
7086                 {
7087                     shrink =
7088                         comm->cellsize_min_dlb[d]/
7089                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7090                 }
7091                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7092             }
7093         }
7094         fprintf(fplog, "\n");
7095     }
7096     else
7097     {
7098         set_dd_cell_sizes_slb(dd, ddbox, FALSE, np);
7099         fprintf(fplog, "The initial number of communication pulses is:");
7100         for (d = 0; d < dd->ndim; d++)
7101         {
7102             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7103         }
7104         fprintf(fplog, "\n");
7105         fprintf(fplog, "The initial domain decomposition cell size is:");
7106         for (d = 0; d < DIM; d++)
7107         {
7108             if (dd->nc[d] > 1)
7109             {
7110                 fprintf(fplog, " %c %.2f nm",
7111                         dim2char(d), dd->comm->cellsize_min[d]);
7112             }
7113         }
7114         fprintf(fplog, "\n\n");
7115     }
7116
7117     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7118     {
7119         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7120         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7121                 "non-bonded interactions", "", comm->cutoff);
7122
7123         if (bDynLoadBal)
7124         {
7125             limit = dd->comm->cellsize_limit;
7126         }
7127         else
7128         {
7129             if (dynamic_dd_box(ddbox, ir))
7130             {
7131                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7132             }
7133             limit = dd->comm->cellsize_min[XX];
7134             for (d = 1; d < DIM; d++)
7135             {
7136                 limit = min(limit, dd->comm->cellsize_min[d]);
7137             }
7138         }
7139
7140         if (comm->bInterCGBondeds)
7141         {
7142             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7143                     "two-body bonded interactions", "(-rdd)",
7144                     max(comm->cutoff, comm->cutoff_mbody));
7145             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7146                     "multi-body bonded interactions", "(-rdd)",
7147                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7148         }
7149         if (dd->vsite_comm)
7150         {
7151             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7152                     "virtual site constructions", "(-rcon)", limit);
7153         }
7154         if (dd->constraint_comm)
7155         {
7156             sprintf(buf, "atoms separated by up to %d constraints",
7157                     1+ir->nProjOrder);
7158             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7159                     buf, "(-rcon)", limit);
7160         }
7161         fprintf(fplog, "\n");
7162     }
7163
7164     fflush(fplog);
7165 }
7166
7167 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7168                                 real               dlb_scale,
7169                                 const t_inputrec  *ir,
7170                                 const gmx_ddbox_t *ddbox)
7171 {
7172     gmx_domdec_comm_t *comm;
7173     int                d, dim, npulse, npulse_d_max, npulse_d;
7174     gmx_bool           bNoCutOff;
7175
7176     comm = dd->comm;
7177
7178     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7179
7180     /* Determine the maximum number of comm. pulses in one dimension */
7181
7182     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7183
7184     /* Determine the maximum required number of grid pulses */
7185     if (comm->cellsize_limit >= comm->cutoff)
7186     {
7187         /* Only a single pulse is required */
7188         npulse = 1;
7189     }
7190     else if (!bNoCutOff && comm->cellsize_limit > 0)
7191     {
7192         /* We round down slightly here to avoid overhead due to the latency
7193          * of extra communication calls when the cut-off
7194          * would be only slightly longer than the cell size.
7195          * Later cellsize_limit is redetermined,
7196          * so we can not miss interactions due to this rounding.
7197          */
7198         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7199     }
7200     else
7201     {
7202         /* There is no cell size limit */
7203         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7204     }
7205
7206     if (!bNoCutOff && npulse > 1)
7207     {
7208         /* See if we can do with less pulses, based on dlb_scale */
7209         npulse_d_max = 0;
7210         for (d = 0; d < dd->ndim; d++)
7211         {
7212             dim      = dd->dim[d];
7213             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7214                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7215             npulse_d_max = max(npulse_d_max, npulse_d);
7216         }
7217         npulse = min(npulse, npulse_d_max);
7218     }
7219
7220     /* This env var can override npulse */
7221     d = dd_nst_env(debug, "GMX_DD_NPULSE", 0);
7222     if (d > 0)
7223     {
7224         npulse = d;
7225     }
7226
7227     comm->maxpulse       = 1;
7228     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7229     for (d = 0; d < dd->ndim; d++)
7230     {
7231         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7232         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7233         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7234         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7235         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7236         {
7237             comm->bVacDLBNoLimit = FALSE;
7238         }
7239     }
7240
7241     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7242     if (!comm->bVacDLBNoLimit)
7243     {
7244         comm->cellsize_limit = max(comm->cellsize_limit,
7245                                    comm->cutoff/comm->maxpulse);
7246     }
7247     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7248     /* Set the minimum cell size for each DD dimension */
7249     for (d = 0; d < dd->ndim; d++)
7250     {
7251         if (comm->bVacDLBNoLimit ||
7252             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7253         {
7254             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7255         }
7256         else
7257         {
7258             comm->cellsize_min_dlb[dd->dim[d]] =
7259                 comm->cutoff/comm->cd[d].np_dlb;
7260         }
7261     }
7262     if (comm->cutoff_mbody <= 0)
7263     {
7264         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7265     }
7266     if (comm->bDynLoadBal)
7267     {
7268         set_dlb_limits(dd);
7269     }
7270 }
7271
7272 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7273 {
7274     /* If each molecule is a single charge group
7275      * or we use domain decomposition for each periodic dimension,
7276      * we do not need to take pbc into account for the bonded interactions.
7277      */
7278     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7279             !(dd->nc[XX] > 1 &&
7280               dd->nc[YY] > 1 &&
7281               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7282 }
7283
7284 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7285                        t_inputrec *ir, t_forcerec *fr,
7286                        gmx_ddbox_t *ddbox)
7287 {
7288     gmx_domdec_comm_t *comm;
7289     int                natoms_tot;
7290     real               vol_frac;
7291
7292     comm = dd->comm;
7293
7294     /* Initialize the thread data.
7295      * This can not be done in init_domain_decomposition,
7296      * as the numbers of threads is determined later.
7297      */
7298     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7299     if (comm->nth > 1)
7300     {
7301         snew(comm->dth, comm->nth);
7302     }
7303
7304     if (EEL_PME(ir->coulombtype))
7305     {
7306         init_ddpme(dd, &comm->ddpme[0], 0);
7307         if (comm->npmedecompdim >= 2)
7308         {
7309             init_ddpme(dd, &comm->ddpme[1], 1);
7310         }
7311     }
7312     else
7313     {
7314         comm->npmenodes = 0;
7315         if (dd->pme_nodeid >= 0)
7316         {
7317             gmx_fatal_collective(FARGS, NULL, dd,
7318                                  "Can not have separate PME nodes without PME electrostatics");
7319         }
7320     }
7321
7322     if (debug)
7323     {
7324         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7325     }
7326     if (comm->eDLB != edlbNO)
7327     {
7328         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7329     }
7330
7331     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7332     if (comm->eDLB == edlbAUTO)
7333     {
7334         if (fplog)
7335         {
7336             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7337         }
7338         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7339     }
7340
7341     if (ir->ePBC == epbcNONE)
7342     {
7343         vol_frac = 1 - 1/(double)dd->nnodes;
7344     }
7345     else
7346     {
7347         vol_frac =
7348             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7349     }
7350     if (debug)
7351     {
7352         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7353     }
7354     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7355
7356     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7357 }
7358
7359 static gmx_bool test_dd_cutoff(t_commrec *cr,
7360                                t_state *state, t_inputrec *ir,
7361                                real cutoff_req)
7362 {
7363     gmx_domdec_t *dd;
7364     gmx_ddbox_t   ddbox;
7365     int           d, dim, np;
7366     real          inv_cell_size;
7367     int           LocallyLimited;
7368
7369     dd = cr->dd;
7370
7371     set_ddbox(dd, FALSE, cr, ir, state->box,
7372               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7373
7374     LocallyLimited = 0;
7375
7376     for (d = 0; d < dd->ndim; d++)
7377     {
7378         dim = dd->dim[d];
7379
7380         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7381         if (dynamic_dd_box(&ddbox, ir))
7382         {
7383             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7384         }
7385
7386         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7387
7388         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7389             dd->comm->cd[d].np_dlb > 0)
7390         {
7391             if (np > dd->comm->cd[d].np_dlb)
7392             {
7393                 return FALSE;
7394             }
7395
7396             /* If a current local cell size is smaller than the requested
7397              * cut-off, we could still fix it, but this gets very complicated.
7398              * Without fixing here, we might actually need more checks.
7399              */
7400             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7401             {
7402                 LocallyLimited = 1;
7403             }
7404         }
7405     }
7406
7407     if (dd->comm->eDLB != edlbNO)
7408     {
7409         /* If DLB is not active yet, we don't need to check the grid jumps.
7410          * Actually we shouldn't, because then the grid jump data is not set.
7411          */
7412         if (dd->comm->bDynLoadBal &&
7413             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7414         {
7415             LocallyLimited = 1;
7416         }
7417
7418         gmx_sumi(1, &LocallyLimited, cr);
7419
7420         if (LocallyLimited > 0)
7421         {
7422             return FALSE;
7423         }
7424     }
7425
7426     return TRUE;
7427 }
7428
7429 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7430                           real cutoff_req)
7431 {
7432     gmx_bool bCutoffAllowed;
7433
7434     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7435
7436     if (bCutoffAllowed)
7437     {
7438         cr->dd->comm->cutoff = cutoff_req;
7439     }
7440
7441     return bCutoffAllowed;
7442 }
7443
7444 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7445 {
7446     gmx_domdec_comm_t *comm;
7447
7448     comm = cr->dd->comm;
7449
7450     /* Turn on the DLB limiting (might have been on already) */
7451     comm->bPMELoadBalDLBLimits = TRUE;
7452
7453     /* Change the cut-off limit */
7454     comm->PMELoadBal_max_cutoff = comm->cutoff;
7455 }
7456
7457 static void merge_cg_buffers(int ncell,
7458                              gmx_domdec_comm_dim_t *cd, int pulse,
7459                              int  *ncg_cell,
7460                              int  *index_gl, int  *recv_i,
7461                              rvec *cg_cm,    rvec *recv_vr,
7462                              int *cgindex,
7463                              cginfo_mb_t *cginfo_mb, int *cginfo)
7464 {
7465     gmx_domdec_ind_t *ind, *ind_p;
7466     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7467     int               shift, shift_at;
7468
7469     ind = &cd->ind[pulse];
7470
7471     /* First correct the already stored data */
7472     shift = ind->nrecv[ncell];
7473     for (cell = ncell-1; cell >= 0; cell--)
7474     {
7475         shift -= ind->nrecv[cell];
7476         if (shift > 0)
7477         {
7478             /* Move the cg's present from previous grid pulses */
7479             cg0                = ncg_cell[ncell+cell];
7480             cg1                = ncg_cell[ncell+cell+1];
7481             cgindex[cg1+shift] = cgindex[cg1];
7482             for (cg = cg1-1; cg >= cg0; cg--)
7483             {
7484                 index_gl[cg+shift] = index_gl[cg];
7485                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7486                 cgindex[cg+shift] = cgindex[cg];
7487                 cginfo[cg+shift]  = cginfo[cg];
7488             }
7489             /* Correct the already stored send indices for the shift */
7490             for (p = 1; p <= pulse; p++)
7491             {
7492                 ind_p = &cd->ind[p];
7493                 cg0   = 0;
7494                 for (c = 0; c < cell; c++)
7495                 {
7496                     cg0 += ind_p->nsend[c];
7497                 }
7498                 cg1 = cg0 + ind_p->nsend[cell];
7499                 for (cg = cg0; cg < cg1; cg++)
7500                 {
7501                     ind_p->index[cg] += shift;
7502                 }
7503             }
7504         }
7505     }
7506
7507     /* Merge in the communicated buffers */
7508     shift    = 0;
7509     shift_at = 0;
7510     cg0      = 0;
7511     for (cell = 0; cell < ncell; cell++)
7512     {
7513         cg1 = ncg_cell[ncell+cell+1] + shift;
7514         if (shift_at > 0)
7515         {
7516             /* Correct the old cg indices */
7517             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7518             {
7519                 cgindex[cg+1] += shift_at;
7520             }
7521         }
7522         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7523         {
7524             /* Copy this charge group from the buffer */
7525             index_gl[cg1] = recv_i[cg0];
7526             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7527             /* Add it to the cgindex */
7528             cg_gl          = index_gl[cg1];
7529             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7530             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7531             cgindex[cg1+1] = cgindex[cg1] + nat;
7532             cg0++;
7533             cg1++;
7534             shift_at += nat;
7535         }
7536         shift                 += ind->nrecv[cell];
7537         ncg_cell[ncell+cell+1] = cg1;
7538     }
7539 }
7540
7541 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7542                                int nzone, int cg0, const int *cgindex)
7543 {
7544     int cg, zone, p;
7545
7546     /* Store the atom block boundaries for easy copying of communication buffers
7547      */
7548     cg = cg0;
7549     for (zone = 0; zone < nzone; zone++)
7550     {
7551         for (p = 0; p < cd->np; p++)
7552         {
7553             cd->ind[p].cell2at0[zone] = cgindex[cg];
7554             cg += cd->ind[p].nrecv[zone];
7555             cd->ind[p].cell2at1[zone] = cgindex[cg];
7556         }
7557     }
7558 }
7559
7560 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7561 {
7562     int      i;
7563     gmx_bool bMiss;
7564
7565     bMiss = FALSE;
7566     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7567     {
7568         if (!bLocalCG[link->a[i]])
7569         {
7570             bMiss = TRUE;
7571         }
7572     }
7573
7574     return bMiss;
7575 }
7576
7577 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7578 typedef struct {
7579     real c[DIM][4]; /* the corners for the non-bonded communication */
7580     real cr0;       /* corner for rounding */
7581     real cr1[4];    /* corners for rounding */
7582     real bc[DIM];   /* corners for bounded communication */
7583     real bcr1;      /* corner for rounding for bonded communication */
7584 } dd_corners_t;
7585
7586 /* Determine the corners of the domain(s) we are communicating with */
7587 static void
7588 set_dd_corners(const gmx_domdec_t *dd,
7589                int dim0, int dim1, int dim2,
7590                gmx_bool bDistMB,
7591                dd_corners_t *c)
7592 {
7593     const gmx_domdec_comm_t  *comm;
7594     const gmx_domdec_zones_t *zones;
7595     int i, j;
7596
7597     comm = dd->comm;
7598
7599     zones = &comm->zones;
7600
7601     /* Keep the compiler happy */
7602     c->cr0  = 0;
7603     c->bcr1 = 0;
7604
7605     /* The first dimension is equal for all cells */
7606     c->c[0][0] = comm->cell_x0[dim0];
7607     if (bDistMB)
7608     {
7609         c->bc[0] = c->c[0][0];
7610     }
7611     if (dd->ndim >= 2)
7612     {
7613         dim1 = dd->dim[1];
7614         /* This cell row is only seen from the first row */
7615         c->c[1][0] = comm->cell_x0[dim1];
7616         /* All rows can see this row */
7617         c->c[1][1] = comm->cell_x0[dim1];
7618         if (dd->bGridJump)
7619         {
7620             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7621             if (bDistMB)
7622             {
7623                 /* For the multi-body distance we need the maximum */
7624                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7625             }
7626         }
7627         /* Set the upper-right corner for rounding */
7628         c->cr0 = comm->cell_x1[dim0];
7629
7630         if (dd->ndim >= 3)
7631         {
7632             dim2 = dd->dim[2];
7633             for (j = 0; j < 4; j++)
7634             {
7635                 c->c[2][j] = comm->cell_x0[dim2];
7636             }
7637             if (dd->bGridJump)
7638             {
7639                 /* Use the maximum of the i-cells that see a j-cell */
7640                 for (i = 0; i < zones->nizone; i++)
7641                 {
7642                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7643                     {
7644                         if (j >= 4)
7645                         {
7646                             c->c[2][j-4] =
7647                                 max(c->c[2][j-4],
7648                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7649                         }
7650                     }
7651                 }
7652                 if (bDistMB)
7653                 {
7654                     /* For the multi-body distance we need the maximum */
7655                     c->bc[2] = comm->cell_x0[dim2];
7656                     for (i = 0; i < 2; i++)
7657                     {
7658                         for (j = 0; j < 2; j++)
7659                         {
7660                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7661                         }
7662                     }
7663                 }
7664             }
7665
7666             /* Set the upper-right corner for rounding */
7667             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7668              * Only cell (0,0,0) can see cell 7 (1,1,1)
7669              */
7670             c->cr1[0] = comm->cell_x1[dim1];
7671             c->cr1[3] = comm->cell_x1[dim1];
7672             if (dd->bGridJump)
7673             {
7674                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7675                 if (bDistMB)
7676                 {
7677                     /* For the multi-body distance we need the maximum */
7678                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7679                 }
7680             }
7681         }
7682     }
7683 }
7684
7685 /* Determine which cg's we need to send in this pulse from this zone */
7686 static void
7687 get_zone_pulse_cgs(gmx_domdec_t *dd,
7688                    int zonei, int zone,
7689                    int cg0, int cg1,
7690                    const int *index_gl,
7691                    const int *cgindex,
7692                    int dim, int dim_ind,
7693                    int dim0, int dim1, int dim2,
7694                    real r_comm2, real r_bcomm2,
7695                    matrix box,
7696                    ivec tric_dist,
7697                    rvec *normal,
7698                    real skew_fac2_d, real skew_fac_01,
7699                    rvec *v_d, rvec *v_0, rvec *v_1,
7700                    const dd_corners_t *c,
7701                    rvec sf2_round,
7702                    gmx_bool bDistBonded,
7703                    gmx_bool bBondComm,
7704                    gmx_bool bDist2B,
7705                    gmx_bool bDistMB,
7706                    rvec *cg_cm,
7707                    int *cginfo,
7708                    gmx_domdec_ind_t *ind,
7709                    int **ibuf, int *ibuf_nalloc,
7710                    vec_rvec_t *vbuf,
7711                    int *nsend_ptr,
7712                    int *nat_ptr,
7713                    int *nsend_z_ptr)
7714 {
7715     gmx_domdec_comm_t *comm;
7716     gmx_bool           bScrew;
7717     gmx_bool           bDistMB_pulse;
7718     int                cg, i;
7719     real               r2, rb2, r, tric_sh;
7720     rvec               rn, rb;
7721     int                dimd;
7722     int                nsend_z, nsend, nat;
7723
7724     comm = dd->comm;
7725
7726     bScrew = (dd->bScrewPBC && dim == XX);
7727
7728     bDistMB_pulse = (bDistMB && bDistBonded);
7729
7730     nsend_z = 0;
7731     nsend   = *nsend_ptr;
7732     nat     = *nat_ptr;
7733
7734     for (cg = cg0; cg < cg1; cg++)
7735     {
7736         r2  = 0;
7737         rb2 = 0;
7738         if (tric_dist[dim_ind] == 0)
7739         {
7740             /* Rectangular direction, easy */
7741             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7742             if (r > 0)
7743             {
7744                 r2 += r*r;
7745             }
7746             if (bDistMB_pulse)
7747             {
7748                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7749                 if (r > 0)
7750                 {
7751                     rb2 += r*r;
7752                 }
7753             }
7754             /* Rounding gives at most a 16% reduction
7755              * in communicated atoms
7756              */
7757             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7758             {
7759                 r = cg_cm[cg][dim0] - c->cr0;
7760                 /* This is the first dimension, so always r >= 0 */
7761                 r2 += r*r;
7762                 if (bDistMB_pulse)
7763                 {
7764                     rb2 += r*r;
7765                 }
7766             }
7767             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7768             {
7769                 r = cg_cm[cg][dim1] - c->cr1[zone];
7770                 if (r > 0)
7771                 {
7772                     r2 += r*r;
7773                 }
7774                 if (bDistMB_pulse)
7775                 {
7776                     r = cg_cm[cg][dim1] - c->bcr1;
7777                     if (r > 0)
7778                     {
7779                         rb2 += r*r;
7780                     }
7781                 }
7782             }
7783         }
7784         else
7785         {
7786             /* Triclinic direction, more complicated */
7787             clear_rvec(rn);
7788             clear_rvec(rb);
7789             /* Rounding, conservative as the skew_fac multiplication
7790              * will slightly underestimate the distance.
7791              */
7792             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7793             {
7794                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7795                 for (i = dim0+1; i < DIM; i++)
7796                 {
7797                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7798                 }
7799                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7800                 if (bDistMB_pulse)
7801                 {
7802                     rb[dim0] = rn[dim0];
7803                     rb2      = r2;
7804                 }
7805                 /* Take care that the cell planes along dim0 might not
7806                  * be orthogonal to those along dim1 and dim2.
7807                  */
7808                 for (i = 1; i <= dim_ind; i++)
7809                 {
7810                     dimd = dd->dim[i];
7811                     if (normal[dim0][dimd] > 0)
7812                     {
7813                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7814                         if (bDistMB_pulse)
7815                         {
7816                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7817                         }
7818                     }
7819                 }
7820             }
7821             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7822             {
7823                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7824                 tric_sh   = 0;
7825                 for (i = dim1+1; i < DIM; i++)
7826                 {
7827                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7828                 }
7829                 rn[dim1] += tric_sh;
7830                 if (rn[dim1] > 0)
7831                 {
7832                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7833                     /* Take care of coupling of the distances
7834                      * to the planes along dim0 and dim1 through dim2.
7835                      */
7836                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7837                     /* Take care that the cell planes along dim1
7838                      * might not be orthogonal to that along dim2.
7839                      */
7840                     if (normal[dim1][dim2] > 0)
7841                     {
7842                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7843                     }
7844                 }
7845                 if (bDistMB_pulse)
7846                 {
7847                     rb[dim1] +=
7848                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7849                     if (rb[dim1] > 0)
7850                     {
7851                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7852                         /* Take care of coupling of the distances
7853                          * to the planes along dim0 and dim1 through dim2.
7854                          */
7855                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7856                         /* Take care that the cell planes along dim1
7857                          * might not be orthogonal to that along dim2.
7858                          */
7859                         if (normal[dim1][dim2] > 0)
7860                         {
7861                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7862                         }
7863                     }
7864                 }
7865             }
7866             /* The distance along the communication direction */
7867             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7868             tric_sh  = 0;
7869             for (i = dim+1; i < DIM; i++)
7870             {
7871                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7872             }
7873             rn[dim] += tric_sh;
7874             if (rn[dim] > 0)
7875             {
7876                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7877                 /* Take care of coupling of the distances
7878                  * to the planes along dim0 and dim1 through dim2.
7879                  */
7880                 if (dim_ind == 1 && zonei == 1)
7881                 {
7882                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
7883                 }
7884             }
7885             if (bDistMB_pulse)
7886             {
7887                 clear_rvec(rb);
7888                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
7889                 if (rb[dim] > 0)
7890                 {
7891                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
7892                     /* Take care of coupling of the distances
7893                      * to the planes along dim0 and dim1 through dim2.
7894                      */
7895                     if (dim_ind == 1 && zonei == 1)
7896                     {
7897                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
7898                     }
7899                 }
7900             }
7901         }
7902
7903         if (r2 < r_comm2 ||
7904             (bDistBonded &&
7905              ((bDistMB && rb2 < r_bcomm2) ||
7906               (bDist2B && r2  < r_bcomm2)) &&
7907              (!bBondComm ||
7908               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
7909                missing_link(comm->cglink, index_gl[cg],
7910                             comm->bLocalCG)))))
7911         {
7912             /* Make an index to the local charge groups */
7913             if (nsend+1 > ind->nalloc)
7914             {
7915                 ind->nalloc = over_alloc_large(nsend+1);
7916                 srenew(ind->index, ind->nalloc);
7917             }
7918             if (nsend+1 > *ibuf_nalloc)
7919             {
7920                 *ibuf_nalloc = over_alloc_large(nsend+1);
7921                 srenew(*ibuf, *ibuf_nalloc);
7922             }
7923             ind->index[nsend] = cg;
7924             (*ibuf)[nsend]    = index_gl[cg];
7925             nsend_z++;
7926             vec_rvec_check_alloc(vbuf, nsend+1);
7927
7928             if (dd->ci[dim] == 0)
7929             {
7930                 /* Correct cg_cm for pbc */
7931                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
7932                 if (bScrew)
7933                 {
7934                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
7935                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
7936                 }
7937             }
7938             else
7939             {
7940                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
7941             }
7942             nsend++;
7943             nat += cgindex[cg+1] - cgindex[cg];
7944         }
7945     }
7946
7947     *nsend_ptr   = nsend;
7948     *nat_ptr     = nat;
7949     *nsend_z_ptr = nsend_z;
7950 }
7951
7952 static void setup_dd_communication(gmx_domdec_t *dd,
7953                                    matrix box, gmx_ddbox_t *ddbox,
7954                                    t_forcerec *fr, t_state *state, rvec **f)
7955 {
7956     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
7957     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
7958     int                    c, i, j, cg, cg_gl, nrcg;
7959     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
7960     gmx_domdec_comm_t     *comm;
7961     gmx_domdec_zones_t    *zones;
7962     gmx_domdec_comm_dim_t *cd;
7963     gmx_domdec_ind_t      *ind;
7964     cginfo_mb_t           *cginfo_mb;
7965     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
7966     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
7967     dd_corners_t           corners;
7968     ivec                   tric_dist;
7969     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
7970     real                   skew_fac2_d, skew_fac_01;
7971     rvec                   sf2_round;
7972     int                    nsend, nat;
7973     int                    th;
7974
7975     if (debug)
7976     {
7977         fprintf(debug, "Setting up DD communication\n");
7978     }
7979
7980     comm  = dd->comm;
7981
7982     switch (fr->cutoff_scheme)
7983     {
7984         case ecutsGROUP:
7985             cg_cm = fr->cg_cm;
7986             break;
7987         case ecutsVERLET:
7988             cg_cm = state->x;
7989             break;
7990         default:
7991             gmx_incons("unimplemented");
7992             cg_cm = NULL;
7993     }
7994
7995     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
7996     {
7997         dim = dd->dim[dim_ind];
7998
7999         /* Check if we need to use triclinic distances */
8000         tric_dist[dim_ind] = 0;
8001         for (i = 0; i <= dim_ind; i++)
8002         {
8003             if (ddbox->tric_dir[dd->dim[i]])
8004             {
8005                 tric_dist[dim_ind] = 1;
8006             }
8007         }
8008     }
8009
8010     bBondComm = comm->bBondComm;
8011
8012     /* Do we need to determine extra distances for multi-body bondeds? */
8013     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8014
8015     /* Do we need to determine extra distances for only two-body bondeds? */
8016     bDist2B = (bBondComm && !bDistMB);
8017
8018     r_comm2  = sqr(comm->cutoff);
8019     r_bcomm2 = sqr(comm->cutoff_mbody);
8020
8021     if (debug)
8022     {
8023         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8024     }
8025
8026     zones = &comm->zones;
8027
8028     dim0 = dd->dim[0];
8029     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8030     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8031
8032     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8033
8034     /* Triclinic stuff */
8035     normal      = ddbox->normal;
8036     skew_fac_01 = 0;
8037     if (dd->ndim >= 2)
8038     {
8039         v_0 = ddbox->v[dim0];
8040         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8041         {
8042             /* Determine the coupling coefficient for the distances
8043              * to the cell planes along dim0 and dim1 through dim2.
8044              * This is required for correct rounding.
8045              */
8046             skew_fac_01 =
8047                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8048             if (debug)
8049             {
8050                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8051             }
8052         }
8053     }
8054     if (dd->ndim >= 3)
8055     {
8056         v_1 = ddbox->v[dim1];
8057     }
8058
8059     zone_cg_range = zones->cg_range;
8060     index_gl      = dd->index_gl;
8061     cgindex       = dd->cgindex;
8062     cginfo_mb     = fr->cginfo_mb;
8063
8064     zone_cg_range[0]   = 0;
8065     zone_cg_range[1]   = dd->ncg_home;
8066     comm->zone_ncg1[0] = dd->ncg_home;
8067     pos_cg             = dd->ncg_home;
8068
8069     nat_tot = dd->nat_home;
8070     nzone   = 1;
8071     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8072     {
8073         dim = dd->dim[dim_ind];
8074         cd  = &comm->cd[dim_ind];
8075
8076         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8077         {
8078             /* No pbc in this dimension, the first node should not comm. */
8079             nzone_send = 0;
8080         }
8081         else
8082         {
8083             nzone_send = nzone;
8084         }
8085
8086         v_d         = ddbox->v[dim];
8087         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8088
8089         cd->bInPlace = TRUE;
8090         for (p = 0; p < cd->np; p++)
8091         {
8092             /* Only atoms communicated in the first pulse are used
8093              * for multi-body bonded interactions or for bBondComm.
8094              */
8095             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8096
8097             ind   = &cd->ind[p];
8098             nsend = 0;
8099             nat   = 0;
8100             for (zone = 0; zone < nzone_send; zone++)
8101             {
8102                 if (tric_dist[dim_ind] && dim_ind > 0)
8103                 {
8104                     /* Determine slightly more optimized skew_fac's
8105                      * for rounding.
8106                      * This reduces the number of communicated atoms
8107                      * by about 10% for 3D DD of rhombic dodecahedra.
8108                      */
8109                     for (dimd = 0; dimd < dim; dimd++)
8110                     {
8111                         sf2_round[dimd] = 1;
8112                         if (ddbox->tric_dir[dimd])
8113                         {
8114                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8115                             {
8116                                 /* If we are shifted in dimension i
8117                                  * and the cell plane is tilted forward
8118                                  * in dimension i, skip this coupling.
8119                                  */
8120                                 if (!(zones->shift[nzone+zone][i] &&
8121                                       ddbox->v[dimd][i][dimd] >= 0))
8122                                 {
8123                                     sf2_round[dimd] +=
8124                                         sqr(ddbox->v[dimd][i][dimd]);
8125                                 }
8126                             }
8127                             sf2_round[dimd] = 1/sf2_round[dimd];
8128                         }
8129                     }
8130                 }
8131
8132                 zonei = zone_perm[dim_ind][zone];
8133                 if (p == 0)
8134                 {
8135                     /* Here we permutate the zones to obtain a convenient order
8136                      * for neighbor searching
8137                      */
8138                     cg0 = zone_cg_range[zonei];
8139                     cg1 = zone_cg_range[zonei+1];
8140                 }
8141                 else
8142                 {
8143                     /* Look only at the cg's received in the previous grid pulse
8144                      */
8145                     cg1 = zone_cg_range[nzone+zone+1];
8146                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8147                 }
8148
8149 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8150                 for (th = 0; th < comm->nth; th++)
8151                 {
8152                     gmx_domdec_ind_t *ind_p;
8153                     int             **ibuf_p, *ibuf_nalloc_p;
8154                     vec_rvec_t       *vbuf_p;
8155                     int              *nsend_p, *nat_p;
8156                     int              *nsend_zone_p;
8157                     int               cg0_th, cg1_th;
8158
8159                     if (th == 0)
8160                     {
8161                         /* Thread 0 writes in the comm buffers */
8162                         ind_p         = ind;
8163                         ibuf_p        = &comm->buf_int;
8164                         ibuf_nalloc_p = &comm->nalloc_int;
8165                         vbuf_p        = &comm->vbuf;
8166                         nsend_p       = &nsend;
8167                         nat_p         = &nat;
8168                         nsend_zone_p  = &ind->nsend[zone];
8169                     }
8170                     else
8171                     {
8172                         /* Other threads write into temp buffers */
8173                         ind_p         = &comm->dth[th].ind;
8174                         ibuf_p        = &comm->dth[th].ibuf;
8175                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8176                         vbuf_p        = &comm->dth[th].vbuf;
8177                         nsend_p       = &comm->dth[th].nsend;
8178                         nat_p         = &comm->dth[th].nat;
8179                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8180
8181                         comm->dth[th].nsend      = 0;
8182                         comm->dth[th].nat        = 0;
8183                         comm->dth[th].nsend_zone = 0;
8184                     }
8185
8186                     if (comm->nth == 1)
8187                     {
8188                         cg0_th = cg0;
8189                         cg1_th = cg1;
8190                     }
8191                     else
8192                     {
8193                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8194                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8195                     }
8196
8197                     /* Get the cg's for this pulse in this zone */
8198                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8199                                        index_gl, cgindex,
8200                                        dim, dim_ind, dim0, dim1, dim2,
8201                                        r_comm2, r_bcomm2,
8202                                        box, tric_dist,
8203                                        normal, skew_fac2_d, skew_fac_01,
8204                                        v_d, v_0, v_1, &corners, sf2_round,
8205                                        bDistBonded, bBondComm,
8206                                        bDist2B, bDistMB,
8207                                        cg_cm, fr->cginfo,
8208                                        ind_p,
8209                                        ibuf_p, ibuf_nalloc_p,
8210                                        vbuf_p,
8211                                        nsend_p, nat_p,
8212                                        nsend_zone_p);
8213                 }
8214
8215                 /* Append data of threads>=1 to the communication buffers */
8216                 for (th = 1; th < comm->nth; th++)
8217                 {
8218                     dd_comm_setup_work_t *dth;
8219                     int                   i, ns1;
8220
8221                     dth = &comm->dth[th];
8222
8223                     ns1 = nsend + dth->nsend_zone;
8224                     if (ns1 > ind->nalloc)
8225                     {
8226                         ind->nalloc = over_alloc_dd(ns1);
8227                         srenew(ind->index, ind->nalloc);
8228                     }
8229                     if (ns1 > comm->nalloc_int)
8230                     {
8231                         comm->nalloc_int = over_alloc_dd(ns1);
8232                         srenew(comm->buf_int, comm->nalloc_int);
8233                     }
8234                     if (ns1 > comm->vbuf.nalloc)
8235                     {
8236                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8237                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8238                     }
8239
8240                     for (i = 0; i < dth->nsend_zone; i++)
8241                     {
8242                         ind->index[nsend]    = dth->ind.index[i];
8243                         comm->buf_int[nsend] = dth->ibuf[i];
8244                         copy_rvec(dth->vbuf.v[i],
8245                                   comm->vbuf.v[nsend]);
8246                         nsend++;
8247                     }
8248                     nat              += dth->nat;
8249                     ind->nsend[zone] += dth->nsend_zone;
8250                 }
8251             }
8252             /* Clear the counts in case we do not have pbc */
8253             for (zone = nzone_send; zone < nzone; zone++)
8254             {
8255                 ind->nsend[zone] = 0;
8256             }
8257             ind->nsend[nzone]   = nsend;
8258             ind->nsend[nzone+1] = nat;
8259             /* Communicate the number of cg's and atoms to receive */
8260             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8261                             ind->nsend, nzone+2,
8262                             ind->nrecv, nzone+2);
8263
8264             /* The rvec buffer is also required for atom buffers of size nsend
8265              * in dd_move_x and dd_move_f.
8266              */
8267             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8268
8269             if (p > 0)
8270             {
8271                 /* We can receive in place if only the last zone is not empty */
8272                 for (zone = 0; zone < nzone-1; zone++)
8273                 {
8274                     if (ind->nrecv[zone] > 0)
8275                     {
8276                         cd->bInPlace = FALSE;
8277                     }
8278                 }
8279                 if (!cd->bInPlace)
8280                 {
8281                     /* The int buffer is only required here for the cg indices */
8282                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8283                     {
8284                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8285                         srenew(comm->buf_int2, comm->nalloc_int2);
8286                     }
8287                     /* The rvec buffer is also required for atom buffers
8288                      * of size nrecv in dd_move_x and dd_move_f.
8289                      */
8290                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8291                     vec_rvec_check_alloc(&comm->vbuf2, i);
8292                 }
8293             }
8294
8295             /* Make space for the global cg indices */
8296             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8297                 || dd->cg_nalloc == 0)
8298             {
8299                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8300                 srenew(index_gl, dd->cg_nalloc);
8301                 srenew(cgindex, dd->cg_nalloc+1);
8302             }
8303             /* Communicate the global cg indices */
8304             if (cd->bInPlace)
8305             {
8306                 recv_i = index_gl + pos_cg;
8307             }
8308             else
8309             {
8310                 recv_i = comm->buf_int2;
8311             }
8312             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8313                             comm->buf_int, nsend,
8314                             recv_i,        ind->nrecv[nzone]);
8315
8316             /* Make space for cg_cm */
8317             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8318             if (fr->cutoff_scheme == ecutsGROUP)
8319             {
8320                 cg_cm = fr->cg_cm;
8321             }
8322             else
8323             {
8324                 cg_cm = state->x;
8325             }
8326             /* Communicate cg_cm */
8327             if (cd->bInPlace)
8328             {
8329                 recv_vr = cg_cm + pos_cg;
8330             }
8331             else
8332             {
8333                 recv_vr = comm->vbuf2.v;
8334             }
8335             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8336                              comm->vbuf.v, nsend,
8337                              recv_vr,      ind->nrecv[nzone]);
8338
8339             /* Make the charge group index */
8340             if (cd->bInPlace)
8341             {
8342                 zone = (p == 0 ? 0 : nzone - 1);
8343                 while (zone < nzone)
8344                 {
8345                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8346                     {
8347                         cg_gl              = index_gl[pos_cg];
8348                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8349                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8350                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8351                         if (bBondComm)
8352                         {
8353                             /* Update the charge group presence,
8354                              * so we can use it in the next pass of the loop.
8355                              */
8356                             comm->bLocalCG[cg_gl] = TRUE;
8357                         }
8358                         pos_cg++;
8359                     }
8360                     if (p == 0)
8361                     {
8362                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8363                     }
8364                     zone++;
8365                     zone_cg_range[nzone+zone] = pos_cg;
8366                 }
8367             }
8368             else
8369             {
8370                 /* This part of the code is never executed with bBondComm. */
8371                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8372                                  index_gl, recv_i, cg_cm, recv_vr,
8373                                  cgindex, fr->cginfo_mb, fr->cginfo);
8374                 pos_cg += ind->nrecv[nzone];
8375             }
8376             nat_tot += ind->nrecv[nzone+1];
8377         }
8378         if (!cd->bInPlace)
8379         {
8380             /* Store the atom block for easy copying of communication buffers */
8381             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8382         }
8383         nzone += nzone;
8384     }
8385     dd->index_gl = index_gl;
8386     dd->cgindex  = cgindex;
8387
8388     dd->ncg_tot          = zone_cg_range[zones->n];
8389     dd->nat_tot          = nat_tot;
8390     comm->nat[ddnatHOME] = dd->nat_home;
8391     for (i = ddnatZONE; i < ddnatNR; i++)
8392     {
8393         comm->nat[i] = dd->nat_tot;
8394     }
8395
8396     if (!bBondComm)
8397     {
8398         /* We don't need to update cginfo, since that was alrady done above.
8399          * So we pass NULL for the forcerec.
8400          */
8401         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8402                       NULL, comm->bLocalCG);
8403     }
8404
8405     if (debug)
8406     {
8407         fprintf(debug, "Finished setting up DD communication, zones:");
8408         for (c = 0; c < zones->n; c++)
8409         {
8410             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8411         }
8412         fprintf(debug, "\n");
8413     }
8414 }
8415
8416 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8417 {
8418     int c;
8419
8420     for (c = 0; c < zones->nizone; c++)
8421     {
8422         zones->izone[c].cg1  = zones->cg_range[c+1];
8423         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8424         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8425     }
8426 }
8427
8428 static void set_zones_size(gmx_domdec_t *dd,
8429                            matrix box, const gmx_ddbox_t *ddbox,
8430                            int zone_start, int zone_end)
8431 {
8432     gmx_domdec_comm_t  *comm;
8433     gmx_domdec_zones_t *zones;
8434     gmx_bool            bDistMB;
8435     int                 z, zi, zj0, zj1, d, dim;
8436     real                rcs, rcmbs;
8437     int                 i, j;
8438     real                size_j, add_tric;
8439     real                vol;
8440
8441     comm = dd->comm;
8442
8443     zones = &comm->zones;
8444
8445     /* Do we need to determine extra distances for multi-body bondeds? */
8446     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8447
8448     for (z = zone_start; z < zone_end; z++)
8449     {
8450         /* Copy cell limits to zone limits.
8451          * Valid for non-DD dims and non-shifted dims.
8452          */
8453         copy_rvec(comm->cell_x0, zones->size[z].x0);
8454         copy_rvec(comm->cell_x1, zones->size[z].x1);
8455     }
8456
8457     for (d = 0; d < dd->ndim; d++)
8458     {
8459         dim = dd->dim[d];
8460
8461         for (z = 0; z < zones->n; z++)
8462         {
8463             /* With a staggered grid we have different sizes
8464              * for non-shifted dimensions.
8465              */
8466             if (dd->bGridJump && zones->shift[z][dim] == 0)
8467             {
8468                 if (d == 1)
8469                 {
8470                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8471                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8472                 }
8473                 else if (d == 2)
8474                 {
8475                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8476                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8477                 }
8478             }
8479         }
8480
8481         rcs   = comm->cutoff;
8482         rcmbs = comm->cutoff_mbody;
8483         if (ddbox->tric_dir[dim])
8484         {
8485             rcs   /= ddbox->skew_fac[dim];
8486             rcmbs /= ddbox->skew_fac[dim];
8487         }
8488
8489         /* Set the lower limit for the shifted zone dimensions */
8490         for (z = zone_start; z < zone_end; z++)
8491         {
8492             if (zones->shift[z][dim] > 0)
8493             {
8494                 dim = dd->dim[d];
8495                 if (!dd->bGridJump || d == 0)
8496                 {
8497                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8498                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8499                 }
8500                 else
8501                 {
8502                     /* Here we take the lower limit of the zone from
8503                      * the lowest domain of the zone below.
8504                      */
8505                     if (z < 4)
8506                     {
8507                         zones->size[z].x0[dim] =
8508                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8509                     }
8510                     else
8511                     {
8512                         if (d == 1)
8513                         {
8514                             zones->size[z].x0[dim] =
8515                                 zones->size[zone_perm[2][z-4]].x0[dim];
8516                         }
8517                         else
8518                         {
8519                             zones->size[z].x0[dim] =
8520                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8521                         }
8522                     }
8523                     /* A temporary limit, is updated below */
8524                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8525
8526                     if (bDistMB)
8527                     {
8528                         for (zi = 0; zi < zones->nizone; zi++)
8529                         {
8530                             if (zones->shift[zi][dim] == 0)
8531                             {
8532                                 /* This takes the whole zone into account.
8533                                  * With multiple pulses this will lead
8534                                  * to a larger zone then strictly necessary.
8535                                  */
8536                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8537                                                              zones->size[zi].x1[dim]+rcmbs);
8538                             }
8539                         }
8540                     }
8541                 }
8542             }
8543         }
8544
8545         /* Loop over the i-zones to set the upper limit of each
8546          * j-zone they see.
8547          */
8548         for (zi = 0; zi < zones->nizone; zi++)
8549         {
8550             if (zones->shift[zi][dim] == 0)
8551             {
8552                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8553                 {
8554                     if (zones->shift[z][dim] > 0)
8555                     {
8556                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8557                                                      zones->size[zi].x1[dim]+rcs);
8558                     }
8559                 }
8560             }
8561         }
8562     }
8563
8564     for (z = zone_start; z < zone_end; z++)
8565     {
8566         /* Initialization only required to keep the compiler happy */
8567         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8568         int  nc, c;
8569
8570         /* To determine the bounding box for a zone we need to find
8571          * the extreme corners of 4, 2 or 1 corners.
8572          */
8573         nc = 1 << (ddbox->npbcdim - 1);
8574
8575         for (c = 0; c < nc; c++)
8576         {
8577             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8578             corner[XX] = 0;
8579             if ((c & 1) == 0)
8580             {
8581                 corner[YY] = zones->size[z].x0[YY];
8582             }
8583             else
8584             {
8585                 corner[YY] = zones->size[z].x1[YY];
8586             }
8587             if ((c & 2) == 0)
8588             {
8589                 corner[ZZ] = zones->size[z].x0[ZZ];
8590             }
8591             else
8592             {
8593                 corner[ZZ] = zones->size[z].x1[ZZ];
8594             }
8595             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8596             {
8597                 /* With 1D domain decomposition the cg's are not in
8598                  * the triclinic box, but triclinic x-y and rectangular y-z.
8599                  * Shift y back, so it will later end up at 0.
8600                  */
8601                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8602             }
8603             /* Apply the triclinic couplings */
8604             for (i = YY; i < ddbox->npbcdim; i++)
8605             {
8606                 for (j = XX; j < i; j++)
8607                 {
8608                     corner[j] += corner[i]*box[i][j]/box[i][i];
8609                 }
8610             }
8611             if (c == 0)
8612             {
8613                 copy_rvec(corner, corner_min);
8614                 copy_rvec(corner, corner_max);
8615             }
8616             else
8617             {
8618                 for (i = 0; i < DIM; i++)
8619                 {
8620                     corner_min[i] = min(corner_min[i], corner[i]);
8621                     corner_max[i] = max(corner_max[i], corner[i]);
8622                 }
8623             }
8624         }
8625         /* Copy the extreme cornes without offset along x */
8626         for (i = 0; i < DIM; i++)
8627         {
8628             zones->size[z].bb_x0[i] = corner_min[i];
8629             zones->size[z].bb_x1[i] = corner_max[i];
8630         }
8631         /* Add the offset along x */
8632         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8633         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8634     }
8635
8636     if (zone_start == 0)
8637     {
8638         vol = 1;
8639         for (dim = 0; dim < DIM; dim++)
8640         {
8641             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8642         }
8643         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8644     }
8645
8646     if (debug)
8647     {
8648         for (z = zone_start; z < zone_end; z++)
8649         {
8650             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8651                     z,
8652                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8653                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8654                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8655             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8656                     z,
8657                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8658                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8659                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8660         }
8661     }
8662 }
8663
8664 static int comp_cgsort(const void *a, const void *b)
8665 {
8666     int           comp;
8667
8668     gmx_cgsort_t *cga, *cgb;
8669     cga = (gmx_cgsort_t *)a;
8670     cgb = (gmx_cgsort_t *)b;
8671
8672     comp = cga->nsc - cgb->nsc;
8673     if (comp == 0)
8674     {
8675         comp = cga->ind_gl - cgb->ind_gl;
8676     }
8677
8678     return comp;
8679 }
8680
8681 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8682                          int *a, int *buf)
8683 {
8684     int i;
8685
8686     /* Order the data */
8687     for (i = 0; i < n; i++)
8688     {
8689         buf[i] = a[sort[i].ind];
8690     }
8691
8692     /* Copy back to the original array */
8693     for (i = 0; i < n; i++)
8694     {
8695         a[i] = buf[i];
8696     }
8697 }
8698
8699 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8700                          rvec *v, rvec *buf)
8701 {
8702     int i;
8703
8704     /* Order the data */
8705     for (i = 0; i < n; i++)
8706     {
8707         copy_rvec(v[sort[i].ind], buf[i]);
8708     }
8709
8710     /* Copy back to the original array */
8711     for (i = 0; i < n; i++)
8712     {
8713         copy_rvec(buf[i], v[i]);
8714     }
8715 }
8716
8717 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8718                            rvec *v, rvec *buf)
8719 {
8720     int a, atot, cg, cg0, cg1, i;
8721
8722     if (cgindex == NULL)
8723     {
8724         /* Avoid the useless loop of the atoms within a cg */
8725         order_vec_cg(ncg, sort, v, buf);
8726
8727         return;
8728     }
8729
8730     /* Order the data */
8731     a = 0;
8732     for (cg = 0; cg < ncg; cg++)
8733     {
8734         cg0 = cgindex[sort[cg].ind];
8735         cg1 = cgindex[sort[cg].ind+1];
8736         for (i = cg0; i < cg1; i++)
8737         {
8738             copy_rvec(v[i], buf[a]);
8739             a++;
8740         }
8741     }
8742     atot = a;
8743
8744     /* Copy back to the original array */
8745     for (a = 0; a < atot; a++)
8746     {
8747         copy_rvec(buf[a], v[a]);
8748     }
8749 }
8750
8751 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8752                          int nsort_new, gmx_cgsort_t *sort_new,
8753                          gmx_cgsort_t *sort1)
8754 {
8755     int i1, i2, i_new;
8756
8757     /* The new indices are not very ordered, so we qsort them */
8758     qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8759
8760     /* sort2 is already ordered, so now we can merge the two arrays */
8761     i1    = 0;
8762     i2    = 0;
8763     i_new = 0;
8764     while (i2 < nsort2 || i_new < nsort_new)
8765     {
8766         if (i2 == nsort2)
8767         {
8768             sort1[i1++] = sort_new[i_new++];
8769         }
8770         else if (i_new == nsort_new)
8771         {
8772             sort1[i1++] = sort2[i2++];
8773         }
8774         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8775                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8776                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8777         {
8778             sort1[i1++] = sort2[i2++];
8779         }
8780         else
8781         {
8782             sort1[i1++] = sort_new[i_new++];
8783         }
8784     }
8785 }
8786
8787 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8788 {
8789     gmx_domdec_sort_t *sort;
8790     gmx_cgsort_t      *cgsort, *sort_i;
8791     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8792     int                sort_last, sort_skip;
8793
8794     sort = dd->comm->sort;
8795
8796     a = fr->ns.grid->cell_index;
8797
8798     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8799
8800     if (ncg_home_old >= 0)
8801     {
8802         /* The charge groups that remained in the same ns grid cell
8803          * are completely ordered. So we can sort efficiently by sorting
8804          * the charge groups that did move into the stationary list.
8805          */
8806         ncg_new   = 0;
8807         nsort2    = 0;
8808         nsort_new = 0;
8809         for (i = 0; i < dd->ncg_home; i++)
8810         {
8811             /* Check if this cg did not move to another node */
8812             if (a[i] < moved)
8813             {
8814                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8815                 {
8816                     /* This cg is new on this node or moved ns grid cell */
8817                     if (nsort_new >= sort->sort_new_nalloc)
8818                     {
8819                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8820                         srenew(sort->sort_new, sort->sort_new_nalloc);
8821                     }
8822                     sort_i = &(sort->sort_new[nsort_new++]);
8823                 }
8824                 else
8825                 {
8826                     /* This cg did not move */
8827                     sort_i = &(sort->sort2[nsort2++]);
8828                 }
8829                 /* Sort on the ns grid cell indices
8830                  * and the global topology index.
8831                  * index_gl is irrelevant with cell ns,
8832                  * but we set it here anyhow to avoid a conditional.
8833                  */
8834                 sort_i->nsc    = a[i];
8835                 sort_i->ind_gl = dd->index_gl[i];
8836                 sort_i->ind    = i;
8837                 ncg_new++;
8838             }
8839         }
8840         if (debug)
8841         {
8842             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8843                     nsort2, nsort_new);
8844         }
8845         /* Sort efficiently */
8846         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8847                      sort->sort);
8848     }
8849     else
8850     {
8851         cgsort  = sort->sort;
8852         ncg_new = 0;
8853         for (i = 0; i < dd->ncg_home; i++)
8854         {
8855             /* Sort on the ns grid cell indices
8856              * and the global topology index
8857              */
8858             cgsort[i].nsc    = a[i];
8859             cgsort[i].ind_gl = dd->index_gl[i];
8860             cgsort[i].ind    = i;
8861             if (cgsort[i].nsc < moved)
8862             {
8863                 ncg_new++;
8864             }
8865         }
8866         if (debug)
8867         {
8868             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8869         }
8870         /* Determine the order of the charge groups using qsort */
8871         qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8872     }
8873
8874     return ncg_new;
8875 }
8876
8877 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
8878 {
8879     gmx_cgsort_t *sort;
8880     int           ncg_new, i, *a, na;
8881
8882     sort = dd->comm->sort->sort;
8883
8884     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
8885
8886     ncg_new = 0;
8887     for (i = 0; i < na; i++)
8888     {
8889         if (a[i] >= 0)
8890         {
8891             sort[ncg_new].ind = a[i];
8892             ncg_new++;
8893         }
8894     }
8895
8896     return ncg_new;
8897 }
8898
8899 static void dd_sort_state(gmx_domdec_t *dd, int ePBC,
8900                           rvec *cgcm, t_forcerec *fr, t_state *state,
8901                           int ncg_home_old)
8902 {
8903     gmx_domdec_sort_t *sort;
8904     gmx_cgsort_t      *cgsort, *sort_i;
8905     int               *cgindex;
8906     int                ncg_new, i, *ibuf, cgsize;
8907     rvec              *vbuf;
8908
8909     sort = dd->comm->sort;
8910
8911     if (dd->ncg_home > sort->sort_nalloc)
8912     {
8913         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
8914         srenew(sort->sort, sort->sort_nalloc);
8915         srenew(sort->sort2, sort->sort_nalloc);
8916     }
8917     cgsort = sort->sort;
8918
8919     switch (fr->cutoff_scheme)
8920     {
8921         case ecutsGROUP:
8922             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
8923             break;
8924         case ecutsVERLET:
8925             ncg_new = dd_sort_order_nbnxn(dd, fr);
8926             break;
8927         default:
8928             gmx_incons("unimplemented");
8929             ncg_new = 0;
8930     }
8931
8932     /* We alloc with the old size, since cgindex is still old */
8933     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
8934     vbuf = dd->comm->vbuf.v;
8935
8936     if (dd->comm->bCGs)
8937     {
8938         cgindex = dd->cgindex;
8939     }
8940     else
8941     {
8942         cgindex = NULL;
8943     }
8944
8945     /* Remove the charge groups which are no longer at home here */
8946     dd->ncg_home = ncg_new;
8947     if (debug)
8948     {
8949         fprintf(debug, "Set the new home charge group count to %d\n",
8950                 dd->ncg_home);
8951     }
8952
8953     /* Reorder the state */
8954     for (i = 0; i < estNR; i++)
8955     {
8956         if (EST_DISTR(i) && (state->flags & (1<<i)))
8957         {
8958             switch (i)
8959             {
8960                 case estX:
8961                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
8962                     break;
8963                 case estV:
8964                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
8965                     break;
8966                 case estSDX:
8967                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
8968                     break;
8969                 case estCGP:
8970                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
8971                     break;
8972                 case estLD_RNG:
8973                 case estLD_RNGI:
8974                 case estDISRE_INITF:
8975                 case estDISRE_RM3TAV:
8976                 case estORIRE_INITF:
8977                 case estORIRE_DTAV:
8978                     /* No ordering required */
8979                     break;
8980                 default:
8981                     gmx_incons("Unknown state entry encountered in dd_sort_state");
8982                     break;
8983             }
8984         }
8985     }
8986     if (fr->cutoff_scheme == ecutsGROUP)
8987     {
8988         /* Reorder cgcm */
8989         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
8990     }
8991
8992     if (dd->ncg_home+1 > sort->ibuf_nalloc)
8993     {
8994         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
8995         srenew(sort->ibuf, sort->ibuf_nalloc);
8996     }
8997     ibuf = sort->ibuf;
8998     /* Reorder the global cg index */
8999     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9000     /* Reorder the cginfo */
9001     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9002     /* Rebuild the local cg index */
9003     if (dd->comm->bCGs)
9004     {
9005         ibuf[0] = 0;
9006         for (i = 0; i < dd->ncg_home; i++)
9007         {
9008             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9009             ibuf[i+1] = ibuf[i] + cgsize;
9010         }
9011         for (i = 0; i < dd->ncg_home+1; i++)
9012         {
9013             dd->cgindex[i] = ibuf[i];
9014         }
9015     }
9016     else
9017     {
9018         for (i = 0; i < dd->ncg_home+1; i++)
9019         {
9020             dd->cgindex[i] = i;
9021         }
9022     }
9023     /* Set the home atom number */
9024     dd->nat_home = dd->cgindex[dd->ncg_home];
9025
9026     if (fr->cutoff_scheme == ecutsVERLET)
9027     {
9028         /* The atoms are now exactly in grid order, update the grid order */
9029         nbnxn_set_atomorder(fr->nbv->nbs);
9030     }
9031     else
9032     {
9033         /* Copy the sorted ns cell indices back to the ns grid struct */
9034         for (i = 0; i < dd->ncg_home; i++)
9035         {
9036             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9037         }
9038         fr->ns.grid->nr = dd->ncg_home;
9039     }
9040 }
9041
9042 static void add_dd_statistics(gmx_domdec_t *dd)
9043 {
9044     gmx_domdec_comm_t *comm;
9045     int                ddnat;
9046
9047     comm = dd->comm;
9048
9049     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9050     {
9051         comm->sum_nat[ddnat-ddnatZONE] +=
9052             comm->nat[ddnat] - comm->nat[ddnat-1];
9053     }
9054     comm->ndecomp++;
9055 }
9056
9057 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9058 {
9059     gmx_domdec_comm_t *comm;
9060     int                ddnat;
9061
9062     comm = dd->comm;
9063
9064     /* Reset all the statistics and counters for total run counting */
9065     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9066     {
9067         comm->sum_nat[ddnat-ddnatZONE] = 0;
9068     }
9069     comm->ndecomp   = 0;
9070     comm->nload     = 0;
9071     comm->load_step = 0;
9072     comm->load_sum  = 0;
9073     comm->load_max  = 0;
9074     clear_ivec(comm->load_lim);
9075     comm->load_mdf = 0;
9076     comm->load_pme = 0;
9077 }
9078
9079 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9080 {
9081     gmx_domdec_comm_t *comm;
9082     int                ddnat;
9083     double             av;
9084
9085     comm = cr->dd->comm;
9086
9087     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9088
9089     if (fplog == NULL)
9090     {
9091         return;
9092     }
9093
9094     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9095
9096     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9097     {
9098         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9099         switch (ddnat)
9100         {
9101             case ddnatZONE:
9102                 fprintf(fplog,
9103                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9104                         2, av);
9105                 break;
9106             case ddnatVSITE:
9107                 if (cr->dd->vsite_comm)
9108                 {
9109                     fprintf(fplog,
9110                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9111                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9112                             av);
9113                 }
9114                 break;
9115             case ddnatCON:
9116                 if (cr->dd->constraint_comm)
9117                 {
9118                     fprintf(fplog,
9119                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9120                             1 + ir->nLincsIter, av);
9121                 }
9122                 break;
9123             default:
9124                 gmx_incons(" Unknown type for DD statistics");
9125         }
9126     }
9127     fprintf(fplog, "\n");
9128
9129     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9130     {
9131         print_dd_load_av(fplog, cr->dd);
9132     }
9133 }
9134
9135 void dd_partition_system(FILE                *fplog,
9136                          gmx_large_int_t      step,
9137                          t_commrec           *cr,
9138                          gmx_bool             bMasterState,
9139                          int                  nstglobalcomm,
9140                          t_state             *state_global,
9141                          gmx_mtop_t          *top_global,
9142                          t_inputrec          *ir,
9143                          t_state             *state_local,
9144                          rvec               **f,
9145                          t_mdatoms           *mdatoms,
9146                          gmx_localtop_t      *top_local,
9147                          t_forcerec          *fr,
9148                          gmx_vsite_t         *vsite,
9149                          gmx_shellfc_t        shellfc,
9150                          gmx_constr_t         constr,
9151                          t_nrnb              *nrnb,
9152                          gmx_wallcycle_t      wcycle,
9153                          gmx_bool             bVerbose)
9154 {
9155     gmx_domdec_t      *dd;
9156     gmx_domdec_comm_t *comm;
9157     gmx_ddbox_t        ddbox = {0};
9158     t_block           *cgs_gl;
9159     gmx_large_int_t    step_pcoupl;
9160     rvec               cell_ns_x0, cell_ns_x1;
9161     int                i, j, n, cg0 = 0, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9162     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9163     gmx_bool           bRedist, bSortCG, bResortAll;
9164     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9165     real               grid_density;
9166     char               sbuf[22];
9167
9168     dd   = cr->dd;
9169     comm = dd->comm;
9170
9171     bBoxChanged = (bMasterState || DEFORM(*ir));
9172     if (ir->epc != epcNO)
9173     {
9174         /* With nstpcouple > 1 pressure coupling happens.
9175          * one step after calculating the pressure.
9176          * Box scaling happens at the end of the MD step,
9177          * after the DD partitioning.
9178          * We therefore have to do DLB in the first partitioning
9179          * after an MD step where P-coupling occured.
9180          * We need to determine the last step in which p-coupling occurred.
9181          * MRS -- need to validate this for vv?
9182          */
9183         n = ir->nstpcouple;
9184         if (n == 1)
9185         {
9186             step_pcoupl = step - 1;
9187         }
9188         else
9189         {
9190             step_pcoupl = ((step - 1)/n)*n + 1;
9191         }
9192         if (step_pcoupl >= comm->partition_step)
9193         {
9194             bBoxChanged = TRUE;
9195         }
9196     }
9197
9198     bNStGlobalComm = (step % nstglobalcomm == 0);
9199
9200     if (!comm->bDynLoadBal)
9201     {
9202         bDoDLB = FALSE;
9203     }
9204     else
9205     {
9206         /* Should we do dynamic load balacing this step?
9207          * Since it requires (possibly expensive) global communication,
9208          * we might want to do DLB less frequently.
9209          */
9210         if (bBoxChanged || ir->epc != epcNO)
9211         {
9212             bDoDLB = bBoxChanged;
9213         }
9214         else
9215         {
9216             bDoDLB = bNStGlobalComm;
9217         }
9218     }
9219
9220     /* Check if we have recorded loads on the nodes */
9221     if (comm->bRecordLoad && dd_load_count(comm))
9222     {
9223         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9224         {
9225             /* Check if we should use DLB at the second partitioning
9226              * and every 100 partitionings,
9227              * so the extra communication cost is negligible.
9228              */
9229             n         = max(100, nstglobalcomm);
9230             bCheckDLB = (comm->n_load_collect == 0 ||
9231                          comm->n_load_have % n == n-1);
9232         }
9233         else
9234         {
9235             bCheckDLB = FALSE;
9236         }
9237
9238         /* Print load every nstlog, first and last step to the log file */
9239         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9240                     comm->n_load_collect == 0 ||
9241                     (ir->nsteps >= 0 &&
9242                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9243
9244         /* Avoid extra communication due to verbose screen output
9245          * when nstglobalcomm is set.
9246          */
9247         if (bDoDLB || bLogLoad || bCheckDLB ||
9248             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9249         {
9250             get_load_distribution(dd, wcycle);
9251             if (DDMASTER(dd))
9252             {
9253                 if (bLogLoad)
9254                 {
9255                     dd_print_load(fplog, dd, step-1);
9256                 }
9257                 if (bVerbose)
9258                 {
9259                     dd_print_load_verbose(dd);
9260                 }
9261             }
9262             comm->n_load_collect++;
9263
9264             if (bCheckDLB)
9265             {
9266                 /* Since the timings are node dependent, the master decides */
9267                 if (DDMASTER(dd))
9268                 {
9269                     bTurnOnDLB =
9270                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS);
9271                     if (debug)
9272                     {
9273                         fprintf(debug, "step %s, imb loss %f\n",
9274                                 gmx_step_str(step, sbuf),
9275                                 dd_force_imb_perf_loss(dd));
9276                     }
9277                 }
9278                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9279                 if (bTurnOnDLB)
9280                 {
9281                     turn_on_dlb(fplog, cr, step);
9282                     bDoDLB = TRUE;
9283                 }
9284             }
9285         }
9286         comm->n_load_have++;
9287     }
9288
9289     cgs_gl = &comm->cgs_gl;
9290
9291     bRedist = FALSE;
9292     if (bMasterState)
9293     {
9294         /* Clear the old state */
9295         clear_dd_indices(dd, 0, 0);
9296
9297         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9298                   TRUE, cgs_gl, state_global->x, &ddbox);
9299
9300         get_cg_distribution(fplog, step, dd, cgs_gl,
9301                             state_global->box, &ddbox, state_global->x);
9302
9303         dd_distribute_state(dd, cgs_gl,
9304                             state_global, state_local, f);
9305
9306         dd_make_local_cgs(dd, &top_local->cgs);
9307
9308         /* Ensure that we have space for the new distribution */
9309         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9310
9311         if (fr->cutoff_scheme == ecutsGROUP)
9312         {
9313             calc_cgcm(fplog, 0, dd->ncg_home,
9314                       &top_local->cgs, state_local->x, fr->cg_cm);
9315         }
9316
9317         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9318
9319         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9320
9321         cg0 = 0;
9322     }
9323     else if (state_local->ddp_count != dd->ddp_count)
9324     {
9325         if (state_local->ddp_count > dd->ddp_count)
9326         {
9327             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9328         }
9329
9330         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9331         {
9332             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9333         }
9334
9335         /* Clear the old state */
9336         clear_dd_indices(dd, 0, 0);
9337
9338         /* Build the new indices */
9339         rebuild_cgindex(dd, cgs_gl->index, state_local);
9340         make_dd_indices(dd, cgs_gl->index, 0);
9341
9342         if (fr->cutoff_scheme == ecutsGROUP)
9343         {
9344             /* Redetermine the cg COMs */
9345             calc_cgcm(fplog, 0, dd->ncg_home,
9346                       &top_local->cgs, state_local->x, fr->cg_cm);
9347         }
9348
9349         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9350
9351         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9352
9353         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9354                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9355
9356         bRedist = comm->bDynLoadBal;
9357     }
9358     else
9359     {
9360         /* We have the full state, only redistribute the cgs */
9361
9362         /* Clear the non-home indices */
9363         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9364
9365         /* Avoid global communication for dim's without pbc and -gcom */
9366         if (!bNStGlobalComm)
9367         {
9368             copy_rvec(comm->box0, ddbox.box0    );
9369             copy_rvec(comm->box_size, ddbox.box_size);
9370         }
9371         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9372                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9373
9374         bBoxChanged = TRUE;
9375         bRedist     = TRUE;
9376     }
9377     /* For dim's without pbc and -gcom */
9378     copy_rvec(ddbox.box0, comm->box0    );
9379     copy_rvec(ddbox.box_size, comm->box_size);
9380
9381     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9382                       step, wcycle);
9383
9384     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9385     {
9386         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9387     }
9388
9389     /* Check if we should sort the charge groups */
9390     if (comm->nstSortCG > 0)
9391     {
9392         bSortCG = (bMasterState ||
9393                    (bRedist && (step % comm->nstSortCG == 0)));
9394     }
9395     else
9396     {
9397         bSortCG = FALSE;
9398     }
9399
9400     ncg_home_old = dd->ncg_home;
9401
9402     ncg_moved = 0;
9403     if (bRedist)
9404     {
9405         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9406
9407         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9408                            state_local, f, fr, mdatoms,
9409                            !bSortCG, nrnb, &cg0, &ncg_moved);
9410
9411         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9412     }
9413
9414     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9415                           dd, &ddbox,
9416                           &comm->cell_x0, &comm->cell_x1,
9417                           dd->ncg_home, fr->cg_cm,
9418                           cell_ns_x0, cell_ns_x1, &grid_density);
9419
9420     if (bBoxChanged)
9421     {
9422         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9423     }
9424
9425     switch (fr->cutoff_scheme)
9426     {
9427         case ecutsGROUP:
9428             copy_ivec(fr->ns.grid->n, ncells_old);
9429             grid_first(fplog, fr->ns.grid, dd, &ddbox, fr->ePBC,
9430                        state_local->box, cell_ns_x0, cell_ns_x1,
9431                        fr->rlistlong, grid_density);
9432             break;
9433         case ecutsVERLET:
9434             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9435             break;
9436         default:
9437             gmx_incons("unimplemented");
9438     }
9439     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9440     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9441
9442     if (bSortCG)
9443     {
9444         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9445
9446         /* Sort the state on charge group position.
9447          * This enables exact restarts from this step.
9448          * It also improves performance by about 15% with larger numbers
9449          * of atoms per node.
9450          */
9451
9452         /* Fill the ns grid with the home cell,
9453          * so we can sort with the indices.
9454          */
9455         set_zones_ncg_home(dd);
9456
9457         switch (fr->cutoff_scheme)
9458         {
9459             case ecutsVERLET:
9460                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9461
9462                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9463                                   0,
9464                                   comm->zones.size[0].bb_x0,
9465                                   comm->zones.size[0].bb_x1,
9466                                   0, dd->ncg_home,
9467                                   comm->zones.dens_zone0,
9468                                   fr->cginfo,
9469                                   state_local->x,
9470                                   ncg_moved, bRedist ? comm->moved : NULL,
9471                                   fr->nbv->grp[eintLocal].kernel_type,
9472                                   fr->nbv->grp[eintLocal].nbat);
9473
9474                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9475                 break;
9476             case ecutsGROUP:
9477                 fill_grid(fplog, &comm->zones, fr->ns.grid, dd->ncg_home,
9478                           0, dd->ncg_home, fr->cg_cm);
9479
9480                 copy_ivec(fr->ns.grid->n, ncells_new);
9481                 break;
9482             default:
9483                 gmx_incons("unimplemented");
9484         }
9485
9486         bResortAll = bMasterState;
9487
9488         /* Check if we can user the old order and ns grid cell indices
9489          * of the charge groups to sort the charge groups efficiently.
9490          */
9491         if (ncells_new[XX] != ncells_old[XX] ||
9492             ncells_new[YY] != ncells_old[YY] ||
9493             ncells_new[ZZ] != ncells_old[ZZ])
9494         {
9495             bResortAll = TRUE;
9496         }
9497
9498         if (debug)
9499         {
9500             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9501                     gmx_step_str(step, sbuf), dd->ncg_home);
9502         }
9503         dd_sort_state(dd, ir->ePBC, fr->cg_cm, fr, state_local,
9504                       bResortAll ? -1 : ncg_home_old);
9505         /* Rebuild all the indices */
9506         cg0 = 0;
9507         ga2la_clear(dd->ga2la);
9508
9509         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9510     }
9511
9512     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9513
9514     /* Setup up the communication and communicate the coordinates */
9515     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9516
9517     /* Set the indices */
9518     make_dd_indices(dd, cgs_gl->index, cg0);
9519
9520     /* Set the charge group boundaries for neighbor searching */
9521     set_cg_boundaries(&comm->zones);
9522
9523     if (fr->cutoff_scheme == ecutsVERLET)
9524     {
9525         set_zones_size(dd, state_local->box, &ddbox,
9526                        bSortCG ? 1 : 0, comm->zones.n);
9527     }
9528
9529     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9530
9531     /*
9532        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9533                  -1,state_local->x,state_local->box);
9534      */
9535
9536     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9537
9538     /* Extract a local topology from the global topology */
9539     for (i = 0; i < dd->ndim; i++)
9540     {
9541         np[dd->dim[i]] = comm->cd[i].np;
9542     }
9543     dd_make_local_top(fplog, dd, &comm->zones, dd->npbcdim, state_local->box,
9544                       comm->cellsize_min, np,
9545                       fr,
9546                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9547                       vsite, top_global, top_local);
9548
9549     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9550
9551     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9552
9553     /* Set up the special atom communication */
9554     n = comm->nat[ddnatZONE];
9555     for (i = ddnatZONE+1; i < ddnatNR; i++)
9556     {
9557         switch (i)
9558         {
9559             case ddnatVSITE:
9560                 if (vsite && vsite->n_intercg_vsite)
9561                 {
9562                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9563                 }
9564                 break;
9565             case ddnatCON:
9566                 if (dd->bInterCGcons || dd->bInterCGsettles)
9567                 {
9568                     /* Only for inter-cg constraints we need special code */
9569                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9570                                                   constr, ir->nProjOrder,
9571                                                   top_local->idef.il);
9572                 }
9573                 break;
9574             default:
9575                 gmx_incons("Unknown special atom type setup");
9576         }
9577         comm->nat[i] = n;
9578     }
9579
9580     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9581
9582     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9583
9584     /* Make space for the extra coordinates for virtual site
9585      * or constraint communication.
9586      */
9587     state_local->natoms = comm->nat[ddnatNR-1];
9588     if (state_local->natoms > state_local->nalloc)
9589     {
9590         dd_realloc_state(state_local, f, state_local->natoms);
9591     }
9592
9593     if (fr->bF_NoVirSum)
9594     {
9595         if (vsite && vsite->n_intercg_vsite)
9596         {
9597             nat_f_novirsum = comm->nat[ddnatVSITE];
9598         }
9599         else
9600         {
9601             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9602             {
9603                 nat_f_novirsum = dd->nat_tot;
9604             }
9605             else
9606             {
9607                 nat_f_novirsum = dd->nat_home;
9608             }
9609         }
9610     }
9611     else
9612     {
9613         nat_f_novirsum = 0;
9614     }
9615
9616     /* Set the number of atoms required for the force calculation.
9617      * Forces need to be constrained when using a twin-range setup
9618      * or with energy minimization. For simple simulations we could
9619      * avoid some allocation, zeroing and copying, but this is
9620      * probably not worth the complications ande checking.
9621      */
9622     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9623                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9624
9625     /* We make the all mdatoms up to nat_tot_con.
9626      * We could save some work by only setting invmass
9627      * between nat_tot and nat_tot_con.
9628      */
9629     /* This call also sets the new number of home particles to dd->nat_home */
9630     atoms2md(top_global, ir,
9631              comm->nat[ddnatCON], dd->gatindex, 0, dd->nat_home, mdatoms);
9632
9633     /* Now we have the charges we can sort the FE interactions */
9634     dd_sort_local_top(dd, mdatoms, top_local);
9635
9636     if (vsite != NULL)
9637     {
9638         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9639         split_vsites_over_threads(top_local->idef.il, mdatoms, FALSE, vsite);
9640     }
9641
9642     if (shellfc)
9643     {
9644         /* Make the local shell stuff, currently no communication is done */
9645         make_local_shells(cr, mdatoms, shellfc);
9646     }
9647
9648     if (ir->implicit_solvent)
9649     {
9650         make_local_gb(cr, fr->born, ir->gb_algorithm);
9651     }
9652
9653     init_bonded_thread_force_reduction(fr, &top_local->idef);
9654
9655     if (!(cr->duty & DUTY_PME))
9656     {
9657         /* Send the charges to our PME only node */
9658         gmx_pme_send_q(cr, mdatoms->nChargePerturbed,
9659                        mdatoms->chargeA, mdatoms->chargeB,
9660                        dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9661     }
9662
9663     if (constr)
9664     {
9665         set_constraints(constr, top_local, ir, mdatoms, cr);
9666     }
9667
9668     if (ir->ePull != epullNO)
9669     {
9670         /* Update the local pull groups */
9671         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9672     }
9673
9674     if (ir->bRot)
9675     {
9676         /* Update the local rotation groups */
9677         dd_make_local_rotation_groups(dd, ir->rot);
9678     }
9679
9680
9681     add_dd_statistics(dd);
9682
9683     /* Make sure we only count the cycles for this DD partitioning */
9684     clear_dd_cycle_counts(dd);
9685
9686     /* Because the order of the atoms might have changed since
9687      * the last vsite construction, we need to communicate the constructing
9688      * atom coordinates again (for spreading the forces this MD step).
9689      */
9690     dd_move_x_vsites(dd, state_local->box, state_local->x);
9691
9692     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9693
9694     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9695     {
9696         dd_move_x(dd, state_local->box, state_local->x);
9697         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9698                      -1, state_local->x, state_local->box);
9699     }
9700
9701     /* Store the partitioning step */
9702     comm->partition_step = step;
9703
9704     /* Increase the DD partitioning counter */
9705     dd->ddp_count++;
9706     /* The state currently matches this DD partitioning count, store it */
9707     state_local->ddp_count = dd->ddp_count;
9708     if (bMasterState)
9709     {
9710         /* The DD master node knows the complete cg distribution,
9711          * store the count so we can possibly skip the cg info communication.
9712          */
9713         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9714     }
9715
9716     if (comm->DD_debug > 0)
9717     {
9718         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9719         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9720                                 "after partitioning");
9721     }
9722 }