src/gromacs/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "gromacs/legacyheaders/domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <math.h>
  44 #include <stdio.h>
  45 #include <stdlib.h>
  46 #include <string.h>
  47 #include <time.h>
  48
  49 #include "gromacs/bonded/bonded.h"
  50 #include "gromacs/fileio/gmxfio.h"
  51 #include "gromacs/fileio/pdbio.h"
  52 #include "gromacs/imd/imd.h"
  53 #include "gromacs/legacyheaders/bonded-threading.h"
  54 #include "gromacs/legacyheaders/chargegroup.h"
  55 #include "gromacs/legacyheaders/constr.h"
  56 #include "gromacs/legacyheaders/domdec_network.h"
  57 #include "gromacs/legacyheaders/force.h"
  58 #include "gromacs/legacyheaders/gmx_ga2la.h"
  59 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  60 #include "gromacs/legacyheaders/gpu_utils.h"
  61 #include "gromacs/legacyheaders/macros.h"
  62 #include "gromacs/legacyheaders/mdatoms.h"
  63 #include "gromacs/legacyheaders/mdrun.h"
  64 #include "gromacs/legacyheaders/names.h"
  65 #include "gromacs/legacyheaders/network.h"
  66 #include "gromacs/legacyheaders/nrnb.h"
  67 #include "gromacs/legacyheaders/nsgrid.h"
  68 #include "gromacs/legacyheaders/pme.h"
  69 #include "gromacs/legacyheaders/shellfc.h"
  70 #include "gromacs/legacyheaders/typedefs.h"
  71 #include "gromacs/math/vec.h"
  72 #include "gromacs/mdlib/nb_verlet.h"
  73 #include "gromacs/mdlib/nbnxn_search.h"
  74 #include "gromacs/pbcutil/ishift.h"
  75 #include "gromacs/pbcutil/pbc.h"
  76 #include "gromacs/pulling/pull.h"
  77 #include "gromacs/pulling/pull_rotation.h"
  78 #include "gromacs/swap/swapcoords.h"
  79 #include "gromacs/timing/wallcycle.h"
  80 #include "gromacs/topology/mtop_util.h"
  81 #include "gromacs/utility/basenetwork.h"
  82 #include "gromacs/utility/fatalerror.h"
  83 #include "gromacs/utility/futil.h"
  84 #include "gromacs/utility/gmxmpi.h"
  85 #include "gromacs/utility/qsort_threadsafe.h"
  86 #include "gromacs/utility/smalloc.h"
  87
  88 #define DDRANK(dd, rank)    (rank)
  89 #define DDMASTERRANK(dd)   (dd->masterrank)
  90
  91 typedef struct gmx_domdec_master
  92 {
  93     /* The cell boundaries */
  94     real **cell_x;
  95     /* The global charge group division */
  96     int   *ncg;    /* Number of home charge groups for each node */
  97     int   *index;  /* Index of nnodes+1 into cg */
  98     int   *cg;     /* Global charge group index */
  99     int   *nat;    /* Number of home atoms for each node. */
 100     int   *ibuf;   /* Buffer for communication */
 101     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 102 } gmx_domdec_master_t;
 103
 104 typedef struct
 105 {
 106     /* The numbers of charge groups to send and receive for each cell
 107      * that requires communication, the last entry contains the total
 108      * number of atoms that needs to be communicated.
 109      */
 110     int  nsend[DD_MAXIZONE+2];
 111     int  nrecv[DD_MAXIZONE+2];
 112     /* The charge groups to send */
 113     int *index;
 114     int  nalloc;
 115     /* The atom range for non-in-place communication */
 116     int  cell2at0[DD_MAXIZONE];
 117     int  cell2at1[DD_MAXIZONE];
 118 } gmx_domdec_ind_t;
 119
 120 typedef struct
 121 {
 122     int               np;       /* Number of grid pulses in this dimension */
 123     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 124     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 125     int               np_nalloc;
 126     gmx_bool          bInPlace; /* Can we communicate in place?            */
 127 } gmx_domdec_comm_dim_t;
 128
 129 typedef struct
 130 {
 131     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 132     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 133     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 134     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 135     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 136     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 137     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 138     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 139     real     *buf_ncd;     /* Temp. var.                                     */
 140 } gmx_domdec_root_t;
 141
 142 #define DD_NLOAD_MAX 9
 143
 144 /* Here floats are accurate enough, since these variables
 145  * only influence the load balancing, not the actual MD results.
 146  */
 147 typedef struct
 148 {
 149     int    nload;
 150     float *load;
 151     float  sum;
 152     float  max;
 153     float  sum_m;
 154     float  cvol_min;
 155     float  mdf;
 156     float  pme;
 157     int    flags;
 158 } gmx_domdec_load_t;
 159
 160 typedef struct
 161 {
 162     int  nsc;
 163     int  ind_gl;
 164     int  ind;
 165 } gmx_cgsort_t;
 166
 167 typedef struct
 168 {
 169     gmx_cgsort_t *sort;
 170     gmx_cgsort_t *sort2;
 171     int           sort_nalloc;
 172     gmx_cgsort_t *sort_new;
 173     int           sort_new_nalloc;
 174     int          *ibuf;
 175     int           ibuf_nalloc;
 176 } gmx_domdec_sort_t;
 177
 178 typedef struct
 179 {
 180     rvec *v;
 181     int   nalloc;
 182 } vec_rvec_t;
 183
 184 /* This enum determines the order of the coordinates.
 185  * ddnatHOME and ddnatZONE should be first and second,
 186  * the others can be ordered as wanted.
 187  */
 188 enum {
 189     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 190 };
 191
 192 enum {
 193     edlbAUTO, edlbNO, edlbYES, edlbNR
 194 };
 195 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 196
 197 typedef struct
 198 {
 199     int      dim;       /* The dimension                                          */
 200     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 201     int      nslab;     /* The number of PME slabs in this dimension              */
 202     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 203     int     *pp_min;    /* The minimum pp node location, size nslab               */
 204     int     *pp_max;    /* The maximum pp node location,size nslab                */
 205     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 206 } gmx_ddpme_t;
 207
 208 typedef struct
 209 {
 210     real min0;    /* The minimum bottom of this zone                        */
 211     real max1;    /* The maximum top of this zone                           */
 212     real min1;    /* The minimum top of this zone                           */
 213     real mch0;    /* The maximum bottom communicaton height for this zone   */
 214     real mch1;    /* The maximum top communicaton height for this zone      */
 215     real p1_0;    /* The bottom value of the first cell in this zone        */
 216     real p1_1;    /* The top value of the first cell in this zone           */
 217 } gmx_ddzone_t;
 218
 219 typedef struct
 220 {
 221     gmx_domdec_ind_t ind;
 222     int             *ibuf;
 223     int              ibuf_nalloc;
 224     vec_rvec_t       vbuf;
 225     int              nsend;
 226     int              nat;
 227     int              nsend_zone;
 228 } dd_comm_setup_work_t;
 229
 230 typedef struct gmx_domdec_comm
 231 {
 232     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 233      * unless stated otherwise.
 234      */
 235
 236     /* The number of decomposition dimensions for PME, 0: no PME */
 237     int         npmedecompdim;
 238     /* The number of nodes doing PME (PP/PME or only PME) */
 239     int         npmenodes;
 240     int         npmenodes_x;
 241     int         npmenodes_y;
 242     /* The communication setup including the PME only nodes */
 243     gmx_bool    bCartesianPP_PME;
 244     ivec        ntot;
 245     int         cartpmedim;
 246     int        *pmenodes;          /* size npmenodes                         */
 247     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 248                                     * but with bCartesianPP_PME              */
 249     gmx_ddpme_t ddpme[2];
 250
 251     /* The DD particle-particle nodes only */
 252     gmx_bool bCartesianPP;
 253     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 254
 255     /* The global charge groups */
 256     t_block cgs_gl;
 257
 258     /* Should we sort the cgs */
 259     int                nstSortCG;
 260     gmx_domdec_sort_t *sort;
 261
 262     /* Are there charge groups? */
 263     gmx_bool bCGs;
 264
 265     /* Are there bonded and multi-body interactions between charge groups? */
 266     gmx_bool bInterCGBondeds;
 267     gmx_bool bInterCGMultiBody;
 268
 269     /* Data for the optional bonded interaction atom communication range */
 270     gmx_bool  bBondComm;
 271     t_blocka *cglink;
 272     char     *bLocalCG;
 273
 274     /* The DLB option */
 275     int      eDLB;
 276     /* Are we actually using DLB? */
 277     gmx_bool bDynLoadBal;
 278
 279     /* Cell sizes for static load balancing, first index cartesian */
 280     real **slb_frac;
 281
 282     /* The width of the communicated boundaries */
 283     real     cutoff_mbody;
 284     real     cutoff;
 285     /* The minimum cell size (including triclinic correction) */
 286     rvec     cellsize_min;
 287     /* For dlb, for use with edlbAUTO */
 288     rvec     cellsize_min_dlb;
 289     /* The lower limit for the DD cell size with DLB */
 290     real     cellsize_limit;
 291     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 292     gmx_bool bVacDLBNoLimit;
 293
 294     /* With PME load balancing we set limits on DLB */
 295     gmx_bool bPMELoadBalDLBLimits;
 296     /* DLB needs to take into account that we want to allow this maximum
 297      * cut-off (for PME load balancing), this could limit cell boundaries.
 298      */
 299     real PMELoadBal_max_cutoff;
 300
 301     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 302     ivec tric_dir;
 303     /* box0 and box_size are required with dim's without pbc and -gcom */
 304     rvec box0;
 305     rvec box_size;
 306
 307     /* The cell boundaries */
 308     rvec cell_x0;
 309     rvec cell_x1;
 310
 311     /* The old location of the cell boundaries, to check cg displacements */
 312     rvec old_cell_x0;
 313     rvec old_cell_x1;
 314
 315     /* The communication setup and charge group boundaries for the zones */
 316     gmx_domdec_zones_t zones;
 317
 318     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 319      * cell boundaries of neighboring cells for dynamic load balancing.
 320      */
 321     gmx_ddzone_t zone_d1[2];
 322     gmx_ddzone_t zone_d2[2][2];
 323
 324     /* The coordinate/force communication setup and indices */
 325     gmx_domdec_comm_dim_t cd[DIM];
 326     /* The maximum number of cells to communicate with in one dimension */
 327     int                   maxpulse;
 328
 329     /* Which cg distribution is stored on the master node */
 330     int master_cg_ddp_count;
 331
 332     /* The number of cg's received from the direct neighbors */
 333     int  zone_ncg1[DD_MAXZONE];
 334
 335     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 336     int  nat[ddnatNR];
 337
 338     /* Array for signalling if atoms have moved to another domain */
 339     int  *moved;
 340     int   moved_nalloc;
 341
 342     /* Communication buffer for general use */
 343     int  *buf_int;
 344     int   nalloc_int;
 345
 346     /* Communication buffer for general use */
 347     vec_rvec_t vbuf;
 348
 349     /* Temporary storage for thread parallel communication setup */
 350     int                   nth;
 351     dd_comm_setup_work_t *dth;
 352
 353     /* Communication buffers only used with multiple grid pulses */
 354     int       *buf_int2;
 355     int        nalloc_int2;
 356     vec_rvec_t vbuf2;
 357
 358     /* Communication buffers for local redistribution */
 359     int  **cggl_flag;
 360     int    cggl_flag_nalloc[DIM*2];
 361     rvec **cgcm_state;
 362     int    cgcm_state_nalloc[DIM*2];
 363
 364     /* Cell sizes for dynamic load balancing */
 365     gmx_domdec_root_t **root;
 366     real               *cell_f_row;
 367     real                cell_f0[DIM];
 368     real                cell_f1[DIM];
 369     real                cell_f_max0[DIM];
 370     real                cell_f_min1[DIM];
 371
 372     /* Stuff for load communication */
 373     gmx_bool           bRecordLoad;
 374     gmx_domdec_load_t *load;
 375     int                nrank_gpu_shared;
 376 #ifdef GMX_MPI
 377     MPI_Comm          *mpi_comm_load;
 378     MPI_Comm           mpi_comm_gpu_shared;
 379 #endif
 380
 381     /* Maximum DLB scaling per load balancing step in percent */
 382     int dlb_scale_lim;
 383
 384     /* Cycle counters */
 385     float  cycl[ddCyclNr];
 386     int    cycl_n[ddCyclNr];
 387     float  cycl_max[ddCyclNr];
 388     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 389     int    eFlop;
 390     double flop;
 391     int    flop_n;
 392     /* Have often have did we have load measurements */
 393     int    n_load_have;
 394     /* Have often have we collected the load measurements */
 395     int    n_load_collect;
 396
 397     /* Statistics */
 398     double sum_nat[ddnatNR-ddnatZONE];
 399     int    ndecomp;
 400     int    nload;
 401     double load_step;
 402     double load_sum;
 403     double load_max;
 404     ivec   load_lim;
 405     double load_mdf;
 406     double load_pme;
 407
 408     /* The last partition step */
 409     gmx_int64_t partition_step;
 410
 411     /* Debugging */
 412     int  nstDDDump;
 413     int  nstDDDumpGrid;
 414     int  DD_debug;
 415 } gmx_domdec_comm_t;
 416
 417 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 418 #define DD_CGIBS 2
 419
 420 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 421 #define DD_FLAG_NRCG  65535
 422 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 423 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 424
 425 /* Zone permutation required to obtain consecutive charge groups
 426  * for neighbor searching.
 427  */
 428 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 429
 430 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 431  * components see only j zones with that component 0.
 432  */
 433
 434 /* The DD zone order */
 435 static const ivec dd_zo[DD_MAXZONE] =
 436 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 437
 438 /* The 3D setup */
 439 #define dd_z3n  8
 440 #define dd_zp3n 4
 441 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 442
 443 /* The 2D setup */
 444 #define dd_z2n  4
 445 #define dd_zp2n 2
 446 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 447
 448 /* The 1D setup */
 449 #define dd_z1n  2
 450 #define dd_zp1n 1
 451 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 452
 453 /* Factors used to avoid problems due to rounding issues */
 454 #define DD_CELL_MARGIN       1.0001
 455 #define DD_CELL_MARGIN2      1.00005
 456 /* Factor to account for pressure scaling during nstlist steps */
 457 #define DD_PRES_SCALE_MARGIN 1.02
 458
 459 /* Turn on DLB when the load imbalance causes this amount of total loss.
 460  * There is a bit of overhead with DLB and it's difficult to achieve
 461  * a load imbalance of less than 2% with DLB.
 462  */
 463 #define DD_PERF_LOSS_DLB_ON  0.02
 464
 465 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 466 #define DD_PERF_LOSS_WARN    0.05
 467
 468 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 469
 470 /* Use separate MPI send and receive commands
 471  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 472  * This saves memory (and some copying for small nnodes).
 473  * For high parallelization scatter and gather calls are used.
 474  */
 475 #define GMX_DD_NNODES_SENDRECV 4
 476
 477
 478 /*
 479    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 480
 481    static void index2xyz(ivec nc,int ind,ivec xyz)
 482    {
 483    xyz[XX] = ind % nc[XX];
 484    xyz[YY] = (ind / nc[XX]) % nc[YY];
 485    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 486    }
 487  */
 488
 489 /* This order is required to minimize the coordinate communication in PME
 490  * which uses decomposition in the x direction.
 491  */
 492 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 493
 494 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 495 {
 496     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 497     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 498     xyz[ZZ] = ind % nc[ZZ];
 499 }
 500
 501 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 502 {
 503     int ddindex;
 504     int ddnodeid = -1;
 505
 506     ddindex = dd_index(dd->nc, c);
 507     if (dd->comm->bCartesianPP_PME)
 508     {
 509         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 510     }
 511     else if (dd->comm->bCartesianPP)
 512     {
 513 #ifdef GMX_MPI
 514         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 515 #endif
 516     }
 517     else
 518     {
 519         ddnodeid = ddindex;
 520     }
 521
 522     return ddnodeid;
 523 }
 524
 525 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 526 {
 527     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 528 }
 529
 530 int ddglatnr(gmx_domdec_t *dd, int i)
 531 {
 532     int atnr;
 533
 534     if (dd == NULL)
 535     {
 536         atnr = i + 1;
 537     }
 538     else
 539     {
 540         if (i >= dd->comm->nat[ddnatNR-1])
 541         {
 542             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 543         }
 544         atnr = dd->gatindex[i] + 1;
 545     }
 546
 547     return atnr;
 548 }
 549
 550 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 551 {
 552     return &dd->comm->cgs_gl;
 553 }
 554
 555 static void vec_rvec_init(vec_rvec_t *v)
 556 {
 557     v->nalloc = 0;
 558     v->v      = NULL;
 559 }
 560
 561 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 562 {
 563     if (n > v->nalloc)
 564     {
 565         v->nalloc = over_alloc_dd(n);
 566         srenew(v->v, v->nalloc);
 567     }
 568 }
 569
 570 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 571 {
 572     int i;
 573
 574     if (state->ddp_count != dd->ddp_count)
 575     {
 576         gmx_incons("The state does not the domain decomposition state");
 577     }
 578
 579     state->ncg_gl = dd->ncg_home;
 580     if (state->ncg_gl > state->cg_gl_nalloc)
 581     {
 582         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 583         srenew(state->cg_gl, state->cg_gl_nalloc);
 584     }
 585     for (i = 0; i < state->ncg_gl; i++)
 586     {
 587         state->cg_gl[i] = dd->index_gl[i];
 588     }
 589
 590     state->ddp_count_cg_gl = dd->ddp_count;
 591 }
 592
 593 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 594 {
 595     return &dd->comm->zones;
 596 }
 597
 598 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 599                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 600 {
 601     gmx_domdec_zones_t *zones;
 602     int                 izone, d, dim;
 603
 604     zones = &dd->comm->zones;
 605
 606     izone = 0;
 607     while (icg >= zones->izone[izone].cg1)
 608     {
 609         izone++;
 610     }
 611
 612     if (izone == 0)
 613     {
 614         *jcg0 = icg;
 615     }
 616     else if (izone < zones->nizone)
 617     {
 618         *jcg0 = zones->izone[izone].jcg0;
 619     }
 620     else
 621     {
 622         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 623                   icg, izone, zones->nizone);
 624     }
 625
 626     *jcg1 = zones->izone[izone].jcg1;
 627
 628     for (d = 0; d < dd->ndim; d++)
 629     {
 630         dim         = dd->dim[d];
 631         shift0[dim] = zones->izone[izone].shift0[dim];
 632         shift1[dim] = zones->izone[izone].shift1[dim];
 633         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 634         {
 635             /* A conservative approach, this can be optimized */
 636             shift0[dim] -= 1;
 637             shift1[dim] += 1;
 638         }
 639     }
 640 }
 641
 642 int dd_natoms_vsite(gmx_domdec_t *dd)
 643 {
 644     return dd->comm->nat[ddnatVSITE];
 645 }
 646
 647 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 648 {
 649     *at_start = dd->comm->nat[ddnatCON-1];
 650     *at_end   = dd->comm->nat[ddnatCON];
 651 }
 652
 653 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 654 {
 655     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 656     int                   *index, *cgindex;
 657     gmx_domdec_comm_t     *comm;
 658     gmx_domdec_comm_dim_t *cd;
 659     gmx_domdec_ind_t      *ind;
 660     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 661     gmx_bool               bPBC, bScrew;
 662
 663     comm = dd->comm;
 664
 665     cgindex = dd->cgindex;
 666
 667     buf = comm->vbuf.v;
 668
 669     nzone   = 1;
 670     nat_tot = dd->nat_home;
 671     for (d = 0; d < dd->ndim; d++)
 672     {
 673         bPBC   = (dd->ci[dd->dim[d]] == 0);
 674         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 675         if (bPBC)
 676         {
 677             copy_rvec(box[dd->dim[d]], shift);
 678         }
 679         cd = &comm->cd[d];
 680         for (p = 0; p < cd->np; p++)
 681         {
 682             ind   = &cd->ind[p];
 683             index = ind->index;
 684             n     = 0;
 685             if (!bPBC)
 686             {
 687                 for (i = 0; i < ind->nsend[nzone]; i++)
 688                 {
 689                     at0 = cgindex[index[i]];
 690                     at1 = cgindex[index[i]+1];
 691                     for (j = at0; j < at1; j++)
 692                     {
 693                         copy_rvec(x[j], buf[n]);
 694                         n++;
 695                     }
 696                 }
 697             }
 698             else if (!bScrew)
 699             {
 700                 for (i = 0; i < ind->nsend[nzone]; i++)
 701                 {
 702                     at0 = cgindex[index[i]];
 703                     at1 = cgindex[index[i]+1];
 704                     for (j = at0; j < at1; j++)
 705                     {
 706                         /* We need to shift the coordinates */
 707                         rvec_add(x[j], shift, buf[n]);
 708                         n++;
 709                     }
 710                 }
 711             }
 712             else
 713             {
 714                 for (i = 0; i < ind->nsend[nzone]; i++)
 715                 {
 716                     at0 = cgindex[index[i]];
 717                     at1 = cgindex[index[i]+1];
 718                     for (j = at0; j < at1; j++)
 719                     {
 720                         /* Shift x */
 721                         buf[n][XX] = x[j][XX] + shift[XX];
 722                         /* Rotate y and z.
 723                          * This operation requires a special shift force
 724                          * treatment, which is performed in calc_vir.
 725                          */
 726                         buf[n][YY] = box[YY][YY] - x[j][YY];
 727                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 728                         n++;
 729                     }
 730                 }
 731             }
 732
 733             if (cd->bInPlace)
 734             {
 735                 rbuf = x + nat_tot;
 736             }
 737             else
 738             {
 739                 rbuf = comm->vbuf2.v;
 740             }
 741             /* Send and receive the coordinates */
 742             dd_sendrecv_rvec(dd, d, dddirBackward,
 743                              buf,  ind->nsend[nzone+1],
 744                              rbuf, ind->nrecv[nzone+1]);
 745             if (!cd->bInPlace)
 746             {
 747                 j = 0;
 748                 for (zone = 0; zone < nzone; zone++)
 749                 {
 750                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 751                     {
 752                         copy_rvec(rbuf[j], x[i]);
 753                         j++;
 754                     }
 755                 }
 756             }
 757             nat_tot += ind->nrecv[nzone+1];
 758         }
 759         nzone += nzone;
 760     }
 761 }
 762
 763 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 764 {
 765     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 766     int                   *index, *cgindex;
 767     gmx_domdec_comm_t     *comm;
 768     gmx_domdec_comm_dim_t *cd;
 769     gmx_domdec_ind_t      *ind;
 770     rvec                  *buf, *sbuf;
 771     ivec                   vis;
 772     int                    is;
 773     gmx_bool               bPBC, bScrew;
 774
 775     comm = dd->comm;
 776
 777     cgindex = dd->cgindex;
 778
 779     buf = comm->vbuf.v;
 780
 781     n       = 0;
 782     nzone   = comm->zones.n/2;
 783     nat_tot = dd->nat_tot;
 784     for (d = dd->ndim-1; d >= 0; d--)
 785     {
 786         bPBC   = (dd->ci[dd->dim[d]] == 0);
 787         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 788         if (fshift == NULL && !bScrew)
 789         {
 790             bPBC = FALSE;
 791         }
 792         /* Determine which shift vector we need */
 793         clear_ivec(vis);
 794         vis[dd->dim[d]] = 1;
 795         is              = IVEC2IS(vis);
 796
 797         cd = &comm->cd[d];
 798         for (p = cd->np-1; p >= 0; p--)
 799         {
 800             ind      = &cd->ind[p];
 801             nat_tot -= ind->nrecv[nzone+1];
 802             if (cd->bInPlace)
 803             {
 804                 sbuf = f + nat_tot;
 805             }
 806             else
 807             {
 808                 sbuf = comm->vbuf2.v;
 809                 j    = 0;
 810                 for (zone = 0; zone < nzone; zone++)
 811                 {
 812                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 813                     {
 814                         copy_rvec(f[i], sbuf[j]);
 815                         j++;
 816                     }
 817                 }
 818             }
 819             /* Communicate the forces */
 820             dd_sendrecv_rvec(dd, d, dddirForward,
 821                              sbuf, ind->nrecv[nzone+1],
 822                              buf,  ind->nsend[nzone+1]);
 823             index = ind->index;
 824             /* Add the received forces */
 825             n = 0;
 826             if (!bPBC)
 827             {
 828                 for (i = 0; i < ind->nsend[nzone]; i++)
 829                 {
 830                     at0 = cgindex[index[i]];
 831                     at1 = cgindex[index[i]+1];
 832                     for (j = at0; j < at1; j++)
 833                     {
 834                         rvec_inc(f[j], buf[n]);
 835                         n++;
 836                     }
 837                 }
 838             }
 839             else if (!bScrew)
 840             {
 841                 for (i = 0; i < ind->nsend[nzone]; i++)
 842                 {
 843                     at0 = cgindex[index[i]];
 844                     at1 = cgindex[index[i]+1];
 845                     for (j = at0; j < at1; j++)
 846                     {
 847                         rvec_inc(f[j], buf[n]);
 848                         /* Add this force to the shift force */
 849                         rvec_inc(fshift[is], buf[n]);
 850                         n++;
 851                     }
 852                 }
 853             }
 854             else
 855             {
 856                 for (i = 0; i < ind->nsend[nzone]; i++)
 857                 {
 858                     at0 = cgindex[index[i]];
 859                     at1 = cgindex[index[i]+1];
 860                     for (j = at0; j < at1; j++)
 861                     {
 862                         /* Rotate the force */
 863                         f[j][XX] += buf[n][XX];
 864                         f[j][YY] -= buf[n][YY];
 865                         f[j][ZZ] -= buf[n][ZZ];
 866                         if (fshift)
 867                         {
 868                             /* Add this force to the shift force */
 869                             rvec_inc(fshift[is], buf[n]);
 870                         }
 871                         n++;
 872                     }
 873                 }
 874             }
 875         }
 876         nzone /= 2;
 877     }
 878 }
 879
 880 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 881 {
 882     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 883     int                   *index, *cgindex;
 884     gmx_domdec_comm_t     *comm;
 885     gmx_domdec_comm_dim_t *cd;
 886     gmx_domdec_ind_t      *ind;
 887     real                  *buf, *rbuf;
 888
 889     comm = dd->comm;
 890
 891     cgindex = dd->cgindex;
 892
 893     buf = &comm->vbuf.v[0][0];
 894
 895     nzone   = 1;
 896     nat_tot = dd->nat_home;
 897     for (d = 0; d < dd->ndim; d++)
 898     {
 899         cd = &comm->cd[d];
 900         for (p = 0; p < cd->np; p++)
 901         {
 902             ind   = &cd->ind[p];
 903             index = ind->index;
 904             n     = 0;
 905             for (i = 0; i < ind->nsend[nzone]; i++)
 906             {
 907                 at0 = cgindex[index[i]];
 908                 at1 = cgindex[index[i]+1];
 909                 for (j = at0; j < at1; j++)
 910                 {
 911                     buf[n] = v[j];
 912                     n++;
 913                 }
 914             }
 915
 916             if (cd->bInPlace)
 917             {
 918                 rbuf = v + nat_tot;
 919             }
 920             else
 921             {
 922                 rbuf = &comm->vbuf2.v[0][0];
 923             }
 924             /* Send and receive the coordinates */
 925             dd_sendrecv_real(dd, d, dddirBackward,
 926                              buf,  ind->nsend[nzone+1],
 927                              rbuf, ind->nrecv[nzone+1]);
 928             if (!cd->bInPlace)
 929             {
 930                 j = 0;
 931                 for (zone = 0; zone < nzone; zone++)
 932                 {
 933                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 934                     {
 935                         v[i] = rbuf[j];
 936                         j++;
 937                     }
 938                 }
 939             }
 940             nat_tot += ind->nrecv[nzone+1];
 941         }
 942         nzone += nzone;
 943     }
 944 }
 945
 946 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 947 {
 948     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 949     int                   *index, *cgindex;
 950     gmx_domdec_comm_t     *comm;
 951     gmx_domdec_comm_dim_t *cd;
 952     gmx_domdec_ind_t      *ind;
 953     real                  *buf, *sbuf;
 954
 955     comm = dd->comm;
 956
 957     cgindex = dd->cgindex;
 958
 959     buf = &comm->vbuf.v[0][0];
 960
 961     n       = 0;
 962     nzone   = comm->zones.n/2;
 963     nat_tot = dd->nat_tot;
 964     for (d = dd->ndim-1; d >= 0; d--)
 965     {
 966         cd = &comm->cd[d];
 967         for (p = cd->np-1; p >= 0; p--)
 968         {
 969             ind      = &cd->ind[p];
 970             nat_tot -= ind->nrecv[nzone+1];
 971             if (cd->bInPlace)
 972             {
 973                 sbuf = v + nat_tot;
 974             }
 975             else
 976             {
 977                 sbuf = &comm->vbuf2.v[0][0];
 978                 j    = 0;
 979                 for (zone = 0; zone < nzone; zone++)
 980                 {
 981                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 982                     {
 983                         sbuf[j] = v[i];
 984                         j++;
 985                     }
 986                 }
 987             }
 988             /* Communicate the forces */
 989             dd_sendrecv_real(dd, d, dddirForward,
 990                              sbuf, ind->nrecv[nzone+1],
 991                              buf,  ind->nsend[nzone+1]);
 992             index = ind->index;
 993             /* Add the received forces */
 994             n = 0;
 995             for (i = 0; i < ind->nsend[nzone]; i++)
 996             {
 997                 at0 = cgindex[index[i]];
 998                 at1 = cgindex[index[i]+1];
 999                 for (j = at0; j < at1; j++)
1000                 {
1001                     v[j] += buf[n];
1002                     n++;
1003                 }
1004             }
1005         }
1006         nzone /= 2;
1007     }
1008 }
1009
1010 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
1011 {
1012     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1013             d, i, j,
1014             zone->min0, zone->max1,
1015             zone->mch0, zone->mch0,
1016             zone->p1_0, zone->p1_1);
1017 }
1018
1019
1020 #define DDZONECOMM_MAXZONE  5
1021 #define DDZONECOMM_BUFSIZE  3
1022
1023 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1024                                int ddimind, int direction,
1025                                gmx_ddzone_t *buf_s, int n_s,
1026                                gmx_ddzone_t *buf_r, int n_r)
1027 {
1028 #define ZBS  DDZONECOMM_BUFSIZE
1029     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1030     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1031     int  i;
1032
1033     for (i = 0; i < n_s; i++)
1034     {
1035         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1036         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1037         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1038         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1039         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1040         vbuf_s[i*ZBS+1][2] = 0;
1041         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1042         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1043         vbuf_s[i*ZBS+2][2] = 0;
1044     }
1045
1046     dd_sendrecv_rvec(dd, ddimind, direction,
1047                      vbuf_s, n_s*ZBS,
1048                      vbuf_r, n_r*ZBS);
1049
1050     for (i = 0; i < n_r; i++)
1051     {
1052         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1053         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1054         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1055         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1056         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1057         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1058         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1059     }
1060
1061 #undef ZBS
1062 }
1063
1064 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1065                           rvec cell_ns_x0, rvec cell_ns_x1)
1066 {
1067     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1068     gmx_ddzone_t      *zp;
1069     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1070     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1071     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1072     rvec               extr_s[2], extr_r[2];
1073     rvec               dh;
1074     real               dist_d, c = 0, det;
1075     gmx_domdec_comm_t *comm;
1076     gmx_bool           bPBC, bUse;
1077
1078     comm = dd->comm;
1079
1080     for (d = 1; d < dd->ndim; d++)
1081     {
1082         dim      = dd->dim[d];
1083         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1084         zp->min0 = cell_ns_x0[dim];
1085         zp->max1 = cell_ns_x1[dim];
1086         zp->min1 = cell_ns_x1[dim];
1087         zp->mch0 = cell_ns_x0[dim];
1088         zp->mch1 = cell_ns_x1[dim];
1089         zp->p1_0 = cell_ns_x0[dim];
1090         zp->p1_1 = cell_ns_x1[dim];
1091     }
1092
1093     for (d = dd->ndim-2; d >= 0; d--)
1094     {
1095         dim  = dd->dim[d];
1096         bPBC = (dim < ddbox->npbcdim);
1097
1098         /* Use an rvec to store two reals */
1099         extr_s[d][0] = comm->cell_f0[d+1];
1100         extr_s[d][1] = comm->cell_f1[d+1];
1101         extr_s[d][2] = comm->cell_f1[d+1];
1102
1103         pos = 0;
1104         /* Store the extremes in the backward sending buffer,
1105          * so the get updated separately from the forward communication.
1106          */
1107         for (d1 = d; d1 < dd->ndim-1; d1++)
1108         {
1109             /* We invert the order to be able to use the same loop for buf_e */
1110             buf_s[pos].min0 = extr_s[d1][1];
1111             buf_s[pos].max1 = extr_s[d1][0];
1112             buf_s[pos].min1 = extr_s[d1][2];
1113             buf_s[pos].mch0 = 0;
1114             buf_s[pos].mch1 = 0;
1115             /* Store the cell corner of the dimension we communicate along */
1116             buf_s[pos].p1_0 = comm->cell_x0[dim];
1117             buf_s[pos].p1_1 = 0;
1118             pos++;
1119         }
1120
1121         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1122         pos++;
1123
1124         if (dd->ndim == 3 && d == 0)
1125         {
1126             buf_s[pos] = comm->zone_d2[0][1];
1127             pos++;
1128             buf_s[pos] = comm->zone_d1[0];
1129             pos++;
1130         }
1131
1132         /* We only need to communicate the extremes
1133          * in the forward direction
1134          */
1135         npulse = comm->cd[d].np;
1136         if (bPBC)
1137         {
1138             /* Take the minimum to avoid double communication */
1139             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1140         }
1141         else
1142         {
1143             /* Without PBC we should really not communicate over
1144              * the boundaries, but implementing that complicates
1145              * the communication setup and therefore we simply
1146              * do all communication, but ignore some data.
1147              */
1148             npulse_min = npulse;
1149         }
1150         for (p = 0; p < npulse_min; p++)
1151         {
1152             /* Communicate the extremes forward */
1153             bUse = (bPBC || dd->ci[dim] > 0);
1154
1155             dd_sendrecv_rvec(dd, d, dddirForward,
1156                              extr_s+d, dd->ndim-d-1,
1157                              extr_r+d, dd->ndim-d-1);
1158
1159             if (bUse)
1160             {
1161                 for (d1 = d; d1 < dd->ndim-1; d1++)
1162                 {
1163                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1164                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1165                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1166                 }
1167             }
1168         }
1169
1170         buf_size = pos;
1171         for (p = 0; p < npulse; p++)
1172         {
1173             /* Communicate all the zone information backward */
1174             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1175
1176             dd_sendrecv_ddzone(dd, d, dddirBackward,
1177                                buf_s, buf_size,
1178                                buf_r, buf_size);
1179
1180             clear_rvec(dh);
1181             if (p > 0)
1182             {
1183                 for (d1 = d+1; d1 < dd->ndim; d1++)
1184                 {
1185                     /* Determine the decrease of maximum required
1186                      * communication height along d1 due to the distance along d,
1187                      * this avoids a lot of useless atom communication.
1188                      */
1189                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1190
1191                     if (ddbox->tric_dir[dim])
1192                     {
1193                         /* c is the off-diagonal coupling between the cell planes
1194                          * along directions d and d1.
1195                          */
1196                         c = ddbox->v[dim][dd->dim[d1]][dim];
1197                     }
1198                     else
1199                     {
1200                         c = 0;
1201                     }
1202                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1203                     if (det > 0)
1204                     {
1205                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1206                     }
1207                     else
1208                     {
1209                         /* A negative value signals out of range */
1210                         dh[d1] = -1;
1211                     }
1212                 }
1213             }
1214
1215             /* Accumulate the extremes over all pulses */
1216             for (i = 0; i < buf_size; i++)
1217             {
1218                 if (p == 0)
1219                 {
1220                     buf_e[i] = buf_r[i];
1221                 }
1222                 else
1223                 {
1224                     if (bUse)
1225                     {
1226                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1227                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1228                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1229                     }
1230
1231                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1232                     {
1233                         d1 = 1;
1234                     }
1235                     else
1236                     {
1237                         d1 = d + 1;
1238                     }
1239                     if (bUse && dh[d1] >= 0)
1240                     {
1241                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1242                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1243                     }
1244                 }
1245                 /* Copy the received buffer to the send buffer,
1246                  * to pass the data through with the next pulse.
1247                  */
1248                 buf_s[i] = buf_r[i];
1249             }
1250             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1251                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1252             {
1253                 /* Store the extremes */
1254                 pos = 0;
1255
1256                 for (d1 = d; d1 < dd->ndim-1; d1++)
1257                 {
1258                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1259                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1260                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1261                     pos++;
1262                 }
1263
1264                 if (d == 1 || (d == 0 && dd->ndim == 3))
1265                 {
1266                     for (i = d; i < 2; i++)
1267                     {
1268                         comm->zone_d2[1-d][i] = buf_e[pos];
1269                         pos++;
1270                     }
1271                 }
1272                 if (d == 0)
1273                 {
1274                     comm->zone_d1[1] = buf_e[pos];
1275                     pos++;
1276                 }
1277             }
1278         }
1279     }
1280
1281     if (dd->ndim >= 2)
1282     {
1283         dim = dd->dim[1];
1284         for (i = 0; i < 2; i++)
1285         {
1286             if (debug)
1287             {
1288                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1289             }
1290             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1291             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1292         }
1293     }
1294     if (dd->ndim >= 3)
1295     {
1296         dim = dd->dim[2];
1297         for (i = 0; i < 2; i++)
1298         {
1299             for (j = 0; j < 2; j++)
1300             {
1301                 if (debug)
1302                 {
1303                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1304                 }
1305                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1306                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1307             }
1308         }
1309     }
1310     for (d = 1; d < dd->ndim; d++)
1311     {
1312         comm->cell_f_max0[d] = extr_s[d-1][0];
1313         comm->cell_f_min1[d] = extr_s[d-1][1];
1314         if (debug)
1315         {
1316             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1317                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1318         }
1319     }
1320 }
1321
1322 static void dd_collect_cg(gmx_domdec_t *dd,
1323                           t_state      *state_local)
1324 {
1325     gmx_domdec_master_t *ma = NULL;
1326     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1327
1328     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1329     {
1330         /* The master has the correct distribution */
1331         return;
1332     }
1333
1334     if (state_local->ddp_count == dd->ddp_count)
1335     {
1336         /* The local state and DD are in sync, use the DD indices */
1337         ncg_home = dd->ncg_home;
1338         cg       = dd->index_gl;
1339         nat_home = dd->nat_home;
1340     }
1341     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1342     {
1343         /* The DD is out of sync with the local state, but we have stored
1344          * the cg indices with the local state, so we can use those.
1345          */
1346         t_block *cgs_gl;
1347
1348         cgs_gl = &dd->comm->cgs_gl;
1349
1350         ncg_home = state_local->ncg_gl;
1351         cg       = state_local->cg_gl;
1352         nat_home = 0;
1353         for (i = 0; i < ncg_home; i++)
1354         {
1355             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1356         }
1357     }
1358     else
1359     {
1360         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1361     }
1362
1363     buf2[0] = ncg_home;
1364     buf2[1] = nat_home;
1365     if (DDMASTER(dd))
1366     {
1367         ma   = dd->ma;
1368         ibuf = ma->ibuf;
1369     }
1370     else
1371     {
1372         ibuf = NULL;
1373     }
1374     /* Collect the charge group and atom counts on the master */
1375     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1376
1377     if (DDMASTER(dd))
1378     {
1379         ma->index[0] = 0;
1380         for (i = 0; i < dd->nnodes; i++)
1381         {
1382             ma->ncg[i]     = ma->ibuf[2*i];
1383             ma->nat[i]     = ma->ibuf[2*i+1];
1384             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1385
1386         }
1387         /* Make byte counts and indices */
1388         for (i = 0; i < dd->nnodes; i++)
1389         {
1390             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1391             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1392         }
1393         if (debug)
1394         {
1395             fprintf(debug, "Initial charge group distribution: ");
1396             for (i = 0; i < dd->nnodes; i++)
1397             {
1398                 fprintf(debug, " %d", ma->ncg[i]);
1399             }
1400             fprintf(debug, "\n");
1401         }
1402     }
1403
1404     /* Collect the charge group indices on the master */
1405     dd_gatherv(dd,
1406                ncg_home*sizeof(int), cg,
1407                DDMASTER(dd) ? ma->ibuf : NULL,
1408                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1409                DDMASTER(dd) ? ma->cg : NULL);
1410
1411     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1412 }
1413
1414 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1415                                     rvec *lv, rvec *v)
1416 {
1417     gmx_domdec_master_t *ma;
1418     int                  n, i, c, a, nalloc = 0;
1419     rvec                *buf = NULL;
1420     t_block             *cgs_gl;
1421
1422     ma = dd->ma;
1423
1424     if (!DDMASTER(dd))
1425     {
1426 #ifdef GMX_MPI
1427         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1428                  dd->rank, dd->mpi_comm_all);
1429 #endif
1430     }
1431     else
1432     {
1433         /* Copy the master coordinates to the global array */
1434         cgs_gl = &dd->comm->cgs_gl;
1435
1436         n = DDMASTERRANK(dd);
1437         a = 0;
1438         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1439         {
1440             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1441             {
1442                 copy_rvec(lv[a++], v[c]);
1443             }
1444         }
1445
1446         for (n = 0; n < dd->nnodes; n++)
1447         {
1448             if (n != dd->rank)
1449             {
1450                 if (ma->nat[n] > nalloc)
1451                 {
1452                     nalloc = over_alloc_dd(ma->nat[n]);
1453                     srenew(buf, nalloc);
1454                 }
1455 #ifdef GMX_MPI
1456                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1457                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1458 #endif
1459                 a = 0;
1460                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1461                 {
1462                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1463                     {
1464                         copy_rvec(buf[a++], v[c]);
1465                     }
1466                 }
1467             }
1468         }
1469         sfree(buf);
1470     }
1471 }
1472
1473 static void get_commbuffer_counts(gmx_domdec_t *dd,
1474                                   int **counts, int **disps)
1475 {
1476     gmx_domdec_master_t *ma;
1477     int                  n;
1478
1479     ma = dd->ma;
1480
1481     /* Make the rvec count and displacment arrays */
1482     *counts  = ma->ibuf;
1483     *disps   = ma->ibuf + dd->nnodes;
1484     for (n = 0; n < dd->nnodes; n++)
1485     {
1486         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1487         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1488     }
1489 }
1490
1491 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1492                                    rvec *lv, rvec *v)
1493 {
1494     gmx_domdec_master_t *ma;
1495     int                 *rcounts = NULL, *disps = NULL;
1496     int                  n, i, c, a;
1497     rvec                *buf = NULL;
1498     t_block             *cgs_gl;
1499
1500     ma = dd->ma;
1501
1502     if (DDMASTER(dd))
1503     {
1504         get_commbuffer_counts(dd, &rcounts, &disps);
1505
1506         buf = ma->vbuf;
1507     }
1508
1509     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1510
1511     if (DDMASTER(dd))
1512     {
1513         cgs_gl = &dd->comm->cgs_gl;
1514
1515         a = 0;
1516         for (n = 0; n < dd->nnodes; n++)
1517         {
1518             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1519             {
1520                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1521                 {
1522                     copy_rvec(buf[a++], v[c]);
1523                 }
1524             }
1525         }
1526     }
1527 }
1528
1529 void dd_collect_vec(gmx_domdec_t *dd,
1530                     t_state *state_local, rvec *lv, rvec *v)
1531 {
1532     gmx_domdec_master_t *ma;
1533     int                  n, i, c, a, nalloc = 0;
1534     rvec                *buf = NULL;
1535
1536     dd_collect_cg(dd, state_local);
1537
1538     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1539     {
1540         dd_collect_vec_sendrecv(dd, lv, v);
1541     }
1542     else
1543     {
1544         dd_collect_vec_gatherv(dd, lv, v);
1545     }
1546 }
1547
1548
1549 void dd_collect_state(gmx_domdec_t *dd,
1550                       t_state *state_local, t_state *state)
1551 {
1552     int est, i, j, nh;
1553
1554     nh = state->nhchainlength;
1555
1556     if (DDMASTER(dd))
1557     {
1558         for (i = 0; i < efptNR; i++)
1559         {
1560             state->lambda[i] = state_local->lambda[i];
1561         }
1562         state->fep_state = state_local->fep_state;
1563         state->veta      = state_local->veta;
1564         state->vol0      = state_local->vol0;
1565         copy_mat(state_local->box, state->box);
1566         copy_mat(state_local->boxv, state->boxv);
1567         copy_mat(state_local->svir_prev, state->svir_prev);
1568         copy_mat(state_local->fvir_prev, state->fvir_prev);
1569         copy_mat(state_local->pres_prev, state->pres_prev);
1570
1571         for (i = 0; i < state_local->ngtc; i++)
1572         {
1573             for (j = 0; j < nh; j++)
1574             {
1575                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1576                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1577             }
1578             state->therm_integral[i] = state_local->therm_integral[i];
1579         }
1580         for (i = 0; i < state_local->nnhpres; i++)
1581         {
1582             for (j = 0; j < nh; j++)
1583             {
1584                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1585                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1586             }
1587         }
1588     }
1589     for (est = 0; est < estNR; est++)
1590     {
1591         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1592         {
1593             switch (est)
1594             {
1595                 case estX:
1596                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1597                     break;
1598                 case estV:
1599                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1600                     break;
1601                 case estSDX:
1602                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1603                     break;
1604                 case estCGP:
1605                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1606                     break;
1607                 case estDISRE_INITF:
1608                 case estDISRE_RM3TAV:
1609                 case estORIRE_INITF:
1610                 case estORIRE_DTAV:
1611                     break;
1612                 default:
1613                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1614             }
1615         }
1616     }
1617 }
1618
1619 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1620 {
1621     int est;
1622
1623     if (debug)
1624     {
1625         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1626     }
1627
1628     state->nalloc = over_alloc_dd(nalloc);
1629
1630     for (est = 0; est < estNR; est++)
1631     {
1632         if (EST_DISTR(est) && (state->flags & (1<<est)))
1633         {
1634             switch (est)
1635             {
1636                 case estX:
1637                     srenew(state->x, state->nalloc);
1638                     break;
1639                 case estV:
1640                     srenew(state->v, state->nalloc);
1641                     break;
1642                 case estSDX:
1643                     srenew(state->sd_X, state->nalloc);
1644                     break;
1645                 case estCGP:
1646                     srenew(state->cg_p, state->nalloc);
1647                     break;
1648                 case estDISRE_INITF:
1649                 case estDISRE_RM3TAV:
1650                 case estORIRE_INITF:
1651                 case estORIRE_DTAV:
1652                     /* No reallocation required */
1653                     break;
1654                 default:
1655                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1656             }
1657         }
1658     }
1659
1660     if (f != NULL)
1661     {
1662         srenew(*f, state->nalloc);
1663     }
1664 }
1665
1666 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1667                                int nalloc)
1668 {
1669     if (nalloc > fr->cg_nalloc)
1670     {
1671         if (debug)
1672         {
1673             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1674         }
1675         fr->cg_nalloc = over_alloc_dd(nalloc);
1676         srenew(fr->cginfo, fr->cg_nalloc);
1677         if (fr->cutoff_scheme == ecutsGROUP)
1678         {
1679             srenew(fr->cg_cm, fr->cg_nalloc);
1680         }
1681     }
1682     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1683     {
1684         /* We don't use charge groups, we use x in state to set up
1685          * the atom communication.
1686          */
1687         dd_realloc_state(state, f, nalloc);
1688     }
1689 }
1690
1691 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1692                                        rvec *v, rvec *lv)
1693 {
1694     gmx_domdec_master_t *ma;
1695     int                  n, i, c, a, nalloc = 0;
1696     rvec                *buf = NULL;
1697
1698     if (DDMASTER(dd))
1699     {
1700         ma  = dd->ma;
1701
1702         for (n = 0; n < dd->nnodes; n++)
1703         {
1704             if (n != dd->rank)
1705             {
1706                 if (ma->nat[n] > nalloc)
1707                 {
1708                     nalloc = over_alloc_dd(ma->nat[n]);
1709                     srenew(buf, nalloc);
1710                 }
1711                 /* Use lv as a temporary buffer */
1712                 a = 0;
1713                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1714                 {
1715                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1716                     {
1717                         copy_rvec(v[c], buf[a++]);
1718                     }
1719                 }
1720                 if (a != ma->nat[n])
1721                 {
1722                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1723                               a, ma->nat[n]);
1724                 }
1725
1726 #ifdef GMX_MPI
1727                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1728                          DDRANK(dd, n), n, dd->mpi_comm_all);
1729 #endif
1730             }
1731         }
1732         sfree(buf);
1733         n = DDMASTERRANK(dd);
1734         a = 0;
1735         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1736         {
1737             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1738             {
1739                 copy_rvec(v[c], lv[a++]);
1740             }
1741         }
1742     }
1743     else
1744     {
1745 #ifdef GMX_MPI
1746         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1747                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1748 #endif
1749     }
1750 }
1751
1752 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1753                                        rvec *v, rvec *lv)
1754 {
1755     gmx_domdec_master_t *ma;
1756     int                 *scounts = NULL, *disps = NULL;
1757     int                  n, i, c, a, nalloc = 0;
1758     rvec                *buf = NULL;
1759
1760     if (DDMASTER(dd))
1761     {
1762         ma  = dd->ma;
1763
1764         get_commbuffer_counts(dd, &scounts, &disps);
1765
1766         buf = ma->vbuf;
1767         a   = 0;
1768         for (n = 0; n < dd->nnodes; n++)
1769         {
1770             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1771             {
1772                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1773                 {
1774                     copy_rvec(v[c], buf[a++]);
1775                 }
1776             }
1777         }
1778     }
1779
1780     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1781 }
1782
1783 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1784 {
1785     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1786     {
1787         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1788     }
1789     else
1790     {
1791         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1792     }
1793 }
1794
1795 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1796 {
1797     int i;
1798     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1799     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1800     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1801
1802     if (dfhist->nlambda > 0)
1803     {
1804         int nlam = dfhist->nlambda;
1805         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1806         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1807         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1808         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1809         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1810         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1811
1812         for (i = 0; i < nlam; i++)
1813         {
1814             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1815             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1816             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1817             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1818             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1819             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1820         }
1821     }
1822 }
1823
1824 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1825                                 t_state *state, t_state *state_local,
1826                                 rvec **f)
1827 {
1828     int  i, j, nh;
1829
1830     nh = state->nhchainlength;
1831
1832     if (DDMASTER(dd))
1833     {
1834         for (i = 0; i < efptNR; i++)
1835         {
1836             state_local->lambda[i] = state->lambda[i];
1837         }
1838         state_local->fep_state = state->fep_state;
1839         state_local->veta      = state->veta;
1840         state_local->vol0      = state->vol0;
1841         copy_mat(state->box, state_local->box);
1842         copy_mat(state->box_rel, state_local->box_rel);
1843         copy_mat(state->boxv, state_local->boxv);
1844         copy_mat(state->svir_prev, state_local->svir_prev);
1845         copy_mat(state->fvir_prev, state_local->fvir_prev);
1846         copy_df_history(&state_local->dfhist, &state->dfhist);
1847         for (i = 0; i < state_local->ngtc; i++)
1848         {
1849             for (j = 0; j < nh; j++)
1850             {
1851                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1852                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1853             }
1854             state_local->therm_integral[i] = state->therm_integral[i];
1855         }
1856         for (i = 0; i < state_local->nnhpres; i++)
1857         {
1858             for (j = 0; j < nh; j++)
1859             {
1860                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1861                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1862             }
1863         }
1864     }
1865     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1866     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1867     dd_bcast(dd, sizeof(real), &state_local->veta);
1868     dd_bcast(dd, sizeof(real), &state_local->vol0);
1869     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1870     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1871     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1872     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1873     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1874     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1875     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1876     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1877     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1878     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1879
1880     /* communicate df_history -- required for restarting from checkpoint */
1881     dd_distribute_dfhist(dd, &state_local->dfhist);
1882
1883     if (dd->nat_home > state_local->nalloc)
1884     {
1885         dd_realloc_state(state_local, f, dd->nat_home);
1886     }
1887     for (i = 0; i < estNR; i++)
1888     {
1889         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1890         {
1891             switch (i)
1892             {
1893                 case estX:
1894                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1895                     break;
1896                 case estV:
1897                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1898                     break;
1899                 case estSDX:
1900                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1901                     break;
1902                 case estCGP:
1903                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1904                     break;
1905                 case estDISRE_INITF:
1906                 case estDISRE_RM3TAV:
1907                 case estORIRE_INITF:
1908                 case estORIRE_DTAV:
1909                     /* Not implemented yet */
1910                     break;
1911                 default:
1912                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1913             }
1914         }
1915     }
1916 }
1917
1918 static char dim2char(int dim)
1919 {
1920     char c = '?';
1921
1922     switch (dim)
1923     {
1924         case XX: c = 'X'; break;
1925         case YY: c = 'Y'; break;
1926         case ZZ: c = 'Z'; break;
1927         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1928     }
1929
1930     return c;
1931 }
1932
1933 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1934                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1935 {
1936     rvec   grid_s[2], *grid_r = NULL, cx, r;
1937     char   fname[STRLEN], buf[22];
1938     FILE  *out;
1939     int    a, i, d, z, y, x;
1940     matrix tric;
1941     real   vol;
1942
1943     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1944     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1945
1946     if (DDMASTER(dd))
1947     {
1948         snew(grid_r, 2*dd->nnodes);
1949     }
1950
1951     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
1952
1953     if (DDMASTER(dd))
1954     {
1955         for (d = 0; d < DIM; d++)
1956         {
1957             for (i = 0; i < DIM; i++)
1958             {
1959                 if (d == i)
1960                 {
1961                     tric[d][i] = 1;
1962                 }
1963                 else
1964                 {
1965                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1966                     {
1967                         tric[d][i] = box[i][d]/box[i][i];
1968                     }
1969                     else
1970                     {
1971                         tric[d][i] = 0;
1972                     }
1973                 }
1974             }
1975         }
1976         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1977         out = gmx_fio_fopen(fname, "w");
1978         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1979         a = 1;
1980         for (i = 0; i < dd->nnodes; i++)
1981         {
1982             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1983             for (d = 0; d < DIM; d++)
1984             {
1985                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1986             }
1987             for (z = 0; z < 2; z++)
1988             {
1989                 for (y = 0; y < 2; y++)
1990                 {
1991                     for (x = 0; x < 2; x++)
1992                     {
1993                         cx[XX] = grid_r[i*2+x][XX];
1994                         cx[YY] = grid_r[i*2+y][YY];
1995                         cx[ZZ] = grid_r[i*2+z][ZZ];
1996                         mvmul(tric, cx, r);
1997                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1998                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1999                     }
2000                 }
2001             }
2002             for (d = 0; d < DIM; d++)
2003             {
2004                 for (x = 0; x < 4; x++)
2005                 {
2006                     switch (d)
2007                     {
2008                         case 0: y = 1 + i*8 + 2*x; break;
2009                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2010                         case 2: y = 1 + i*8 + x; break;
2011                     }
2012                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2013                 }
2014             }
2015         }
2016         gmx_fio_fclose(out);
2017         sfree(grid_r);
2018     }
2019 }
2020
2021 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
2022                   gmx_mtop_t *mtop, t_commrec *cr,
2023                   int natoms, rvec x[], matrix box)
2024 {
2025     char          fname[STRLEN], buf[22];
2026     FILE         *out;
2027     int           i, ii, resnr, c;
2028     char         *atomname, *resname;
2029     real          b;
2030     gmx_domdec_t *dd;
2031
2032     dd = cr->dd;
2033     if (natoms == -1)
2034     {
2035         natoms = dd->comm->nat[ddnatVSITE];
2036     }
2037
2038     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2039
2040     out = gmx_fio_fopen(fname, "w");
2041
2042     fprintf(out, "TITLE     %s\n", title);
2043     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2044     for (i = 0; i < natoms; i++)
2045     {
2046         ii = dd->gatindex[i];
2047         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2048         if (i < dd->comm->nat[ddnatZONE])
2049         {
2050             c = 0;
2051             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2052             {
2053                 c++;
2054             }
2055             b = c;
2056         }
2057         else if (i < dd->comm->nat[ddnatVSITE])
2058         {
2059             b = dd->comm->zones.n;
2060         }
2061         else
2062         {
2063             b = dd->comm->zones.n + 1;
2064         }
2065         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
2066                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
2067     }
2068     fprintf(out, "TER\n");
2069
2070     gmx_fio_fclose(out);
2071 }
2072
2073 real dd_cutoff_mbody(gmx_domdec_t *dd)
2074 {
2075     gmx_domdec_comm_t *comm;
2076     int                di;
2077     real               r;
2078
2079     comm = dd->comm;
2080
2081     r = -1;
2082     if (comm->bInterCGBondeds)
2083     {
2084         if (comm->cutoff_mbody > 0)
2085         {
2086             r = comm->cutoff_mbody;
2087         }
2088         else
2089         {
2090             /* cutoff_mbody=0 means we do not have DLB */
2091             r = comm->cellsize_min[dd->dim[0]];
2092             for (di = 1; di < dd->ndim; di++)
2093             {
2094                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2095             }
2096             if (comm->bBondComm)
2097             {
2098                 r = max(r, comm->cutoff_mbody);
2099             }
2100             else
2101             {
2102                 r = min(r, comm->cutoff);
2103             }
2104         }
2105     }
2106
2107     return r;
2108 }
2109
2110 real dd_cutoff_twobody(gmx_domdec_t *dd)
2111 {
2112     real r_mb;
2113
2114     r_mb = dd_cutoff_mbody(dd);
2115
2116     return max(dd->comm->cutoff, r_mb);
2117 }
2118
2119
2120 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2121 {
2122     int nc, ntot;
2123
2124     nc   = dd->nc[dd->comm->cartpmedim];
2125     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2126     copy_ivec(coord, coord_pme);
2127     coord_pme[dd->comm->cartpmedim] =
2128         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2129 }
2130
2131 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2132 {
2133     /* Here we assign a PME node to communicate with this DD node
2134      * by assuming that the major index of both is x.
2135      * We add cr->npmenodes/2 to obtain an even distribution.
2136      */
2137     return (ddindex*npme + npme/2)/ndd;
2138 }
2139
2140 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2141 {
2142     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2143 }
2144
2145 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2146 {
2147     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2148 }
2149
2150 static int *dd_pmenodes(t_commrec *cr)
2151 {
2152     int *pmenodes;
2153     int  n, i, p0, p1;
2154
2155     snew(pmenodes, cr->npmenodes);
2156     n = 0;
2157     for (i = 0; i < cr->dd->nnodes; i++)
2158     {
2159         p0 = cr_ddindex2pmeindex(cr, i);
2160         p1 = cr_ddindex2pmeindex(cr, i+1);
2161         if (i+1 == cr->dd->nnodes || p1 > p0)
2162         {
2163             if (debug)
2164             {
2165                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2166             }
2167             pmenodes[n] = i + 1 + n;
2168             n++;
2169         }
2170     }
2171
2172     return pmenodes;
2173 }
2174
2175 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2176 {
2177     gmx_domdec_t *dd;
2178     ivec          coords, coords_pme, nc;
2179     int           slab;
2180
2181     dd = cr->dd;
2182     /*
2183        if (dd->comm->bCartesian) {
2184        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2185        dd_coords2pmecoords(dd,coords,coords_pme);
2186        copy_ivec(dd->ntot,nc);
2187        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2188        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2189
2190        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2191        } else {
2192        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2193        }
2194      */
2195     coords[XX] = x;
2196     coords[YY] = y;
2197     coords[ZZ] = z;
2198     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2199
2200     return slab;
2201 }
2202
2203 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2204 {
2205     gmx_domdec_comm_t *comm;
2206     ivec               coords;
2207     int                ddindex, nodeid = -1;
2208
2209     comm = cr->dd->comm;
2210
2211     coords[XX] = x;
2212     coords[YY] = y;
2213     coords[ZZ] = z;
2214     if (comm->bCartesianPP_PME)
2215     {
2216 #ifdef GMX_MPI
2217         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2218 #endif
2219     }
2220     else
2221     {
2222         ddindex = dd_index(cr->dd->nc, coords);
2223         if (comm->bCartesianPP)
2224         {
2225             nodeid = comm->ddindex2simnodeid[ddindex];
2226         }
2227         else
2228         {
2229             if (comm->pmenodes)
2230             {
2231                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2232             }
2233             else
2234             {
2235                 nodeid = ddindex;
2236             }
2237         }
2238     }
2239
2240     return nodeid;
2241 }
2242
2243 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2244 {
2245     gmx_domdec_t      *dd;
2246     gmx_domdec_comm_t *comm;
2247     ivec               coord, coord_pme;
2248     int                i;
2249     int                pmenode = -1;
2250
2251     dd   = cr->dd;
2252     comm = dd->comm;
2253
2254     /* This assumes a uniform x domain decomposition grid cell size */
2255     if (comm->bCartesianPP_PME)
2256     {
2257 #ifdef GMX_MPI
2258         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2259         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2260         {
2261             /* This is a PP node */
2262             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2263             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2264         }
2265 #endif
2266     }
2267     else if (comm->bCartesianPP)
2268     {
2269         if (sim_nodeid < dd->nnodes)
2270         {
2271             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2272         }
2273     }
2274     else
2275     {
2276         /* This assumes DD cells with identical x coordinates
2277          * are numbered sequentially.
2278          */
2279         if (dd->comm->pmenodes == NULL)
2280         {
2281             if (sim_nodeid < dd->nnodes)
2282             {
2283                 /* The DD index equals the nodeid */
2284                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2285             }
2286         }
2287         else
2288         {
2289             i = 0;
2290             while (sim_nodeid > dd->comm->pmenodes[i])
2291             {
2292                 i++;
2293             }
2294             if (sim_nodeid < dd->comm->pmenodes[i])
2295             {
2296                 pmenode = dd->comm->pmenodes[i];
2297             }
2298         }
2299     }
2300
2301     return pmenode;
2302 }
2303
2304 void get_pme_nnodes(const gmx_domdec_t *dd,
2305                     int *npmenodes_x, int *npmenodes_y)
2306 {
2307     if (dd != NULL)
2308     {
2309         *npmenodes_x = dd->comm->npmenodes_x;
2310         *npmenodes_y = dd->comm->npmenodes_y;
2311     }
2312     else
2313     {
2314         *npmenodes_x = 1;
2315         *npmenodes_y = 1;
2316     }
2317 }
2318
2319 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2320 {
2321     gmx_bool bPMEOnlyNode;
2322
2323     if (DOMAINDECOMP(cr))
2324     {
2325         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2326     }
2327     else
2328     {
2329         bPMEOnlyNode = FALSE;
2330     }
2331
2332     return bPMEOnlyNode;
2333 }
2334
2335 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2336                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2337 {
2338     gmx_domdec_t *dd;
2339     int           x, y, z;
2340     ivec          coord, coord_pme;
2341
2342     dd = cr->dd;
2343
2344     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2345
2346     *nmy_ddnodes = 0;
2347     for (x = 0; x < dd->nc[XX]; x++)
2348     {
2349         for (y = 0; y < dd->nc[YY]; y++)
2350         {
2351             for (z = 0; z < dd->nc[ZZ]; z++)
2352             {
2353                 if (dd->comm->bCartesianPP_PME)
2354                 {
2355                     coord[XX] = x;
2356                     coord[YY] = y;
2357                     coord[ZZ] = z;
2358                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2359                     if (dd->ci[XX] == coord_pme[XX] &&
2360                         dd->ci[YY] == coord_pme[YY] &&
2361                         dd->ci[ZZ] == coord_pme[ZZ])
2362                     {
2363                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2364                     }
2365                 }
2366                 else
2367                 {
2368                     /* The slab corresponds to the nodeid in the PME group */
2369                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2370                     {
2371                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2372                     }
2373                 }
2374             }
2375         }
2376     }
2377
2378     /* The last PP-only node is the peer node */
2379     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2380
2381     if (debug)
2382     {
2383         fprintf(debug, "Receive coordinates from PP ranks:");
2384         for (x = 0; x < *nmy_ddnodes; x++)
2385         {
2386             fprintf(debug, " %d", (*my_ddnodes)[x]);
2387         }
2388         fprintf(debug, "\n");
2389     }
2390 }
2391
2392 static gmx_bool receive_vir_ener(t_commrec *cr)
2393 {
2394     gmx_domdec_comm_t *comm;
2395     int                pmenode, coords[DIM], rank;
2396     gmx_bool           bReceive;
2397
2398     bReceive = TRUE;
2399     if (cr->npmenodes < cr->dd->nnodes)
2400     {
2401         comm = cr->dd->comm;
2402         if (comm->bCartesianPP_PME)
2403         {
2404             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2405 #ifdef GMX_MPI
2406             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2407             coords[comm->cartpmedim]++;
2408             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2409             {
2410                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2411                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2412                 {
2413                     /* This is not the last PP node for pmenode */
2414                     bReceive = FALSE;
2415                 }
2416             }
2417 #endif
2418         }
2419         else
2420         {
2421             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2422             if (cr->sim_nodeid+1 < cr->nnodes &&
2423                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2424             {
2425                 /* This is not the last PP node for pmenode */
2426                 bReceive = FALSE;
2427             }
2428         }
2429     }
2430
2431     return bReceive;
2432 }
2433
2434 static void set_zones_ncg_home(gmx_domdec_t *dd)
2435 {
2436     gmx_domdec_zones_t *zones;
2437     int                 i;
2438
2439     zones = &dd->comm->zones;
2440
2441     zones->cg_range[0] = 0;
2442     for (i = 1; i < zones->n+1; i++)
2443     {
2444         zones->cg_range[i] = dd->ncg_home;
2445     }
2446     /* zone_ncg1[0] should always be equal to ncg_home */
2447     dd->comm->zone_ncg1[0] = dd->ncg_home;
2448 }
2449
2450 static void rebuild_cgindex(gmx_domdec_t *dd,
2451                             const int *gcgs_index, t_state *state)
2452 {
2453     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2454
2455     ind        = state->cg_gl;
2456     dd_cg_gl   = dd->index_gl;
2457     cgindex    = dd->cgindex;
2458     nat        = 0;
2459     cgindex[0] = nat;
2460     for (i = 0; i < state->ncg_gl; i++)
2461     {
2462         cgindex[i]  = nat;
2463         cg_gl       = ind[i];
2464         dd_cg_gl[i] = cg_gl;
2465         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2466     }
2467     cgindex[i] = nat;
2468
2469     dd->ncg_home = state->ncg_gl;
2470     dd->nat_home = nat;
2471
2472     set_zones_ncg_home(dd);
2473 }
2474
2475 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2476 {
2477     while (cg >= cginfo_mb->cg_end)
2478     {
2479         cginfo_mb++;
2480     }
2481
2482     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2483 }
2484
2485 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2486                           t_forcerec *fr, char *bLocalCG)
2487 {
2488     cginfo_mb_t *cginfo_mb;
2489     int         *cginfo;
2490     int          cg;
2491
2492     if (fr != NULL)
2493     {
2494         cginfo_mb = fr->cginfo_mb;
2495         cginfo    = fr->cginfo;
2496
2497         for (cg = cg0; cg < cg1; cg++)
2498         {
2499             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2500         }
2501     }
2502
2503     if (bLocalCG != NULL)
2504     {
2505         for (cg = cg0; cg < cg1; cg++)
2506         {
2507             bLocalCG[index_gl[cg]] = TRUE;
2508         }
2509     }
2510 }
2511
2512 static void make_dd_indices(gmx_domdec_t *dd,
2513                             const int *gcgs_index, int cg_start)
2514 {
2515     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2516     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2517     gmx_ga2la_t *ga2la;
2518     char        *bLocalCG;
2519     gmx_bool     bCGs;
2520
2521     bLocalCG = dd->comm->bLocalCG;
2522
2523     if (dd->nat_tot > dd->gatindex_nalloc)
2524     {
2525         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2526         srenew(dd->gatindex, dd->gatindex_nalloc);
2527     }
2528
2529     nzone      = dd->comm->zones.n;
2530     zone2cg    = dd->comm->zones.cg_range;
2531     zone_ncg1  = dd->comm->zone_ncg1;
2532     index_gl   = dd->index_gl;
2533     gatindex   = dd->gatindex;
2534     bCGs       = dd->comm->bCGs;
2535
2536     if (zone2cg[1] != dd->ncg_home)
2537     {
2538         gmx_incons("dd->ncg_zone is not up to date");
2539     }
2540
2541     /* Make the local to global and global to local atom index */
2542     a = dd->cgindex[cg_start];
2543     for (zone = 0; zone < nzone; zone++)
2544     {
2545         if (zone == 0)
2546         {
2547             cg0 = cg_start;
2548         }
2549         else
2550         {
2551             cg0 = zone2cg[zone];
2552         }
2553         cg1    = zone2cg[zone+1];
2554         cg1_p1 = cg0 + zone_ncg1[zone];
2555
2556         for (cg = cg0; cg < cg1; cg++)
2557         {
2558             zone1 = zone;
2559             if (cg >= cg1_p1)
2560             {
2561                 /* Signal that this cg is from more than one pulse away */
2562                 zone1 += nzone;
2563             }
2564             cg_gl = index_gl[cg];
2565             if (bCGs)
2566             {
2567                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2568                 {
2569                     gatindex[a] = a_gl;
2570                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2571                     a++;
2572                 }
2573             }
2574             else
2575             {
2576                 gatindex[a] = cg_gl;
2577                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2578                 a++;
2579             }
2580         }
2581     }
2582 }
2583
2584 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2585                           const char *where)
2586 {
2587     int ncg, i, ngl, nerr;
2588
2589     nerr = 0;
2590     if (bLocalCG == NULL)
2591     {
2592         return nerr;
2593     }
2594     for (i = 0; i < dd->ncg_tot; i++)
2595     {
2596         if (!bLocalCG[dd->index_gl[i]])
2597         {
2598             fprintf(stderr,
2599                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2600             nerr++;
2601         }
2602     }
2603     ngl = 0;
2604     for (i = 0; i < ncg_sys; i++)
2605     {
2606         if (bLocalCG[i])
2607         {
2608             ngl++;
2609         }
2610     }
2611     if (ngl != dd->ncg_tot)
2612     {
2613         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2614         nerr++;
2615     }
2616
2617     return nerr;
2618 }
2619
2620 static void check_index_consistency(gmx_domdec_t *dd,
2621                                     int natoms_sys, int ncg_sys,
2622                                     const char *where)
2623 {
2624     int   nerr, ngl, i, a, cell;
2625     int  *have;
2626
2627     nerr = 0;
2628
2629     if (dd->comm->DD_debug > 1)
2630     {
2631         snew(have, natoms_sys);
2632         for (a = 0; a < dd->nat_tot; a++)
2633         {
2634             if (have[dd->gatindex[a]] > 0)
2635             {
2636                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2637             }
2638             else
2639             {
2640                 have[dd->gatindex[a]] = a + 1;
2641             }
2642         }
2643         sfree(have);
2644     }
2645
2646     snew(have, dd->nat_tot);
2647
2648     ngl  = 0;
2649     for (i = 0; i < natoms_sys; i++)
2650     {
2651         if (ga2la_get(dd->ga2la, i, &a, &cell))
2652         {
2653             if (a >= dd->nat_tot)
2654             {
2655                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2656                 nerr++;
2657             }
2658             else
2659             {
2660                 have[a] = 1;
2661                 if (dd->gatindex[a] != i)
2662                 {
2663                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2664                     nerr++;
2665                 }
2666             }
2667             ngl++;
2668         }
2669     }
2670     if (ngl != dd->nat_tot)
2671     {
2672         fprintf(stderr,
2673                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2674                 dd->rank, where, ngl, dd->nat_tot);
2675     }
2676     for (a = 0; a < dd->nat_tot; a++)
2677     {
2678         if (have[a] == 0)
2679         {
2680             fprintf(stderr,
2681                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2682                     dd->rank, where, a+1, dd->gatindex[a]+1);
2683         }
2684     }
2685     sfree(have);
2686
2687     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2688
2689     if (nerr > 0)
2690     {
2691         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2692                   dd->rank, where, nerr);
2693     }
2694 }
2695
2696 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2697 {
2698     int   i;
2699     char *bLocalCG;
2700
2701     if (a_start == 0)
2702     {
2703         /* Clear the whole list without searching */
2704         ga2la_clear(dd->ga2la);
2705     }
2706     else
2707     {
2708         for (i = a_start; i < dd->nat_tot; i++)
2709         {
2710             ga2la_del(dd->ga2la, dd->gatindex[i]);
2711         }
2712     }
2713
2714     bLocalCG = dd->comm->bLocalCG;
2715     if (bLocalCG)
2716     {
2717         for (i = cg_start; i < dd->ncg_tot; i++)
2718         {
2719             bLocalCG[dd->index_gl[i]] = FALSE;
2720         }
2721     }
2722
2723     dd_clear_local_vsite_indices(dd);
2724
2725     if (dd->constraints)
2726     {
2727         dd_clear_local_constraint_indices(dd);
2728     }
2729 }
2730
2731 /* This function should be used for moving the domain boudaries during DLB,
2732  * for obtaining the minimum cell size. It checks the initially set limit
2733  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2734  * and, possibly, a longer cut-off limit set for PME load balancing.
2735  */
2736 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2737 {
2738     real cellsize_min;
2739
2740     cellsize_min = comm->cellsize_min[dim];
2741
2742     if (!comm->bVacDLBNoLimit)
2743     {
2744         /* The cut-off might have changed, e.g. by PME load balacning,
2745          * from the value used to set comm->cellsize_min, so check it.
2746          */
2747         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2748
2749         if (comm->bPMELoadBalDLBLimits)
2750         {
2751             /* Check for the cut-off limit set by the PME load balancing */
2752             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2753         }
2754     }
2755
2756     return cellsize_min;
2757 }
2758
2759 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2760                             int dim_ind)
2761 {
2762     real grid_jump_limit;
2763
2764     /* The distance between the boundaries of cells at distance
2765      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2766      * and by the fact that cells should not be shifted by more than
2767      * half their size, such that cg's only shift by one cell
2768      * at redecomposition.
2769      */
2770     grid_jump_limit = comm->cellsize_limit;
2771     if (!comm->bVacDLBNoLimit)
2772     {
2773         if (comm->bPMELoadBalDLBLimits)
2774         {
2775             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2776         }
2777         grid_jump_limit = max(grid_jump_limit,
2778                               cutoff/comm->cd[dim_ind].np);
2779     }
2780
2781     return grid_jump_limit;
2782 }
2783
2784 static gmx_bool check_grid_jump(gmx_int64_t     step,
2785                                 gmx_domdec_t   *dd,
2786                                 real            cutoff,
2787                                 gmx_ddbox_t    *ddbox,
2788                                 gmx_bool        bFatal)
2789 {
2790     gmx_domdec_comm_t *comm;
2791     int                d, dim;
2792     real               limit, bfac;
2793     gmx_bool           bInvalid;
2794
2795     bInvalid = FALSE;
2796
2797     comm = dd->comm;
2798
2799     for (d = 1; d < dd->ndim; d++)
2800     {
2801         dim   = dd->dim[d];
2802         limit = grid_jump_limit(comm, cutoff, d);
2803         bfac  = ddbox->box_size[dim];
2804         if (ddbox->tric_dir[dim])
2805         {
2806             bfac *= ddbox->skew_fac[dim];
2807         }
2808         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2809                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2810         {
2811             bInvalid = TRUE;
2812
2813             if (bFatal)
2814             {
2815                 char buf[22];
2816
2817                 /* This error should never be triggered under normal
2818                  * circumstances, but you never know ...
2819                  */
2820                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2821                           gmx_step_str(step, buf),
2822                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2823             }
2824         }
2825     }
2826
2827     return bInvalid;
2828 }
2829
2830 static int dd_load_count(gmx_domdec_comm_t *comm)
2831 {
2832     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2833 }
2834
2835 static float dd_force_load(gmx_domdec_comm_t *comm)
2836 {
2837     float load;
2838
2839     if (comm->eFlop)
2840     {
2841         load = comm->flop;
2842         if (comm->eFlop > 1)
2843         {
2844             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2845         }
2846     }
2847     else
2848     {
2849         load = comm->cycl[ddCyclF];
2850         if (comm->cycl_n[ddCyclF] > 1)
2851         {
2852             /* Subtract the maximum of the last n cycle counts
2853              * to get rid of possible high counts due to other sources,
2854              * for instance system activity, that would otherwise
2855              * affect the dynamic load balancing.
2856              */
2857             load -= comm->cycl_max[ddCyclF];
2858         }
2859
2860 #ifdef GMX_MPI
2861         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2862         {
2863             float gpu_wait, gpu_wait_sum;
2864
2865             gpu_wait = comm->cycl[ddCyclWaitGPU];
2866             if (comm->cycl_n[ddCyclF] > 1)
2867             {
2868                 /* We should remove the WaitGPU time of the same MD step
2869                  * as the one with the maximum F time, since the F time
2870                  * and the wait time are not independent.
2871                  * Furthermore, the step for the max F time should be chosen
2872                  * the same on all ranks that share the same GPU.
2873                  * But to keep the code simple, we remove the average instead.
2874                  * The main reason for artificially long times at some steps
2875                  * is spurious CPU activity or MPI time, so we don't expect
2876                  * that changes in the GPU wait time matter a lot here.
2877                  */
2878                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2879             }
2880             /* Sum the wait times over the ranks that share the same GPU */
2881             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2882                           comm->mpi_comm_gpu_shared);
2883             /* Replace the wait time by the average over the ranks */
2884             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2885         }
2886 #endif
2887     }
2888
2889     return load;
2890 }
2891
2892 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2893 {
2894     gmx_domdec_comm_t *comm;
2895     int                i;
2896
2897     comm = dd->comm;
2898
2899     snew(*dim_f, dd->nc[dim]+1);
2900     (*dim_f)[0] = 0;
2901     for (i = 1; i < dd->nc[dim]; i++)
2902     {
2903         if (comm->slb_frac[dim])
2904         {
2905             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2906         }
2907         else
2908         {
2909             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2910         }
2911     }
2912     (*dim_f)[dd->nc[dim]] = 1;
2913 }
2914
2915 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2916 {
2917     int  pmeindex, slab, nso, i;
2918     ivec xyz;
2919
2920     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2921     {
2922         ddpme->dim = YY;
2923     }
2924     else
2925     {
2926         ddpme->dim = dimind;
2927     }
2928     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2929
2930     ddpme->nslab = (ddpme->dim == 0 ?
2931                     dd->comm->npmenodes_x :
2932                     dd->comm->npmenodes_y);
2933
2934     if (ddpme->nslab <= 1)
2935     {
2936         return;
2937     }
2938
2939     nso = dd->comm->npmenodes/ddpme->nslab;
2940     /* Determine for each PME slab the PP location range for dimension dim */
2941     snew(ddpme->pp_min, ddpme->nslab);
2942     snew(ddpme->pp_max, ddpme->nslab);
2943     for (slab = 0; slab < ddpme->nslab; slab++)
2944     {
2945         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2946         ddpme->pp_max[slab] = 0;
2947     }
2948     for (i = 0; i < dd->nnodes; i++)
2949     {
2950         ddindex2xyz(dd->nc, i, xyz);
2951         /* For y only use our y/z slab.
2952          * This assumes that the PME x grid size matches the DD grid size.
2953          */
2954         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2955         {
2956             pmeindex = ddindex2pmeindex(dd, i);
2957             if (dimind == 0)
2958             {
2959                 slab = pmeindex/nso;
2960             }
2961             else
2962             {
2963                 slab = pmeindex % ddpme->nslab;
2964             }
2965             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2966             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2967         }
2968     }
2969
2970     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2971 }
2972
2973 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2974 {
2975     if (dd->comm->ddpme[0].dim == XX)
2976     {
2977         return dd->comm->ddpme[0].maxshift;
2978     }
2979     else
2980     {
2981         return 0;
2982     }
2983 }
2984
2985 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2986 {
2987     if (dd->comm->ddpme[0].dim == YY)
2988     {
2989         return dd->comm->ddpme[0].maxshift;
2990     }
2991     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2992     {
2993         return dd->comm->ddpme[1].maxshift;
2994     }
2995     else
2996     {
2997         return 0;
2998     }
2999 }
3000
3001 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
3002                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
3003 {
3004     gmx_domdec_comm_t *comm;
3005     int                nc, ns, s;
3006     int               *xmin, *xmax;
3007     real               range, pme_boundary;
3008     int                sh;
3009
3010     comm = dd->comm;
3011     nc   = dd->nc[ddpme->dim];
3012     ns   = ddpme->nslab;
3013
3014     if (!ddpme->dim_match)
3015     {
3016         /* PP decomposition is not along dim: the worst situation */
3017         sh = ns/2;
3018     }
3019     else if (ns <= 3 || (bUniform && ns == nc))
3020     {
3021         /* The optimal situation */
3022         sh = 1;
3023     }
3024     else
3025     {
3026         /* We need to check for all pme nodes which nodes they
3027          * could possibly need to communicate with.
3028          */
3029         xmin = ddpme->pp_min;
3030         xmax = ddpme->pp_max;
3031         /* Allow for atoms to be maximally 2/3 times the cut-off
3032          * out of their DD cell. This is a reasonable balance between
3033          * between performance and support for most charge-group/cut-off
3034          * combinations.
3035          */
3036         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3037         /* Avoid extra communication when we are exactly at a boundary */
3038         range *= 0.999;
3039
3040         sh = 1;
3041         for (s = 0; s < ns; s++)
3042         {
3043             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3044             pme_boundary = (real)s/ns;
3045             while (sh+1 < ns &&
3046                    ((s-(sh+1) >= 0 &&
3047                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3048                     (s-(sh+1) <  0 &&
3049                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3050             {
3051                 sh++;
3052             }
3053             pme_boundary = (real)(s+1)/ns;
3054             while (sh+1 < ns &&
3055                    ((s+(sh+1) <  ns &&
3056                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3057                     (s+(sh+1) >= ns &&
3058                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3059             {
3060                 sh++;
3061             }
3062         }
3063     }
3064
3065     ddpme->maxshift = sh;
3066
3067     if (debug)
3068     {
3069         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3070                 ddpme->dim, ddpme->maxshift);
3071     }
3072 }
3073
3074 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3075 {
3076     int d, dim;
3077
3078     for (d = 0; d < dd->ndim; d++)
3079     {
3080         dim = dd->dim[d];
3081         if (dim < ddbox->nboundeddim &&
3082             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3083             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3084         {
3085             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3086                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3087                       dd->nc[dim], dd->comm->cellsize_limit);
3088         }
3089     }
3090 }
3091
3092 enum {
3093     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
3094 };
3095
3096 /* Set the domain boundaries. Use for static (or no) load balancing,
3097  * and also for the starting state for dynamic load balancing.
3098  * setmode determine if and where the boundaries are stored, use enum above.
3099  * Returns the number communication pulses in npulse.
3100  */
3101 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3102                                   int setmode, ivec npulse)
3103 {
3104     gmx_domdec_comm_t *comm;
3105     int                d, j;
3106     rvec               cellsize_min;
3107     real              *cell_x, cell_dx, cellsize;
3108
3109     comm = dd->comm;
3110
3111     for (d = 0; d < DIM; d++)
3112     {
3113         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3114         npulse[d]       = 1;
3115         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3116         {
3117             /* Uniform grid */
3118             cell_dx = ddbox->box_size[d]/dd->nc[d];
3119             switch (setmode)
3120             {
3121                 case setcellsizeslbMASTER:
3122                     for (j = 0; j < dd->nc[d]+1; j++)
3123                     {
3124                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3125                     }
3126                     break;
3127                 case setcellsizeslbLOCAL:
3128                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3129                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3130                     break;
3131                 default:
3132                     break;
3133             }
3134             cellsize = cell_dx*ddbox->skew_fac[d];
3135             while (cellsize*npulse[d] < comm->cutoff)
3136             {
3137                 npulse[d]++;
3138             }
3139             cellsize_min[d] = cellsize;
3140         }
3141         else
3142         {
3143             /* Statically load balanced grid */
3144             /* Also when we are not doing a master distribution we determine
3145              * all cell borders in a loop to obtain identical values
3146              * to the master distribution case and to determine npulse.
3147              */
3148             if (setmode == setcellsizeslbMASTER)
3149             {
3150                 cell_x = dd->ma->cell_x[d];
3151             }
3152             else
3153             {
3154                 snew(cell_x, dd->nc[d]+1);
3155             }
3156             cell_x[0] = ddbox->box0[d];
3157             for (j = 0; j < dd->nc[d]; j++)
3158             {
3159                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3160                 cell_x[j+1] = cell_x[j] + cell_dx;
3161                 cellsize    = cell_dx*ddbox->skew_fac[d];
3162                 while (cellsize*npulse[d] < comm->cutoff &&
3163                        npulse[d] < dd->nc[d]-1)
3164                 {
3165                     npulse[d]++;
3166                 }
3167                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3168             }
3169             if (setmode == setcellsizeslbLOCAL)
3170             {
3171                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3172                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3173             }
3174             if (setmode != setcellsizeslbMASTER)
3175             {
3176                 sfree(cell_x);
3177             }
3178         }
3179         /* The following limitation is to avoid that a cell would receive
3180          * some of its own home charge groups back over the periodic boundary.
3181          * Double charge groups cause trouble with the global indices.
3182          */
3183         if (d < ddbox->npbcdim &&
3184             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3185         {
3186             char error_string[STRLEN];
3187
3188             sprintf(error_string,
3189                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3190                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3191                     comm->cutoff,
3192                     dd->nc[d], dd->nc[d],
3193                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
3194
3195             if (setmode == setcellsizeslbLOCAL)
3196             {
3197                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
3198             }
3199             else
3200             {
3201                 gmx_fatal(FARGS, error_string);
3202             }
3203         }
3204     }
3205
3206     if (!comm->bDynLoadBal)
3207     {
3208         copy_rvec(cellsize_min, comm->cellsize_min);
3209     }
3210
3211     for (d = 0; d < comm->npmedecompdim; d++)
3212     {
3213         set_pme_maxshift(dd, &comm->ddpme[d],
3214                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3215                          comm->ddpme[d].slb_dim_f);
3216     }
3217 }
3218
3219
3220 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3221                                                   int d, int dim, gmx_domdec_root_t *root,
3222                                                   gmx_ddbox_t *ddbox,
3223                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
3224 {
3225     gmx_domdec_comm_t *comm;
3226     int                ncd, i, j, nmin, nmin_old;
3227     gmx_bool           bLimLo, bLimHi;
3228     real              *cell_size;
3229     real               fac, halfway, cellsize_limit_f_i, region_size;
3230     gmx_bool           bPBC, bLastHi = FALSE;
3231     int                nrange[] = {range[0], range[1]};
3232
3233     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3234
3235     comm = dd->comm;
3236
3237     ncd = dd->nc[dim];
3238
3239     bPBC = (dim < ddbox->npbcdim);
3240
3241     cell_size = root->buf_ncd;
3242
3243     if (debug)
3244     {
3245         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3246     }
3247
3248     /* First we need to check if the scaling does not make cells
3249      * smaller than the smallest allowed size.
3250      * We need to do this iteratively, since if a cell is too small,
3251      * it needs to be enlarged, which makes all the other cells smaller,
3252      * which could in turn make another cell smaller than allowed.
3253      */
3254     for (i = range[0]; i < range[1]; i++)
3255     {
3256         root->bCellMin[i] = FALSE;
3257     }
3258     nmin = 0;
3259     do
3260     {
3261         nmin_old = nmin;
3262         /* We need the total for normalization */
3263         fac = 0;
3264         for (i = range[0]; i < range[1]; i++)
3265         {
3266             if (root->bCellMin[i] == FALSE)
3267             {
3268                 fac += cell_size[i];
3269             }
3270         }
3271         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3272         /* Determine the cell boundaries */
3273         for (i = range[0]; i < range[1]; i++)
3274         {
3275             if (root->bCellMin[i] == FALSE)
3276             {
3277                 cell_size[i] *= fac;
3278                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3279                 {
3280                     cellsize_limit_f_i = 0;
3281                 }
3282                 else
3283                 {
3284                     cellsize_limit_f_i = cellsize_limit_f;
3285                 }
3286                 if (cell_size[i] < cellsize_limit_f_i)
3287                 {
3288                     root->bCellMin[i] = TRUE;
3289                     cell_size[i]      = cellsize_limit_f_i;
3290                     nmin++;
3291                 }
3292             }
3293             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3294         }
3295     }
3296     while (nmin > nmin_old);
3297
3298     i            = range[1]-1;
3299     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3300     /* For this check we should not use DD_CELL_MARGIN,
3301      * but a slightly smaller factor,
3302      * since rounding could get use below the limit.
3303      */
3304     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3305     {
3306         char buf[22];
3307         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3308                   gmx_step_str(step, buf),
3309                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3310                   ncd, comm->cellsize_min[dim]);
3311     }
3312
3313     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3314
3315     if (!bUniform)
3316     {
3317         /* Check if the boundary did not displace more than halfway
3318          * each of the cells it bounds, as this could cause problems,
3319          * especially when the differences between cell sizes are large.
3320          * If changes are applied, they will not make cells smaller
3321          * than the cut-off, as we check all the boundaries which
3322          * might be affected by a change and if the old state was ok,
3323          * the cells will at most be shrunk back to their old size.
3324          */
3325         for (i = range[0]+1; i < range[1]; i++)
3326         {
3327             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3328             if (root->cell_f[i] < halfway)
3329             {
3330                 root->cell_f[i] = halfway;
3331                 /* Check if the change also causes shifts of the next boundaries */
3332                 for (j = i+1; j < range[1]; j++)
3333                 {
3334                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3335                     {
3336                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3337                     }
3338                 }
3339             }
3340             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3341             if (root->cell_f[i] > halfway)
3342             {
3343                 root->cell_f[i] = halfway;
3344                 /* Check if the change also causes shifts of the next boundaries */
3345                 for (j = i-1; j >= range[0]+1; j--)
3346                 {
3347                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3348                     {
3349                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3350                     }
3351                 }
3352             }
3353         }
3354     }
3355
3356     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3357     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3358      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3359      * for a and b nrange is used */
3360     if (d > 0)
3361     {
3362         /* Take care of the staggering of the cell boundaries */
3363         if (bUniform)
3364         {
3365             for (i = range[0]; i < range[1]; i++)
3366             {
3367                 root->cell_f_max0[i] = root->cell_f[i];
3368                 root->cell_f_min1[i] = root->cell_f[i+1];
3369             }
3370         }
3371         else
3372         {
3373             for (i = range[0]+1; i < range[1]; i++)
3374             {
3375                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3376                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3377                 if (bLimLo && bLimHi)
3378                 {
3379                     /* Both limits violated, try the best we can */
3380                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3381                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3382                     nrange[0]       = range[0];
3383                     nrange[1]       = i;
3384                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3385
3386                     nrange[0] = i;
3387                     nrange[1] = range[1];
3388                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3389
3390                     return;
3391                 }
3392                 else if (bLimLo)
3393                 {
3394                     /* root->cell_f[i] = root->bound_min[i]; */
3395                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3396                     bLastHi   = FALSE;
3397                 }
3398                 else if (bLimHi && !bLastHi)
3399                 {
3400                     bLastHi = TRUE;
3401                     if (nrange[1] < range[1])   /* found a LimLo before */
3402                     {
3403                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3404                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3405                         nrange[0] = nrange[1];
3406                     }
3407                     root->cell_f[i] = root->bound_max[i];
3408                     nrange[1]       = i;
3409                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3410                     nrange[0] = i;
3411                     nrange[1] = range[1];
3412                 }
3413             }
3414             if (nrange[1] < range[1])   /* found last a LimLo */
3415             {
3416                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3417                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3418                 nrange[0] = nrange[1];
3419                 nrange[1] = range[1];
3420                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3421             }
3422             else if (nrange[0] > range[0]) /* found at least one LimHi */
3423             {
3424                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3425             }
3426         }
3427     }
3428 }
3429
3430
3431 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3432                                        int d, int dim, gmx_domdec_root_t *root,
3433                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3434                                        gmx_bool bUniform, gmx_int64_t step)
3435 {
3436     gmx_domdec_comm_t *comm;
3437     int                ncd, d1, i, j, pos;
3438     real              *cell_size;
3439     real               load_aver, load_i, imbalance, change, change_max, sc;
3440     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3441     real               change_limit;
3442     real               relax = 0.5;
3443     gmx_bool           bPBC;
3444     int                range[] = { 0, 0 };
3445
3446     comm = dd->comm;
3447
3448     /* Convert the maximum change from the input percentage to a fraction */
3449     change_limit = comm->dlb_scale_lim*0.01;
3450
3451     ncd = dd->nc[dim];
3452
3453     bPBC = (dim < ddbox->npbcdim);
3454
3455     cell_size = root->buf_ncd;
3456
3457     /* Store the original boundaries */
3458     for (i = 0; i < ncd+1; i++)
3459     {
3460         root->old_cell_f[i] = root->cell_f[i];
3461     }
3462     if (bUniform)
3463     {
3464         for (i = 0; i < ncd; i++)
3465         {
3466             cell_size[i] = 1.0/ncd;
3467         }
3468     }
3469     else if (dd_load_count(comm))
3470     {
3471         load_aver  = comm->load[d].sum_m/ncd;
3472         change_max = 0;
3473         for (i = 0; i < ncd; i++)
3474         {
3475             /* Determine the relative imbalance of cell i */
3476             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3477             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3478             /* Determine the change of the cell size using underrelaxation */
3479             change     = -relax*imbalance;
3480             change_max = max(change_max, max(change, -change));
3481         }
3482         /* Limit the amount of scaling.
3483          * We need to use the same rescaling for all cells in one row,
3484          * otherwise the load balancing might not converge.
3485          */
3486         sc = relax;
3487         if (change_max > change_limit)
3488         {
3489             sc *= change_limit/change_max;
3490         }
3491         for (i = 0; i < ncd; i++)
3492         {
3493             /* Determine the relative imbalance of cell i */
3494             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3495             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3496             /* Determine the change of the cell size using underrelaxation */
3497             change       = -sc*imbalance;
3498             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3499         }
3500     }
3501
3502     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3503     cellsize_limit_f *= DD_CELL_MARGIN;
3504     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3505     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3506     if (ddbox->tric_dir[dim])
3507     {
3508         cellsize_limit_f /= ddbox->skew_fac[dim];
3509         dist_min_f       /= ddbox->skew_fac[dim];
3510     }
3511     if (bDynamicBox && d > 0)
3512     {
3513         dist_min_f *= DD_PRES_SCALE_MARGIN;
3514     }
3515     if (d > 0 && !bUniform)
3516     {
3517         /* Make sure that the grid is not shifted too much */
3518         for (i = 1; i < ncd; i++)
3519         {
3520             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3521             {
3522                 gmx_incons("Inconsistent DD boundary staggering limits!");
3523             }
3524             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3525             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3526             if (space > 0)
3527             {
3528                 root->bound_min[i] += 0.5*space;
3529             }
3530             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3531             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3532             if (space < 0)
3533             {
3534                 root->bound_max[i] += 0.5*space;
3535             }
3536             if (debug)
3537             {
3538                 fprintf(debug,
3539                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3540                         d, i,
3541                         root->cell_f_max0[i-1] + dist_min_f,
3542                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3543                         root->cell_f_min1[i] - dist_min_f);
3544             }
3545         }
3546     }
3547     range[1]          = ncd;
3548     root->cell_f[0]   = 0;
3549     root->cell_f[ncd] = 1;
3550     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3551
3552
3553     /* After the checks above, the cells should obey the cut-off
3554      * restrictions, but it does not hurt to check.
3555      */
3556     for (i = 0; i < ncd; i++)
3557     {
3558         if (debug)
3559         {
3560             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3561                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3562         }
3563
3564         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3565             root->cell_f[i+1] - root->cell_f[i] <
3566             cellsize_limit_f/DD_CELL_MARGIN)
3567         {
3568             char buf[22];
3569             fprintf(stderr,
3570                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3571                     gmx_step_str(step, buf), dim2char(dim), i,
3572                     (root->cell_f[i+1] - root->cell_f[i])
3573                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3574         }
3575     }
3576
3577     pos = ncd + 1;
3578     /* Store the cell boundaries of the lower dimensions at the end */
3579     for (d1 = 0; d1 < d; d1++)
3580     {
3581         root->cell_f[pos++] = comm->cell_f0[d1];
3582         root->cell_f[pos++] = comm->cell_f1[d1];
3583     }
3584
3585     if (d < comm->npmedecompdim)
3586     {
3587         /* The master determines the maximum shift for
3588          * the coordinate communication between separate PME nodes.
3589          */
3590         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3591     }
3592     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3593     if (d >= 1)
3594     {
3595         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3596     }
3597 }
3598
3599 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3600                                              gmx_ddbox_t *ddbox, int dimind)
3601 {
3602     gmx_domdec_comm_t *comm;
3603     int                dim;
3604
3605     comm = dd->comm;
3606
3607     /* Set the cell dimensions */
3608     dim                = dd->dim[dimind];
3609     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3610     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3611     if (dim >= ddbox->nboundeddim)
3612     {
3613         comm->cell_x0[dim] += ddbox->box0[dim];
3614         comm->cell_x1[dim] += ddbox->box0[dim];
3615     }
3616 }
3617
3618 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3619                                          int d, int dim, real *cell_f_row,
3620                                          gmx_ddbox_t *ddbox)
3621 {
3622     gmx_domdec_comm_t *comm;
3623     int                d1, dim1, pos;
3624
3625     comm = dd->comm;
3626
3627 #ifdef GMX_MPI
3628     /* Each node would only need to know two fractions,
3629      * but it is probably cheaper to broadcast the whole array.
3630      */
3631     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3632               0, comm->mpi_comm_load[d]);
3633 #endif
3634     /* Copy the fractions for this dimension from the buffer */
3635     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3636     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3637     /* The whole array was communicated, so set the buffer position */
3638     pos = dd->nc[dim] + 1;
3639     for (d1 = 0; d1 <= d; d1++)
3640     {
3641         if (d1 < d)
3642         {
3643             /* Copy the cell fractions of the lower dimensions */
3644             comm->cell_f0[d1] = cell_f_row[pos++];
3645             comm->cell_f1[d1] = cell_f_row[pos++];
3646         }
3647         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3648     }
3649     /* Convert the communicated shift from float to int */
3650     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3651     if (d >= 1)
3652     {
3653         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3654     }
3655 }
3656
3657 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3658                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3659                                          gmx_bool bUniform, gmx_int64_t step)
3660 {
3661     gmx_domdec_comm_t *comm;
3662     int                d, dim, d1;
3663     gmx_bool           bRowMember, bRowRoot;
3664     real              *cell_f_row;
3665
3666     comm = dd->comm;
3667
3668     for (d = 0; d < dd->ndim; d++)
3669     {
3670         dim        = dd->dim[d];
3671         bRowMember = TRUE;
3672         bRowRoot   = TRUE;
3673         for (d1 = d; d1 < dd->ndim; d1++)
3674         {
3675             if (dd->ci[dd->dim[d1]] > 0)
3676             {
3677                 if (d1 != d)
3678                 {
3679                     bRowMember = FALSE;
3680                 }
3681                 bRowRoot = FALSE;
3682             }
3683         }
3684         if (bRowMember)
3685         {
3686             if (bRowRoot)
3687             {
3688                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3689                                            ddbox, bDynamicBox, bUniform, step);
3690                 cell_f_row = comm->root[d]->cell_f;
3691             }
3692             else
3693             {
3694                 cell_f_row = comm->cell_f_row;
3695             }
3696             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3697         }
3698     }
3699 }
3700
3701 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3702 {
3703     int d;
3704
3705     /* This function assumes the box is static and should therefore
3706      * not be called when the box has changed since the last
3707      * call to dd_partition_system.
3708      */
3709     for (d = 0; d < dd->ndim; d++)
3710     {
3711         relative_to_absolute_cell_bounds(dd, ddbox, d);
3712     }
3713 }
3714
3715
3716
3717 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3718                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3719                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3720                                   gmx_wallcycle_t wcycle)
3721 {
3722     gmx_domdec_comm_t *comm;
3723     int                dim;
3724
3725     comm = dd->comm;
3726
3727     if (bDoDLB)
3728     {
3729         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3730         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3731         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3732     }
3733     else if (bDynamicBox)
3734     {
3735         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3736     }
3737
3738     /* Set the dimensions for which no DD is used */
3739     for (dim = 0; dim < DIM; dim++)
3740     {
3741         if (dd->nc[dim] == 1)
3742         {
3743             comm->cell_x0[dim] = 0;
3744             comm->cell_x1[dim] = ddbox->box_size[dim];
3745             if (dim >= ddbox->nboundeddim)
3746             {
3747                 comm->cell_x0[dim] += ddbox->box0[dim];
3748                 comm->cell_x1[dim] += ddbox->box0[dim];
3749             }
3750         }
3751     }
3752 }
3753
3754 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3755 {
3756     int                    d, np, i;
3757     gmx_domdec_comm_dim_t *cd;
3758
3759     for (d = 0; d < dd->ndim; d++)
3760     {
3761         cd = &dd->comm->cd[d];
3762         np = npulse[dd->dim[d]];
3763         if (np > cd->np_nalloc)
3764         {
3765             if (debug)
3766             {
3767                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3768                         dim2char(dd->dim[d]), np);
3769             }
3770             if (DDMASTER(dd) && cd->np_nalloc > 0)
3771             {
3772                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3773             }
3774             srenew(cd->ind, np);
3775             for (i = cd->np_nalloc; i < np; i++)
3776             {
3777                 cd->ind[i].index  = NULL;
3778                 cd->ind[i].nalloc = 0;
3779             }
3780             cd->np_nalloc = np;
3781         }
3782         cd->np = np;
3783     }
3784 }
3785
3786
3787 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3788                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3789                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3790                               gmx_wallcycle_t wcycle)
3791 {
3792     gmx_domdec_comm_t *comm;
3793     int                d;
3794     ivec               npulse;
3795
3796     comm = dd->comm;
3797
3798     /* Copy the old cell boundaries for the cg displacement check */
3799     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3800     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3801
3802     if (comm->bDynLoadBal)
3803     {
3804         if (DDMASTER(dd))
3805         {
3806             check_box_size(dd, ddbox);
3807         }
3808         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3809     }
3810     else
3811     {
3812         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3813         realloc_comm_ind(dd, npulse);
3814     }
3815
3816     if (debug)
3817     {
3818         for (d = 0; d < DIM; d++)
3819         {
3820             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3821                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3822         }
3823     }
3824 }
3825
3826 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3827                                   gmx_ddbox_t *ddbox,
3828                                   rvec cell_ns_x0, rvec cell_ns_x1,
3829                                   gmx_int64_t step)
3830 {
3831     gmx_domdec_comm_t *comm;
3832     int                dim_ind, dim;
3833
3834     comm = dd->comm;
3835
3836     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3837     {
3838         dim = dd->dim[dim_ind];
3839
3840         /* Without PBC we don't have restrictions on the outer cells */
3841         if (!(dim >= ddbox->npbcdim &&
3842               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3843             comm->bDynLoadBal &&
3844             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3845             comm->cellsize_min[dim])
3846         {
3847             char buf[22];
3848             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3849                       gmx_step_str(step, buf), dim2char(dim),
3850                       comm->cell_x1[dim] - comm->cell_x0[dim],
3851                       ddbox->skew_fac[dim],
3852                       dd->comm->cellsize_min[dim],
3853                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3854         }
3855     }
3856
3857     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3858     {
3859         /* Communicate the boundaries and update cell_ns_x0/1 */
3860         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3861         if (dd->bGridJump && dd->ndim > 1)
3862         {
3863             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3864         }
3865     }
3866 }
3867
3868 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3869 {
3870     if (YY < npbcdim)
3871     {
3872         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3873     }
3874     else
3875     {
3876         tcm[YY][XX] = 0;
3877     }
3878     if (ZZ < npbcdim)
3879     {
3880         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3881         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3882     }
3883     else
3884     {
3885         tcm[ZZ][XX] = 0;
3886         tcm[ZZ][YY] = 0;
3887     }
3888 }
3889
3890 static void check_screw_box(matrix box)
3891 {
3892     /* Mathematical limitation */
3893     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3894     {
3895         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3896     }
3897
3898     /* Limitation due to the asymmetry of the eighth shell method */
3899     if (box[ZZ][YY] != 0)
3900     {
3901         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3902     }
3903 }
3904
3905 static void distribute_cg(FILE *fplog, gmx_int64_t step,
3906                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3907                           gmx_domdec_t *dd)
3908 {
3909     gmx_domdec_master_t *ma;
3910     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3911     int                  i, icg, j, k, k0, k1, d, npbcdim;
3912     matrix               tcm;
3913     rvec                 box_size, cg_cm;
3914     ivec                 ind;
3915     real                 nrcg, inv_ncg, pos_d;
3916     atom_id             *cgindex;
3917     gmx_bool             bUnbounded, bScrew;
3918
3919     ma = dd->ma;
3920
3921     if (tmp_ind == NULL)
3922     {
3923         snew(tmp_nalloc, dd->nnodes);
3924         snew(tmp_ind, dd->nnodes);
3925         for (i = 0; i < dd->nnodes; i++)
3926         {
3927             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3928             snew(tmp_ind[i], tmp_nalloc[i]);
3929         }
3930     }
3931
3932     /* Clear the count */
3933     for (i = 0; i < dd->nnodes; i++)
3934     {
3935         ma->ncg[i] = 0;
3936         ma->nat[i] = 0;
3937     }
3938
3939     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3940
3941     cgindex = cgs->index;
3942
3943     /* Compute the center of geometry for all charge groups */
3944     for (icg = 0; icg < cgs->nr; icg++)
3945     {
3946         k0      = cgindex[icg];
3947         k1      = cgindex[icg+1];
3948         nrcg    = k1 - k0;
3949         if (nrcg == 1)
3950         {
3951             copy_rvec(pos[k0], cg_cm);
3952         }
3953         else
3954         {
3955             inv_ncg = 1.0/nrcg;
3956
3957             clear_rvec(cg_cm);
3958             for (k = k0; (k < k1); k++)
3959             {
3960                 rvec_inc(cg_cm, pos[k]);
3961             }
3962             for (d = 0; (d < DIM); d++)
3963             {
3964                 cg_cm[d] *= inv_ncg;
3965             }
3966         }
3967         /* Put the charge group in the box and determine the cell index */
3968         for (d = DIM-1; d >= 0; d--)
3969         {
3970             pos_d = cg_cm[d];
3971             if (d < dd->npbcdim)
3972             {
3973                 bScrew = (dd->bScrewPBC && d == XX);
3974                 if (tric_dir[d] && dd->nc[d] > 1)
3975                 {
3976                     /* Use triclinic coordintates for this dimension */
3977                     for (j = d+1; j < DIM; j++)
3978                     {
3979                         pos_d += cg_cm[j]*tcm[j][d];
3980                     }
3981                 }
3982                 while (pos_d >= box[d][d])
3983                 {
3984                     pos_d -= box[d][d];
3985                     rvec_dec(cg_cm, box[d]);
3986                     if (bScrew)
3987                     {
3988                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3989                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3990                     }
3991                     for (k = k0; (k < k1); k++)
3992                     {
3993                         rvec_dec(pos[k], box[d]);
3994                         if (bScrew)
3995                         {
3996                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3997                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3998                         }
3999                     }
4000                 }
4001                 while (pos_d < 0)
4002                 {
4003                     pos_d += box[d][d];
4004                     rvec_inc(cg_cm, box[d]);
4005                     if (bScrew)
4006                     {
4007                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4008                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4009                     }
4010                     for (k = k0; (k < k1); k++)
4011                     {
4012                         rvec_inc(pos[k], box[d]);
4013                         if (bScrew)
4014                         {
4015                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4016                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4017                         }
4018                     }
4019                 }
4020             }
4021             /* This could be done more efficiently */
4022             ind[d] = 0;
4023             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
4024             {
4025                 ind[d]++;
4026             }
4027         }
4028         i = dd_index(dd->nc, ind);
4029         if (ma->ncg[i] == tmp_nalloc[i])
4030         {
4031             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
4032             srenew(tmp_ind[i], tmp_nalloc[i]);
4033         }
4034         tmp_ind[i][ma->ncg[i]] = icg;
4035         ma->ncg[i]++;
4036         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
4037     }
4038
4039     k1 = 0;
4040     for (i = 0; i < dd->nnodes; i++)
4041     {
4042         ma->index[i] = k1;
4043         for (k = 0; k < ma->ncg[i]; k++)
4044         {
4045             ma->cg[k1++] = tmp_ind[i][k];
4046         }
4047     }
4048     ma->index[dd->nnodes] = k1;
4049
4050     for (i = 0; i < dd->nnodes; i++)
4051     {
4052         sfree(tmp_ind[i]);
4053     }
4054     sfree(tmp_ind);
4055     sfree(tmp_nalloc);
4056
4057     if (fplog)
4058     {
4059         char buf[22];
4060         fprintf(fplog, "Charge group distribution at step %s:",
4061                 gmx_step_str(step, buf));
4062         for (i = 0; i < dd->nnodes; i++)
4063         {
4064             fprintf(fplog, " %d", ma->ncg[i]);
4065         }
4066         fprintf(fplog, "\n");
4067     }
4068 }
4069
4070 static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
4071                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4072                                 rvec pos[])
4073 {
4074     gmx_domdec_master_t *ma = NULL;
4075     ivec                 npulse;
4076     int                  i, cg_gl;
4077     int                 *ibuf, buf2[2] = { 0, 0 };
4078     gmx_bool             bMaster = DDMASTER(dd);
4079
4080     if (bMaster)
4081     {
4082         ma = dd->ma;
4083
4084         if (dd->bScrewPBC)
4085         {
4086             check_screw_box(box);
4087         }
4088
4089         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
4090
4091         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4092         for (i = 0; i < dd->nnodes; i++)
4093         {
4094             ma->ibuf[2*i]   = ma->ncg[i];
4095             ma->ibuf[2*i+1] = ma->nat[i];
4096         }
4097         ibuf = ma->ibuf;
4098     }
4099     else
4100     {
4101         ibuf = NULL;
4102     }
4103     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4104
4105     dd->ncg_home = buf2[0];
4106     dd->nat_home = buf2[1];
4107     dd->ncg_tot  = dd->ncg_home;
4108     dd->nat_tot  = dd->nat_home;
4109     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4110     {
4111         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4112         srenew(dd->index_gl, dd->cg_nalloc);
4113         srenew(dd->cgindex, dd->cg_nalloc+1);
4114     }
4115     if (bMaster)
4116     {
4117         for (i = 0; i < dd->nnodes; i++)
4118         {
4119             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4120             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4121         }
4122     }
4123
4124     dd_scatterv(dd,
4125                 DDMASTER(dd) ? ma->ibuf : NULL,
4126                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4127                 DDMASTER(dd) ? ma->cg : NULL,
4128                 dd->ncg_home*sizeof(int), dd->index_gl);
4129
4130     /* Determine the home charge group sizes */
4131     dd->cgindex[0] = 0;
4132     for (i = 0; i < dd->ncg_home; i++)
4133     {
4134         cg_gl            = dd->index_gl[i];
4135         dd->cgindex[i+1] =
4136             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4137     }
4138
4139     if (debug)
4140     {
4141         fprintf(debug, "Home charge groups:\n");
4142         for (i = 0; i < dd->ncg_home; i++)
4143         {
4144             fprintf(debug, " %d", dd->index_gl[i]);
4145             if (i % 10 == 9)
4146             {
4147                 fprintf(debug, "\n");
4148             }
4149         }
4150         fprintf(debug, "\n");
4151     }
4152 }
4153
4154 static int compact_and_copy_vec_at(int ncg, int *move,
4155                                    int *cgindex,
4156                                    int nvec, int vec,
4157                                    rvec *src, gmx_domdec_comm_t *comm,
4158                                    gmx_bool bCompact)
4159 {
4160     int m, icg, i, i0, i1, nrcg;
4161     int home_pos;
4162     int pos_vec[DIM*2];
4163
4164     home_pos = 0;
4165
4166     for (m = 0; m < DIM*2; m++)
4167     {
4168         pos_vec[m] = 0;
4169     }
4170
4171     i0 = 0;
4172     for (icg = 0; icg < ncg; icg++)
4173     {
4174         i1 = cgindex[icg+1];
4175         m  = move[icg];
4176         if (m == -1)
4177         {
4178             if (bCompact)
4179             {
4180                 /* Compact the home array in place */
4181                 for (i = i0; i < i1; i++)
4182                 {
4183                     copy_rvec(src[i], src[home_pos++]);
4184                 }
4185             }
4186         }
4187         else
4188         {
4189             /* Copy to the communication buffer */
4190             nrcg        = i1 - i0;
4191             pos_vec[m] += 1 + vec*nrcg;
4192             for (i = i0; i < i1; i++)
4193             {
4194                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4195             }
4196             pos_vec[m] += (nvec - vec - 1)*nrcg;
4197         }
4198         if (!bCompact)
4199         {
4200             home_pos += i1 - i0;
4201         }
4202         i0 = i1;
4203     }
4204
4205     return home_pos;
4206 }
4207
4208 static int compact_and_copy_vec_cg(int ncg, int *move,
4209                                    int *cgindex,
4210                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4211                                    gmx_bool bCompact)
4212 {
4213     int m, icg, i0, i1, nrcg;
4214     int home_pos;
4215     int pos_vec[DIM*2];
4216
4217     home_pos = 0;
4218
4219     for (m = 0; m < DIM*2; m++)
4220     {
4221         pos_vec[m] = 0;
4222     }
4223
4224     i0 = 0;
4225     for (icg = 0; icg < ncg; icg++)
4226     {
4227         i1 = cgindex[icg+1];
4228         m  = move[icg];
4229         if (m == -1)
4230         {
4231             if (bCompact)
4232             {
4233                 /* Compact the home array in place */
4234                 copy_rvec(src[icg], src[home_pos++]);
4235             }
4236         }
4237         else
4238         {
4239             nrcg = i1 - i0;
4240             /* Copy to the communication buffer */
4241             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4242             pos_vec[m] += 1 + nrcg*nvec;
4243         }
4244         i0 = i1;
4245     }
4246     if (!bCompact)
4247     {
4248         home_pos = ncg;
4249     }
4250
4251     return home_pos;
4252 }
4253
4254 static int compact_ind(int ncg, int *move,
4255                        int *index_gl, int *cgindex,
4256                        int *gatindex,
4257                        gmx_ga2la_t ga2la, char *bLocalCG,
4258                        int *cginfo)
4259 {
4260     int cg, nat, a0, a1, a, a_gl;
4261     int home_pos;
4262
4263     home_pos = 0;
4264     nat      = 0;
4265     for (cg = 0; cg < ncg; cg++)
4266     {
4267         a0 = cgindex[cg];
4268         a1 = cgindex[cg+1];
4269         if (move[cg] == -1)
4270         {
4271             /* Compact the home arrays in place.
4272              * Anything that can be done here avoids access to global arrays.
4273              */
4274             cgindex[home_pos] = nat;
4275             for (a = a0; a < a1; a++)
4276             {
4277                 a_gl          = gatindex[a];
4278                 gatindex[nat] = a_gl;
4279                 /* The cell number stays 0, so we don't need to set it */
4280                 ga2la_change_la(ga2la, a_gl, nat);
4281                 nat++;
4282             }
4283             index_gl[home_pos] = index_gl[cg];
4284             cginfo[home_pos]   = cginfo[cg];
4285             /* The charge group remains local, so bLocalCG does not change */
4286             home_pos++;
4287         }
4288         else
4289         {
4290             /* Clear the global indices */
4291             for (a = a0; a < a1; a++)
4292             {
4293                 ga2la_del(ga2la, gatindex[a]);
4294             }
4295             if (bLocalCG)
4296             {
4297                 bLocalCG[index_gl[cg]] = FALSE;
4298             }
4299         }
4300     }
4301     cgindex[home_pos] = nat;
4302
4303     return home_pos;
4304 }
4305
4306 static void clear_and_mark_ind(int ncg, int *move,
4307                                int *index_gl, int *cgindex, int *gatindex,
4308                                gmx_ga2la_t ga2la, char *bLocalCG,
4309                                int *cell_index)
4310 {
4311     int cg, a0, a1, a;
4312
4313     for (cg = 0; cg < ncg; cg++)
4314     {
4315         if (move[cg] >= 0)
4316         {
4317             a0 = cgindex[cg];
4318             a1 = cgindex[cg+1];
4319             /* Clear the global indices */
4320             for (a = a0; a < a1; a++)
4321             {
4322                 ga2la_del(ga2la, gatindex[a]);
4323             }
4324             if (bLocalCG)
4325             {
4326                 bLocalCG[index_gl[cg]] = FALSE;
4327             }
4328             /* Signal that this cg has moved using the ns cell index.
4329              * Here we set it to -1. fill_grid will change it
4330              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4331              */
4332             cell_index[cg] = -1;
4333         }
4334     }
4335 }
4336
4337 static void print_cg_move(FILE *fplog,
4338                           gmx_domdec_t *dd,
4339                           gmx_int64_t step, int cg, int dim, int dir,
4340                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4341                           rvec cm_old, rvec cm_new, real pos_d)
4342 {
4343     gmx_domdec_comm_t *comm;
4344     char               buf[22];
4345
4346     comm = dd->comm;
4347
4348     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4349     if (bHaveLimitdAndCMOld)
4350     {
4351         fprintf(fplog, "The charge group starting at atom %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4352                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4353     }
4354     else
4355     {
4356         fprintf(fplog, "The charge group starting at atom %d moved than the distance allowed by the domain decomposition in direction %c\n",
4357                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4358     }
4359     fprintf(fplog, "distance out of cell %f\n",
4360             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4361     if (bHaveLimitdAndCMOld)
4362     {
4363         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4364                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4365     }
4366     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4367             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4368     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4369             dim2char(dim),
4370             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4371     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4372             dim2char(dim),
4373             comm->cell_x0[dim], comm->cell_x1[dim]);
4374 }
4375
4376 static void cg_move_error(FILE *fplog,
4377                           gmx_domdec_t *dd,
4378                           gmx_int64_t step, int cg, int dim, int dir,
4379                           gmx_bool bHaveLimitdAndCMOld, real limitd,
4380                           rvec cm_old, rvec cm_new, real pos_d)
4381 {
4382     if (fplog)
4383     {
4384         print_cg_move(fplog, dd, step, cg, dim, dir,
4385                       bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4386     }
4387     print_cg_move(stderr, dd, step, cg, dim, dir,
4388                   bHaveLimitdAndCMOld, limitd, cm_old, cm_new, pos_d);
4389     gmx_fatal(FARGS,
4390               "A charge group moved too far between two domain decomposition steps\n"
4391               "This usually means that your system is not well equilibrated");
4392 }
4393
4394 static void rotate_state_atom(t_state *state, int a)
4395 {
4396     int est;
4397
4398     for (est = 0; est < estNR; est++)
4399     {
4400         if (EST_DISTR(est) && (state->flags & (1<<est)))
4401         {
4402             switch (est)
4403             {
4404                 case estX:
4405                     /* Rotate the complete state; for a rectangular box only */
4406                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4407                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4408                     break;
4409                 case estV:
4410                     state->v[a][YY] = -state->v[a][YY];
4411                     state->v[a][ZZ] = -state->v[a][ZZ];
4412                     break;
4413                 case estSDX:
4414                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4415                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4416                     break;
4417                 case estCGP:
4418                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4419                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4420                     break;
4421                 case estDISRE_INITF:
4422                 case estDISRE_RM3TAV:
4423                 case estORIRE_INITF:
4424                 case estORIRE_DTAV:
4425                     /* These are distances, so not affected by rotation */
4426                     break;
4427                 default:
4428                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4429             }
4430         }
4431     }
4432 }
4433
4434 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4435 {
4436     if (natoms > comm->moved_nalloc)
4437     {
4438         /* Contents should be preserved here */
4439         comm->moved_nalloc = over_alloc_dd(natoms);
4440         srenew(comm->moved, comm->moved_nalloc);
4441     }
4442
4443     return comm->moved;
4444 }
4445
4446 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4447                          gmx_domdec_t *dd,
4448                          t_state *state,
4449                          ivec tric_dir, matrix tcm,
4450                          rvec cell_x0, rvec cell_x1,
4451                          rvec limitd, rvec limit0, rvec limit1,
4452                          const int *cgindex,
4453                          int cg_start, int cg_end,
4454                          rvec *cg_cm,
4455                          int *move)
4456 {
4457     int      npbcdim;
4458     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4459     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4460     int      flag;
4461     gmx_bool bScrew;
4462     ivec     dev;
4463     real     inv_ncg, pos_d;
4464     rvec     cm_new;
4465
4466     npbcdim = dd->npbcdim;
4467
4468     for (cg = cg_start; cg < cg_end; cg++)
4469     {
4470         k0   = cgindex[cg];
4471         k1   = cgindex[cg+1];
4472         nrcg = k1 - k0;
4473         if (nrcg == 1)
4474         {
4475             copy_rvec(state->x[k0], cm_new);
4476         }
4477         else
4478         {
4479             inv_ncg = 1.0/nrcg;
4480
4481             clear_rvec(cm_new);
4482             for (k = k0; (k < k1); k++)
4483             {
4484                 rvec_inc(cm_new, state->x[k]);
4485             }
4486             for (d = 0; (d < DIM); d++)
4487             {
4488                 cm_new[d] = inv_ncg*cm_new[d];
4489             }
4490         }
4491
4492         clear_ivec(dev);
4493         /* Do pbc and check DD cell boundary crossings */
4494         for (d = DIM-1; d >= 0; d--)
4495         {
4496             if (dd->nc[d] > 1)
4497             {
4498                 bScrew = (dd->bScrewPBC && d == XX);
4499                 /* Determine the location of this cg in lattice coordinates */
4500                 pos_d = cm_new[d];
4501                 if (tric_dir[d])
4502                 {
4503                     for (d2 = d+1; d2 < DIM; d2++)
4504                     {
4505                         pos_d += cm_new[d2]*tcm[d2][d];
4506                     }
4507                 }
4508                 /* Put the charge group in the triclinic unit-cell */
4509                 if (pos_d >= cell_x1[d])
4510                 {
4511                     if (pos_d >= limit1[d])
4512                     {
4513                         cg_move_error(fplog, dd, step, cg, d, 1, TRUE, limitd[d],
4514                                       cg_cm[cg], cm_new, pos_d);
4515                     }
4516                     dev[d] = 1;
4517                     if (dd->ci[d] == dd->nc[d] - 1)
4518                     {
4519                         rvec_dec(cm_new, state->box[d]);
4520                         if (bScrew)
4521                         {
4522                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4523                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4524                         }
4525                         for (k = k0; (k < k1); k++)
4526                         {
4527                             rvec_dec(state->x[k], state->box[d]);
4528                             if (bScrew)
4529                             {
4530                                 rotate_state_atom(state, k);
4531                             }
4532                         }
4533                     }
4534                 }
4535                 else if (pos_d < cell_x0[d])
4536                 {
4537                     if (pos_d < limit0[d])
4538                     {
4539                         cg_move_error(fplog, dd, step, cg, d, -1, TRUE, limitd[d],
4540                                       cg_cm[cg], cm_new, pos_d);
4541                     }
4542                     dev[d] = -1;
4543                     if (dd->ci[d] == 0)
4544                     {
4545                         rvec_inc(cm_new, state->box[d]);
4546                         if (bScrew)
4547                         {
4548                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4549                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4550                         }
4551                         for (k = k0; (k < k1); k++)
4552                         {
4553                             rvec_inc(state->x[k], state->box[d]);
4554                             if (bScrew)
4555                             {
4556                                 rotate_state_atom(state, k);
4557                             }
4558                         }
4559                     }
4560                 }
4561             }
4562             else if (d < npbcdim)
4563             {
4564                 /* Put the charge group in the rectangular unit-cell */
4565                 while (cm_new[d] >= state->box[d][d])
4566                 {
4567                     rvec_dec(cm_new, state->box[d]);
4568                     for (k = k0; (k < k1); k++)
4569                     {
4570                         rvec_dec(state->x[k], state->box[d]);
4571                     }
4572                 }
4573                 while (cm_new[d] < 0)
4574                 {
4575                     rvec_inc(cm_new, state->box[d]);
4576                     for (k = k0; (k < k1); k++)
4577                     {
4578                         rvec_inc(state->x[k], state->box[d]);
4579                     }
4580                 }
4581             }
4582         }
4583
4584         copy_rvec(cm_new, cg_cm[cg]);
4585
4586         /* Determine where this cg should go */
4587         flag = 0;
4588         mc   = -1;
4589         for (d = 0; d < dd->ndim; d++)
4590         {
4591             dim = dd->dim[d];
4592             if (dev[dim] == 1)
4593             {
4594                 flag |= DD_FLAG_FW(d);
4595                 if (mc == -1)
4596                 {
4597                     mc = d*2;
4598                 }
4599             }
4600             else if (dev[dim] == -1)
4601             {
4602                 flag |= DD_FLAG_BW(d);
4603                 if (mc == -1)
4604                 {
4605                     if (dd->nc[dim] > 2)
4606                     {
4607                         mc = d*2 + 1;
4608                     }
4609                     else
4610                     {
4611                         mc = d*2;
4612                     }
4613                 }
4614             }
4615         }
4616         /* Temporarily store the flag in move */
4617         move[cg] = mc + flag;
4618     }
4619 }
4620
4621 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4622                                gmx_domdec_t *dd, ivec tric_dir,
4623                                t_state *state, rvec **f,
4624                                t_forcerec *fr,
4625                                gmx_bool bCompact,
4626                                t_nrnb *nrnb,
4627                                int *ncg_stay_home,
4628                                int *ncg_moved)
4629 {
4630     int               *move;
4631     int                npbcdim;
4632     int                ncg[DIM*2], nat[DIM*2];
4633     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4634     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4635     int                sbuf[2], rbuf[2];
4636     int                home_pos_cg, home_pos_at, buf_pos;
4637     int                flag;
4638     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4639     gmx_bool           bScrew;
4640     ivec               dev;
4641     real               inv_ncg, pos_d;
4642     matrix             tcm;
4643     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4644     atom_id           *cgindex;
4645     cginfo_mb_t       *cginfo_mb;
4646     gmx_domdec_comm_t *comm;
4647     int               *moved;
4648     int                nthread, thread;
4649
4650     if (dd->bScrewPBC)
4651     {
4652         check_screw_box(state->box);
4653     }
4654
4655     comm  = dd->comm;
4656     if (fr->cutoff_scheme == ecutsGROUP)
4657     {
4658         cg_cm = fr->cg_cm;
4659     }
4660
4661     for (i = 0; i < estNR; i++)
4662     {
4663         if (EST_DISTR(i))
4664         {
4665             switch (i)
4666             {
4667                 case estX: /* Always present */ break;
4668                 case estV:   bV   = (state->flags & (1<<i)); break;
4669                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4670                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4671                 case estLD_RNG:
4672                 case estLD_RNGI:
4673                 case estDISRE_INITF:
4674                 case estDISRE_RM3TAV:
4675                 case estORIRE_INITF:
4676                 case estORIRE_DTAV:
4677                     /* No processing required */
4678                     break;
4679                 default:
4680                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4681             }
4682         }
4683     }
4684
4685     if (dd->ncg_tot > comm->nalloc_int)
4686     {
4687         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4688         srenew(comm->buf_int, comm->nalloc_int);
4689     }
4690     move = comm->buf_int;
4691
4692     /* Clear the count */
4693     for (c = 0; c < dd->ndim*2; c++)
4694     {
4695         ncg[c] = 0;
4696         nat[c] = 0;
4697     }
4698
4699     npbcdim = dd->npbcdim;
4700
4701     for (d = 0; (d < DIM); d++)
4702     {
4703         limitd[d] = dd->comm->cellsize_min[d];
4704         if (d >= npbcdim && dd->ci[d] == 0)
4705         {
4706             cell_x0[d] = -GMX_FLOAT_MAX;
4707         }
4708         else
4709         {
4710             cell_x0[d] = comm->cell_x0[d];
4711         }
4712         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4713         {
4714             cell_x1[d] = GMX_FLOAT_MAX;
4715         }
4716         else
4717         {
4718             cell_x1[d] = comm->cell_x1[d];
4719         }
4720         if (d < npbcdim)
4721         {
4722             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4723             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4724         }
4725         else
4726         {
4727             /* We check after communication if a charge group moved
4728              * more than one cell. Set the pre-comm check limit to float_max.
4729              */
4730             limit0[d] = -GMX_FLOAT_MAX;
4731             limit1[d] =  GMX_FLOAT_MAX;
4732         }
4733     }
4734
4735     make_tric_corr_matrix(npbcdim, state->box, tcm);
4736
4737     cgindex = dd->cgindex;
4738
4739     nthread = gmx_omp_nthreads_get(emntDomdec);
4740
4741     /* Compute the center of geometry for all home charge groups
4742      * and put them in the box and determine where they should go.
4743      */
4744 #pragma omp parallel for num_threads(nthread) schedule(static)
4745     for (thread = 0; thread < nthread; thread++)
4746     {
4747         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4748                      cell_x0, cell_x1, limitd, limit0, limit1,
4749                      cgindex,
4750                      ( thread   *dd->ncg_home)/nthread,
4751                      ((thread+1)*dd->ncg_home)/nthread,
4752                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4753                      move);
4754     }
4755
4756     for (cg = 0; cg < dd->ncg_home; cg++)
4757     {
4758         if (move[cg] >= 0)
4759         {
4760             mc       = move[cg];
4761             flag     = mc & ~DD_FLAG_NRCG;
4762             mc       = mc & DD_FLAG_NRCG;
4763             move[cg] = mc;
4764
4765             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4766             {
4767                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4768                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4769             }
4770             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4771             /* We store the cg size in the lower 16 bits
4772              * and the place where the charge group should go
4773              * in the next 6 bits. This saves some communication volume.
4774              */
4775             nrcg = cgindex[cg+1] - cgindex[cg];
4776             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4777             ncg[mc] += 1;
4778             nat[mc] += nrcg;
4779         }
4780     }
4781
4782     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4783     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4784
4785     *ncg_moved = 0;
4786     for (i = 0; i < dd->ndim*2; i++)
4787     {
4788         *ncg_moved += ncg[i];
4789     }
4790
4791     nvec = 1;
4792     if (bV)
4793     {
4794         nvec++;
4795     }
4796     if (bSDX)
4797     {
4798         nvec++;
4799     }
4800     if (bCGP)
4801     {
4802         nvec++;
4803     }
4804
4805     /* Make sure the communication buffers are large enough */
4806     for (mc = 0; mc < dd->ndim*2; mc++)
4807     {
4808         nvr = ncg[mc] + nat[mc]*nvec;
4809         if (nvr > comm->cgcm_state_nalloc[mc])
4810         {
4811             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4812             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4813         }
4814     }
4815
4816     switch (fr->cutoff_scheme)
4817     {
4818         case ecutsGROUP:
4819             /* Recalculating cg_cm might be cheaper than communicating,
4820              * but that could give rise to rounding issues.
4821              */
4822             home_pos_cg =
4823                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4824                                         nvec, cg_cm, comm, bCompact);
4825             break;
4826         case ecutsVERLET:
4827             /* Without charge groups we send the moved atom coordinates
4828              * over twice. This is so the code below can be used without
4829              * many conditionals for both for with and without charge groups.
4830              */
4831             home_pos_cg =
4832                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4833                                         nvec, state->x, comm, FALSE);
4834             if (bCompact)
4835             {
4836                 home_pos_cg -= *ncg_moved;
4837             }
4838             break;
4839         default:
4840             gmx_incons("unimplemented");
4841             home_pos_cg = 0;
4842     }
4843
4844     vec         = 0;
4845     home_pos_at =
4846         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4847                                 nvec, vec++, state->x, comm, bCompact);
4848     if (bV)
4849     {
4850         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4851                                 nvec, vec++, state->v, comm, bCompact);
4852     }
4853     if (bSDX)
4854     {
4855         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4856                                 nvec, vec++, state->sd_X, comm, bCompact);
4857     }
4858     if (bCGP)
4859     {
4860         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4861                                 nvec, vec++, state->cg_p, comm, bCompact);
4862     }
4863
4864     if (bCompact)
4865     {
4866         compact_ind(dd->ncg_home, move,
4867                     dd->index_gl, dd->cgindex, dd->gatindex,
4868                     dd->ga2la, comm->bLocalCG,
4869                     fr->cginfo);
4870     }
4871     else
4872     {
4873         if (fr->cutoff_scheme == ecutsVERLET)
4874         {
4875             moved = get_moved(comm, dd->ncg_home);
4876
4877             for (k = 0; k < dd->ncg_home; k++)
4878             {
4879                 moved[k] = 0;
4880             }
4881         }
4882         else
4883         {
4884             moved = fr->ns.grid->cell_index;
4885         }
4886
4887         clear_and_mark_ind(dd->ncg_home, move,
4888                            dd->index_gl, dd->cgindex, dd->gatindex,
4889                            dd->ga2la, comm->bLocalCG,
4890                            moved);
4891     }
4892
4893     cginfo_mb = fr->cginfo_mb;
4894
4895     *ncg_stay_home = home_pos_cg;
4896     for (d = 0; d < dd->ndim; d++)
4897     {
4898         dim      = dd->dim[d];
4899         ncg_recv = 0;
4900         nat_recv = 0;
4901         nvr      = 0;
4902         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4903         {
4904             cdd = d*2 + dir;
4905             /* Communicate the cg and atom counts */
4906             sbuf[0] = ncg[cdd];
4907             sbuf[1] = nat[cdd];
4908             if (debug)
4909             {
4910                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4911                         d, dir, sbuf[0], sbuf[1]);
4912             }
4913             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4914
4915             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4916             {
4917                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4918                 srenew(comm->buf_int, comm->nalloc_int);
4919             }
4920
4921             /* Communicate the charge group indices, sizes and flags */
4922             dd_sendrecv_int(dd, d, dir,
4923                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4924                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4925
4926             nvs = ncg[cdd] + nat[cdd]*nvec;
4927             i   = rbuf[0]  + rbuf[1] *nvec;
4928             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4929
4930             /* Communicate cgcm and state */
4931             dd_sendrecv_rvec(dd, d, dir,
4932                              comm->cgcm_state[cdd], nvs,
4933                              comm->vbuf.v+nvr, i);
4934             ncg_recv += rbuf[0];
4935             nat_recv += rbuf[1];
4936             nvr      += i;
4937         }
4938
4939         /* Process the received charge groups */
4940         buf_pos = 0;
4941         for (cg = 0; cg < ncg_recv; cg++)
4942         {
4943             flag = comm->buf_int[cg*DD_CGIBS+1];
4944
4945             if (dim >= npbcdim && dd->nc[dim] > 2)
4946             {
4947                 /* No pbc in this dim and more than one domain boundary.
4948                  * We do a separate check if a charge group didn't move too far.
4949                  */
4950                 if (((flag & DD_FLAG_FW(d)) &&
4951                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4952                     ((flag & DD_FLAG_BW(d)) &&
4953                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4954                 {
4955                     cg_move_error(fplog, dd, step, cg, dim,
4956                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4957                                   FALSE, 0,
4958                                   comm->vbuf.v[buf_pos],
4959                                   comm->vbuf.v[buf_pos],
4960                                   comm->vbuf.v[buf_pos][dim]);
4961                 }
4962             }
4963
4964             mc = -1;
4965             if (d < dd->ndim-1)
4966             {
4967                 /* Check which direction this cg should go */
4968                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4969                 {
4970                     if (dd->bGridJump)
4971                     {
4972                         /* The cell boundaries for dimension d2 are not equal
4973                          * for each cell row of the lower dimension(s),
4974                          * therefore we might need to redetermine where
4975                          * this cg should go.
4976                          */
4977                         dim2 = dd->dim[d2];
4978                         /* If this cg crosses the box boundary in dimension d2
4979                          * we can use the communicated flag, so we do not
4980                          * have to worry about pbc.
4981                          */
4982                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4983                                (flag & DD_FLAG_FW(d2))) ||
4984                               (dd->ci[dim2] == 0 &&
4985                                (flag & DD_FLAG_BW(d2)))))
4986                         {
4987                             /* Clear the two flags for this dimension */
4988                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4989                             /* Determine the location of this cg
4990                              * in lattice coordinates
4991                              */
4992                             pos_d = comm->vbuf.v[buf_pos][dim2];
4993                             if (tric_dir[dim2])
4994                             {
4995                                 for (d3 = dim2+1; d3 < DIM; d3++)
4996                                 {
4997                                     pos_d +=
4998                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
4999                                 }
5000                             }
5001                             /* Check of we are not at the box edge.
5002                              * pbc is only handled in the first step above,
5003                              * but this check could move over pbc while
5004                              * the first step did not due to different rounding.
5005                              */
5006                             if (pos_d >= cell_x1[dim2] &&
5007                                 dd->ci[dim2] != dd->nc[dim2]-1)
5008                             {
5009                                 flag |= DD_FLAG_FW(d2);
5010                             }
5011                             else if (pos_d < cell_x0[dim2] &&
5012                                      dd->ci[dim2] != 0)
5013                             {
5014                                 flag |= DD_FLAG_BW(d2);
5015                             }
5016                             comm->buf_int[cg*DD_CGIBS+1] = flag;
5017                         }
5018                     }
5019                     /* Set to which neighboring cell this cg should go */
5020                     if (flag & DD_FLAG_FW(d2))
5021                     {
5022                         mc = d2*2;
5023                     }
5024                     else if (flag & DD_FLAG_BW(d2))
5025                     {
5026                         if (dd->nc[dd->dim[d2]] > 2)
5027                         {
5028                             mc = d2*2+1;
5029                         }
5030                         else
5031                         {
5032                             mc = d2*2;
5033                         }
5034                     }
5035                 }
5036             }
5037
5038             nrcg = flag & DD_FLAG_NRCG;
5039             if (mc == -1)
5040             {
5041                 if (home_pos_cg+1 > dd->cg_nalloc)
5042                 {
5043                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5044                     srenew(dd->index_gl, dd->cg_nalloc);
5045                     srenew(dd->cgindex, dd->cg_nalloc+1);
5046                 }
5047                 /* Set the global charge group index and size */
5048                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5049                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5050                 /* Copy the state from the buffer */
5051                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5052                 if (fr->cutoff_scheme == ecutsGROUP)
5053                 {
5054                     cg_cm = fr->cg_cm;
5055                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5056                 }
5057                 buf_pos++;
5058
5059                 /* Set the cginfo */
5060                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5061                                                    dd->index_gl[home_pos_cg]);
5062                 if (comm->bLocalCG)
5063                 {
5064                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5065                 }
5066
5067                 if (home_pos_at+nrcg > state->nalloc)
5068                 {
5069                     dd_realloc_state(state, f, home_pos_at+nrcg);
5070                 }
5071                 for (i = 0; i < nrcg; i++)
5072                 {
5073                     copy_rvec(comm->vbuf.v[buf_pos++],
5074                               state->x[home_pos_at+i]);
5075                 }
5076                 if (bV)
5077                 {
5078                     for (i = 0; i < nrcg; i++)
5079                     {
5080                         copy_rvec(comm->vbuf.v[buf_pos++],
5081                                   state->v[home_pos_at+i]);
5082                     }
5083                 }
5084                 if (bSDX)
5085                 {
5086                     for (i = 0; i < nrcg; i++)
5087                     {
5088                         copy_rvec(comm->vbuf.v[buf_pos++],
5089                                   state->sd_X[home_pos_at+i]);
5090                     }
5091                 }
5092                 if (bCGP)
5093                 {
5094                     for (i = 0; i < nrcg; i++)
5095                     {
5096                         copy_rvec(comm->vbuf.v[buf_pos++],
5097                                   state->cg_p[home_pos_at+i]);
5098                     }
5099                 }
5100                 home_pos_cg += 1;
5101                 home_pos_at += nrcg;
5102             }
5103             else
5104             {
5105                 /* Reallocate the buffers if necessary  */
5106                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5107                 {
5108                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5109                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5110                 }
5111                 nvr = ncg[mc] + nat[mc]*nvec;
5112                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5113                 {
5114                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5115                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5116                 }
5117                 /* Copy from the receive to the send buffers */
5118                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5119                        comm->buf_int + cg*DD_CGIBS,
5120                        DD_CGIBS*sizeof(int));
5121                 memcpy(comm->cgcm_state[mc][nvr],
5122                        comm->vbuf.v[buf_pos],
5123                        (1+nrcg*nvec)*sizeof(rvec));
5124                 buf_pos += 1 + nrcg*nvec;
5125                 ncg[mc] += 1;
5126                 nat[mc] += nrcg;
5127             }
5128         }
5129     }
5130
5131     /* With sorting (!bCompact) the indices are now only partially up to date
5132      * and ncg_home and nat_home are not the real count, since there are
5133      * "holes" in the arrays for the charge groups that moved to neighbors.
5134      */
5135     if (fr->cutoff_scheme == ecutsVERLET)
5136     {
5137         moved = get_moved(comm, home_pos_cg);
5138
5139         for (i = dd->ncg_home; i < home_pos_cg; i++)
5140         {
5141             moved[i] = 0;
5142         }
5143     }
5144     dd->ncg_home = home_pos_cg;
5145     dd->nat_home = home_pos_at;
5146
5147     if (debug)
5148     {
5149         fprintf(debug,
5150                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5151                 *ncg_moved, dd->ncg_home-*ncg_moved);
5152
5153     }
5154 }
5155
5156 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5157 {
5158     dd->comm->cycl[ddCycl] += cycles;
5159     dd->comm->cycl_n[ddCycl]++;
5160     if (cycles > dd->comm->cycl_max[ddCycl])
5161     {
5162         dd->comm->cycl_max[ddCycl] = cycles;
5163     }
5164 }
5165
5166 static double force_flop_count(t_nrnb *nrnb)
5167 {
5168     int         i;
5169     double      sum;
5170     const char *name;
5171
5172     sum = 0;
5173     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5174     {
5175         /* To get closer to the real timings, we half the count
5176          * for the normal loops and again half it for water loops.
5177          */
5178         name = nrnb_str(i);
5179         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5180         {
5181             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5182         }
5183         else
5184         {
5185             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5186         }
5187     }
5188     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5189     {
5190         name = nrnb_str(i);
5191         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5192         {
5193             sum += nrnb->n[i]*cost_nrnb(i);
5194         }
5195     }
5196     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5197     {
5198         sum += nrnb->n[i]*cost_nrnb(i);
5199     }
5200
5201     return sum;
5202 }
5203
5204 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5205 {
5206     if (dd->comm->eFlop)
5207     {
5208         dd->comm->flop -= force_flop_count(nrnb);
5209     }
5210 }
5211 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5212 {
5213     if (dd->comm->eFlop)
5214     {
5215         dd->comm->flop += force_flop_count(nrnb);
5216         dd->comm->flop_n++;
5217     }
5218 }
5219
5220 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5221 {
5222     int i;
5223
5224     for (i = 0; i < ddCyclNr; i++)
5225     {
5226         dd->comm->cycl[i]     = 0;
5227         dd->comm->cycl_n[i]   = 0;
5228         dd->comm->cycl_max[i] = 0;
5229     }
5230     dd->comm->flop   = 0;
5231     dd->comm->flop_n = 0;
5232 }
5233
5234 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5235 {
5236     gmx_domdec_comm_t *comm;
5237     gmx_domdec_load_t *load;
5238     gmx_domdec_root_t *root = NULL;
5239     int                d, dim, cid, i, pos;
5240     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5241     gmx_bool           bSepPME;
5242
5243     if (debug)
5244     {
5245         fprintf(debug, "get_load_distribution start\n");
5246     }
5247
5248     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5249
5250     comm = dd->comm;
5251
5252     bSepPME = (dd->pme_nodeid >= 0);
5253
5254     for (d = dd->ndim-1; d >= 0; d--)
5255     {
5256         dim = dd->dim[d];
5257         /* Check if we participate in the communication in this dimension */
5258         if (d == dd->ndim-1 ||
5259             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5260         {
5261             load = &comm->load[d];
5262             if (dd->bGridJump)
5263             {
5264                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5265             }
5266             pos = 0;
5267             if (d == dd->ndim-1)
5268             {
5269                 sbuf[pos++] = dd_force_load(comm);
5270                 sbuf[pos++] = sbuf[0];
5271                 if (dd->bGridJump)
5272                 {
5273                     sbuf[pos++] = sbuf[0];
5274                     sbuf[pos++] = cell_frac;
5275                     if (d > 0)
5276                     {
5277                         sbuf[pos++] = comm->cell_f_max0[d];
5278                         sbuf[pos++] = comm->cell_f_min1[d];
5279                     }
5280                 }
5281                 if (bSepPME)
5282                 {
5283                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5284                     sbuf[pos++] = comm->cycl[ddCyclPME];
5285                 }
5286             }
5287             else
5288             {
5289                 sbuf[pos++] = comm->load[d+1].sum;
5290                 sbuf[pos++] = comm->load[d+1].max;
5291                 if (dd->bGridJump)
5292                 {
5293                     sbuf[pos++] = comm->load[d+1].sum_m;
5294                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5295                     sbuf[pos++] = comm->load[d+1].flags;
5296                     if (d > 0)
5297                     {
5298                         sbuf[pos++] = comm->cell_f_max0[d];
5299                         sbuf[pos++] = comm->cell_f_min1[d];
5300                     }
5301                 }
5302                 if (bSepPME)
5303                 {
5304                     sbuf[pos++] = comm->load[d+1].mdf;
5305                     sbuf[pos++] = comm->load[d+1].pme;
5306                 }
5307             }
5308             load->nload = pos;
5309             /* Communicate a row in DD direction d.
5310              * The communicators are setup such that the root always has rank 0.
5311              */
5312 #ifdef GMX_MPI
5313             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5314                        load->load, load->nload*sizeof(float), MPI_BYTE,
5315                        0, comm->mpi_comm_load[d]);
5316 #endif
5317             if (dd->ci[dim] == dd->master_ci[dim])
5318             {
5319                 /* We are the root, process this row */
5320                 if (comm->bDynLoadBal)
5321                 {
5322                     root = comm->root[d];
5323                 }
5324                 load->sum      = 0;
5325                 load->max      = 0;
5326                 load->sum_m    = 0;
5327                 load->cvol_min = 1;
5328                 load->flags    = 0;
5329                 load->mdf      = 0;
5330                 load->pme      = 0;
5331                 pos            = 0;
5332                 for (i = 0; i < dd->nc[dim]; i++)
5333                 {
5334                     load->sum += load->load[pos++];
5335                     load->max  = max(load->max, load->load[pos]);
5336                     pos++;
5337                     if (dd->bGridJump)
5338                     {
5339                         if (root->bLimited)
5340                         {
5341                             /* This direction could not be load balanced properly,
5342                              * therefore we need to use the maximum iso the average load.
5343                              */
5344                             load->sum_m = max(load->sum_m, load->load[pos]);
5345                         }
5346                         else
5347                         {
5348                             load->sum_m += load->load[pos];
5349                         }
5350                         pos++;
5351                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5352                         pos++;
5353                         if (d < dd->ndim-1)
5354                         {
5355                             load->flags = (int)(load->load[pos++] + 0.5);
5356                         }
5357                         if (d > 0)
5358                         {
5359                             root->cell_f_max0[i] = load->load[pos++];
5360                             root->cell_f_min1[i] = load->load[pos++];
5361                         }
5362                     }
5363                     if (bSepPME)
5364                     {
5365                         load->mdf = max(load->mdf, load->load[pos]);
5366                         pos++;
5367                         load->pme = max(load->pme, load->load[pos]);
5368                         pos++;
5369                     }
5370                 }
5371                 if (comm->bDynLoadBal && root->bLimited)
5372                 {
5373                     load->sum_m *= dd->nc[dim];
5374                     load->flags |= (1<<d);
5375                 }
5376             }
5377         }
5378     }
5379
5380     if (DDMASTER(dd))
5381     {
5382         comm->nload      += dd_load_count(comm);
5383         comm->load_step  += comm->cycl[ddCyclStep];
5384         comm->load_sum   += comm->load[0].sum;
5385         comm->load_max   += comm->load[0].max;
5386         if (comm->bDynLoadBal)
5387         {
5388             for (d = 0; d < dd->ndim; d++)
5389             {
5390                 if (comm->load[0].flags & (1<<d))
5391                 {
5392                     comm->load_lim[d]++;
5393                 }
5394             }
5395         }
5396         if (bSepPME)
5397         {
5398             comm->load_mdf += comm->load[0].mdf;
5399             comm->load_pme += comm->load[0].pme;
5400         }
5401     }
5402
5403     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5404
5405     if (debug)
5406     {
5407         fprintf(debug, "get_load_distribution finished\n");
5408     }
5409 }
5410
5411 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5412 {
5413     /* Return the relative performance loss on the total run time
5414      * due to the force calculation load imbalance.
5415      */
5416     if (dd->comm->nload > 0)
5417     {
5418         return
5419             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5420             (dd->comm->load_step*dd->nnodes);
5421     }
5422     else
5423     {
5424         return 0;
5425     }
5426 }
5427
5428 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5429 {
5430     char               buf[STRLEN];
5431     int                npp, npme, nnodes, d, limp;
5432     float              imbal, pme_f_ratio, lossf, lossp = 0;
5433     gmx_bool           bLim;
5434     gmx_domdec_comm_t *comm;
5435
5436     comm = dd->comm;
5437     if (DDMASTER(dd) && comm->nload > 0)
5438     {
5439         npp    = dd->nnodes;
5440         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5441         nnodes = npp + npme;
5442         imbal  = comm->load_max*npp/comm->load_sum - 1;
5443         lossf  = dd_force_imb_perf_loss(dd);
5444         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5445         fprintf(fplog, "%s", buf);
5446         fprintf(stderr, "\n");
5447         fprintf(stderr, "%s", buf);
5448         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5449         fprintf(fplog, "%s", buf);
5450         fprintf(stderr, "%s", buf);
5451         bLim = FALSE;
5452         if (comm->bDynLoadBal)
5453         {
5454             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5455             for (d = 0; d < dd->ndim; d++)
5456             {
5457                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5458                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5459                 if (limp >= 50)
5460                 {
5461                     bLim = TRUE;
5462                 }
5463             }
5464             sprintf(buf+strlen(buf), "\n");
5465             fprintf(fplog, "%s", buf);
5466             fprintf(stderr, "%s", buf);
5467         }
5468         if (npme > 0)
5469         {
5470             pme_f_ratio = comm->load_pme/comm->load_mdf;
5471             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5472             if (lossp <= 0)
5473             {
5474                 lossp *= (float)npme/(float)nnodes;
5475             }
5476             else
5477             {
5478                 lossp *= (float)npp/(float)nnodes;
5479             }
5480             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5481             fprintf(fplog, "%s", buf);
5482             fprintf(stderr, "%s", buf);
5483             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5484             fprintf(fplog, "%s", buf);
5485             fprintf(stderr, "%s", buf);
5486         }
5487         fprintf(fplog, "\n");
5488         fprintf(stderr, "\n");
5489
5490         if (lossf >= DD_PERF_LOSS_WARN)
5491         {
5492             sprintf(buf,
5493                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5494                     "      in the domain decomposition.\n", lossf*100);
5495             if (!comm->bDynLoadBal)
5496             {
5497                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5498             }
5499             else if (bLim)
5500             {
5501                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5502             }
5503             fprintf(fplog, "%s\n", buf);
5504             fprintf(stderr, "%s\n", buf);
5505         }
5506         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5507         {
5508             sprintf(buf,
5509                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5510                     "      had %s work to do than the PP ranks.\n"
5511                     "      You might want to %s the number of PME ranks\n"
5512                     "      or %s the cut-off and the grid spacing.\n",
5513                     fabs(lossp*100),
5514                     (lossp < 0) ? "less"     : "more",
5515                     (lossp < 0) ? "decrease" : "increase",
5516                     (lossp < 0) ? "decrease" : "increase");
5517             fprintf(fplog, "%s\n", buf);
5518             fprintf(stderr, "%s\n", buf);
5519         }
5520     }
5521 }
5522
5523 static float dd_vol_min(gmx_domdec_t *dd)
5524 {
5525     return dd->comm->load[0].cvol_min*dd->nnodes;
5526 }
5527
5528 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5529 {
5530     return dd->comm->load[0].flags;
5531 }
5532
5533 static float dd_f_imbal(gmx_domdec_t *dd)
5534 {
5535     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5536 }
5537
5538 float dd_pme_f_ratio(gmx_domdec_t *dd)
5539 {
5540     if (dd->comm->cycl_n[ddCyclPME] > 0)
5541     {
5542         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5543     }
5544     else
5545     {
5546         return -1.0;
5547     }
5548 }
5549
5550 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5551 {
5552     int  flags, d;
5553     char buf[22];
5554
5555     flags = dd_load_flags(dd);
5556     if (flags)
5557     {
5558         fprintf(fplog,
5559                 "DD  load balancing is limited by minimum cell size in dimension");
5560         for (d = 0; d < dd->ndim; d++)
5561         {
5562             if (flags & (1<<d))
5563             {
5564                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5565             }
5566         }
5567         fprintf(fplog, "\n");
5568     }
5569     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5570     if (dd->comm->bDynLoadBal)
5571     {
5572         fprintf(fplog, "  vol min/aver %5.3f%c",
5573                 dd_vol_min(dd), flags ? '!' : ' ');
5574     }
5575     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5576     if (dd->comm->cycl_n[ddCyclPME])
5577     {
5578         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5579     }
5580     fprintf(fplog, "\n\n");
5581 }
5582
5583 static void dd_print_load_verbose(gmx_domdec_t *dd)
5584 {
5585     if (dd->comm->bDynLoadBal)
5586     {
5587         fprintf(stderr, "vol %4.2f%c ",
5588                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5589     }
5590     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5591     if (dd->comm->cycl_n[ddCyclPME])
5592     {
5593         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5594     }
5595 }
5596
5597 #ifdef GMX_MPI
5598 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5599 {
5600     MPI_Comm           c_row;
5601     int                dim, i, rank;
5602     ivec               loc_c;
5603     gmx_domdec_root_t *root;
5604     gmx_bool           bPartOfGroup = FALSE;
5605
5606     dim = dd->dim[dim_ind];
5607     copy_ivec(loc, loc_c);
5608     for (i = 0; i < dd->nc[dim]; i++)
5609     {
5610         loc_c[dim] = i;
5611         rank       = dd_index(dd->nc, loc_c);
5612         if (rank == dd->rank)
5613         {
5614             /* This process is part of the group */
5615             bPartOfGroup = TRUE;
5616         }
5617     }
5618     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5619                    &c_row);
5620     if (bPartOfGroup)
5621     {
5622         dd->comm->mpi_comm_load[dim_ind] = c_row;
5623         if (dd->comm->eDLB != edlbNO)
5624         {
5625             if (dd->ci[dim] == dd->master_ci[dim])
5626             {
5627                 /* This is the root process of this row */
5628                 snew(dd->comm->root[dim_ind], 1);
5629                 root = dd->comm->root[dim_ind];
5630                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5631                 snew(root->old_cell_f, dd->nc[dim]+1);
5632                 snew(root->bCellMin, dd->nc[dim]);
5633                 if (dim_ind > 0)
5634                 {
5635                     snew(root->cell_f_max0, dd->nc[dim]);
5636                     snew(root->cell_f_min1, dd->nc[dim]);
5637                     snew(root->bound_min, dd->nc[dim]);
5638                     snew(root->bound_max, dd->nc[dim]);
5639                 }
5640                 snew(root->buf_ncd, dd->nc[dim]);
5641             }
5642             else
5643             {
5644                 /* This is not a root process, we only need to receive cell_f */
5645                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5646             }
5647         }
5648         if (dd->ci[dim] == dd->master_ci[dim])
5649         {
5650             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5651         }
5652     }
5653 }
5654 #endif
5655
5656 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5657                                    const gmx_hw_info_t gmx_unused *hwinfo,
5658                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5659 {
5660 #ifdef GMX_MPI
5661     int           physicalnode_id_hash;
5662     int           gpu_id;
5663     gmx_domdec_t *dd;
5664     MPI_Comm      mpi_comm_pp_physicalnode;
5665
5666     if (!(cr->duty & DUTY_PP) ||
5667         hw_opt->gpu_opt.ncuda_dev_use == 0)
5668     {
5669         /* Only PP nodes (currently) use GPUs.
5670          * If we don't have GPUs, there are no resources to share.
5671          */
5672         return;
5673     }
5674
5675     physicalnode_id_hash = gmx_physicalnode_id_hash();
5676
5677     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5678
5679     dd = cr->dd;
5680
5681     if (debug)
5682     {
5683         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5684         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5685                 dd->rank, physicalnode_id_hash, gpu_id);
5686     }
5687     /* Split the PP communicator over the physical nodes */
5688     /* TODO: See if we should store this (before), as it's also used for
5689      * for the nodecomm summution.
5690      */
5691     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5692                    &mpi_comm_pp_physicalnode);
5693     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5694                    &dd->comm->mpi_comm_gpu_shared);
5695     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5696     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5697
5698     if (debug)
5699     {
5700         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5701     }
5702
5703     /* Note that some ranks could share a GPU, while others don't */
5704
5705     if (dd->comm->nrank_gpu_shared == 1)
5706     {
5707         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5708     }
5709 #endif
5710 }
5711
5712 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5713 {
5714 #ifdef GMX_MPI
5715     int  dim0, dim1, i, j;
5716     ivec loc;
5717
5718     if (debug)
5719     {
5720         fprintf(debug, "Making load communicators\n");
5721     }
5722
5723     snew(dd->comm->load, dd->ndim);
5724     snew(dd->comm->mpi_comm_load, dd->ndim);
5725
5726     clear_ivec(loc);
5727     make_load_communicator(dd, 0, loc);
5728     if (dd->ndim > 1)
5729     {
5730         dim0 = dd->dim[0];
5731         for (i = 0; i < dd->nc[dim0]; i++)
5732         {
5733             loc[dim0] = i;
5734             make_load_communicator(dd, 1, loc);
5735         }
5736     }
5737     if (dd->ndim > 2)
5738     {
5739         dim0 = dd->dim[0];
5740         for (i = 0; i < dd->nc[dim0]; i++)
5741         {
5742             loc[dim0] = i;
5743             dim1      = dd->dim[1];
5744             for (j = 0; j < dd->nc[dim1]; j++)
5745             {
5746                 loc[dim1] = j;
5747                 make_load_communicator(dd, 2, loc);
5748             }
5749         }
5750     }
5751
5752     if (debug)
5753     {
5754         fprintf(debug, "Finished making load communicators\n");
5755     }
5756 #endif
5757 }
5758
5759 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5760 {
5761     gmx_bool                bZYX;
5762     int                     d, dim, i, j, m;
5763     ivec                    tmp, s;
5764     int                     nzone, nzonep;
5765     ivec                    dd_zp[DD_MAXIZONE];
5766     gmx_domdec_zones_t     *zones;
5767     gmx_domdec_ns_ranges_t *izone;
5768
5769     for (d = 0; d < dd->ndim; d++)
5770     {
5771         dim = dd->dim[d];
5772         copy_ivec(dd->ci, tmp);
5773         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5774         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5775         copy_ivec(dd->ci, tmp);
5776         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5777         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5778         if (debug)
5779         {
5780             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5781                     dd->rank, dim,
5782                     dd->neighbor[d][0],
5783                     dd->neighbor[d][1]);
5784         }
5785     }
5786
5787     if (fplog)
5788     {
5789         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5790                 dd->ndim,
5791                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5792                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5793     }
5794     switch (dd->ndim)
5795     {
5796         case 3:
5797             nzone  = dd_z3n;
5798             nzonep = dd_zp3n;
5799             for (i = 0; i < nzonep; i++)
5800             {
5801                 copy_ivec(dd_zp3[i], dd_zp[i]);
5802             }
5803             break;
5804         case 2:
5805             nzone  = dd_z2n;
5806             nzonep = dd_zp2n;
5807             for (i = 0; i < nzonep; i++)
5808             {
5809                 copy_ivec(dd_zp2[i], dd_zp[i]);
5810             }
5811             break;
5812         case 1:
5813             nzone  = dd_z1n;
5814             nzonep = dd_zp1n;
5815             for (i = 0; i < nzonep; i++)
5816             {
5817                 copy_ivec(dd_zp1[i], dd_zp[i]);
5818             }
5819             break;
5820         default:
5821             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5822             nzone  = 0;
5823             nzonep = 0;
5824     }
5825
5826     zones = &dd->comm->zones;
5827
5828     for (i = 0; i < nzone; i++)
5829     {
5830         m = 0;
5831         clear_ivec(zones->shift[i]);
5832         for (d = 0; d < dd->ndim; d++)
5833         {
5834             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5835         }
5836     }
5837
5838     zones->n = nzone;
5839     for (i = 0; i < nzone; i++)
5840     {
5841         for (d = 0; d < DIM; d++)
5842         {
5843             s[d] = dd->ci[d] - zones->shift[i][d];
5844             if (s[d] < 0)
5845             {
5846                 s[d] += dd->nc[d];
5847             }
5848             else if (s[d] >= dd->nc[d])
5849             {
5850                 s[d] -= dd->nc[d];
5851             }
5852         }
5853     }
5854     zones->nizone = nzonep;
5855     for (i = 0; i < zones->nizone; i++)
5856     {
5857         if (dd_zp[i][0] != i)
5858         {
5859             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5860         }
5861         izone     = &zones->izone[i];
5862         izone->j0 = dd_zp[i][1];
5863         izone->j1 = dd_zp[i][2];
5864         for (dim = 0; dim < DIM; dim++)
5865         {
5866             if (dd->nc[dim] == 1)
5867             {
5868                 /* All shifts should be allowed */
5869                 izone->shift0[dim] = -1;
5870                 izone->shift1[dim] = 1;
5871             }
5872             else
5873             {
5874                 /*
5875                    izone->shift0[d] = 0;
5876                    izone->shift1[d] = 0;
5877                    for(j=izone->j0; j<izone->j1; j++) {
5878                    if (dd->shift[j][d] > dd->shift[i][d])
5879                    izone->shift0[d] = -1;
5880                    if (dd->shift[j][d] < dd->shift[i][d])
5881                    izone->shift1[d] = 1;
5882                    }
5883                  */
5884
5885                 int shift_diff;
5886
5887                 /* Assume the shift are not more than 1 cell */
5888                 izone->shift0[dim] = 1;
5889                 izone->shift1[dim] = -1;
5890                 for (j = izone->j0; j < izone->j1; j++)
5891                 {
5892                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5893                     if (shift_diff < izone->shift0[dim])
5894                     {
5895                         izone->shift0[dim] = shift_diff;
5896                     }
5897                     if (shift_diff > izone->shift1[dim])
5898                     {
5899                         izone->shift1[dim] = shift_diff;
5900                     }
5901                 }
5902             }
5903         }
5904     }
5905
5906     if (dd->comm->eDLB != edlbNO)
5907     {
5908         snew(dd->comm->root, dd->ndim);
5909     }
5910
5911     if (dd->comm->bRecordLoad)
5912     {
5913         make_load_communicators(dd);
5914     }
5915 }
5916
5917 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5918 {
5919     gmx_domdec_t      *dd;
5920     gmx_domdec_comm_t *comm;
5921     int                i, rank, *buf;
5922     ivec               periods;
5923 #ifdef GMX_MPI
5924     MPI_Comm           comm_cart;
5925 #endif
5926
5927     dd   = cr->dd;
5928     comm = dd->comm;
5929
5930 #ifdef GMX_MPI
5931     if (comm->bCartesianPP)
5932     {
5933         /* Set up cartesian communication for the particle-particle part */
5934         if (fplog)
5935         {
5936             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5937                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5938         }
5939
5940         for (i = 0; i < DIM; i++)
5941         {
5942             periods[i] = TRUE;
5943         }
5944         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5945                         &comm_cart);
5946         /* We overwrite the old communicator with the new cartesian one */
5947         cr->mpi_comm_mygroup = comm_cart;
5948     }
5949
5950     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5951     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5952
5953     if (comm->bCartesianPP_PME)
5954     {
5955         /* Since we want to use the original cartesian setup for sim,
5956          * and not the one after split, we need to make an index.
5957          */
5958         snew(comm->ddindex2ddnodeid, dd->nnodes);
5959         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5960         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5961         /* Get the rank of the DD master,
5962          * above we made sure that the master node is a PP node.
5963          */
5964         if (MASTER(cr))
5965         {
5966             rank = dd->rank;
5967         }
5968         else
5969         {
5970             rank = 0;
5971         }
5972         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5973     }
5974     else if (comm->bCartesianPP)
5975     {
5976         if (cr->npmenodes == 0)
5977         {
5978             /* The PP communicator is also
5979              * the communicator for this simulation
5980              */
5981             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5982         }
5983         cr->nodeid = dd->rank;
5984
5985         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5986
5987         /* We need to make an index to go from the coordinates
5988          * to the nodeid of this simulation.
5989          */
5990         snew(comm->ddindex2simnodeid, dd->nnodes);
5991         snew(buf, dd->nnodes);
5992         if (cr->duty & DUTY_PP)
5993         {
5994             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5995         }
5996         /* Communicate the ddindex to simulation nodeid index */
5997         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
5998                       cr->mpi_comm_mysim);
5999         sfree(buf);
6000
6001         /* Determine the master coordinates and rank.
6002          * The DD master should be the same node as the master of this sim.
6003          */
6004         for (i = 0; i < dd->nnodes; i++)
6005         {
6006             if (comm->ddindex2simnodeid[i] == 0)
6007             {
6008                 ddindex2xyz(dd->nc, i, dd->master_ci);
6009                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
6010             }
6011         }
6012         if (debug)
6013         {
6014             fprintf(debug, "The master rank is %d\n", dd->masterrank);
6015         }
6016     }
6017     else
6018     {
6019         /* No Cartesian communicators */
6020         /* We use the rank in dd->comm->all as DD index */
6021         ddindex2xyz(dd->nc, dd->rank, dd->ci);
6022         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6023         dd->masterrank = 0;
6024         clear_ivec(dd->master_ci);
6025     }
6026 #endif
6027
6028     if (fplog)
6029     {
6030         fprintf(fplog,
6031                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6032                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6033     }
6034     if (debug)
6035     {
6036         fprintf(debug,
6037                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6038                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6039     }
6040 }
6041
6042 static void receive_ddindex2simnodeid(t_commrec *cr)
6043 {
6044     gmx_domdec_t      *dd;
6045
6046     gmx_domdec_comm_t *comm;
6047     int               *buf;
6048
6049     dd   = cr->dd;
6050     comm = dd->comm;
6051
6052 #ifdef GMX_MPI
6053     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
6054     {
6055         snew(comm->ddindex2simnodeid, dd->nnodes);
6056         snew(buf, dd->nnodes);
6057         if (cr->duty & DUTY_PP)
6058         {
6059             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6060         }
6061 #ifdef GMX_MPI
6062         /* Communicate the ddindex to simulation nodeid index */
6063         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6064                       cr->mpi_comm_mysim);
6065 #endif
6066         sfree(buf);
6067     }
6068 #endif
6069 }
6070
6071 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
6072                                                      int ncg, int natoms)
6073 {
6074     gmx_domdec_master_t *ma;
6075     int                  i;
6076
6077     snew(ma, 1);
6078
6079     snew(ma->ncg, dd->nnodes);
6080     snew(ma->index, dd->nnodes+1);
6081     snew(ma->cg, ncg);
6082     snew(ma->nat, dd->nnodes);
6083     snew(ma->ibuf, dd->nnodes*2);
6084     snew(ma->cell_x, DIM);
6085     for (i = 0; i < DIM; i++)
6086     {
6087         snew(ma->cell_x[i], dd->nc[i]+1);
6088     }
6089
6090     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
6091     {
6092         ma->vbuf = NULL;
6093     }
6094     else
6095     {
6096         snew(ma->vbuf, natoms);
6097     }
6098
6099     return ma;
6100 }
6101
6102 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
6103                                int gmx_unused reorder)
6104 {
6105     gmx_domdec_t      *dd;
6106     gmx_domdec_comm_t *comm;
6107     int                i, rank;
6108     gmx_bool           bDiv[DIM];
6109     ivec               periods;
6110 #ifdef GMX_MPI
6111     MPI_Comm           comm_cart;
6112 #endif
6113
6114     dd   = cr->dd;
6115     comm = dd->comm;
6116
6117     if (comm->bCartesianPP)
6118     {
6119         for (i = 1; i < DIM; i++)
6120         {
6121             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6122         }
6123         if (bDiv[YY] || bDiv[ZZ])
6124         {
6125             comm->bCartesianPP_PME = TRUE;
6126             /* If we have 2D PME decomposition, which is always in x+y,
6127              * we stack the PME only nodes in z.
6128              * Otherwise we choose the direction that provides the thinnest slab
6129              * of PME only nodes as this will have the least effect
6130              * on the PP communication.
6131              * But for the PME communication the opposite might be better.
6132              */
6133             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6134                              !bDiv[YY] ||
6135                              dd->nc[YY] > dd->nc[ZZ]))
6136             {
6137                 comm->cartpmedim = ZZ;
6138             }
6139             else
6140             {
6141                 comm->cartpmedim = YY;
6142             }
6143             comm->ntot[comm->cartpmedim]
6144                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6145         }
6146         else if (fplog)
6147         {
6148             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6149             fprintf(fplog,
6150                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6151         }
6152     }
6153
6154 #ifdef GMX_MPI
6155     if (comm->bCartesianPP_PME)
6156     {
6157         if (fplog)
6158         {
6159             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6160         }
6161
6162         for (i = 0; i < DIM; i++)
6163         {
6164             periods[i] = TRUE;
6165         }
6166         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6167                         &comm_cart);
6168
6169         MPI_Comm_rank(comm_cart, &rank);
6170         if (MASTERNODE(cr) && rank != 0)
6171         {
6172             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6173         }
6174
6175         /* With this assigment we loose the link to the original communicator
6176          * which will usually be MPI_COMM_WORLD, unless have multisim.
6177          */
6178         cr->mpi_comm_mysim = comm_cart;
6179         cr->sim_nodeid     = rank;
6180
6181         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6182
6183         if (fplog)
6184         {
6185             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
6186                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6187         }
6188
6189         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6190         {
6191             cr->duty = DUTY_PP;
6192         }
6193         if (cr->npmenodes == 0 ||
6194             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6195         {
6196             cr->duty = DUTY_PME;
6197         }
6198
6199         /* Split the sim communicator into PP and PME only nodes */
6200         MPI_Comm_split(cr->mpi_comm_mysim,
6201                        cr->duty,
6202                        dd_index(comm->ntot, dd->ci),
6203                        &cr->mpi_comm_mygroup);
6204     }
6205     else
6206     {
6207         switch (dd_node_order)
6208         {
6209             case ddnoPP_PME:
6210                 if (fplog)
6211                 {
6212                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
6213                 }
6214                 break;
6215             case ddnoINTERLEAVE:
6216                 /* Interleave the PP-only and PME-only nodes,
6217                  * as on clusters with dual-core machines this will double
6218                  * the communication bandwidth of the PME processes
6219                  * and thus speed up the PP <-> PME and inter PME communication.
6220                  */
6221                 if (fplog)
6222                 {
6223                     fprintf(fplog, "Interleaving PP and PME ranks\n");
6224                 }
6225                 comm->pmenodes = dd_pmenodes(cr);
6226                 break;
6227             case ddnoCARTESIAN:
6228                 break;
6229             default:
6230                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6231         }
6232
6233         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6234         {
6235             cr->duty = DUTY_PME;
6236         }
6237         else
6238         {
6239             cr->duty = DUTY_PP;
6240         }
6241
6242         /* Split the sim communicator into PP and PME only nodes */
6243         MPI_Comm_split(cr->mpi_comm_mysim,
6244                        cr->duty,
6245                        cr->nodeid,
6246                        &cr->mpi_comm_mygroup);
6247         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6248     }
6249 #endif
6250
6251     if (fplog)
6252     {
6253         fprintf(fplog, "This rank does only %s work.\n\n",
6254                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6255     }
6256 }
6257
6258 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6259 {
6260     gmx_domdec_t      *dd;
6261     gmx_domdec_comm_t *comm;
6262     int                CartReorder;
6263
6264     dd   = cr->dd;
6265     comm = dd->comm;
6266
6267     copy_ivec(dd->nc, comm->ntot);
6268
6269     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6270     comm->bCartesianPP_PME = FALSE;
6271
6272     /* Reorder the nodes by default. This might change the MPI ranks.
6273      * Real reordering is only supported on very few architectures,
6274      * Blue Gene is one of them.
6275      */
6276     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6277
6278     if (cr->npmenodes > 0)
6279     {
6280         /* Split the communicator into a PP and PME part */
6281         split_communicator(fplog, cr, dd_node_order, CartReorder);
6282         if (comm->bCartesianPP_PME)
6283         {
6284             /* We (possibly) reordered the nodes in split_communicator,
6285              * so it is no longer required in make_pp_communicator.
6286              */
6287             CartReorder = FALSE;
6288         }
6289     }
6290     else
6291     {
6292         /* All nodes do PP and PME */
6293 #ifdef GMX_MPI
6294         /* We do not require separate communicators */
6295         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6296 #endif
6297     }
6298
6299     if (cr->duty & DUTY_PP)
6300     {
6301         /* Copy or make a new PP communicator */
6302         make_pp_communicator(fplog, cr, CartReorder);
6303     }
6304     else
6305     {
6306         receive_ddindex2simnodeid(cr);
6307     }
6308
6309     if (!(cr->duty & DUTY_PME))
6310     {
6311         /* Set up the commnuication to our PME node */
6312         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6313         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6314         if (debug)
6315         {
6316             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6317                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6318         }
6319     }
6320     else
6321     {
6322         dd->pme_nodeid = -1;
6323     }
6324
6325     if (DDMASTER(dd))
6326     {
6327         dd->ma = init_gmx_domdec_master_t(dd,
6328                                           comm->cgs_gl.nr,
6329                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6330     }
6331 }
6332
6333 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6334 {
6335     real  *slb_frac, tot;
6336     int    i, n;
6337     double dbl;
6338
6339     slb_frac = NULL;
6340     if (nc > 1 && size_string != NULL)
6341     {
6342         if (fplog)
6343         {
6344             fprintf(fplog, "Using static load balancing for the %s direction\n",
6345                     dir);
6346         }
6347         snew(slb_frac, nc);
6348         tot = 0;
6349         for (i = 0; i < nc; i++)
6350         {
6351             dbl = 0;
6352             sscanf(size_string, "%lf%n", &dbl, &n);
6353             if (dbl == 0)
6354             {
6355                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6356             }
6357             slb_frac[i]  = dbl;
6358             size_string += n;
6359             tot         += slb_frac[i];
6360         }
6361         /* Normalize */
6362         if (fplog)
6363         {
6364             fprintf(fplog, "Relative cell sizes:");
6365         }
6366         for (i = 0; i < nc; i++)
6367         {
6368             slb_frac[i] /= tot;
6369             if (fplog)
6370             {
6371                 fprintf(fplog, " %5.3f", slb_frac[i]);
6372             }
6373         }
6374         if (fplog)
6375         {
6376             fprintf(fplog, "\n");
6377         }
6378     }
6379
6380     return slb_frac;
6381 }
6382
6383 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6384 {
6385     int                  n, nmol, ftype;
6386     gmx_mtop_ilistloop_t iloop;
6387     t_ilist             *il;
6388
6389     n     = 0;
6390     iloop = gmx_mtop_ilistloop_init(mtop);
6391     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6392     {
6393         for (ftype = 0; ftype < F_NRE; ftype++)
6394         {
6395             if ((interaction_function[ftype].flags & IF_BOND) &&
6396                 NRAL(ftype) >  2)
6397             {
6398                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6399             }
6400         }
6401     }
6402
6403     return n;
6404 }
6405
6406 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6407 {
6408     char *val;
6409     int   nst;
6410
6411     nst = def;
6412     val = getenv(env_var);
6413     if (val)
6414     {
6415         if (sscanf(val, "%d", &nst) <= 0)
6416         {
6417             nst = 1;
6418         }
6419         if (fplog)
6420         {
6421             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6422                     env_var, val, nst);
6423         }
6424     }
6425
6426     return nst;
6427 }
6428
6429 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6430 {
6431     if (MASTER(cr))
6432     {
6433         fprintf(stderr, "\n%s\n", warn_string);
6434     }
6435     if (fplog)
6436     {
6437         fprintf(fplog, "\n%s\n", warn_string);
6438     }
6439 }
6440
6441 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6442                                   t_inputrec *ir, FILE *fplog)
6443 {
6444     if (ir->ePBC == epbcSCREW &&
6445         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6446     {
6447         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6448     }
6449
6450     if (ir->ns_type == ensSIMPLE)
6451     {
6452         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6453     }
6454
6455     if (ir->nstlist == 0)
6456     {
6457         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6458     }
6459
6460     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6461     {
6462         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6463     }
6464 }
6465
6466 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6467 {
6468     int  di, d;
6469     real r;
6470
6471     r = ddbox->box_size[XX];
6472     for (di = 0; di < dd->ndim; di++)
6473     {
6474         d = dd->dim[di];
6475         /* Check using the initial average cell size */
6476         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6477     }
6478
6479     return r;
6480 }
6481
6482 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6483                              const char *dlb_opt, gmx_bool bRecordLoad,
6484                              unsigned long Flags, t_inputrec *ir)
6485 {
6486     gmx_domdec_t *dd;
6487     int           eDLB = -1;
6488     char          buf[STRLEN];
6489
6490     switch (dlb_opt[0])
6491     {
6492         case 'a': eDLB = edlbAUTO; break;
6493         case 'n': eDLB = edlbNO;   break;
6494         case 'y': eDLB = edlbYES;  break;
6495         default: gmx_incons("Unknown dlb_opt");
6496     }
6497
6498     if (Flags & MD_RERUN)
6499     {
6500         return edlbNO;
6501     }
6502
6503     if (!EI_DYNAMICS(ir->eI))
6504     {
6505         if (eDLB == edlbYES)
6506         {
6507             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6508             dd_warning(cr, fplog, buf);
6509         }
6510
6511         return edlbNO;
6512     }
6513
6514     if (!bRecordLoad)
6515     {
6516         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6517
6518         return edlbNO;
6519     }
6520
6521     if (Flags & MD_REPRODUCIBLE)
6522     {
6523         switch (eDLB)
6524         {
6525             case edlbNO:
6526                 break;
6527             case edlbAUTO:
6528                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6529                 eDLB = edlbNO;
6530                 break;
6531             case edlbYES:
6532                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6533                 break;
6534             default:
6535                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6536                 break;
6537         }
6538     }
6539
6540     return eDLB;
6541 }
6542
6543 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6544 {
6545     int dim;
6546
6547     dd->ndim = 0;
6548     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6549     {
6550         /* Decomposition order z,y,x */
6551         if (fplog)
6552         {
6553             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6554         }
6555         for (dim = DIM-1; dim >= 0; dim--)
6556         {
6557             if (dd->nc[dim] > 1)
6558             {
6559                 dd->dim[dd->ndim++] = dim;
6560             }
6561         }
6562     }
6563     else
6564     {
6565         /* Decomposition order x,y,z */
6566         for (dim = 0; dim < DIM; dim++)
6567         {
6568             if (dd->nc[dim] > 1)
6569             {
6570                 dd->dim[dd->ndim++] = dim;
6571             }
6572         }
6573     }
6574 }
6575
6576 static gmx_domdec_comm_t *init_dd_comm()
6577 {
6578     gmx_domdec_comm_t *comm;
6579     int                i;
6580
6581     snew(comm, 1);
6582     snew(comm->cggl_flag, DIM*2);
6583     snew(comm->cgcm_state, DIM*2);
6584     for (i = 0; i < DIM*2; i++)
6585     {
6586         comm->cggl_flag_nalloc[i]  = 0;
6587         comm->cgcm_state_nalloc[i] = 0;
6588     }
6589
6590     comm->nalloc_int = 0;
6591     comm->buf_int    = NULL;
6592
6593     vec_rvec_init(&comm->vbuf);
6594
6595     comm->n_load_have    = 0;
6596     comm->n_load_collect = 0;
6597
6598     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6599     {
6600         comm->sum_nat[i] = 0;
6601     }
6602     comm->ndecomp   = 0;
6603     comm->nload     = 0;
6604     comm->load_step = 0;
6605     comm->load_sum  = 0;
6606     comm->load_max  = 0;
6607     clear_ivec(comm->load_lim);
6608     comm->load_mdf  = 0;
6609     comm->load_pme  = 0;
6610
6611     return comm;
6612 }
6613
6614 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6615                                         unsigned long Flags,
6616                                         ivec nc,
6617                                         real comm_distance_min, real rconstr,
6618                                         const char *dlb_opt, real dlb_scale,
6619                                         const char *sizex, const char *sizey, const char *sizez,
6620                                         gmx_mtop_t *mtop, t_inputrec *ir,
6621                                         matrix box, rvec *x,
6622                                         gmx_ddbox_t *ddbox,
6623                                         int *npme_x, int *npme_y)
6624 {
6625     gmx_domdec_t      *dd;
6626     gmx_domdec_comm_t *comm;
6627     int                recload;
6628     int                d, i, j;
6629     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6630     gmx_bool           bC;
6631     char               buf[STRLEN];
6632
6633     if (fplog)
6634     {
6635         fprintf(fplog,
6636                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6637     }
6638
6639     snew(dd, 1);
6640
6641     dd->comm = init_dd_comm();
6642     comm     = dd->comm;
6643     snew(comm->cggl_flag, DIM*2);
6644     snew(comm->cgcm_state, DIM*2);
6645
6646     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6647     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6648
6649     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6650     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6651     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6652     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6653     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6654     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6655     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6656     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6657
6658     dd->pme_recv_f_alloc = 0;
6659     dd->pme_recv_f_buf   = NULL;
6660
6661     if (dd->bSendRecv2 && fplog)
6662     {
6663         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6664     }
6665     if (comm->eFlop)
6666     {
6667         if (fplog)
6668         {
6669             fprintf(fplog, "Will load balance based on FLOP count\n");
6670         }
6671         if (comm->eFlop > 1)
6672         {
6673             srand(1+cr->nodeid);
6674         }
6675         comm->bRecordLoad = TRUE;
6676     }
6677     else
6678     {
6679         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6680
6681     }
6682
6683     /* Initialize to GPU share count to 0, might change later */
6684     comm->nrank_gpu_shared = 0;
6685
6686     comm->eDLB = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6687
6688     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6689     if (fplog)
6690     {
6691         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6692     }
6693     dd->bGridJump              = comm->bDynLoadBal;
6694     comm->bPMELoadBalDLBLimits = FALSE;
6695
6696     if (comm->nstSortCG)
6697     {
6698         if (fplog)
6699         {
6700             if (comm->nstSortCG == 1)
6701             {
6702                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6703             }
6704             else
6705             {
6706                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6707                         comm->nstSortCG);
6708             }
6709         }
6710         snew(comm->sort, 1);
6711     }
6712     else
6713     {
6714         if (fplog)
6715         {
6716             fprintf(fplog, "Will not sort the charge groups\n");
6717         }
6718     }
6719
6720     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6721
6722     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6723     if (comm->bInterCGBondeds)
6724     {
6725         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6726     }
6727     else
6728     {
6729         comm->bInterCGMultiBody = FALSE;
6730     }
6731
6732     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6733     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6734
6735     if (ir->rlistlong == 0)
6736     {
6737         /* Set the cut-off to some very large value,
6738          * so we don't need if statements everywhere in the code.
6739          * We use sqrt, since the cut-off is squared in some places.
6740          */
6741         comm->cutoff   = GMX_CUTOFF_INF;
6742     }
6743     else
6744     {
6745         comm->cutoff   = ir->rlistlong;
6746     }
6747     comm->cutoff_mbody = 0;
6748
6749     comm->cellsize_limit = 0;
6750     comm->bBondComm      = FALSE;
6751
6752     if (comm->bInterCGBondeds)
6753     {
6754         if (comm_distance_min > 0)
6755         {
6756             comm->cutoff_mbody = comm_distance_min;
6757             if (Flags & MD_DDBONDCOMM)
6758             {
6759                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6760             }
6761             else
6762             {
6763                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6764             }
6765             r_bonded_limit = comm->cutoff_mbody;
6766         }
6767         else if (ir->bPeriodicMols)
6768         {
6769             /* Can not easily determine the required cut-off */
6770             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6771             comm->cutoff_mbody = comm->cutoff/2;
6772             r_bonded_limit     = comm->cutoff_mbody;
6773         }
6774         else
6775         {
6776             if (MASTER(cr))
6777             {
6778                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6779                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6780             }
6781             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6782             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6783
6784             /* We use an initial margin of 10% for the minimum cell size,
6785              * except when we are just below the non-bonded cut-off.
6786              */
6787             if (Flags & MD_DDBONDCOMM)
6788             {
6789                 if (max(r_2b, r_mb) > comm->cutoff)
6790                 {
6791                     r_bonded        = max(r_2b, r_mb);
6792                     r_bonded_limit  = 1.1*r_bonded;
6793                     comm->bBondComm = TRUE;
6794                 }
6795                 else
6796                 {
6797                     r_bonded       = r_mb;
6798                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6799                 }
6800                 /* We determine cutoff_mbody later */
6801             }
6802             else
6803             {
6804                 /* No special bonded communication,
6805                  * simply increase the DD cut-off.
6806                  */
6807                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6808                 comm->cutoff_mbody = r_bonded_limit;
6809                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6810             }
6811         }
6812         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6813         if (fplog)
6814         {
6815             fprintf(fplog,
6816                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6817                     comm->cellsize_limit);
6818         }
6819     }
6820
6821     if (dd->bInterCGcons && rconstr <= 0)
6822     {
6823         /* There is a cell size limit due to the constraints (P-LINCS) */
6824         rconstr = constr_r_max(fplog, mtop, ir);
6825         if (fplog)
6826         {
6827             fprintf(fplog,
6828                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6829                     rconstr);
6830             if (rconstr > comm->cellsize_limit)
6831             {
6832                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6833             }
6834         }
6835     }
6836     else if (rconstr > 0 && fplog)
6837     {
6838         /* Here we do not check for dd->bInterCGcons,
6839          * because one can also set a cell size limit for virtual sites only
6840          * and at this point we don't know yet if there are intercg v-sites.
6841          */
6842         fprintf(fplog,
6843                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6844                 rconstr);
6845     }
6846     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6847
6848     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6849
6850     if (nc[XX] > 0)
6851     {
6852         copy_ivec(nc, dd->nc);
6853         set_dd_dim(fplog, dd);
6854         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6855
6856         if (cr->npmenodes == -1)
6857         {
6858             cr->npmenodes = 0;
6859         }
6860         acs = average_cellsize_min(dd, ddbox);
6861         if (acs < comm->cellsize_limit)
6862         {
6863             if (fplog)
6864             {
6865                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6866             }
6867             gmx_fatal_collective(FARGS, cr, NULL,
6868                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6869                                  acs, comm->cellsize_limit);
6870         }
6871     }
6872     else
6873     {
6874         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6875
6876         /* We need to choose the optimal DD grid and possibly PME nodes */
6877         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6878                                comm->eDLB != edlbNO, dlb_scale,
6879                                comm->cellsize_limit, comm->cutoff,
6880                                comm->bInterCGBondeds);
6881
6882         if (dd->nc[XX] == 0)
6883         {
6884             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6885             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6886                     !bC ? "-rdd" : "-rcon",
6887                     comm->eDLB != edlbNO ? " or -dds" : "",
6888                     bC ? " or your LINCS settings" : "");
6889
6890             gmx_fatal_collective(FARGS, cr, NULL,
6891                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6892                                  "%s\n"
6893                                  "Look in the log file for details on the domain decomposition",
6894                                  cr->nnodes-cr->npmenodes, limit, buf);
6895         }
6896         set_dd_dim(fplog, dd);
6897     }
6898
6899     if (fplog)
6900     {
6901         fprintf(fplog,
6902                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6903                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6904     }
6905
6906     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6907     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6908     {
6909         gmx_fatal_collective(FARGS, cr, NULL,
6910                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6911                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6912     }
6913     if (cr->npmenodes > dd->nnodes)
6914     {
6915         gmx_fatal_collective(FARGS, cr, NULL,
6916                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6917     }
6918     if (cr->npmenodes > 0)
6919     {
6920         comm->npmenodes = cr->npmenodes;
6921     }
6922     else
6923     {
6924         comm->npmenodes = dd->nnodes;
6925     }
6926
6927     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6928     {
6929         /* The following choices should match those
6930          * in comm_cost_est in domdec_setup.c.
6931          * Note that here the checks have to take into account
6932          * that the decomposition might occur in a different order than xyz
6933          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6934          * in which case they will not match those in comm_cost_est,
6935          * but since that is mainly for testing purposes that's fine.
6936          */
6937         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6938             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6939             getenv("GMX_PMEONEDD") == NULL)
6940         {
6941             comm->npmedecompdim = 2;
6942             comm->npmenodes_x   = dd->nc[XX];
6943             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6944         }
6945         else
6946         {
6947             /* In case nc is 1 in both x and y we could still choose to
6948              * decompose pme in y instead of x, but we use x for simplicity.
6949              */
6950             comm->npmedecompdim = 1;
6951             if (dd->dim[0] == YY)
6952             {
6953                 comm->npmenodes_x = 1;
6954                 comm->npmenodes_y = comm->npmenodes;
6955             }
6956             else
6957             {
6958                 comm->npmenodes_x = comm->npmenodes;
6959                 comm->npmenodes_y = 1;
6960             }
6961         }
6962         if (fplog)
6963         {
6964             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6965                     comm->npmenodes_x, comm->npmenodes_y, 1);
6966         }
6967     }
6968     else
6969     {
6970         comm->npmedecompdim = 0;
6971         comm->npmenodes_x   = 0;
6972         comm->npmenodes_y   = 0;
6973     }
6974
6975     /* Technically we don't need both of these,
6976      * but it simplifies code not having to recalculate it.
6977      */
6978     *npme_x = comm->npmenodes_x;
6979     *npme_y = comm->npmenodes_y;
6980
6981     snew(comm->slb_frac, DIM);
6982     if (comm->eDLB == edlbNO)
6983     {
6984         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6985         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6986         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6987     }
6988
6989     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
6990     {
6991         if (comm->bBondComm || comm->eDLB != edlbNO)
6992         {
6993             /* Set the bonded communication distance to halfway
6994              * the minimum and the maximum,
6995              * since the extra communication cost is nearly zero.
6996              */
6997             acs                = average_cellsize_min(dd, ddbox);
6998             comm->cutoff_mbody = 0.5*(r_bonded + acs);
6999             if (comm->eDLB != edlbNO)
7000             {
7001                 /* Check if this does not limit the scaling */
7002                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
7003             }
7004             if (!comm->bBondComm)
7005             {
7006                 /* Without bBondComm do not go beyond the n.b. cut-off */
7007                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
7008                 if (comm->cellsize_limit >= comm->cutoff)
7009                 {
7010                     /* We don't loose a lot of efficieny
7011                      * when increasing it to the n.b. cut-off.
7012                      * It can even be slightly faster, because we need
7013                      * less checks for the communication setup.
7014                      */
7015                     comm->cutoff_mbody = comm->cutoff;
7016                 }
7017             }
7018             /* Check if we did not end up below our original limit */
7019             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
7020
7021             if (comm->cutoff_mbody > comm->cellsize_limit)
7022             {
7023                 comm->cellsize_limit = comm->cutoff_mbody;
7024             }
7025         }
7026         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7027     }
7028
7029     if (debug)
7030     {
7031         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
7032                 "cellsize limit %f\n",
7033                 comm->bBondComm, comm->cellsize_limit);
7034     }
7035
7036     if (MASTER(cr))
7037     {
7038         check_dd_restrictions(cr, dd, ir, fplog);
7039     }
7040
7041     comm->partition_step = INT_MIN;
7042     dd->ddp_count        = 0;
7043
7044     clear_dd_cycle_counts(dd);
7045
7046     return dd;
7047 }
7048
7049 static void set_dlb_limits(gmx_domdec_t *dd)
7050
7051 {
7052     int d;
7053
7054     for (d = 0; d < dd->ndim; d++)
7055     {
7056         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
7057         dd->comm->cellsize_min[dd->dim[d]] =
7058             dd->comm->cellsize_min_dlb[dd->dim[d]];
7059     }
7060 }
7061
7062
7063 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
7064 {
7065     gmx_domdec_t      *dd;
7066     gmx_domdec_comm_t *comm;
7067     real               cellsize_min;
7068     int                d, nc, i;
7069     char               buf[STRLEN];
7070
7071     dd   = cr->dd;
7072     comm = dd->comm;
7073
7074     if (fplog)
7075     {
7076         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
7077     }
7078
7079     cellsize_min = comm->cellsize_min[dd->dim[0]];
7080     for (d = 1; d < dd->ndim; d++)
7081     {
7082         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
7083     }
7084
7085     if (cellsize_min < comm->cellsize_limit*1.05)
7086     {
7087         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7088
7089         /* Change DLB from "auto" to "no". */
7090         comm->eDLB = edlbNO;
7091
7092         return;
7093     }
7094
7095     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
7096     comm->bDynLoadBal = TRUE;
7097     dd->bGridJump     = TRUE;
7098
7099     set_dlb_limits(dd);
7100
7101     /* We can set the required cell size info here,
7102      * so we do not need to communicate this.
7103      * The grid is completely uniform.
7104      */
7105     for (d = 0; d < dd->ndim; d++)
7106     {
7107         if (comm->root[d])
7108         {
7109             comm->load[d].sum_m = comm->load[d].sum;
7110
7111             nc = dd->nc[dd->dim[d]];
7112             for (i = 0; i < nc; i++)
7113             {
7114                 comm->root[d]->cell_f[i]    = i/(real)nc;
7115                 if (d > 0)
7116                 {
7117                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7118                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7119                 }
7120             }
7121             comm->root[d]->cell_f[nc] = 1.0;
7122         }
7123     }
7124 }
7125
7126 static char *init_bLocalCG(gmx_mtop_t *mtop)
7127 {
7128     int   ncg, cg;
7129     char *bLocalCG;
7130
7131     ncg = ncg_mtop(mtop);
7132     snew(bLocalCG, ncg);
7133     for (cg = 0; cg < ncg; cg++)
7134     {
7135         bLocalCG[cg] = FALSE;
7136     }
7137
7138     return bLocalCG;
7139 }
7140
7141 void dd_init_bondeds(FILE *fplog,
7142                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7143                      gmx_vsite_t *vsite,
7144                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7145 {
7146     gmx_domdec_comm_t *comm;
7147     gmx_bool           bBondComm;
7148     int                d;
7149
7150     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
7151
7152     comm = dd->comm;
7153
7154     if (comm->bBondComm)
7155     {
7156         /* Communicate atoms beyond the cut-off for bonded interactions */
7157         comm = dd->comm;
7158
7159         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7160
7161         comm->bLocalCG = init_bLocalCG(mtop);
7162     }
7163     else
7164     {
7165         /* Only communicate atoms based on cut-off */
7166         comm->cglink   = NULL;
7167         comm->bLocalCG = NULL;
7168     }
7169 }
7170
7171 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7172                               t_inputrec *ir,
7173                               gmx_bool bDynLoadBal, real dlb_scale,
7174                               gmx_ddbox_t *ddbox)
7175 {
7176     gmx_domdec_comm_t *comm;
7177     int                d;
7178     ivec               np;
7179     real               limit, shrink;
7180     char               buf[64];
7181
7182     if (fplog == NULL)
7183     {
7184         return;
7185     }
7186
7187     comm = dd->comm;
7188
7189     if (bDynLoadBal)
7190     {
7191         fprintf(fplog, "The maximum number of communication pulses is:");
7192         for (d = 0; d < dd->ndim; d++)
7193         {
7194             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7195         }
7196         fprintf(fplog, "\n");
7197         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7198         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7199         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7200         for (d = 0; d < DIM; d++)
7201         {
7202             if (dd->nc[d] > 1)
7203             {
7204                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7205                 {
7206                     shrink = 0;
7207                 }
7208                 else
7209                 {
7210                     shrink =
7211                         comm->cellsize_min_dlb[d]/
7212                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7213                 }
7214                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7215             }
7216         }
7217         fprintf(fplog, "\n");
7218     }
7219     else
7220     {
7221         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
7222         fprintf(fplog, "The initial number of communication pulses is:");
7223         for (d = 0; d < dd->ndim; d++)
7224         {
7225             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7226         }
7227         fprintf(fplog, "\n");
7228         fprintf(fplog, "The initial domain decomposition cell size is:");
7229         for (d = 0; d < DIM; d++)
7230         {
7231             if (dd->nc[d] > 1)
7232             {
7233                 fprintf(fplog, " %c %.2f nm",
7234                         dim2char(d), dd->comm->cellsize_min[d]);
7235             }
7236         }
7237         fprintf(fplog, "\n\n");
7238     }
7239
7240     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7241     {
7242         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7243         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7244                 "non-bonded interactions", "", comm->cutoff);
7245
7246         if (bDynLoadBal)
7247         {
7248             limit = dd->comm->cellsize_limit;
7249         }
7250         else
7251         {
7252             if (dynamic_dd_box(ddbox, ir))
7253             {
7254                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7255             }
7256             limit = dd->comm->cellsize_min[XX];
7257             for (d = 1; d < DIM; d++)
7258             {
7259                 limit = min(limit, dd->comm->cellsize_min[d]);
7260             }
7261         }
7262
7263         if (comm->bInterCGBondeds)
7264         {
7265             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7266                     "two-body bonded interactions", "(-rdd)",
7267                     max(comm->cutoff, comm->cutoff_mbody));
7268             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7269                     "multi-body bonded interactions", "(-rdd)",
7270                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7271         }
7272         if (dd->vsite_comm)
7273         {
7274             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7275                     "virtual site constructions", "(-rcon)", limit);
7276         }
7277         if (dd->constraint_comm)
7278         {
7279             sprintf(buf, "atoms separated by up to %d constraints",
7280                     1+ir->nProjOrder);
7281             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7282                     buf, "(-rcon)", limit);
7283         }
7284         fprintf(fplog, "\n");
7285     }
7286
7287     fflush(fplog);
7288 }
7289
7290 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7291                                 real               dlb_scale,
7292                                 const t_inputrec  *ir,
7293                                 const gmx_ddbox_t *ddbox)
7294 {
7295     gmx_domdec_comm_t *comm;
7296     int                d, dim, npulse, npulse_d_max, npulse_d;
7297     gmx_bool           bNoCutOff;
7298
7299     comm = dd->comm;
7300
7301     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7302
7303     /* Determine the maximum number of comm. pulses in one dimension */
7304
7305     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7306
7307     /* Determine the maximum required number of grid pulses */
7308     if (comm->cellsize_limit >= comm->cutoff)
7309     {
7310         /* Only a single pulse is required */
7311         npulse = 1;
7312     }
7313     else if (!bNoCutOff && comm->cellsize_limit > 0)
7314     {
7315         /* We round down slightly here to avoid overhead due to the latency
7316          * of extra communication calls when the cut-off
7317          * would be only slightly longer than the cell size.
7318          * Later cellsize_limit is redetermined,
7319          * so we can not miss interactions due to this rounding.
7320          */
7321         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7322     }
7323     else
7324     {
7325         /* There is no cell size limit */
7326         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7327     }
7328
7329     if (!bNoCutOff && npulse > 1)
7330     {
7331         /* See if we can do with less pulses, based on dlb_scale */
7332         npulse_d_max = 0;
7333         for (d = 0; d < dd->ndim; d++)
7334         {
7335             dim      = dd->dim[d];
7336             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7337                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7338             npulse_d_max = max(npulse_d_max, npulse_d);
7339         }
7340         npulse = min(npulse, npulse_d_max);
7341     }
7342
7343     /* This env var can override npulse */
7344     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7345     if (d > 0)
7346     {
7347         npulse = d;
7348     }
7349
7350     comm->maxpulse       = 1;
7351     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7352     for (d = 0; d < dd->ndim; d++)
7353     {
7354         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7355         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7356         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7357         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7358         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7359         {
7360             comm->bVacDLBNoLimit = FALSE;
7361         }
7362     }
7363
7364     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7365     if (!comm->bVacDLBNoLimit)
7366     {
7367         comm->cellsize_limit = max(comm->cellsize_limit,
7368                                    comm->cutoff/comm->maxpulse);
7369     }
7370     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7371     /* Set the minimum cell size for each DD dimension */
7372     for (d = 0; d < dd->ndim; d++)
7373     {
7374         if (comm->bVacDLBNoLimit ||
7375             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7376         {
7377             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7378         }
7379         else
7380         {
7381             comm->cellsize_min_dlb[dd->dim[d]] =
7382                 comm->cutoff/comm->cd[d].np_dlb;
7383         }
7384     }
7385     if (comm->cutoff_mbody <= 0)
7386     {
7387         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7388     }
7389     if (comm->bDynLoadBal)
7390     {
7391         set_dlb_limits(dd);
7392     }
7393 }
7394
7395 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7396 {
7397     /* If each molecule is a single charge group
7398      * or we use domain decomposition for each periodic dimension,
7399      * we do not need to take pbc into account for the bonded interactions.
7400      */
7401     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7402             !(dd->nc[XX] > 1 &&
7403               dd->nc[YY] > 1 &&
7404               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7405 }
7406
7407 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7408                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7409 {
7410     gmx_domdec_comm_t *comm;
7411     int                natoms_tot;
7412     real               vol_frac;
7413
7414     comm = dd->comm;
7415
7416     /* Initialize the thread data.
7417      * This can not be done in init_domain_decomposition,
7418      * as the numbers of threads is determined later.
7419      */
7420     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7421     if (comm->nth > 1)
7422     {
7423         snew(comm->dth, comm->nth);
7424     }
7425
7426     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7427     {
7428         init_ddpme(dd, &comm->ddpme[0], 0);
7429         if (comm->npmedecompdim >= 2)
7430         {
7431             init_ddpme(dd, &comm->ddpme[1], 1);
7432         }
7433     }
7434     else
7435     {
7436         comm->npmenodes = 0;
7437         if (dd->pme_nodeid >= 0)
7438         {
7439             gmx_fatal_collective(FARGS, NULL, dd,
7440                                  "Can not have separate PME ranks without PME electrostatics");
7441         }
7442     }
7443
7444     if (debug)
7445     {
7446         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7447     }
7448     if (comm->eDLB != edlbNO)
7449     {
7450         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7451     }
7452
7453     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7454     if (comm->eDLB == edlbAUTO)
7455     {
7456         if (fplog)
7457         {
7458             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7459         }
7460         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7461     }
7462
7463     if (ir->ePBC == epbcNONE)
7464     {
7465         vol_frac = 1 - 1/(double)dd->nnodes;
7466     }
7467     else
7468     {
7469         vol_frac =
7470             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7471     }
7472     if (debug)
7473     {
7474         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7475     }
7476     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7477
7478     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7479 }
7480
7481 static gmx_bool test_dd_cutoff(t_commrec *cr,
7482                                t_state *state, t_inputrec *ir,
7483                                real cutoff_req)
7484 {
7485     gmx_domdec_t *dd;
7486     gmx_ddbox_t   ddbox;
7487     int           d, dim, np;
7488     real          inv_cell_size;
7489     int           LocallyLimited;
7490
7491     dd = cr->dd;
7492
7493     set_ddbox(dd, FALSE, cr, ir, state->box,
7494               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7495
7496     LocallyLimited = 0;
7497
7498     for (d = 0; d < dd->ndim; d++)
7499     {
7500         dim = dd->dim[d];
7501
7502         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7503         if (dynamic_dd_box(&ddbox, ir))
7504         {
7505             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7506         }
7507
7508         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7509
7510         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7511             dd->comm->cd[d].np_dlb > 0)
7512         {
7513             if (np > dd->comm->cd[d].np_dlb)
7514             {
7515                 return FALSE;
7516             }
7517
7518             /* If a current local cell size is smaller than the requested
7519              * cut-off, we could still fix it, but this gets very complicated.
7520              * Without fixing here, we might actually need more checks.
7521              */
7522             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7523             {
7524                 LocallyLimited = 1;
7525             }
7526         }
7527     }
7528
7529     if (dd->comm->eDLB != edlbNO)
7530     {
7531         /* If DLB is not active yet, we don't need to check the grid jumps.
7532          * Actually we shouldn't, because then the grid jump data is not set.
7533          */
7534         if (dd->comm->bDynLoadBal &&
7535             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7536         {
7537             LocallyLimited = 1;
7538         }
7539
7540         gmx_sumi(1, &LocallyLimited, cr);
7541
7542         if (LocallyLimited > 0)
7543         {
7544             return FALSE;
7545         }
7546     }
7547
7548     return TRUE;
7549 }
7550
7551 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7552                           real cutoff_req)
7553 {
7554     gmx_bool bCutoffAllowed;
7555
7556     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7557
7558     if (bCutoffAllowed)
7559     {
7560         cr->dd->comm->cutoff = cutoff_req;
7561     }
7562
7563     return bCutoffAllowed;
7564 }
7565
7566 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7567 {
7568     gmx_domdec_comm_t *comm;
7569
7570     comm = cr->dd->comm;
7571
7572     /* Turn on the DLB limiting (might have been on already) */
7573     comm->bPMELoadBalDLBLimits = TRUE;
7574
7575     /* Change the cut-off limit */
7576     comm->PMELoadBal_max_cutoff = comm->cutoff;
7577 }
7578
7579 static void merge_cg_buffers(int ncell,
7580                              gmx_domdec_comm_dim_t *cd, int pulse,
7581                              int  *ncg_cell,
7582                              int  *index_gl, int  *recv_i,
7583                              rvec *cg_cm,    rvec *recv_vr,
7584                              int *cgindex,
7585                              cginfo_mb_t *cginfo_mb, int *cginfo)
7586 {
7587     gmx_domdec_ind_t *ind, *ind_p;
7588     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7589     int               shift, shift_at;
7590
7591     ind = &cd->ind[pulse];
7592
7593     /* First correct the already stored data */
7594     shift = ind->nrecv[ncell];
7595     for (cell = ncell-1; cell >= 0; cell--)
7596     {
7597         shift -= ind->nrecv[cell];
7598         if (shift > 0)
7599         {
7600             /* Move the cg's present from previous grid pulses */
7601             cg0                = ncg_cell[ncell+cell];
7602             cg1                = ncg_cell[ncell+cell+1];
7603             cgindex[cg1+shift] = cgindex[cg1];
7604             for (cg = cg1-1; cg >= cg0; cg--)
7605             {
7606                 index_gl[cg+shift] = index_gl[cg];
7607                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7608                 cgindex[cg+shift] = cgindex[cg];
7609                 cginfo[cg+shift]  = cginfo[cg];
7610             }
7611             /* Correct the already stored send indices for the shift */
7612             for (p = 1; p <= pulse; p++)
7613             {
7614                 ind_p = &cd->ind[p];
7615                 cg0   = 0;
7616                 for (c = 0; c < cell; c++)
7617                 {
7618                     cg0 += ind_p->nsend[c];
7619                 }
7620                 cg1 = cg0 + ind_p->nsend[cell];
7621                 for (cg = cg0; cg < cg1; cg++)
7622                 {
7623                     ind_p->index[cg] += shift;
7624                 }
7625             }
7626         }
7627     }
7628
7629     /* Merge in the communicated buffers */
7630     shift    = 0;
7631     shift_at = 0;
7632     cg0      = 0;
7633     for (cell = 0; cell < ncell; cell++)
7634     {
7635         cg1 = ncg_cell[ncell+cell+1] + shift;
7636         if (shift_at > 0)
7637         {
7638             /* Correct the old cg indices */
7639             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7640             {
7641                 cgindex[cg+1] += shift_at;
7642             }
7643         }
7644         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7645         {
7646             /* Copy this charge group from the buffer */
7647             index_gl[cg1] = recv_i[cg0];
7648             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7649             /* Add it to the cgindex */
7650             cg_gl          = index_gl[cg1];
7651             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7652             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7653             cgindex[cg1+1] = cgindex[cg1] + nat;
7654             cg0++;
7655             cg1++;
7656             shift_at += nat;
7657         }
7658         shift                 += ind->nrecv[cell];
7659         ncg_cell[ncell+cell+1] = cg1;
7660     }
7661 }
7662
7663 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7664                                int nzone, int cg0, const int *cgindex)
7665 {
7666     int cg, zone, p;
7667
7668     /* Store the atom block boundaries for easy copying of communication buffers
7669      */
7670     cg = cg0;
7671     for (zone = 0; zone < nzone; zone++)
7672     {
7673         for (p = 0; p < cd->np; p++)
7674         {
7675             cd->ind[p].cell2at0[zone] = cgindex[cg];
7676             cg += cd->ind[p].nrecv[zone];
7677             cd->ind[p].cell2at1[zone] = cgindex[cg];
7678         }
7679     }
7680 }
7681
7682 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7683 {
7684     int      i;
7685     gmx_bool bMiss;
7686
7687     bMiss = FALSE;
7688     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7689     {
7690         if (!bLocalCG[link->a[i]])
7691         {
7692             bMiss = TRUE;
7693         }
7694     }
7695
7696     return bMiss;
7697 }
7698
7699 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7700 typedef struct {
7701     real c[DIM][4]; /* the corners for the non-bonded communication */
7702     real cr0;       /* corner for rounding */
7703     real cr1[4];    /* corners for rounding */
7704     real bc[DIM];   /* corners for bounded communication */
7705     real bcr1;      /* corner for rounding for bonded communication */
7706 } dd_corners_t;
7707
7708 /* Determine the corners of the domain(s) we are communicating with */
7709 static void
7710 set_dd_corners(const gmx_domdec_t *dd,
7711                int dim0, int dim1, int dim2,
7712                gmx_bool bDistMB,
7713                dd_corners_t *c)
7714 {
7715     const gmx_domdec_comm_t  *comm;
7716     const gmx_domdec_zones_t *zones;
7717     int i, j;
7718
7719     comm = dd->comm;
7720
7721     zones = &comm->zones;
7722
7723     /* Keep the compiler happy */
7724     c->cr0  = 0;
7725     c->bcr1 = 0;
7726
7727     /* The first dimension is equal for all cells */
7728     c->c[0][0] = comm->cell_x0[dim0];
7729     if (bDistMB)
7730     {
7731         c->bc[0] = c->c[0][0];
7732     }
7733     if (dd->ndim >= 2)
7734     {
7735         dim1 = dd->dim[1];
7736         /* This cell row is only seen from the first row */
7737         c->c[1][0] = comm->cell_x0[dim1];
7738         /* All rows can see this row */
7739         c->c[1][1] = comm->cell_x0[dim1];
7740         if (dd->bGridJump)
7741         {
7742             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7743             if (bDistMB)
7744             {
7745                 /* For the multi-body distance we need the maximum */
7746                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7747             }
7748         }
7749         /* Set the upper-right corner for rounding */
7750         c->cr0 = comm->cell_x1[dim0];
7751
7752         if (dd->ndim >= 3)
7753         {
7754             dim2 = dd->dim[2];
7755             for (j = 0; j < 4; j++)
7756             {
7757                 c->c[2][j] = comm->cell_x0[dim2];
7758             }
7759             if (dd->bGridJump)
7760             {
7761                 /* Use the maximum of the i-cells that see a j-cell */
7762                 for (i = 0; i < zones->nizone; i++)
7763                 {
7764                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7765                     {
7766                         if (j >= 4)
7767                         {
7768                             c->c[2][j-4] =
7769                                 max(c->c[2][j-4],
7770                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7771                         }
7772                     }
7773                 }
7774                 if (bDistMB)
7775                 {
7776                     /* For the multi-body distance we need the maximum */
7777                     c->bc[2] = comm->cell_x0[dim2];
7778                     for (i = 0; i < 2; i++)
7779                     {
7780                         for (j = 0; j < 2; j++)
7781                         {
7782                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7783                         }
7784                     }
7785                 }
7786             }
7787
7788             /* Set the upper-right corner for rounding */
7789             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7790              * Only cell (0,0,0) can see cell 7 (1,1,1)
7791              */
7792             c->cr1[0] = comm->cell_x1[dim1];
7793             c->cr1[3] = comm->cell_x1[dim1];
7794             if (dd->bGridJump)
7795             {
7796                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7797                 if (bDistMB)
7798                 {
7799                     /* For the multi-body distance we need the maximum */
7800                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7801                 }
7802             }
7803         }
7804     }
7805 }
7806
7807 /* Determine which cg's we need to send in this pulse from this zone */
7808 static void
7809 get_zone_pulse_cgs(gmx_domdec_t *dd,
7810                    int zonei, int zone,
7811                    int cg0, int cg1,
7812                    const int *index_gl,
7813                    const int *cgindex,
7814                    int dim, int dim_ind,
7815                    int dim0, int dim1, int dim2,
7816                    real r_comm2, real r_bcomm2,
7817                    matrix box,
7818                    ivec tric_dist,
7819                    rvec *normal,
7820                    real skew_fac2_d, real skew_fac_01,
7821                    rvec *v_d, rvec *v_0, rvec *v_1,
7822                    const dd_corners_t *c,
7823                    rvec sf2_round,
7824                    gmx_bool bDistBonded,
7825                    gmx_bool bBondComm,
7826                    gmx_bool bDist2B,
7827                    gmx_bool bDistMB,
7828                    rvec *cg_cm,
7829                    int *cginfo,
7830                    gmx_domdec_ind_t *ind,
7831                    int **ibuf, int *ibuf_nalloc,
7832                    vec_rvec_t *vbuf,
7833                    int *nsend_ptr,
7834                    int *nat_ptr,
7835                    int *nsend_z_ptr)
7836 {
7837     gmx_domdec_comm_t *comm;
7838     gmx_bool           bScrew;
7839     gmx_bool           bDistMB_pulse;
7840     int                cg, i;
7841     real               r2, rb2, r, tric_sh;
7842     rvec               rn, rb;
7843     int                dimd;
7844     int                nsend_z, nsend, nat;
7845
7846     comm = dd->comm;
7847
7848     bScrew = (dd->bScrewPBC && dim == XX);
7849
7850     bDistMB_pulse = (bDistMB && bDistBonded);
7851
7852     nsend_z = 0;
7853     nsend   = *nsend_ptr;
7854     nat     = *nat_ptr;
7855
7856     for (cg = cg0; cg < cg1; cg++)
7857     {
7858         r2  = 0;
7859         rb2 = 0;
7860         if (tric_dist[dim_ind] == 0)
7861         {
7862             /* Rectangular direction, easy */
7863             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7864             if (r > 0)
7865             {
7866                 r2 += r*r;
7867             }
7868             if (bDistMB_pulse)
7869             {
7870                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7871                 if (r > 0)
7872                 {
7873                     rb2 += r*r;
7874                 }
7875             }
7876             /* Rounding gives at most a 16% reduction
7877              * in communicated atoms
7878              */
7879             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7880             {
7881                 r = cg_cm[cg][dim0] - c->cr0;
7882                 /* This is the first dimension, so always r >= 0 */
7883                 r2 += r*r;
7884                 if (bDistMB_pulse)
7885                 {
7886                     rb2 += r*r;
7887                 }
7888             }
7889             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7890             {
7891                 r = cg_cm[cg][dim1] - c->cr1[zone];
7892                 if (r > 0)
7893                 {
7894                     r2 += r*r;
7895                 }
7896                 if (bDistMB_pulse)
7897                 {
7898                     r = cg_cm[cg][dim1] - c->bcr1;
7899                     if (r > 0)
7900                     {
7901                         rb2 += r*r;
7902                     }
7903                 }
7904             }
7905         }
7906         else
7907         {
7908             /* Triclinic direction, more complicated */
7909             clear_rvec(rn);
7910             clear_rvec(rb);
7911             /* Rounding, conservative as the skew_fac multiplication
7912              * will slightly underestimate the distance.
7913              */
7914             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7915             {
7916                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7917                 for (i = dim0+1; i < DIM; i++)
7918                 {
7919                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7920                 }
7921                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7922                 if (bDistMB_pulse)
7923                 {
7924                     rb[dim0] = rn[dim0];
7925                     rb2      = r2;
7926                 }
7927                 /* Take care that the cell planes along dim0 might not
7928                  * be orthogonal to those along dim1 and dim2.
7929                  */
7930                 for (i = 1; i <= dim_ind; i++)
7931                 {
7932                     dimd = dd->dim[i];
7933                     if (normal[dim0][dimd] > 0)
7934                     {
7935                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7936                         if (bDistMB_pulse)
7937                         {
7938                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7939                         }
7940                     }
7941                 }
7942             }
7943             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7944             {
7945                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7946                 tric_sh   = 0;
7947                 for (i = dim1+1; i < DIM; i++)
7948                 {
7949                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7950                 }
7951                 rn[dim1] += tric_sh;
7952                 if (rn[dim1] > 0)
7953                 {
7954                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7955                     /* Take care of coupling of the distances
7956                      * to the planes along dim0 and dim1 through dim2.
7957                      */
7958                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7959                     /* Take care that the cell planes along dim1
7960                      * might not be orthogonal to that along dim2.
7961                      */
7962                     if (normal[dim1][dim2] > 0)
7963                     {
7964                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7965                     }
7966                 }
7967                 if (bDistMB_pulse)
7968                 {
7969                     rb[dim1] +=
7970                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7971                     if (rb[dim1] > 0)
7972                     {
7973                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
7974                         /* Take care of coupling of the distances
7975                          * to the planes along dim0 and dim1 through dim2.
7976                          */
7977                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
7978                         /* Take care that the cell planes along dim1
7979                          * might not be orthogonal to that along dim2.
7980                          */
7981                         if (normal[dim1][dim2] > 0)
7982                         {
7983                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
7984                         }
7985                     }
7986                 }
7987             }
7988             /* The distance along the communication direction */
7989             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
7990             tric_sh  = 0;
7991             for (i = dim+1; i < DIM; i++)
7992             {
7993                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
7994             }
7995             rn[dim] += tric_sh;
7996             if (rn[dim] > 0)
7997             {
7998                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
7999                 /* Take care of coupling of the distances
8000                  * to the planes along dim0 and dim1 through dim2.
8001                  */
8002                 if (dim_ind == 1 && zonei == 1)
8003                 {
8004                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
8005                 }
8006             }
8007             if (bDistMB_pulse)
8008             {
8009                 clear_rvec(rb);
8010                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
8011                 if (rb[dim] > 0)
8012                 {
8013                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
8014                     /* Take care of coupling of the distances
8015                      * to the planes along dim0 and dim1 through dim2.
8016                      */
8017                     if (dim_ind == 1 && zonei == 1)
8018                     {
8019                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
8020                     }
8021                 }
8022             }
8023         }
8024
8025         if (r2 < r_comm2 ||
8026             (bDistBonded &&
8027              ((bDistMB && rb2 < r_bcomm2) ||
8028               (bDist2B && r2  < r_bcomm2)) &&
8029              (!bBondComm ||
8030               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
8031                missing_link(comm->cglink, index_gl[cg],
8032                             comm->bLocalCG)))))
8033         {
8034             /* Make an index to the local charge groups */
8035             if (nsend+1 > ind->nalloc)
8036             {
8037                 ind->nalloc = over_alloc_large(nsend+1);
8038                 srenew(ind->index, ind->nalloc);
8039             }
8040             if (nsend+1 > *ibuf_nalloc)
8041             {
8042                 *ibuf_nalloc = over_alloc_large(nsend+1);
8043                 srenew(*ibuf, *ibuf_nalloc);
8044             }
8045             ind->index[nsend] = cg;
8046             (*ibuf)[nsend]    = index_gl[cg];
8047             nsend_z++;
8048             vec_rvec_check_alloc(vbuf, nsend+1);
8049
8050             if (dd->ci[dim] == 0)
8051             {
8052                 /* Correct cg_cm for pbc */
8053                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
8054                 if (bScrew)
8055                 {
8056                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
8057                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
8058                 }
8059             }
8060             else
8061             {
8062                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
8063             }
8064             nsend++;
8065             nat += cgindex[cg+1] - cgindex[cg];
8066         }
8067     }
8068
8069     *nsend_ptr   = nsend;
8070     *nat_ptr     = nat;
8071     *nsend_z_ptr = nsend_z;
8072 }
8073
8074 static void setup_dd_communication(gmx_domdec_t *dd,
8075                                    matrix box, gmx_ddbox_t *ddbox,
8076                                    t_forcerec *fr, t_state *state, rvec **f)
8077 {
8078     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8079     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
8080     int                    c, i, j, cg, cg_gl, nrcg;
8081     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8082     gmx_domdec_comm_t     *comm;
8083     gmx_domdec_zones_t    *zones;
8084     gmx_domdec_comm_dim_t *cd;
8085     gmx_domdec_ind_t      *ind;
8086     cginfo_mb_t           *cginfo_mb;
8087     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8088     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
8089     dd_corners_t           corners;
8090     ivec                   tric_dist;
8091     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
8092     real                   skew_fac2_d, skew_fac_01;
8093     rvec                   sf2_round;
8094     int                    nsend, nat;
8095     int                    th;
8096
8097     if (debug)
8098     {
8099         fprintf(debug, "Setting up DD communication\n");
8100     }
8101
8102     comm  = dd->comm;
8103
8104     switch (fr->cutoff_scheme)
8105     {
8106         case ecutsGROUP:
8107             cg_cm = fr->cg_cm;
8108             break;
8109         case ecutsVERLET:
8110             cg_cm = state->x;
8111             break;
8112         default:
8113             gmx_incons("unimplemented");
8114             cg_cm = NULL;
8115     }
8116
8117     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8118     {
8119         dim = dd->dim[dim_ind];
8120
8121         /* Check if we need to use triclinic distances */
8122         tric_dist[dim_ind] = 0;
8123         for (i = 0; i <= dim_ind; i++)
8124         {
8125             if (ddbox->tric_dir[dd->dim[i]])
8126             {
8127                 tric_dist[dim_ind] = 1;
8128             }
8129         }
8130     }
8131
8132     bBondComm = comm->bBondComm;
8133
8134     /* Do we need to determine extra distances for multi-body bondeds? */
8135     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8136
8137     /* Do we need to determine extra distances for only two-body bondeds? */
8138     bDist2B = (bBondComm && !bDistMB);
8139
8140     r_comm2  = sqr(comm->cutoff);
8141     r_bcomm2 = sqr(comm->cutoff_mbody);
8142
8143     if (debug)
8144     {
8145         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8146     }
8147
8148     zones = &comm->zones;
8149
8150     dim0 = dd->dim[0];
8151     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8152     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8153
8154     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8155
8156     /* Triclinic stuff */
8157     normal      = ddbox->normal;
8158     skew_fac_01 = 0;
8159     if (dd->ndim >= 2)
8160     {
8161         v_0 = ddbox->v[dim0];
8162         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8163         {
8164             /* Determine the coupling coefficient for the distances
8165              * to the cell planes along dim0 and dim1 through dim2.
8166              * This is required for correct rounding.
8167              */
8168             skew_fac_01 =
8169                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8170             if (debug)
8171             {
8172                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8173             }
8174         }
8175     }
8176     if (dd->ndim >= 3)
8177     {
8178         v_1 = ddbox->v[dim1];
8179     }
8180
8181     zone_cg_range = zones->cg_range;
8182     index_gl      = dd->index_gl;
8183     cgindex       = dd->cgindex;
8184     cginfo_mb     = fr->cginfo_mb;
8185
8186     zone_cg_range[0]   = 0;
8187     zone_cg_range[1]   = dd->ncg_home;
8188     comm->zone_ncg1[0] = dd->ncg_home;
8189     pos_cg             = dd->ncg_home;
8190
8191     nat_tot = dd->nat_home;
8192     nzone   = 1;
8193     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8194     {
8195         dim = dd->dim[dim_ind];
8196         cd  = &comm->cd[dim_ind];
8197
8198         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8199         {
8200             /* No pbc in this dimension, the first node should not comm. */
8201             nzone_send = 0;
8202         }
8203         else
8204         {
8205             nzone_send = nzone;
8206         }
8207
8208         v_d         = ddbox->v[dim];
8209         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8210
8211         cd->bInPlace = TRUE;
8212         for (p = 0; p < cd->np; p++)
8213         {
8214             /* Only atoms communicated in the first pulse are used
8215              * for multi-body bonded interactions or for bBondComm.
8216              */
8217             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8218
8219             ind   = &cd->ind[p];
8220             nsend = 0;
8221             nat   = 0;
8222             for (zone = 0; zone < nzone_send; zone++)
8223             {
8224                 if (tric_dist[dim_ind] && dim_ind > 0)
8225                 {
8226                     /* Determine slightly more optimized skew_fac's
8227                      * for rounding.
8228                      * This reduces the number of communicated atoms
8229                      * by about 10% for 3D DD of rhombic dodecahedra.
8230                      */
8231                     for (dimd = 0; dimd < dim; dimd++)
8232                     {
8233                         sf2_round[dimd] = 1;
8234                         if (ddbox->tric_dir[dimd])
8235                         {
8236                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8237                             {
8238                                 /* If we are shifted in dimension i
8239                                  * and the cell plane is tilted forward
8240                                  * in dimension i, skip this coupling.
8241                                  */
8242                                 if (!(zones->shift[nzone+zone][i] &&
8243                                       ddbox->v[dimd][i][dimd] >= 0))
8244                                 {
8245                                     sf2_round[dimd] +=
8246                                         sqr(ddbox->v[dimd][i][dimd]);
8247                                 }
8248                             }
8249                             sf2_round[dimd] = 1/sf2_round[dimd];
8250                         }
8251                     }
8252                 }
8253
8254                 zonei = zone_perm[dim_ind][zone];
8255                 if (p == 0)
8256                 {
8257                     /* Here we permutate the zones to obtain a convenient order
8258                      * for neighbor searching
8259                      */
8260                     cg0 = zone_cg_range[zonei];
8261                     cg1 = zone_cg_range[zonei+1];
8262                 }
8263                 else
8264                 {
8265                     /* Look only at the cg's received in the previous grid pulse
8266                      */
8267                     cg1 = zone_cg_range[nzone+zone+1];
8268                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8269                 }
8270
8271 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8272                 for (th = 0; th < comm->nth; th++)
8273                 {
8274                     gmx_domdec_ind_t *ind_p;
8275                     int             **ibuf_p, *ibuf_nalloc_p;
8276                     vec_rvec_t       *vbuf_p;
8277                     int              *nsend_p, *nat_p;
8278                     int              *nsend_zone_p;
8279                     int               cg0_th, cg1_th;
8280
8281                     if (th == 0)
8282                     {
8283                         /* Thread 0 writes in the comm buffers */
8284                         ind_p         = ind;
8285                         ibuf_p        = &comm->buf_int;
8286                         ibuf_nalloc_p = &comm->nalloc_int;
8287                         vbuf_p        = &comm->vbuf;
8288                         nsend_p       = &nsend;
8289                         nat_p         = &nat;
8290                         nsend_zone_p  = &ind->nsend[zone];
8291                     }
8292                     else
8293                     {
8294                         /* Other threads write into temp buffers */
8295                         ind_p         = &comm->dth[th].ind;
8296                         ibuf_p        = &comm->dth[th].ibuf;
8297                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8298                         vbuf_p        = &comm->dth[th].vbuf;
8299                         nsend_p       = &comm->dth[th].nsend;
8300                         nat_p         = &comm->dth[th].nat;
8301                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8302
8303                         comm->dth[th].nsend      = 0;
8304                         comm->dth[th].nat        = 0;
8305                         comm->dth[th].nsend_zone = 0;
8306                     }
8307
8308                     if (comm->nth == 1)
8309                     {
8310                         cg0_th = cg0;
8311                         cg1_th = cg1;
8312                     }
8313                     else
8314                     {
8315                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8316                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8317                     }
8318
8319                     /* Get the cg's for this pulse in this zone */
8320                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8321                                        index_gl, cgindex,
8322                                        dim, dim_ind, dim0, dim1, dim2,
8323                                        r_comm2, r_bcomm2,
8324                                        box, tric_dist,
8325                                        normal, skew_fac2_d, skew_fac_01,
8326                                        v_d, v_0, v_1, &corners, sf2_round,
8327                                        bDistBonded, bBondComm,
8328                                        bDist2B, bDistMB,
8329                                        cg_cm, fr->cginfo,
8330                                        ind_p,
8331                                        ibuf_p, ibuf_nalloc_p,
8332                                        vbuf_p,
8333                                        nsend_p, nat_p,
8334                                        nsend_zone_p);
8335                 }
8336
8337                 /* Append data of threads>=1 to the communication buffers */
8338                 for (th = 1; th < comm->nth; th++)
8339                 {
8340                     dd_comm_setup_work_t *dth;
8341                     int                   i, ns1;
8342
8343                     dth = &comm->dth[th];
8344
8345                     ns1 = nsend + dth->nsend_zone;
8346                     if (ns1 > ind->nalloc)
8347                     {
8348                         ind->nalloc = over_alloc_dd(ns1);
8349                         srenew(ind->index, ind->nalloc);
8350                     }
8351                     if (ns1 > comm->nalloc_int)
8352                     {
8353                         comm->nalloc_int = over_alloc_dd(ns1);
8354                         srenew(comm->buf_int, comm->nalloc_int);
8355                     }
8356                     if (ns1 > comm->vbuf.nalloc)
8357                     {
8358                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8359                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8360                     }
8361
8362                     for (i = 0; i < dth->nsend_zone; i++)
8363                     {
8364                         ind->index[nsend]    = dth->ind.index[i];
8365                         comm->buf_int[nsend] = dth->ibuf[i];
8366                         copy_rvec(dth->vbuf.v[i],
8367                                   comm->vbuf.v[nsend]);
8368                         nsend++;
8369                     }
8370                     nat              += dth->nat;
8371                     ind->nsend[zone] += dth->nsend_zone;
8372                 }
8373             }
8374             /* Clear the counts in case we do not have pbc */
8375             for (zone = nzone_send; zone < nzone; zone++)
8376             {
8377                 ind->nsend[zone] = 0;
8378             }
8379             ind->nsend[nzone]   = nsend;
8380             ind->nsend[nzone+1] = nat;
8381             /* Communicate the number of cg's and atoms to receive */
8382             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8383                             ind->nsend, nzone+2,
8384                             ind->nrecv, nzone+2);
8385
8386             /* The rvec buffer is also required for atom buffers of size nsend
8387              * in dd_move_x and dd_move_f.
8388              */
8389             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8390
8391             if (p > 0)
8392             {
8393                 /* We can receive in place if only the last zone is not empty */
8394                 for (zone = 0; zone < nzone-1; zone++)
8395                 {
8396                     if (ind->nrecv[zone] > 0)
8397                     {
8398                         cd->bInPlace = FALSE;
8399                     }
8400                 }
8401                 if (!cd->bInPlace)
8402                 {
8403                     /* The int buffer is only required here for the cg indices */
8404                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8405                     {
8406                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8407                         srenew(comm->buf_int2, comm->nalloc_int2);
8408                     }
8409                     /* The rvec buffer is also required for atom buffers
8410                      * of size nrecv in dd_move_x and dd_move_f.
8411                      */
8412                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8413                     vec_rvec_check_alloc(&comm->vbuf2, i);
8414                 }
8415             }
8416
8417             /* Make space for the global cg indices */
8418             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8419                 || dd->cg_nalloc == 0)
8420             {
8421                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8422                 srenew(index_gl, dd->cg_nalloc);
8423                 srenew(cgindex, dd->cg_nalloc+1);
8424             }
8425             /* Communicate the global cg indices */
8426             if (cd->bInPlace)
8427             {
8428                 recv_i = index_gl + pos_cg;
8429             }
8430             else
8431             {
8432                 recv_i = comm->buf_int2;
8433             }
8434             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8435                             comm->buf_int, nsend,
8436                             recv_i,        ind->nrecv[nzone]);
8437
8438             /* Make space for cg_cm */
8439             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8440             if (fr->cutoff_scheme == ecutsGROUP)
8441             {
8442                 cg_cm = fr->cg_cm;
8443             }
8444             else
8445             {
8446                 cg_cm = state->x;
8447             }
8448             /* Communicate cg_cm */
8449             if (cd->bInPlace)
8450             {
8451                 recv_vr = cg_cm + pos_cg;
8452             }
8453             else
8454             {
8455                 recv_vr = comm->vbuf2.v;
8456             }
8457             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8458                              comm->vbuf.v, nsend,
8459                              recv_vr,      ind->nrecv[nzone]);
8460
8461             /* Make the charge group index */
8462             if (cd->bInPlace)
8463             {
8464                 zone = (p == 0 ? 0 : nzone - 1);
8465                 while (zone < nzone)
8466                 {
8467                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8468                     {
8469                         cg_gl              = index_gl[pos_cg];
8470                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8471                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8472                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8473                         if (bBondComm)
8474                         {
8475                             /* Update the charge group presence,
8476                              * so we can use it in the next pass of the loop.
8477                              */
8478                             comm->bLocalCG[cg_gl] = TRUE;
8479                         }
8480                         pos_cg++;
8481                     }
8482                     if (p == 0)
8483                     {
8484                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8485                     }
8486                     zone++;
8487                     zone_cg_range[nzone+zone] = pos_cg;
8488                 }
8489             }
8490             else
8491             {
8492                 /* This part of the code is never executed with bBondComm. */
8493                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8494                                  index_gl, recv_i, cg_cm, recv_vr,
8495                                  cgindex, fr->cginfo_mb, fr->cginfo);
8496                 pos_cg += ind->nrecv[nzone];
8497             }
8498             nat_tot += ind->nrecv[nzone+1];
8499         }
8500         if (!cd->bInPlace)
8501         {
8502             /* Store the atom block for easy copying of communication buffers */
8503             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8504         }
8505         nzone += nzone;
8506     }
8507     dd->index_gl = index_gl;
8508     dd->cgindex  = cgindex;
8509
8510     dd->ncg_tot          = zone_cg_range[zones->n];
8511     dd->nat_tot          = nat_tot;
8512     comm->nat[ddnatHOME] = dd->nat_home;
8513     for (i = ddnatZONE; i < ddnatNR; i++)
8514     {
8515         comm->nat[i] = dd->nat_tot;
8516     }
8517
8518     if (!bBondComm)
8519     {
8520         /* We don't need to update cginfo, since that was alrady done above.
8521          * So we pass NULL for the forcerec.
8522          */
8523         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8524                       NULL, comm->bLocalCG);
8525     }
8526
8527     if (debug)
8528     {
8529         fprintf(debug, "Finished setting up DD communication, zones:");
8530         for (c = 0; c < zones->n; c++)
8531         {
8532             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8533         }
8534         fprintf(debug, "\n");
8535     }
8536 }
8537
8538 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8539 {
8540     int c;
8541
8542     for (c = 0; c < zones->nizone; c++)
8543     {
8544         zones->izone[c].cg1  = zones->cg_range[c+1];
8545         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8546         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8547     }
8548 }
8549
8550 static void set_zones_size(gmx_domdec_t *dd,
8551                            matrix box, const gmx_ddbox_t *ddbox,
8552                            int zone_start, int zone_end)
8553 {
8554     gmx_domdec_comm_t  *comm;
8555     gmx_domdec_zones_t *zones;
8556     gmx_bool            bDistMB;
8557     int                 z, zi, zj0, zj1, d, dim;
8558     real                rcs, rcmbs;
8559     int                 i, j;
8560     real                size_j, add_tric;
8561     real                vol;
8562
8563     comm = dd->comm;
8564
8565     zones = &comm->zones;
8566
8567     /* Do we need to determine extra distances for multi-body bondeds? */
8568     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8569
8570     for (z = zone_start; z < zone_end; z++)
8571     {
8572         /* Copy cell limits to zone limits.
8573          * Valid for non-DD dims and non-shifted dims.
8574          */
8575         copy_rvec(comm->cell_x0, zones->size[z].x0);
8576         copy_rvec(comm->cell_x1, zones->size[z].x1);
8577     }
8578
8579     for (d = 0; d < dd->ndim; d++)
8580     {
8581         dim = dd->dim[d];
8582
8583         for (z = 0; z < zones->n; z++)
8584         {
8585             /* With a staggered grid we have different sizes
8586              * for non-shifted dimensions.
8587              */
8588             if (dd->bGridJump && zones->shift[z][dim] == 0)
8589             {
8590                 if (d == 1)
8591                 {
8592                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8593                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8594                 }
8595                 else if (d == 2)
8596                 {
8597                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8598                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8599                 }
8600             }
8601         }
8602
8603         rcs   = comm->cutoff;
8604         rcmbs = comm->cutoff_mbody;
8605         if (ddbox->tric_dir[dim])
8606         {
8607             rcs   /= ddbox->skew_fac[dim];
8608             rcmbs /= ddbox->skew_fac[dim];
8609         }
8610
8611         /* Set the lower limit for the shifted zone dimensions */
8612         for (z = zone_start; z < zone_end; z++)
8613         {
8614             if (zones->shift[z][dim] > 0)
8615             {
8616                 dim = dd->dim[d];
8617                 if (!dd->bGridJump || d == 0)
8618                 {
8619                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8620                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8621                 }
8622                 else
8623                 {
8624                     /* Here we take the lower limit of the zone from
8625                      * the lowest domain of the zone below.
8626                      */
8627                     if (z < 4)
8628                     {
8629                         zones->size[z].x0[dim] =
8630                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8631                     }
8632                     else
8633                     {
8634                         if (d == 1)
8635                         {
8636                             zones->size[z].x0[dim] =
8637                                 zones->size[zone_perm[2][z-4]].x0[dim];
8638                         }
8639                         else
8640                         {
8641                             zones->size[z].x0[dim] =
8642                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8643                         }
8644                     }
8645                     /* A temporary limit, is updated below */
8646                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8647
8648                     if (bDistMB)
8649                     {
8650                         for (zi = 0; zi < zones->nizone; zi++)
8651                         {
8652                             if (zones->shift[zi][dim] == 0)
8653                             {
8654                                 /* This takes the whole zone into account.
8655                                  * With multiple pulses this will lead
8656                                  * to a larger zone then strictly necessary.
8657                                  */
8658                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8659                                                              zones->size[zi].x1[dim]+rcmbs);
8660                             }
8661                         }
8662                     }
8663                 }
8664             }
8665         }
8666
8667         /* Loop over the i-zones to set the upper limit of each
8668          * j-zone they see.
8669          */
8670         for (zi = 0; zi < zones->nizone; zi++)
8671         {
8672             if (zones->shift[zi][dim] == 0)
8673             {
8674                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8675                 {
8676                     if (zones->shift[z][dim] > 0)
8677                     {
8678                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8679                                                      zones->size[zi].x1[dim]+rcs);
8680                     }
8681                 }
8682             }
8683         }
8684     }
8685
8686     for (z = zone_start; z < zone_end; z++)
8687     {
8688         /* Initialization only required to keep the compiler happy */
8689         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8690         int  nc, c;
8691
8692         /* To determine the bounding box for a zone we need to find
8693          * the extreme corners of 4, 2 or 1 corners.
8694          */
8695         nc = 1 << (ddbox->npbcdim - 1);
8696
8697         for (c = 0; c < nc; c++)
8698         {
8699             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8700             corner[XX] = 0;
8701             if ((c & 1) == 0)
8702             {
8703                 corner[YY] = zones->size[z].x0[YY];
8704             }
8705             else
8706             {
8707                 corner[YY] = zones->size[z].x1[YY];
8708             }
8709             if ((c & 2) == 0)
8710             {
8711                 corner[ZZ] = zones->size[z].x0[ZZ];
8712             }
8713             else
8714             {
8715                 corner[ZZ] = zones->size[z].x1[ZZ];
8716             }
8717             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8718             {
8719                 /* With 1D domain decomposition the cg's are not in
8720                  * the triclinic box, but triclinic x-y and rectangular y-z.
8721                  * Shift y back, so it will later end up at 0.
8722                  */
8723                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8724             }
8725             /* Apply the triclinic couplings */
8726             assert(ddbox->npbcdim <= DIM);
8727             for (i = YY; i < ddbox->npbcdim; i++)
8728             {
8729                 for (j = XX; j < i; j++)
8730                 {
8731                     corner[j] += corner[i]*box[i][j]/box[i][i];
8732                 }
8733             }
8734             if (c == 0)
8735             {
8736                 copy_rvec(corner, corner_min);
8737                 copy_rvec(corner, corner_max);
8738             }
8739             else
8740             {
8741                 for (i = 0; i < DIM; i++)
8742                 {
8743                     corner_min[i] = min(corner_min[i], corner[i]);
8744                     corner_max[i] = max(corner_max[i], corner[i]);
8745                 }
8746             }
8747         }
8748         /* Copy the extreme cornes without offset along x */
8749         for (i = 0; i < DIM; i++)
8750         {
8751             zones->size[z].bb_x0[i] = corner_min[i];
8752             zones->size[z].bb_x1[i] = corner_max[i];
8753         }
8754         /* Add the offset along x */
8755         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8756         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8757     }
8758
8759     if (zone_start == 0)
8760     {
8761         vol = 1;
8762         for (dim = 0; dim < DIM; dim++)
8763         {
8764             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8765         }
8766         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8767     }
8768
8769     if (debug)
8770     {
8771         for (z = zone_start; z < zone_end; z++)
8772         {
8773             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8774                     z,
8775                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8776                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8777                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8778             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8779                     z,
8780                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8781                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8782                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8783         }
8784     }
8785 }
8786
8787 static int comp_cgsort(const void *a, const void *b)
8788 {
8789     int           comp;
8790
8791     gmx_cgsort_t *cga, *cgb;
8792     cga = (gmx_cgsort_t *)a;
8793     cgb = (gmx_cgsort_t *)b;
8794
8795     comp = cga->nsc - cgb->nsc;
8796     if (comp == 0)
8797     {
8798         comp = cga->ind_gl - cgb->ind_gl;
8799     }
8800
8801     return comp;
8802 }
8803
8804 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8805                          int *a, int *buf)
8806 {
8807     int i;
8808
8809     /* Order the data */
8810     for (i = 0; i < n; i++)
8811     {
8812         buf[i] = a[sort[i].ind];
8813     }
8814
8815     /* Copy back to the original array */
8816     for (i = 0; i < n; i++)
8817     {
8818         a[i] = buf[i];
8819     }
8820 }
8821
8822 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8823                          rvec *v, rvec *buf)
8824 {
8825     int i;
8826
8827     /* Order the data */
8828     for (i = 0; i < n; i++)
8829     {
8830         copy_rvec(v[sort[i].ind], buf[i]);
8831     }
8832
8833     /* Copy back to the original array */
8834     for (i = 0; i < n; i++)
8835     {
8836         copy_rvec(buf[i], v[i]);
8837     }
8838 }
8839
8840 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8841                            rvec *v, rvec *buf)
8842 {
8843     int a, atot, cg, cg0, cg1, i;
8844
8845     if (cgindex == NULL)
8846     {
8847         /* Avoid the useless loop of the atoms within a cg */
8848         order_vec_cg(ncg, sort, v, buf);
8849
8850         return;
8851     }
8852
8853     /* Order the data */
8854     a = 0;
8855     for (cg = 0; cg < ncg; cg++)
8856     {
8857         cg0 = cgindex[sort[cg].ind];
8858         cg1 = cgindex[sort[cg].ind+1];
8859         for (i = cg0; i < cg1; i++)
8860         {
8861             copy_rvec(v[i], buf[a]);
8862             a++;
8863         }
8864     }
8865     atot = a;
8866
8867     /* Copy back to the original array */
8868     for (a = 0; a < atot; a++)
8869     {
8870         copy_rvec(buf[a], v[a]);
8871     }
8872 }
8873
8874 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8875                          int nsort_new, gmx_cgsort_t *sort_new,
8876                          gmx_cgsort_t *sort1)
8877 {
8878     int i1, i2, i_new;
8879
8880     /* The new indices are not very ordered, so we qsort them */
8881     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8882
8883     /* sort2 is already ordered, so now we can merge the two arrays */
8884     i1    = 0;
8885     i2    = 0;
8886     i_new = 0;
8887     while (i2 < nsort2 || i_new < nsort_new)
8888     {
8889         if (i2 == nsort2)
8890         {
8891             sort1[i1++] = sort_new[i_new++];
8892         }
8893         else if (i_new == nsort_new)
8894         {
8895             sort1[i1++] = sort2[i2++];
8896         }
8897         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8898                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8899                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8900         {
8901             sort1[i1++] = sort2[i2++];
8902         }
8903         else
8904         {
8905             sort1[i1++] = sort_new[i_new++];
8906         }
8907     }
8908 }
8909
8910 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8911 {
8912     gmx_domdec_sort_t *sort;
8913     gmx_cgsort_t      *cgsort, *sort_i;
8914     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8915     int                sort_last, sort_skip;
8916
8917     sort = dd->comm->sort;
8918
8919     a = fr->ns.grid->cell_index;
8920
8921     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8922
8923     if (ncg_home_old >= 0)
8924     {
8925         /* The charge groups that remained in the same ns grid cell
8926          * are completely ordered. So we can sort efficiently by sorting
8927          * the charge groups that did move into the stationary list.
8928          */
8929         ncg_new   = 0;
8930         nsort2    = 0;
8931         nsort_new = 0;
8932         for (i = 0; i < dd->ncg_home; i++)
8933         {
8934             /* Check if this cg did not move to another node */
8935             if (a[i] < moved)
8936             {
8937                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8938                 {
8939                     /* This cg is new on this node or moved ns grid cell */
8940                     if (nsort_new >= sort->sort_new_nalloc)
8941                     {
8942                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8943                         srenew(sort->sort_new, sort->sort_new_nalloc);
8944                     }
8945                     sort_i = &(sort->sort_new[nsort_new++]);
8946                 }
8947                 else
8948                 {
8949                     /* This cg did not move */
8950                     sort_i = &(sort->sort2[nsort2++]);
8951                 }
8952                 /* Sort on the ns grid cell indices
8953                  * and the global topology index.
8954                  * index_gl is irrelevant with cell ns,
8955                  * but we set it here anyhow to avoid a conditional.
8956                  */
8957                 sort_i->nsc    = a[i];
8958                 sort_i->ind_gl = dd->index_gl[i];
8959                 sort_i->ind    = i;
8960                 ncg_new++;
8961             }
8962         }
8963         if (debug)
8964         {
8965             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8966                     nsort2, nsort_new);
8967         }
8968         /* Sort efficiently */
8969         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8970                      sort->sort);
8971     }
8972     else
8973     {
8974         cgsort  = sort->sort;
8975         ncg_new = 0;
8976         for (i = 0; i < dd->ncg_home; i++)
8977         {
8978             /* Sort on the ns grid cell indices
8979              * and the global topology index
8980              */
8981             cgsort[i].nsc    = a[i];
8982             cgsort[i].ind_gl = dd->index_gl[i];
8983             cgsort[i].ind    = i;
8984             if (cgsort[i].nsc < moved)
8985             {
8986                 ncg_new++;
8987             }
8988         }
8989         if (debug)
8990         {
8991             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
8992         }
8993         /* Determine the order of the charge groups using qsort */
8994         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
8995     }
8996
8997     return ncg_new;
8998 }
8999
9000 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
9001 {
9002     gmx_cgsort_t *sort;
9003     int           ncg_new, i, *a, na;
9004
9005     sort = dd->comm->sort->sort;
9006
9007     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
9008
9009     ncg_new = 0;
9010     for (i = 0; i < na; i++)
9011     {
9012         if (a[i] >= 0)
9013         {
9014             sort[ncg_new].ind = a[i];
9015             ncg_new++;
9016         }
9017     }
9018
9019     return ncg_new;
9020 }
9021
9022 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
9023                           int ncg_home_old)
9024 {
9025     gmx_domdec_sort_t *sort;
9026     gmx_cgsort_t      *cgsort, *sort_i;
9027     int               *cgindex;
9028     int                ncg_new, i, *ibuf, cgsize;
9029     rvec              *vbuf;
9030
9031     sort = dd->comm->sort;
9032
9033     if (dd->ncg_home > sort->sort_nalloc)
9034     {
9035         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9036         srenew(sort->sort, sort->sort_nalloc);
9037         srenew(sort->sort2, sort->sort_nalloc);
9038     }
9039     cgsort = sort->sort;
9040
9041     switch (fr->cutoff_scheme)
9042     {
9043         case ecutsGROUP:
9044             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9045             break;
9046         case ecutsVERLET:
9047             ncg_new = dd_sort_order_nbnxn(dd, fr);
9048             break;
9049         default:
9050             gmx_incons("unimplemented");
9051             ncg_new = 0;
9052     }
9053
9054     /* We alloc with the old size, since cgindex is still old */
9055     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9056     vbuf = dd->comm->vbuf.v;
9057
9058     if (dd->comm->bCGs)
9059     {
9060         cgindex = dd->cgindex;
9061     }
9062     else
9063     {
9064         cgindex = NULL;
9065     }
9066
9067     /* Remove the charge groups which are no longer at home here */
9068     dd->ncg_home = ncg_new;
9069     if (debug)
9070     {
9071         fprintf(debug, "Set the new home charge group count to %d\n",
9072                 dd->ncg_home);
9073     }
9074
9075     /* Reorder the state */
9076     for (i = 0; i < estNR; i++)
9077     {
9078         if (EST_DISTR(i) && (state->flags & (1<<i)))
9079         {
9080             switch (i)
9081             {
9082                 case estX:
9083                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
9084                     break;
9085                 case estV:
9086                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
9087                     break;
9088                 case estSDX:
9089                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
9090                     break;
9091                 case estCGP:
9092                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
9093                     break;
9094                 case estLD_RNG:
9095                 case estLD_RNGI:
9096                 case estDISRE_INITF:
9097                 case estDISRE_RM3TAV:
9098                 case estORIRE_INITF:
9099                 case estORIRE_DTAV:
9100                     /* No ordering required */
9101                     break;
9102                 default:
9103                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9104                     break;
9105             }
9106         }
9107     }
9108     if (fr->cutoff_scheme == ecutsGROUP)
9109     {
9110         /* Reorder cgcm */
9111         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9112     }
9113
9114     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9115     {
9116         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9117         srenew(sort->ibuf, sort->ibuf_nalloc);
9118     }
9119     ibuf = sort->ibuf;
9120     /* Reorder the global cg index */
9121     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9122     /* Reorder the cginfo */
9123     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9124     /* Rebuild the local cg index */
9125     if (dd->comm->bCGs)
9126     {
9127         ibuf[0] = 0;
9128         for (i = 0; i < dd->ncg_home; i++)
9129         {
9130             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9131             ibuf[i+1] = ibuf[i] + cgsize;
9132         }
9133         for (i = 0; i < dd->ncg_home+1; i++)
9134         {
9135             dd->cgindex[i] = ibuf[i];
9136         }
9137     }
9138     else
9139     {
9140         for (i = 0; i < dd->ncg_home+1; i++)
9141         {
9142             dd->cgindex[i] = i;
9143         }
9144     }
9145     /* Set the home atom number */
9146     dd->nat_home = dd->cgindex[dd->ncg_home];
9147
9148     if (fr->cutoff_scheme == ecutsVERLET)
9149     {
9150         /* The atoms are now exactly in grid order, update the grid order */
9151         nbnxn_set_atomorder(fr->nbv->nbs);
9152     }
9153     else
9154     {
9155         /* Copy the sorted ns cell indices back to the ns grid struct */
9156         for (i = 0; i < dd->ncg_home; i++)
9157         {
9158             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9159         }
9160         fr->ns.grid->nr = dd->ncg_home;
9161     }
9162 }
9163
9164 static void add_dd_statistics(gmx_domdec_t *dd)
9165 {
9166     gmx_domdec_comm_t *comm;
9167     int                ddnat;
9168
9169     comm = dd->comm;
9170
9171     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9172     {
9173         comm->sum_nat[ddnat-ddnatZONE] +=
9174             comm->nat[ddnat] - comm->nat[ddnat-1];
9175     }
9176     comm->ndecomp++;
9177 }
9178
9179 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9180 {
9181     gmx_domdec_comm_t *comm;
9182     int                ddnat;
9183
9184     comm = dd->comm;
9185
9186     /* Reset all the statistics and counters for total run counting */
9187     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9188     {
9189         comm->sum_nat[ddnat-ddnatZONE] = 0;
9190     }
9191     comm->ndecomp   = 0;
9192     comm->nload     = 0;
9193     comm->load_step = 0;
9194     comm->load_sum  = 0;
9195     comm->load_max  = 0;
9196     clear_ivec(comm->load_lim);
9197     comm->load_mdf = 0;
9198     comm->load_pme = 0;
9199 }
9200
9201 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9202 {
9203     gmx_domdec_comm_t *comm;
9204     int                ddnat;
9205     double             av;
9206
9207     comm = cr->dd->comm;
9208
9209     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9210
9211     if (fplog == NULL)
9212     {
9213         return;
9214     }
9215
9216     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9217
9218     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9219     {
9220         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9221         switch (ddnat)
9222         {
9223             case ddnatZONE:
9224                 fprintf(fplog,
9225                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9226                         2, av);
9227                 break;
9228             case ddnatVSITE:
9229                 if (cr->dd->vsite_comm)
9230                 {
9231                     fprintf(fplog,
9232                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9233                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9234                             av);
9235                 }
9236                 break;
9237             case ddnatCON:
9238                 if (cr->dd->constraint_comm)
9239                 {
9240                     fprintf(fplog,
9241                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9242                             1 + ir->nLincsIter, av);
9243                 }
9244                 break;
9245             default:
9246                 gmx_incons(" Unknown type for DD statistics");
9247         }
9248     }
9249     fprintf(fplog, "\n");
9250
9251     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9252     {
9253         print_dd_load_av(fplog, cr->dd);
9254     }
9255 }
9256
9257 void dd_partition_system(FILE                *fplog,
9258                          gmx_int64_t          step,
9259                          t_commrec           *cr,
9260                          gmx_bool             bMasterState,
9261                          int                  nstglobalcomm,
9262                          t_state             *state_global,
9263                          gmx_mtop_t          *top_global,
9264                          t_inputrec          *ir,
9265                          t_state             *state_local,
9266                          rvec               **f,
9267                          t_mdatoms           *mdatoms,
9268                          gmx_localtop_t      *top_local,
9269                          t_forcerec          *fr,
9270                          gmx_vsite_t         *vsite,
9271                          gmx_shellfc_t        shellfc,
9272                          gmx_constr_t         constr,
9273                          t_nrnb              *nrnb,
9274                          gmx_wallcycle_t      wcycle,
9275                          gmx_bool             bVerbose)
9276 {
9277     gmx_domdec_t      *dd;
9278     gmx_domdec_comm_t *comm;
9279     gmx_ddbox_t        ddbox = {0};
9280     t_block           *cgs_gl;
9281     gmx_int64_t        step_pcoupl;
9282     rvec               cell_ns_x0, cell_ns_x1;
9283     int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9284     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9285     gmx_bool           bRedist, bSortCG, bResortAll;
9286     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9287     real               grid_density;
9288     char               sbuf[22];
9289
9290     dd   = cr->dd;
9291     comm = dd->comm;
9292
9293     bBoxChanged = (bMasterState || DEFORM(*ir));
9294     if (ir->epc != epcNO)
9295     {
9296         /* With nstpcouple > 1 pressure coupling happens.
9297          * one step after calculating the pressure.
9298          * Box scaling happens at the end of the MD step,
9299          * after the DD partitioning.
9300          * We therefore have to do DLB in the first partitioning
9301          * after an MD step where P-coupling occured.
9302          * We need to determine the last step in which p-coupling occurred.
9303          * MRS -- need to validate this for vv?
9304          */
9305         n = ir->nstpcouple;
9306         if (n == 1)
9307         {
9308             step_pcoupl = step - 1;
9309         }
9310         else
9311         {
9312             step_pcoupl = ((step - 1)/n)*n + 1;
9313         }
9314         if (step_pcoupl >= comm->partition_step)
9315         {
9316             bBoxChanged = TRUE;
9317         }
9318     }
9319
9320     bNStGlobalComm = (step % nstglobalcomm == 0);
9321
9322     if (!comm->bDynLoadBal)
9323     {
9324         bDoDLB = FALSE;
9325     }
9326     else
9327     {
9328         /* Should we do dynamic load balacing this step?
9329          * Since it requires (possibly expensive) global communication,
9330          * we might want to do DLB less frequently.
9331          */
9332         if (bBoxChanged || ir->epc != epcNO)
9333         {
9334             bDoDLB = bBoxChanged;
9335         }
9336         else
9337         {
9338             bDoDLB = bNStGlobalComm;
9339         }
9340     }
9341
9342     /* Check if we have recorded loads on the nodes */
9343     if (comm->bRecordLoad && dd_load_count(comm))
9344     {
9345         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal)
9346         {
9347             /* Check if we should use DLB at the second partitioning
9348              * and every 100 partitionings,
9349              * so the extra communication cost is negligible.
9350              */
9351             n         = max(100, nstglobalcomm);
9352             bCheckDLB = (comm->n_load_collect == 0 ||
9353                          comm->n_load_have % n == n-1);
9354         }
9355         else
9356         {
9357             bCheckDLB = FALSE;
9358         }
9359
9360         /* Print load every nstlog, first and last step to the log file */
9361         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9362                     comm->n_load_collect == 0 ||
9363                     (ir->nsteps >= 0 &&
9364                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9365
9366         /* Avoid extra communication due to verbose screen output
9367          * when nstglobalcomm is set.
9368          */
9369         if (bDoDLB || bLogLoad || bCheckDLB ||
9370             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9371         {
9372             get_load_distribution(dd, wcycle);
9373             if (DDMASTER(dd))
9374             {
9375                 if (bLogLoad)
9376                 {
9377                     dd_print_load(fplog, dd, step-1);
9378                 }
9379                 if (bVerbose)
9380                 {
9381                     dd_print_load_verbose(dd);
9382                 }
9383             }
9384             comm->n_load_collect++;
9385
9386             if (bCheckDLB)
9387             {
9388                 /* Since the timings are node dependent, the master decides */
9389                 if (DDMASTER(dd))
9390                 {
9391                     bTurnOnDLB =
9392                         (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9393                     if (debug)
9394                     {
9395                         fprintf(debug, "step %s, imb loss %f\n",
9396                                 gmx_step_str(step, sbuf),
9397                                 dd_force_imb_perf_loss(dd));
9398                     }
9399                 }
9400                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9401                 if (bTurnOnDLB)
9402                 {
9403                     turn_on_dlb(fplog, cr, step);
9404                     bDoDLB = TRUE;
9405                 }
9406             }
9407         }
9408         comm->n_load_have++;
9409     }
9410
9411     cgs_gl = &comm->cgs_gl;
9412
9413     bRedist = FALSE;
9414     if (bMasterState)
9415     {
9416         /* Clear the old state */
9417         clear_dd_indices(dd, 0, 0);
9418         ncgindex_set = 0;
9419
9420         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9421                   TRUE, cgs_gl, state_global->x, &ddbox);
9422
9423         get_cg_distribution(fplog, step, dd, cgs_gl,
9424                             state_global->box, &ddbox, state_global->x);
9425
9426         dd_distribute_state(dd, cgs_gl,
9427                             state_global, state_local, f);
9428
9429         dd_make_local_cgs(dd, &top_local->cgs);
9430
9431         /* Ensure that we have space for the new distribution */
9432         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9433
9434         if (fr->cutoff_scheme == ecutsGROUP)
9435         {
9436             calc_cgcm(fplog, 0, dd->ncg_home,
9437                       &top_local->cgs, state_local->x, fr->cg_cm);
9438         }
9439
9440         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9441
9442         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9443     }
9444     else if (state_local->ddp_count != dd->ddp_count)
9445     {
9446         if (state_local->ddp_count > dd->ddp_count)
9447         {
9448             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9449         }
9450
9451         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9452         {
9453             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9454         }
9455
9456         /* Clear the old state */
9457         clear_dd_indices(dd, 0, 0);
9458
9459         /* Build the new indices */
9460         rebuild_cgindex(dd, cgs_gl->index, state_local);
9461         make_dd_indices(dd, cgs_gl->index, 0);
9462         ncgindex_set = dd->ncg_home;
9463
9464         if (fr->cutoff_scheme == ecutsGROUP)
9465         {
9466             /* Redetermine the cg COMs */
9467             calc_cgcm(fplog, 0, dd->ncg_home,
9468                       &top_local->cgs, state_local->x, fr->cg_cm);
9469         }
9470
9471         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9472
9473         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9474
9475         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9476                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9477
9478         bRedist = comm->bDynLoadBal;
9479     }
9480     else
9481     {
9482         /* We have the full state, only redistribute the cgs */
9483
9484         /* Clear the non-home indices */
9485         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9486         ncgindex_set = 0;
9487
9488         /* Avoid global communication for dim's without pbc and -gcom */
9489         if (!bNStGlobalComm)
9490         {
9491             copy_rvec(comm->box0, ddbox.box0    );
9492             copy_rvec(comm->box_size, ddbox.box_size);
9493         }
9494         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9495                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9496
9497         bBoxChanged = TRUE;
9498         bRedist     = TRUE;
9499     }
9500     /* For dim's without pbc and -gcom */
9501     copy_rvec(ddbox.box0, comm->box0    );
9502     copy_rvec(ddbox.box_size, comm->box_size);
9503
9504     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9505                       step, wcycle);
9506
9507     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9508     {
9509         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9510     }
9511
9512     /* Check if we should sort the charge groups */
9513     if (comm->nstSortCG > 0)
9514     {
9515         bSortCG = (bMasterState ||
9516                    (bRedist && (step % comm->nstSortCG == 0)));
9517     }
9518     else
9519     {
9520         bSortCG = FALSE;
9521     }
9522
9523     ncg_home_old = dd->ncg_home;
9524
9525     ncg_moved = 0;
9526     if (bRedist)
9527     {
9528         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9529
9530         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9531                            state_local, f, fr,
9532                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9533
9534         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9535     }
9536
9537     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9538                           dd, &ddbox,
9539                           &comm->cell_x0, &comm->cell_x1,
9540                           dd->ncg_home, fr->cg_cm,
9541                           cell_ns_x0, cell_ns_x1, &grid_density);
9542
9543     if (bBoxChanged)
9544     {
9545         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9546     }
9547
9548     switch (fr->cutoff_scheme)
9549     {
9550         case ecutsGROUP:
9551             copy_ivec(fr->ns.grid->n, ncells_old);
9552             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9553                        state_local->box, cell_ns_x0, cell_ns_x1,
9554                        fr->rlistlong, grid_density);
9555             break;
9556         case ecutsVERLET:
9557             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9558             break;
9559         default:
9560             gmx_incons("unimplemented");
9561     }
9562     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9563     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9564
9565     if (bSortCG)
9566     {
9567         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9568
9569         /* Sort the state on charge group position.
9570          * This enables exact restarts from this step.
9571          * It also improves performance by about 15% with larger numbers
9572          * of atoms per node.
9573          */
9574
9575         /* Fill the ns grid with the home cell,
9576          * so we can sort with the indices.
9577          */
9578         set_zones_ncg_home(dd);
9579
9580         switch (fr->cutoff_scheme)
9581         {
9582             case ecutsVERLET:
9583                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9584
9585                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9586                                   0,
9587                                   comm->zones.size[0].bb_x0,
9588                                   comm->zones.size[0].bb_x1,
9589                                   0, dd->ncg_home,
9590                                   comm->zones.dens_zone0,
9591                                   fr->cginfo,
9592                                   state_local->x,
9593                                   ncg_moved, bRedist ? comm->moved : NULL,
9594                                   fr->nbv->grp[eintLocal].kernel_type,
9595                                   fr->nbv->grp[eintLocal].nbat);
9596
9597                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9598                 break;
9599             case ecutsGROUP:
9600                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9601                           0, dd->ncg_home, fr->cg_cm);
9602
9603                 copy_ivec(fr->ns.grid->n, ncells_new);
9604                 break;
9605             default:
9606                 gmx_incons("unimplemented");
9607         }
9608
9609         bResortAll = bMasterState;
9610
9611         /* Check if we can user the old order and ns grid cell indices
9612          * of the charge groups to sort the charge groups efficiently.
9613          */
9614         if (ncells_new[XX] != ncells_old[XX] ||
9615             ncells_new[YY] != ncells_old[YY] ||
9616             ncells_new[ZZ] != ncells_old[ZZ])
9617         {
9618             bResortAll = TRUE;
9619         }
9620
9621         if (debug)
9622         {
9623             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9624                     gmx_step_str(step, sbuf), dd->ncg_home);
9625         }
9626         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9627                       bResortAll ? -1 : ncg_home_old);
9628         /* Rebuild all the indices */
9629         ga2la_clear(dd->ga2la);
9630         ncgindex_set = 0;
9631
9632         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9633     }
9634
9635     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9636
9637     /* Setup up the communication and communicate the coordinates */
9638     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9639
9640     /* Set the indices */
9641     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9642
9643     /* Set the charge group boundaries for neighbor searching */
9644     set_cg_boundaries(&comm->zones);
9645
9646     if (fr->cutoff_scheme == ecutsVERLET)
9647     {
9648         set_zones_size(dd, state_local->box, &ddbox,
9649                        bSortCG ? 1 : 0, comm->zones.n);
9650     }
9651
9652     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9653
9654     /*
9655        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9656                  -1,state_local->x,state_local->box);
9657      */
9658
9659     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9660
9661     /* Extract a local topology from the global topology */
9662     for (i = 0; i < dd->ndim; i++)
9663     {
9664         np[dd->dim[i]] = comm->cd[i].np;
9665     }
9666     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9667                       comm->cellsize_min, np,
9668                       fr,
9669                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9670                       vsite, top_global, top_local);
9671
9672     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9673
9674     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9675
9676     /* Set up the special atom communication */
9677     n = comm->nat[ddnatZONE];
9678     for (i = ddnatZONE+1; i < ddnatNR; i++)
9679     {
9680         switch (i)
9681         {
9682             case ddnatVSITE:
9683                 if (vsite && vsite->n_intercg_vsite)
9684                 {
9685                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9686                 }
9687                 break;
9688             case ddnatCON:
9689                 if (dd->bInterCGcons || dd->bInterCGsettles)
9690                 {
9691                     /* Only for inter-cg constraints we need special code */
9692                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9693                                                   constr, ir->nProjOrder,
9694                                                   top_local->idef.il);
9695                 }
9696                 break;
9697             default:
9698                 gmx_incons("Unknown special atom type setup");
9699         }
9700         comm->nat[i] = n;
9701     }
9702
9703     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9704
9705     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9706
9707     /* Make space for the extra coordinates for virtual site
9708      * or constraint communication.
9709      */
9710     state_local->natoms = comm->nat[ddnatNR-1];
9711     if (state_local->natoms > state_local->nalloc)
9712     {
9713         dd_realloc_state(state_local, f, state_local->natoms);
9714     }
9715
9716     if (fr->bF_NoVirSum)
9717     {
9718         if (vsite && vsite->n_intercg_vsite)
9719         {
9720             nat_f_novirsum = comm->nat[ddnatVSITE];
9721         }
9722         else
9723         {
9724             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9725             {
9726                 nat_f_novirsum = dd->nat_tot;
9727             }
9728             else
9729             {
9730                 nat_f_novirsum = dd->nat_home;
9731             }
9732         }
9733     }
9734     else
9735     {
9736         nat_f_novirsum = 0;
9737     }
9738
9739     /* Set the number of atoms required for the force calculation.
9740      * Forces need to be constrained when using a twin-range setup
9741      * or with energy minimization. For simple simulations we could
9742      * avoid some allocation, zeroing and copying, but this is
9743      * probably not worth the complications ande checking.
9744      */
9745     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9746                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9747
9748     /* We make the all mdatoms up to nat_tot_con.
9749      * We could save some work by only setting invmass
9750      * between nat_tot and nat_tot_con.
9751      */
9752     /* This call also sets the new number of home particles to dd->nat_home */
9753     atoms2md(top_global, ir,
9754              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9755
9756     /* Now we have the charges we can sort the FE interactions */
9757     dd_sort_local_top(dd, mdatoms, top_local);
9758
9759     if (vsite != NULL)
9760     {
9761         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9762         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9763                                   mdatoms, FALSE, vsite);
9764     }
9765
9766     if (shellfc)
9767     {
9768         /* Make the local shell stuff, currently no communication is done */
9769         make_local_shells(cr, mdatoms, shellfc);
9770     }
9771
9772     if (ir->implicit_solvent)
9773     {
9774         make_local_gb(cr, fr->born, ir->gb_algorithm);
9775     }
9776
9777     setup_bonded_threading(fr, &top_local->idef);
9778
9779     if (!(cr->duty & DUTY_PME))
9780     {
9781         /* Send the charges and/or c6/sigmas to our PME only node */
9782         gmx_pme_send_parameters(cr,
9783                                 fr->ic,
9784                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9785                                 mdatoms->chargeA, mdatoms->chargeB,
9786                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9787                                 mdatoms->sigmaA, mdatoms->sigmaB,
9788                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9789     }
9790
9791     if (constr)
9792     {
9793         set_constraints(constr, top_local, ir, mdatoms, cr);
9794     }
9795
9796     if (ir->ePull != epullNO)
9797     {
9798         /* Update the local pull groups */
9799         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9800     }
9801
9802     if (ir->bRot)
9803     {
9804         /* Update the local rotation groups */
9805         dd_make_local_rotation_groups(dd, ir->rot);
9806     }
9807
9808     if (ir->eSwapCoords != eswapNO)
9809     {
9810         /* Update the local groups needed for ion swapping */
9811         dd_make_local_swap_groups(dd, ir->swap);
9812     }
9813
9814     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9815     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9816
9817     add_dd_statistics(dd);
9818
9819     /* Make sure we only count the cycles for this DD partitioning */
9820     clear_dd_cycle_counts(dd);
9821
9822     /* Because the order of the atoms might have changed since
9823      * the last vsite construction, we need to communicate the constructing
9824      * atom coordinates again (for spreading the forces this MD step).
9825      */
9826     dd_move_x_vsites(dd, state_local->box, state_local->x);
9827
9828     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9829
9830     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9831     {
9832         dd_move_x(dd, state_local->box, state_local->x);
9833         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9834                      -1, state_local->x, state_local->box);
9835     }
9836
9837     /* Store the partitioning step */
9838     comm->partition_step = step;
9839
9840     /* Increase the DD partitioning counter */
9841     dd->ddp_count++;
9842     /* The state currently matches this DD partitioning count, store it */
9843     state_local->ddp_count = dd->ddp_count;
9844     if (bMasterState)
9845     {
9846         /* The DD master node knows the complete cg distribution,
9847          * store the count so we can possibly skip the cg info communication.
9848          */
9849         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9850     }
9851
9852     if (comm->DD_debug > 0)
9853     {
9854         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9855         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9856                                 "after partitioning");
9857     }
9858 }