src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/ewald/pme.h"
  53 #include "gromacs/fileio/gmxfio.h"
  54 #include "gromacs/fileio/pdbio.h"
  55 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  56 #include "gromacs/imd/imd.h"
  57 #include "gromacs/legacyheaders/chargegroup.h"
  58 #include "gromacs/legacyheaders/constr.h"
  59 #include "gromacs/legacyheaders/force.h"
  60 #include "gromacs/legacyheaders/genborn.h"
  61 #include "gromacs/legacyheaders/gmx_ga2la.h"
  62 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  63 #include "gromacs/legacyheaders/mdatoms.h"
  64 #include "gromacs/legacyheaders/mdrun.h"
  65 #include "gromacs/legacyheaders/names.h"
  66 #include "gromacs/legacyheaders/network.h"
  67 #include "gromacs/legacyheaders/nrnb.h"
  68 #include "gromacs/legacyheaders/nsgrid.h"
  69 #include "gromacs/legacyheaders/shellfc.h"
  70 #include "gromacs/legacyheaders/typedefs.h"
  71 #include "gromacs/legacyheaders/vsite.h"
  72 #include "gromacs/legacyheaders/types/commrec.h"
  73 #include "gromacs/legacyheaders/types/constr.h"
  74 #include "gromacs/legacyheaders/types/enums.h"
  75 #include "gromacs/legacyheaders/types/forcerec.h"
  76 #include "gromacs/legacyheaders/types/hw_info.h"
  77 #include "gromacs/legacyheaders/types/ifunc.h"
  78 #include "gromacs/legacyheaders/types/inputrec.h"
  79 #include "gromacs/legacyheaders/types/mdatom.h"
  80 #include "gromacs/legacyheaders/types/nrnb.h"
  81 #include "gromacs/legacyheaders/types/ns.h"
  82 #include "gromacs/legacyheaders/types/nsgrid.h"
  83 #include "gromacs/legacyheaders/types/shellfc.h"
  84 #include "gromacs/legacyheaders/types/simple.h"
  85 #include "gromacs/legacyheaders/types/state.h"
  86 #include "gromacs/listed-forces/manage-threading.h"
  87 #include "gromacs/math/vec.h"
  88 #include "gromacs/math/vectypes.h"
  89 #include "gromacs/mdlib/nb_verlet.h"
  90 #include "gromacs/mdlib/nbnxn_search.h"
  91 #include "gromacs/pbcutil/ishift.h"
  92 #include "gromacs/pbcutil/pbc.h"
  93 #include "gromacs/pulling/pull.h"
  94 #include "gromacs/pulling/pull_rotation.h"
  95 #include "gromacs/swap/swapcoords.h"
  96 #include "gromacs/timing/wallcycle.h"
  97 #include "gromacs/topology/block.h"
  98 #include "gromacs/topology/idef.h"
  99 #include "gromacs/topology/mtop_util.h"
 100 #include "gromacs/topology/topology.h"
 101 #include "gromacs/utility/basedefinitions.h"
 102 #include "gromacs/utility/basenetwork.h"
 103 #include "gromacs/utility/cstringutil.h"
 104 #include "gromacs/utility/fatalerror.h"
 105 #include "gromacs/utility/gmxmpi.h"
 106 #include "gromacs/utility/qsort_threadsafe.h"
 107 #include "gromacs/utility/real.h"
 108 #include "gromacs/utility/smalloc.h"
 109
 110 #include "domdec_constraints.h"
 111 #include "domdec_internal.h"
 112 #include "domdec_vsite.h"
 113
 114 #define DDRANK(dd, rank)    (rank)
 115 #define DDMASTERRANK(dd)   (dd->masterrank)
 116
 117 typedef struct gmx_domdec_master
 118 {
 119     /* The cell boundaries */
 120     real **cell_x;
 121     /* The global charge group division */
 122     int   *ncg;    /* Number of home charge groups for each node */
 123     int   *index;  /* Index of nnodes+1 into cg */
 124     int   *cg;     /* Global charge group index */
 125     int   *nat;    /* Number of home atoms for each node. */
 126     int   *ibuf;   /* Buffer for communication */
 127     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 128 } gmx_domdec_master_t;
 129
 130 typedef struct
 131 {
 132     /* The numbers of charge groups to send and receive for each cell
 133      * that requires communication, the last entry contains the total
 134      * number of atoms that needs to be communicated.
 135      */
 136     int  nsend[DD_MAXIZONE+2];
 137     int  nrecv[DD_MAXIZONE+2];
 138     /* The charge groups to send */
 139     int *index;
 140     int  nalloc;
 141     /* The atom range for non-in-place communication */
 142     int  cell2at0[DD_MAXIZONE];
 143     int  cell2at1[DD_MAXIZONE];
 144 } gmx_domdec_ind_t;
 145
 146 typedef struct
 147 {
 148     int               np;       /* Number of grid pulses in this dimension */
 149     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 150     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 151     int               np_nalloc;
 152     gmx_bool          bInPlace; /* Can we communicate in place?            */
 153 } gmx_domdec_comm_dim_t;
 154
 155 typedef struct
 156 {
 157     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 158     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 159     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 160     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 161     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 162     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 163     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 164     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 165     real     *buf_ncd;     /* Temp. var.                                     */
 166 } gmx_domdec_root_t;
 167
 168 #define DD_NLOAD_MAX 9
 169
 170 /* Here floats are accurate enough, since these variables
 171  * only influence the load balancing, not the actual MD results.
 172  */
 173 typedef struct
 174 {
 175     int    nload;
 176     float *load;
 177     float  sum;
 178     float  max;
 179     float  sum_m;
 180     float  cvol_min;
 181     float  mdf;
 182     float  pme;
 183     int    flags;
 184 } gmx_domdec_load_t;
 185
 186 typedef struct
 187 {
 188     int  nsc;
 189     int  ind_gl;
 190     int  ind;
 191 } gmx_cgsort_t;
 192
 193 typedef struct
 194 {
 195     gmx_cgsort_t *sort;
 196     gmx_cgsort_t *sort2;
 197     int           sort_nalloc;
 198     gmx_cgsort_t *sort_new;
 199     int           sort_new_nalloc;
 200     int          *ibuf;
 201     int           ibuf_nalloc;
 202 } gmx_domdec_sort_t;
 203
 204 typedef struct
 205 {
 206     rvec *v;
 207     int   nalloc;
 208 } vec_rvec_t;
 209
 210 /* This enum determines the order of the coordinates.
 211  * ddnatHOME and ddnatZONE should be first and second,
 212  * the others can be ordered as wanted.
 213  */
 214 enum {
 215     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 216 };
 217
 218 enum {
 219     edlbAUTO, edlbNO, edlbYES, edlbNR
 220 };
 221 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 222
 223 typedef struct
 224 {
 225     int      dim;       /* The dimension                                          */
 226     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 227     int      nslab;     /* The number of PME slabs in this dimension              */
 228     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 229     int     *pp_min;    /* The minimum pp node location, size nslab               */
 230     int     *pp_max;    /* The maximum pp node location,size nslab                */
 231     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 232 } gmx_ddpme_t;
 233
 234 typedef struct
 235 {
 236     real min0;    /* The minimum bottom of this zone                        */
 237     real max1;    /* The maximum top of this zone                           */
 238     real min1;    /* The minimum top of this zone                           */
 239     real mch0;    /* The maximum bottom communicaton height for this zone   */
 240     real mch1;    /* The maximum top communicaton height for this zone      */
 241     real p1_0;    /* The bottom value of the first cell in this zone        */
 242     real p1_1;    /* The top value of the first cell in this zone           */
 243 } gmx_ddzone_t;
 244
 245 typedef struct
 246 {
 247     gmx_domdec_ind_t ind;
 248     int             *ibuf;
 249     int              ibuf_nalloc;
 250     vec_rvec_t       vbuf;
 251     int              nsend;
 252     int              nat;
 253     int              nsend_zone;
 254 } dd_comm_setup_work_t;
 255
 256 typedef struct gmx_domdec_comm
 257 {
 258     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 259      * unless stated otherwise.
 260      */
 261
 262     /* The number of decomposition dimensions for PME, 0: no PME */
 263     int         npmedecompdim;
 264     /* The number of nodes doing PME (PP/PME or only PME) */
 265     int         npmenodes;
 266     int         npmenodes_x;
 267     int         npmenodes_y;
 268     /* The communication setup including the PME only nodes */
 269     gmx_bool    bCartesianPP_PME;
 270     ivec        ntot;
 271     int         cartpmedim;
 272     int        *pmenodes;          /* size npmenodes                         */
 273     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 274                                     * but with bCartesianPP_PME              */
 275     gmx_ddpme_t ddpme[2];
 276
 277     /* The DD particle-particle nodes only */
 278     gmx_bool bCartesianPP;
 279     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 280
 281     /* The global charge groups */
 282     t_block cgs_gl;
 283
 284     /* Should we sort the cgs */
 285     int                nstSortCG;
 286     gmx_domdec_sort_t *sort;
 287
 288     /* Are there charge groups? */
 289     gmx_bool bCGs;
 290
 291     /* Are there bonded and multi-body interactions between charge groups? */
 292     gmx_bool bInterCGBondeds;
 293     gmx_bool bInterCGMultiBody;
 294
 295     /* Data for the optional bonded interaction atom communication range */
 296     gmx_bool  bBondComm;
 297     t_blocka *cglink;
 298     char     *bLocalCG;
 299
 300     /* The DLB option */
 301     int      eDLB;
 302     /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
 303     gmx_bool bDLB_locked;
 304     /* Are we actually using DLB? */
 305     gmx_bool bDynLoadBal;
 306
 307     /* Cell sizes for static load balancing, first index cartesian */
 308     real **slb_frac;
 309
 310     /* The width of the communicated boundaries */
 311     real     cutoff_mbody;
 312     real     cutoff;
 313     /* The minimum cell size (including triclinic correction) */
 314     rvec     cellsize_min;
 315     /* For dlb, for use with edlbAUTO */
 316     rvec     cellsize_min_dlb;
 317     /* The lower limit for the DD cell size with DLB */
 318     real     cellsize_limit;
 319     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 320     gmx_bool bVacDLBNoLimit;
 321
 322     /* With PME load balancing we set limits on DLB */
 323     gmx_bool bPMELoadBalDLBLimits;
 324     /* DLB needs to take into account that we want to allow this maximum
 325      * cut-off (for PME load balancing), this could limit cell boundaries.
 326      */
 327     real PMELoadBal_max_cutoff;
 328
 329     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 330     ivec tric_dir;
 331     /* box0 and box_size are required with dim's without pbc and -gcom */
 332     rvec box0;
 333     rvec box_size;
 334
 335     /* The cell boundaries */
 336     rvec cell_x0;
 337     rvec cell_x1;
 338
 339     /* The old location of the cell boundaries, to check cg displacements */
 340     rvec old_cell_x0;
 341     rvec old_cell_x1;
 342
 343     /* The communication setup and charge group boundaries for the zones */
 344     gmx_domdec_zones_t zones;
 345
 346     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 347      * cell boundaries of neighboring cells for dynamic load balancing.
 348      */
 349     gmx_ddzone_t zone_d1[2];
 350     gmx_ddzone_t zone_d2[2][2];
 351
 352     /* The coordinate/force communication setup and indices */
 353     gmx_domdec_comm_dim_t cd[DIM];
 354     /* The maximum number of cells to communicate with in one dimension */
 355     int                   maxpulse;
 356
 357     /* Which cg distribution is stored on the master node */
 358     int master_cg_ddp_count;
 359
 360     /* The number of cg's received from the direct neighbors */
 361     int  zone_ncg1[DD_MAXZONE];
 362
 363     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 364     int  nat[ddnatNR];
 365
 366     /* Array for signalling if atoms have moved to another domain */
 367     int  *moved;
 368     int   moved_nalloc;
 369
 370     /* Communication buffer for general use */
 371     int  *buf_int;
 372     int   nalloc_int;
 373
 374     /* Communication buffer for general use */
 375     vec_rvec_t vbuf;
 376
 377     /* Temporary storage for thread parallel communication setup */
 378     int                   nth;
 379     dd_comm_setup_work_t *dth;
 380
 381     /* Communication buffers only used with multiple grid pulses */
 382     int       *buf_int2;
 383     int        nalloc_int2;
 384     vec_rvec_t vbuf2;
 385
 386     /* Communication buffers for local redistribution */
 387     int  **cggl_flag;
 388     int    cggl_flag_nalloc[DIM*2];
 389     rvec **cgcm_state;
 390     int    cgcm_state_nalloc[DIM*2];
 391
 392     /* Cell sizes for dynamic load balancing */
 393     gmx_domdec_root_t **root;
 394     real               *cell_f_row;
 395     real                cell_f0[DIM];
 396     real                cell_f1[DIM];
 397     real                cell_f_max0[DIM];
 398     real                cell_f_min1[DIM];
 399
 400     /* Stuff for load communication */
 401     gmx_bool           bRecordLoad;
 402     gmx_domdec_load_t *load;
 403     int                nrank_gpu_shared;
 404 #ifdef GMX_MPI
 405     MPI_Comm          *mpi_comm_load;
 406     MPI_Comm           mpi_comm_gpu_shared;
 407 #endif
 408
 409     /* Maximum DLB scaling per load balancing step in percent */
 410     int dlb_scale_lim;
 411
 412     /* Cycle counters */
 413     float  cycl[ddCyclNr];
 414     int    cycl_n[ddCyclNr];
 415     float  cycl_max[ddCyclNr];
 416     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 417     int    eFlop;
 418     double flop;
 419     int    flop_n;
 420     /* How many times have did we have load measurements */
 421     int    n_load_have;
 422     /* How many times have we collected the load measurements */
 423     int    n_load_collect;
 424
 425     /* Statistics */
 426     double sum_nat[ddnatNR-ddnatZONE];
 427     int    ndecomp;
 428     int    nload;
 429     double load_step;
 430     double load_sum;
 431     double load_max;
 432     ivec   load_lim;
 433     double load_mdf;
 434     double load_pme;
 435
 436     /* The last partition step */
 437     gmx_int64_t partition_step;
 438
 439     /* Debugging */
 440     int  nstDDDump;
 441     int  nstDDDumpGrid;
 442     int  DD_debug;
 443 } gmx_domdec_comm_t;
 444
 445 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 446 #define DD_CGIBS 2
 447
 448 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 449 #define DD_FLAG_NRCG  65535
 450 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 451 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 452
 453 /* Zone permutation required to obtain consecutive charge groups
 454  * for neighbor searching.
 455  */
 456 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 457
 458 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 459  * components see only j zones with that component 0.
 460  */
 461
 462 /* The DD zone order */
 463 static const ivec dd_zo[DD_MAXZONE] =
 464 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 465
 466 /* The 3D setup */
 467 #define dd_z3n  8
 468 #define dd_zp3n 4
 469 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 470
 471 /* The 2D setup */
 472 #define dd_z2n  4
 473 #define dd_zp2n 2
 474 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 475
 476 /* The 1D setup */
 477 #define dd_z1n  2
 478 #define dd_zp1n 1
 479 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 480
 481 /* Factors used to avoid problems due to rounding issues */
 482 #define DD_CELL_MARGIN       1.0001
 483 #define DD_CELL_MARGIN2      1.00005
 484 /* Factor to account for pressure scaling during nstlist steps */
 485 #define DD_PRES_SCALE_MARGIN 1.02
 486
 487 /* Turn on DLB when the load imbalance causes this amount of total loss.
 488  * There is a bit of overhead with DLB and it's difficult to achieve
 489  * a load imbalance of less than 2% with DLB.
 490  */
 491 #define DD_PERF_LOSS_DLB_ON  0.02
 492
 493 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 494 #define DD_PERF_LOSS_WARN    0.05
 495
 496 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 497
 498 /* Use separate MPI send and receive commands
 499  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 500  * This saves memory (and some copying for small nnodes).
 501  * For high parallelization scatter and gather calls are used.
 502  */
 503 #define GMX_DD_NNODES_SENDRECV 4
 504
 505
 506 /*
 507    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 508
 509    static void index2xyz(ivec nc,int ind,ivec xyz)
 510    {
 511    xyz[XX] = ind % nc[XX];
 512    xyz[YY] = (ind / nc[XX]) % nc[YY];
 513    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 514    }
 515  */
 516
 517 /* This order is required to minimize the coordinate communication in PME
 518  * which uses decomposition in the x direction.
 519  */
 520 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 521
 522 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 523 {
 524     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 525     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 526     xyz[ZZ] = ind % nc[ZZ];
 527 }
 528
 529 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 530 {
 531     int ddindex;
 532     int ddnodeid = -1;
 533
 534     ddindex = dd_index(dd->nc, c);
 535     if (dd->comm->bCartesianPP_PME)
 536     {
 537         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 538     }
 539     else if (dd->comm->bCartesianPP)
 540     {
 541 #ifdef GMX_MPI
 542         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 543 #endif
 544     }
 545     else
 546     {
 547         ddnodeid = ddindex;
 548     }
 549
 550     return ddnodeid;
 551 }
 552
 553 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 554 {
 555     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 556 }
 557
 558 int ddglatnr(gmx_domdec_t *dd, int i)
 559 {
 560     int atnr;
 561
 562     if (dd == NULL)
 563     {
 564         atnr = i + 1;
 565     }
 566     else
 567     {
 568         if (i >= dd->comm->nat[ddnatNR-1])
 569         {
 570             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 571         }
 572         atnr = dd->gatindex[i] + 1;
 573     }
 574
 575     return atnr;
 576 }
 577
 578 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 579 {
 580     return &dd->comm->cgs_gl;
 581 }
 582
 583 static void vec_rvec_init(vec_rvec_t *v)
 584 {
 585     v->nalloc = 0;
 586     v->v      = NULL;
 587 }
 588
 589 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 590 {
 591     if (n > v->nalloc)
 592     {
 593         v->nalloc = over_alloc_dd(n);
 594         srenew(v->v, v->nalloc);
 595     }
 596 }
 597
 598 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 599 {
 600     int i;
 601
 602     if (state->ddp_count != dd->ddp_count)
 603     {
 604         gmx_incons("The state does not the domain decomposition state");
 605     }
 606
 607     state->ncg_gl = dd->ncg_home;
 608     if (state->ncg_gl > state->cg_gl_nalloc)
 609     {
 610         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 611         srenew(state->cg_gl, state->cg_gl_nalloc);
 612     }
 613     for (i = 0; i < state->ncg_gl; i++)
 614     {
 615         state->cg_gl[i] = dd->index_gl[i];
 616     }
 617
 618     state->ddp_count_cg_gl = dd->ddp_count;
 619 }
 620
 621 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 622 {
 623     return &dd->comm->zones;
 624 }
 625
 626 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 627                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 628 {
 629     gmx_domdec_zones_t *zones;
 630     int                 izone, d, dim;
 631
 632     zones = &dd->comm->zones;
 633
 634     izone = 0;
 635     while (icg >= zones->izone[izone].cg1)
 636     {
 637         izone++;
 638     }
 639
 640     if (izone == 0)
 641     {
 642         *jcg0 = icg;
 643     }
 644     else if (izone < zones->nizone)
 645     {
 646         *jcg0 = zones->izone[izone].jcg0;
 647     }
 648     else
 649     {
 650         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 651                   icg, izone, zones->nizone);
 652     }
 653
 654     *jcg1 = zones->izone[izone].jcg1;
 655
 656     for (d = 0; d < dd->ndim; d++)
 657     {
 658         dim         = dd->dim[d];
 659         shift0[dim] = zones->izone[izone].shift0[dim];
 660         shift1[dim] = zones->izone[izone].shift1[dim];
 661         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 662         {
 663             /* A conservative approach, this can be optimized */
 664             shift0[dim] -= 1;
 665             shift1[dim] += 1;
 666         }
 667     }
 668 }
 669
 670 int dd_natoms_vsite(gmx_domdec_t *dd)
 671 {
 672     return dd->comm->nat[ddnatVSITE];
 673 }
 674
 675 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 676 {
 677     *at_start = dd->comm->nat[ddnatCON-1];
 678     *at_end   = dd->comm->nat[ddnatCON];
 679 }
 680
 681 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 682 {
 683     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 684     int                   *index, *cgindex;
 685     gmx_domdec_comm_t     *comm;
 686     gmx_domdec_comm_dim_t *cd;
 687     gmx_domdec_ind_t      *ind;
 688     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 689     gmx_bool               bPBC, bScrew;
 690
 691     comm = dd->comm;
 692
 693     cgindex = dd->cgindex;
 694
 695     buf = comm->vbuf.v;
 696
 697     nzone   = 1;
 698     nat_tot = dd->nat_home;
 699     for (d = 0; d < dd->ndim; d++)
 700     {
 701         bPBC   = (dd->ci[dd->dim[d]] == 0);
 702         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 703         if (bPBC)
 704         {
 705             copy_rvec(box[dd->dim[d]], shift);
 706         }
 707         cd = &comm->cd[d];
 708         for (p = 0; p < cd->np; p++)
 709         {
 710             ind   = &cd->ind[p];
 711             index = ind->index;
 712             n     = 0;
 713             if (!bPBC)
 714             {
 715                 for (i = 0; i < ind->nsend[nzone]; i++)
 716                 {
 717                     at0 = cgindex[index[i]];
 718                     at1 = cgindex[index[i]+1];
 719                     for (j = at0; j < at1; j++)
 720                     {
 721                         copy_rvec(x[j], buf[n]);
 722                         n++;
 723                     }
 724                 }
 725             }
 726             else if (!bScrew)
 727             {
 728                 for (i = 0; i < ind->nsend[nzone]; i++)
 729                 {
 730                     at0 = cgindex[index[i]];
 731                     at1 = cgindex[index[i]+1];
 732                     for (j = at0; j < at1; j++)
 733                     {
 734                         /* We need to shift the coordinates */
 735                         rvec_add(x[j], shift, buf[n]);
 736                         n++;
 737                     }
 738                 }
 739             }
 740             else
 741             {
 742                 for (i = 0; i < ind->nsend[nzone]; i++)
 743                 {
 744                     at0 = cgindex[index[i]];
 745                     at1 = cgindex[index[i]+1];
 746                     for (j = at0; j < at1; j++)
 747                     {
 748                         /* Shift x */
 749                         buf[n][XX] = x[j][XX] + shift[XX];
 750                         /* Rotate y and z.
 751                          * This operation requires a special shift force
 752                          * treatment, which is performed in calc_vir.
 753                          */
 754                         buf[n][YY] = box[YY][YY] - x[j][YY];
 755                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 756                         n++;
 757                     }
 758                 }
 759             }
 760
 761             if (cd->bInPlace)
 762             {
 763                 rbuf = x + nat_tot;
 764             }
 765             else
 766             {
 767                 rbuf = comm->vbuf2.v;
 768             }
 769             /* Send and receive the coordinates */
 770             dd_sendrecv_rvec(dd, d, dddirBackward,
 771                              buf,  ind->nsend[nzone+1],
 772                              rbuf, ind->nrecv[nzone+1]);
 773             if (!cd->bInPlace)
 774             {
 775                 j = 0;
 776                 for (zone = 0; zone < nzone; zone++)
 777                 {
 778                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 779                     {
 780                         copy_rvec(rbuf[j], x[i]);
 781                         j++;
 782                     }
 783                 }
 784             }
 785             nat_tot += ind->nrecv[nzone+1];
 786         }
 787         nzone += nzone;
 788     }
 789 }
 790
 791 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 792 {
 793     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 794     int                   *index, *cgindex;
 795     gmx_domdec_comm_t     *comm;
 796     gmx_domdec_comm_dim_t *cd;
 797     gmx_domdec_ind_t      *ind;
 798     rvec                  *buf, *sbuf;
 799     ivec                   vis;
 800     int                    is;
 801     gmx_bool               bShiftForcesNeedPbc, bScrew;
 802
 803     comm = dd->comm;
 804
 805     cgindex = dd->cgindex;
 806
 807     buf = comm->vbuf.v;
 808
 809     nzone   = comm->zones.n/2;
 810     nat_tot = dd->nat_tot;
 811     for (d = dd->ndim-1; d >= 0; d--)
 812     {
 813         /* Only forces in domains near the PBC boundaries need to
 814            consider PBC in the treatment of fshift */
 815         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 816         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 817         if (fshift == NULL && !bScrew)
 818         {
 819             bShiftForcesNeedPbc = FALSE;
 820         }
 821         /* Determine which shift vector we need */
 822         clear_ivec(vis);
 823         vis[dd->dim[d]] = 1;
 824         is              = IVEC2IS(vis);
 825
 826         cd = &comm->cd[d];
 827         for (p = cd->np-1; p >= 0; p--)
 828         {
 829             ind      = &cd->ind[p];
 830             nat_tot -= ind->nrecv[nzone+1];
 831             if (cd->bInPlace)
 832             {
 833                 sbuf = f + nat_tot;
 834             }
 835             else
 836             {
 837                 sbuf = comm->vbuf2.v;
 838                 j    = 0;
 839                 for (zone = 0; zone < nzone; zone++)
 840                 {
 841                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 842                     {
 843                         copy_rvec(f[i], sbuf[j]);
 844                         j++;
 845                     }
 846                 }
 847             }
 848             /* Communicate the forces */
 849             dd_sendrecv_rvec(dd, d, dddirForward,
 850                              sbuf, ind->nrecv[nzone+1],
 851                              buf,  ind->nsend[nzone+1]);
 852             index = ind->index;
 853             /* Add the received forces */
 854             n = 0;
 855             if (!bShiftForcesNeedPbc)
 856             {
 857                 for (i = 0; i < ind->nsend[nzone]; i++)
 858                 {
 859                     at0 = cgindex[index[i]];
 860                     at1 = cgindex[index[i]+1];
 861                     for (j = at0; j < at1; j++)
 862                     {
 863                         rvec_inc(f[j], buf[n]);
 864                         n++;
 865                     }
 866                 }
 867             }
 868             else if (!bScrew)
 869             {
 870                 /* fshift should always be defined if this function is
 871                  * called when bShiftForcesNeedPbc is true */
 872                 assert(NULL != fshift);
 873                 for (i = 0; i < ind->nsend[nzone]; i++)
 874                 {
 875                     at0 = cgindex[index[i]];
 876                     at1 = cgindex[index[i]+1];
 877                     for (j = at0; j < at1; j++)
 878                     {
 879                         rvec_inc(f[j], buf[n]);
 880                         /* Add this force to the shift force */
 881                         rvec_inc(fshift[is], buf[n]);
 882                         n++;
 883                     }
 884                 }
 885             }
 886             else
 887             {
 888                 for (i = 0; i < ind->nsend[nzone]; i++)
 889                 {
 890                     at0 = cgindex[index[i]];
 891                     at1 = cgindex[index[i]+1];
 892                     for (j = at0; j < at1; j++)
 893                     {
 894                         /* Rotate the force */
 895                         f[j][XX] += buf[n][XX];
 896                         f[j][YY] -= buf[n][YY];
 897                         f[j][ZZ] -= buf[n][ZZ];
 898                         if (fshift)
 899                         {
 900                             /* Add this force to the shift force */
 901                             rvec_inc(fshift[is], buf[n]);
 902                         }
 903                         n++;
 904                     }
 905                 }
 906             }
 907         }
 908         nzone /= 2;
 909     }
 910 }
 911
 912 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 913 {
 914     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 915     int                   *index, *cgindex;
 916     gmx_domdec_comm_t     *comm;
 917     gmx_domdec_comm_dim_t *cd;
 918     gmx_domdec_ind_t      *ind;
 919     real                  *buf, *rbuf;
 920
 921     comm = dd->comm;
 922
 923     cgindex = dd->cgindex;
 924
 925     buf = &comm->vbuf.v[0][0];
 926
 927     nzone   = 1;
 928     nat_tot = dd->nat_home;
 929     for (d = 0; d < dd->ndim; d++)
 930     {
 931         cd = &comm->cd[d];
 932         for (p = 0; p < cd->np; p++)
 933         {
 934             ind   = &cd->ind[p];
 935             index = ind->index;
 936             n     = 0;
 937             for (i = 0; i < ind->nsend[nzone]; i++)
 938             {
 939                 at0 = cgindex[index[i]];
 940                 at1 = cgindex[index[i]+1];
 941                 for (j = at0; j < at1; j++)
 942                 {
 943                     buf[n] = v[j];
 944                     n++;
 945                 }
 946             }
 947
 948             if (cd->bInPlace)
 949             {
 950                 rbuf = v + nat_tot;
 951             }
 952             else
 953             {
 954                 rbuf = &comm->vbuf2.v[0][0];
 955             }
 956             /* Send and receive the coordinates */
 957             dd_sendrecv_real(dd, d, dddirBackward,
 958                              buf,  ind->nsend[nzone+1],
 959                              rbuf, ind->nrecv[nzone+1]);
 960             if (!cd->bInPlace)
 961             {
 962                 j = 0;
 963                 for (zone = 0; zone < nzone; zone++)
 964                 {
 965                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 966                     {
 967                         v[i] = rbuf[j];
 968                         j++;
 969                     }
 970                 }
 971             }
 972             nat_tot += ind->nrecv[nzone+1];
 973         }
 974         nzone += nzone;
 975     }
 976 }
 977
 978 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 979 {
 980     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 981     int                   *index, *cgindex;
 982     gmx_domdec_comm_t     *comm;
 983     gmx_domdec_comm_dim_t *cd;
 984     gmx_domdec_ind_t      *ind;
 985     real                  *buf, *sbuf;
 986
 987     comm = dd->comm;
 988
 989     cgindex = dd->cgindex;
 990
 991     buf = &comm->vbuf.v[0][0];
 992
 993     nzone   = comm->zones.n/2;
 994     nat_tot = dd->nat_tot;
 995     for (d = dd->ndim-1; d >= 0; d--)
 996     {
 997         cd = &comm->cd[d];
 998         for (p = cd->np-1; p >= 0; p--)
 999         {
1000             ind      = &cd->ind[p];
1001             nat_tot -= ind->nrecv[nzone+1];
1002             if (cd->bInPlace)
1003             {
1004                 sbuf = v + nat_tot;
1005             }
1006             else
1007             {
1008                 sbuf = &comm->vbuf2.v[0][0];
1009                 j    = 0;
1010                 for (zone = 0; zone < nzone; zone++)
1011                 {
1012                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
1013                     {
1014                         sbuf[j] = v[i];
1015                         j++;
1016                     }
1017                 }
1018             }
1019             /* Communicate the forces */
1020             dd_sendrecv_real(dd, d, dddirForward,
1021                              sbuf, ind->nrecv[nzone+1],
1022                              buf,  ind->nsend[nzone+1]);
1023             index = ind->index;
1024             /* Add the received forces */
1025             n = 0;
1026             for (i = 0; i < ind->nsend[nzone]; i++)
1027             {
1028                 at0 = cgindex[index[i]];
1029                 at1 = cgindex[index[i]+1];
1030                 for (j = at0; j < at1; j++)
1031                 {
1032                     v[j] += buf[n];
1033                     n++;
1034                 }
1035             }
1036         }
1037         nzone /= 2;
1038     }
1039 }
1040
1041 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
1042 {
1043     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1044             d, i, j,
1045             zone->min0, zone->max1,
1046             zone->mch0, zone->mch0,
1047             zone->p1_0, zone->p1_1);
1048 }
1049
1050
1051 #define DDZONECOMM_MAXZONE  5
1052 #define DDZONECOMM_BUFSIZE  3
1053
1054 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1055                                int ddimind, int direction,
1056                                gmx_ddzone_t *buf_s, int n_s,
1057                                gmx_ddzone_t *buf_r, int n_r)
1058 {
1059 #define ZBS  DDZONECOMM_BUFSIZE
1060     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1061     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1062     int  i;
1063
1064     for (i = 0; i < n_s; i++)
1065     {
1066         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1067         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1068         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1069         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1070         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1071         vbuf_s[i*ZBS+1][2] = 0;
1072         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1073         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1074         vbuf_s[i*ZBS+2][2] = 0;
1075     }
1076
1077     dd_sendrecv_rvec(dd, ddimind, direction,
1078                      vbuf_s, n_s*ZBS,
1079                      vbuf_r, n_r*ZBS);
1080
1081     for (i = 0; i < n_r; i++)
1082     {
1083         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1084         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1085         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1086         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1087         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1088         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1089         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1090     }
1091
1092 #undef ZBS
1093 }
1094
1095 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1096                           rvec cell_ns_x0, rvec cell_ns_x1)
1097 {
1098     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
1099     gmx_ddzone_t      *zp;
1100     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1101     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1102     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1103     rvec               extr_s[2], extr_r[2];
1104     rvec               dh;
1105     real               dist_d, c = 0, det;
1106     gmx_domdec_comm_t *comm;
1107     gmx_bool           bPBC, bUse;
1108
1109     comm = dd->comm;
1110
1111     for (d = 1; d < dd->ndim; d++)
1112     {
1113         dim      = dd->dim[d];
1114         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1115         zp->min0 = cell_ns_x0[dim];
1116         zp->max1 = cell_ns_x1[dim];
1117         zp->min1 = cell_ns_x1[dim];
1118         zp->mch0 = cell_ns_x0[dim];
1119         zp->mch1 = cell_ns_x1[dim];
1120         zp->p1_0 = cell_ns_x0[dim];
1121         zp->p1_1 = cell_ns_x1[dim];
1122     }
1123
1124     for (d = dd->ndim-2; d >= 0; d--)
1125     {
1126         dim  = dd->dim[d];
1127         bPBC = (dim < ddbox->npbcdim);
1128
1129         /* Use an rvec to store two reals */
1130         extr_s[d][0] = comm->cell_f0[d+1];
1131         extr_s[d][1] = comm->cell_f1[d+1];
1132         extr_s[d][2] = comm->cell_f1[d+1];
1133
1134         pos = 0;
1135         /* Store the extremes in the backward sending buffer,
1136          * so the get updated separately from the forward communication.
1137          */
1138         for (d1 = d; d1 < dd->ndim-1; d1++)
1139         {
1140             /* We invert the order to be able to use the same loop for buf_e */
1141             buf_s[pos].min0 = extr_s[d1][1];
1142             buf_s[pos].max1 = extr_s[d1][0];
1143             buf_s[pos].min1 = extr_s[d1][2];
1144             buf_s[pos].mch0 = 0;
1145             buf_s[pos].mch1 = 0;
1146             /* Store the cell corner of the dimension we communicate along */
1147             buf_s[pos].p1_0 = comm->cell_x0[dim];
1148             buf_s[pos].p1_1 = 0;
1149             pos++;
1150         }
1151
1152         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1153         pos++;
1154
1155         if (dd->ndim == 3 && d == 0)
1156         {
1157             buf_s[pos] = comm->zone_d2[0][1];
1158             pos++;
1159             buf_s[pos] = comm->zone_d1[0];
1160             pos++;
1161         }
1162
1163         /* We only need to communicate the extremes
1164          * in the forward direction
1165          */
1166         npulse = comm->cd[d].np;
1167         if (bPBC)
1168         {
1169             /* Take the minimum to avoid double communication */
1170             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
1171         }
1172         else
1173         {
1174             /* Without PBC we should really not communicate over
1175              * the boundaries, but implementing that complicates
1176              * the communication setup and therefore we simply
1177              * do all communication, but ignore some data.
1178              */
1179             npulse_min = npulse;
1180         }
1181         for (p = 0; p < npulse_min; p++)
1182         {
1183             /* Communicate the extremes forward */
1184             bUse = (bPBC || dd->ci[dim] > 0);
1185
1186             dd_sendrecv_rvec(dd, d, dddirForward,
1187                              extr_s+d, dd->ndim-d-1,
1188                              extr_r+d, dd->ndim-d-1);
1189
1190             if (bUse)
1191             {
1192                 for (d1 = d; d1 < dd->ndim-1; d1++)
1193                 {
1194                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
1195                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
1196                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
1197                 }
1198             }
1199         }
1200
1201         buf_size = pos;
1202         for (p = 0; p < npulse; p++)
1203         {
1204             /* Communicate all the zone information backward */
1205             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1206
1207             dd_sendrecv_ddzone(dd, d, dddirBackward,
1208                                buf_s, buf_size,
1209                                buf_r, buf_size);
1210
1211             clear_rvec(dh);
1212             if (p > 0)
1213             {
1214                 for (d1 = d+1; d1 < dd->ndim; d1++)
1215                 {
1216                     /* Determine the decrease of maximum required
1217                      * communication height along d1 due to the distance along d,
1218                      * this avoids a lot of useless atom communication.
1219                      */
1220                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1221
1222                     if (ddbox->tric_dir[dim])
1223                     {
1224                         /* c is the off-diagonal coupling between the cell planes
1225                          * along directions d and d1.
1226                          */
1227                         c = ddbox->v[dim][dd->dim[d1]][dim];
1228                     }
1229                     else
1230                     {
1231                         c = 0;
1232                     }
1233                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1234                     if (det > 0)
1235                     {
1236                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1237                     }
1238                     else
1239                     {
1240                         /* A negative value signals out of range */
1241                         dh[d1] = -1;
1242                     }
1243                 }
1244             }
1245
1246             /* Accumulate the extremes over all pulses */
1247             for (i = 0; i < buf_size; i++)
1248             {
1249                 if (p == 0)
1250                 {
1251                     buf_e[i] = buf_r[i];
1252                 }
1253                 else
1254                 {
1255                     if (bUse)
1256                     {
1257                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
1258                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
1259                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
1260                     }
1261
1262                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1263                     {
1264                         d1 = 1;
1265                     }
1266                     else
1267                     {
1268                         d1 = d + 1;
1269                     }
1270                     if (bUse && dh[d1] >= 0)
1271                     {
1272                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1273                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1274                     }
1275                 }
1276                 /* Copy the received buffer to the send buffer,
1277                  * to pass the data through with the next pulse.
1278                  */
1279                 buf_s[i] = buf_r[i];
1280             }
1281             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1282                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1283             {
1284                 /* Store the extremes */
1285                 pos = 0;
1286
1287                 for (d1 = d; d1 < dd->ndim-1; d1++)
1288                 {
1289                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
1290                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
1291                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
1292                     pos++;
1293                 }
1294
1295                 if (d == 1 || (d == 0 && dd->ndim == 3))
1296                 {
1297                     for (i = d; i < 2; i++)
1298                     {
1299                         comm->zone_d2[1-d][i] = buf_e[pos];
1300                         pos++;
1301                     }
1302                 }
1303                 if (d == 0)
1304                 {
1305                     comm->zone_d1[1] = buf_e[pos];
1306                     pos++;
1307                 }
1308             }
1309         }
1310     }
1311
1312     if (dd->ndim >= 2)
1313     {
1314         dim = dd->dim[1];
1315         for (i = 0; i < 2; i++)
1316         {
1317             if (debug)
1318             {
1319                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1320             }
1321             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1322             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1323         }
1324     }
1325     if (dd->ndim >= 3)
1326     {
1327         dim = dd->dim[2];
1328         for (i = 0; i < 2; i++)
1329         {
1330             for (j = 0; j < 2; j++)
1331             {
1332                 if (debug)
1333                 {
1334                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1335                 }
1336                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1337                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1338             }
1339         }
1340     }
1341     for (d = 1; d < dd->ndim; d++)
1342     {
1343         comm->cell_f_max0[d] = extr_s[d-1][0];
1344         comm->cell_f_min1[d] = extr_s[d-1][1];
1345         if (debug)
1346         {
1347             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1348                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1349         }
1350     }
1351 }
1352
1353 static void dd_collect_cg(gmx_domdec_t *dd,
1354                           t_state      *state_local)
1355 {
1356     gmx_domdec_master_t *ma = NULL;
1357     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1358
1359     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1360     {
1361         /* The master has the correct distribution */
1362         return;
1363     }
1364
1365     if (state_local->ddp_count == dd->ddp_count)
1366     {
1367         /* The local state and DD are in sync, use the DD indices */
1368         ncg_home = dd->ncg_home;
1369         cg       = dd->index_gl;
1370         nat_home = dd->nat_home;
1371     }
1372     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1373     {
1374         /* The DD is out of sync with the local state, but we have stored
1375          * the cg indices with the local state, so we can use those.
1376          */
1377         t_block *cgs_gl;
1378
1379         cgs_gl = &dd->comm->cgs_gl;
1380
1381         ncg_home = state_local->ncg_gl;
1382         cg       = state_local->cg_gl;
1383         nat_home = 0;
1384         for (i = 0; i < ncg_home; i++)
1385         {
1386             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1387         }
1388     }
1389     else
1390     {
1391         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1392     }
1393
1394     buf2[0] = ncg_home;
1395     buf2[1] = nat_home;
1396     if (DDMASTER(dd))
1397     {
1398         ma   = dd->ma;
1399         ibuf = ma->ibuf;
1400     }
1401     else
1402     {
1403         ibuf = NULL;
1404     }
1405     /* Collect the charge group and atom counts on the master */
1406     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1407
1408     if (DDMASTER(dd))
1409     {
1410         ma->index[0] = 0;
1411         for (i = 0; i < dd->nnodes; i++)
1412         {
1413             ma->ncg[i]     = ma->ibuf[2*i];
1414             ma->nat[i]     = ma->ibuf[2*i+1];
1415             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1416
1417         }
1418         /* Make byte counts and indices */
1419         for (i = 0; i < dd->nnodes; i++)
1420         {
1421             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1422             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1423         }
1424         if (debug)
1425         {
1426             fprintf(debug, "Initial charge group distribution: ");
1427             for (i = 0; i < dd->nnodes; i++)
1428             {
1429                 fprintf(debug, " %d", ma->ncg[i]);
1430             }
1431             fprintf(debug, "\n");
1432         }
1433     }
1434
1435     /* Collect the charge group indices on the master */
1436     dd_gatherv(dd,
1437                ncg_home*sizeof(int), cg,
1438                DDMASTER(dd) ? ma->ibuf : NULL,
1439                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1440                DDMASTER(dd) ? ma->cg : NULL);
1441
1442     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1443 }
1444
1445 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1446                                     rvec *lv, rvec *v)
1447 {
1448     gmx_domdec_master_t *ma;
1449     int                  n, i, c, a, nalloc = 0;
1450     rvec                *buf = NULL;
1451     t_block             *cgs_gl;
1452
1453     ma = dd->ma;
1454
1455     if (!DDMASTER(dd))
1456     {
1457 #ifdef GMX_MPI
1458         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1459                  dd->rank, dd->mpi_comm_all);
1460 #endif
1461     }
1462     else
1463     {
1464         /* Copy the master coordinates to the global array */
1465         cgs_gl = &dd->comm->cgs_gl;
1466
1467         n = DDMASTERRANK(dd);
1468         a = 0;
1469         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1470         {
1471             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1472             {
1473                 copy_rvec(lv[a++], v[c]);
1474             }
1475         }
1476
1477         for (n = 0; n < dd->nnodes; n++)
1478         {
1479             if (n != dd->rank)
1480             {
1481                 if (ma->nat[n] > nalloc)
1482                 {
1483                     nalloc = over_alloc_dd(ma->nat[n]);
1484                     srenew(buf, nalloc);
1485                 }
1486 #ifdef GMX_MPI
1487                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1488                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1489 #endif
1490                 a = 0;
1491                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1492                 {
1493                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1494                     {
1495                         copy_rvec(buf[a++], v[c]);
1496                     }
1497                 }
1498             }
1499         }
1500         sfree(buf);
1501     }
1502 }
1503
1504 static void get_commbuffer_counts(gmx_domdec_t *dd,
1505                                   int **counts, int **disps)
1506 {
1507     gmx_domdec_master_t *ma;
1508     int                  n;
1509
1510     ma = dd->ma;
1511
1512     /* Make the rvec count and displacment arrays */
1513     *counts  = ma->ibuf;
1514     *disps   = ma->ibuf + dd->nnodes;
1515     for (n = 0; n < dd->nnodes; n++)
1516     {
1517         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1518         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1519     }
1520 }
1521
1522 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1523                                    rvec *lv, rvec *v)
1524 {
1525     gmx_domdec_master_t *ma;
1526     int                 *rcounts = NULL, *disps = NULL;
1527     int                  n, i, c, a;
1528     rvec                *buf = NULL;
1529     t_block             *cgs_gl;
1530
1531     ma = dd->ma;
1532
1533     if (DDMASTER(dd))
1534     {
1535         get_commbuffer_counts(dd, &rcounts, &disps);
1536
1537         buf = ma->vbuf;
1538     }
1539
1540     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1541
1542     if (DDMASTER(dd))
1543     {
1544         cgs_gl = &dd->comm->cgs_gl;
1545
1546         a = 0;
1547         for (n = 0; n < dd->nnodes; n++)
1548         {
1549             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1550             {
1551                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1552                 {
1553                     copy_rvec(buf[a++], v[c]);
1554                 }
1555             }
1556         }
1557     }
1558 }
1559
1560 void dd_collect_vec(gmx_domdec_t *dd,
1561                     t_state *state_local, rvec *lv, rvec *v)
1562 {
1563     dd_collect_cg(dd, state_local);
1564
1565     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1566     {
1567         dd_collect_vec_sendrecv(dd, lv, v);
1568     }
1569     else
1570     {
1571         dd_collect_vec_gatherv(dd, lv, v);
1572     }
1573 }
1574
1575
1576 void dd_collect_state(gmx_domdec_t *dd,
1577                       t_state *state_local, t_state *state)
1578 {
1579     int est, i, j, nh;
1580
1581     nh = state->nhchainlength;
1582
1583     if (DDMASTER(dd))
1584     {
1585         for (i = 0; i < efptNR; i++)
1586         {
1587             state->lambda[i] = state_local->lambda[i];
1588         }
1589         state->fep_state = state_local->fep_state;
1590         state->veta      = state_local->veta;
1591         state->vol0      = state_local->vol0;
1592         copy_mat(state_local->box, state->box);
1593         copy_mat(state_local->boxv, state->boxv);
1594         copy_mat(state_local->svir_prev, state->svir_prev);
1595         copy_mat(state_local->fvir_prev, state->fvir_prev);
1596         copy_mat(state_local->pres_prev, state->pres_prev);
1597
1598         for (i = 0; i < state_local->ngtc; i++)
1599         {
1600             for (j = 0; j < nh; j++)
1601             {
1602                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1603                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1604             }
1605             state->therm_integral[i] = state_local->therm_integral[i];
1606         }
1607         for (i = 0; i < state_local->nnhpres; i++)
1608         {
1609             for (j = 0; j < nh; j++)
1610             {
1611                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1612                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1613             }
1614         }
1615     }
1616     for (est = 0; est < estNR; est++)
1617     {
1618         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1619         {
1620             switch (est)
1621             {
1622                 case estX:
1623                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1624                     break;
1625                 case estV:
1626                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1627                     break;
1628                 case estSDX:
1629                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1630                     break;
1631                 case estCGP:
1632                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1633                     break;
1634                 case estDISRE_INITF:
1635                 case estDISRE_RM3TAV:
1636                 case estORIRE_INITF:
1637                 case estORIRE_DTAV:
1638                     break;
1639                 default:
1640                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1641             }
1642         }
1643     }
1644 }
1645
1646 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1647 {
1648     int est;
1649
1650     if (debug)
1651     {
1652         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1653     }
1654
1655     state->nalloc = over_alloc_dd(nalloc);
1656
1657     for (est = 0; est < estNR; est++)
1658     {
1659         if (EST_DISTR(est) && (state->flags & (1<<est)))
1660         {
1661             switch (est)
1662             {
1663                 case estX:
1664                     srenew(state->x, state->nalloc);
1665                     break;
1666                 case estV:
1667                     srenew(state->v, state->nalloc);
1668                     break;
1669                 case estSDX:
1670                     srenew(state->sd_X, state->nalloc);
1671                     break;
1672                 case estCGP:
1673                     srenew(state->cg_p, state->nalloc);
1674                     break;
1675                 case estDISRE_INITF:
1676                 case estDISRE_RM3TAV:
1677                 case estORIRE_INITF:
1678                 case estORIRE_DTAV:
1679                     /* No reallocation required */
1680                     break;
1681                 default:
1682                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1683             }
1684         }
1685     }
1686
1687     if (f != NULL)
1688     {
1689         srenew(*f, state->nalloc);
1690     }
1691 }
1692
1693 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1694                                int nalloc)
1695 {
1696     if (nalloc > fr->cg_nalloc)
1697     {
1698         if (debug)
1699         {
1700             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1701         }
1702         fr->cg_nalloc = over_alloc_dd(nalloc);
1703         srenew(fr->cginfo, fr->cg_nalloc);
1704         if (fr->cutoff_scheme == ecutsGROUP)
1705         {
1706             srenew(fr->cg_cm, fr->cg_nalloc);
1707         }
1708     }
1709     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1710     {
1711         /* We don't use charge groups, we use x in state to set up
1712          * the atom communication.
1713          */
1714         dd_realloc_state(state, f, nalloc);
1715     }
1716 }
1717
1718 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1719                                        rvec *v, rvec *lv)
1720 {
1721     gmx_domdec_master_t *ma;
1722     int                  n, i, c, a, nalloc = 0;
1723     rvec                *buf = NULL;
1724
1725     if (DDMASTER(dd))
1726     {
1727         ma  = dd->ma;
1728
1729         for (n = 0; n < dd->nnodes; n++)
1730         {
1731             if (n != dd->rank)
1732             {
1733                 if (ma->nat[n] > nalloc)
1734                 {
1735                     nalloc = over_alloc_dd(ma->nat[n]);
1736                     srenew(buf, nalloc);
1737                 }
1738                 /* Use lv as a temporary buffer */
1739                 a = 0;
1740                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1741                 {
1742                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1743                     {
1744                         copy_rvec(v[c], buf[a++]);
1745                     }
1746                 }
1747                 if (a != ma->nat[n])
1748                 {
1749                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1750                               a, ma->nat[n]);
1751                 }
1752
1753 #ifdef GMX_MPI
1754                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1755                          DDRANK(dd, n), n, dd->mpi_comm_all);
1756 #endif
1757             }
1758         }
1759         sfree(buf);
1760         n = DDMASTERRANK(dd);
1761         a = 0;
1762         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1763         {
1764             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1765             {
1766                 copy_rvec(v[c], lv[a++]);
1767             }
1768         }
1769     }
1770     else
1771     {
1772 #ifdef GMX_MPI
1773         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1774                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1775 #endif
1776     }
1777 }
1778
1779 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1780                                        rvec *v, rvec *lv)
1781 {
1782     gmx_domdec_master_t *ma;
1783     int                 *scounts = NULL, *disps = NULL;
1784     int                  n, i, c, a;
1785     rvec                *buf = NULL;
1786
1787     if (DDMASTER(dd))
1788     {
1789         ma  = dd->ma;
1790
1791         get_commbuffer_counts(dd, &scounts, &disps);
1792
1793         buf = ma->vbuf;
1794         a   = 0;
1795         for (n = 0; n < dd->nnodes; n++)
1796         {
1797             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1798             {
1799                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1800                 {
1801                     copy_rvec(v[c], buf[a++]);
1802                 }
1803             }
1804         }
1805     }
1806
1807     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1808 }
1809
1810 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1811 {
1812     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1813     {
1814         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1815     }
1816     else
1817     {
1818         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1819     }
1820 }
1821
1822 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1823 {
1824     int i;
1825     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1826     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1827     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1828
1829     if (dfhist->nlambda > 0)
1830     {
1831         int nlam = dfhist->nlambda;
1832         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1833         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1834         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1835         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1836         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1837         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1838
1839         for (i = 0; i < nlam; i++)
1840         {
1841             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1842             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1843             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1844             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1845             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1846             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1847         }
1848     }
1849 }
1850
1851 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1852                                 t_state *state, t_state *state_local,
1853                                 rvec **f)
1854 {
1855     int  i, j, nh;
1856
1857     nh = state->nhchainlength;
1858
1859     if (DDMASTER(dd))
1860     {
1861         for (i = 0; i < efptNR; i++)
1862         {
1863             state_local->lambda[i] = state->lambda[i];
1864         }
1865         state_local->fep_state = state->fep_state;
1866         state_local->veta      = state->veta;
1867         state_local->vol0      = state->vol0;
1868         copy_mat(state->box, state_local->box);
1869         copy_mat(state->box_rel, state_local->box_rel);
1870         copy_mat(state->boxv, state_local->boxv);
1871         copy_mat(state->svir_prev, state_local->svir_prev);
1872         copy_mat(state->fvir_prev, state_local->fvir_prev);
1873         copy_df_history(&state_local->dfhist, &state->dfhist);
1874         for (i = 0; i < state_local->ngtc; i++)
1875         {
1876             for (j = 0; j < nh; j++)
1877             {
1878                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1879                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1880             }
1881             state_local->therm_integral[i] = state->therm_integral[i];
1882         }
1883         for (i = 0; i < state_local->nnhpres; i++)
1884         {
1885             for (j = 0; j < nh; j++)
1886             {
1887                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1888                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1889             }
1890         }
1891     }
1892     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1893     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1894     dd_bcast(dd, sizeof(real), &state_local->veta);
1895     dd_bcast(dd, sizeof(real), &state_local->vol0);
1896     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1897     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1898     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1899     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1900     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1901     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1902     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1903     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1904     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1905     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1906
1907     /* communicate df_history -- required for restarting from checkpoint */
1908     dd_distribute_dfhist(dd, &state_local->dfhist);
1909
1910     if (dd->nat_home > state_local->nalloc)
1911     {
1912         dd_realloc_state(state_local, f, dd->nat_home);
1913     }
1914     for (i = 0; i < estNR; i++)
1915     {
1916         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1917         {
1918             switch (i)
1919             {
1920                 case estX:
1921                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1922                     break;
1923                 case estV:
1924                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1925                     break;
1926                 case estSDX:
1927                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1928                     break;
1929                 case estCGP:
1930                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1931                     break;
1932                 case estDISRE_INITF:
1933                 case estDISRE_RM3TAV:
1934                 case estORIRE_INITF:
1935                 case estORIRE_DTAV:
1936                     /* Not implemented yet */
1937                     break;
1938                 default:
1939                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1940             }
1941         }
1942     }
1943 }
1944
1945 static char dim2char(int dim)
1946 {
1947     char c = '?';
1948
1949     switch (dim)
1950     {
1951         case XX: c = 'X'; break;
1952         case YY: c = 'Y'; break;
1953         case ZZ: c = 'Z'; break;
1954         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1955     }
1956
1957     return c;
1958 }
1959
1960 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1961                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1962 {
1963     rvec   grid_s[2], *grid_r = NULL, cx, r;
1964     char   fname[STRLEN], buf[22];
1965     FILE  *out;
1966     int    a, i, d, z, y, x;
1967     matrix tric;
1968     real   vol;
1969
1970     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1971     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1972
1973     if (DDMASTER(dd))
1974     {
1975         snew(grid_r, 2*dd->nnodes);
1976     }
1977
1978     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
1979
1980     if (DDMASTER(dd))
1981     {
1982         for (d = 0; d < DIM; d++)
1983         {
1984             for (i = 0; i < DIM; i++)
1985             {
1986                 if (d == i)
1987                 {
1988                     tric[d][i] = 1;
1989                 }
1990                 else
1991                 {
1992                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1993                     {
1994                         tric[d][i] = box[i][d]/box[i][i];
1995                     }
1996                     else
1997                     {
1998                         tric[d][i] = 0;
1999                     }
2000                 }
2001             }
2002         }
2003         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
2004         out = gmx_fio_fopen(fname, "w");
2005         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2006         a = 1;
2007         for (i = 0; i < dd->nnodes; i++)
2008         {
2009             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
2010             for (d = 0; d < DIM; d++)
2011             {
2012                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
2013             }
2014             for (z = 0; z < 2; z++)
2015             {
2016                 for (y = 0; y < 2; y++)
2017                 {
2018                     for (x = 0; x < 2; x++)
2019                     {
2020                         cx[XX] = grid_r[i*2+x][XX];
2021                         cx[YY] = grid_r[i*2+y][YY];
2022                         cx[ZZ] = grid_r[i*2+z][ZZ];
2023                         mvmul(tric, cx, r);
2024                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
2025                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
2026                     }
2027                 }
2028             }
2029             for (d = 0; d < DIM; d++)
2030             {
2031                 for (x = 0; x < 4; x++)
2032                 {
2033                     switch (d)
2034                     {
2035                         case 0: y = 1 + i*8 + 2*x; break;
2036                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2037                         case 2: y = 1 + i*8 + x; break;
2038                     }
2039                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2040                 }
2041             }
2042         }
2043         gmx_fio_fclose(out);
2044         sfree(grid_r);
2045     }
2046 }
2047
2048 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
2049                   gmx_mtop_t *mtop, t_commrec *cr,
2050                   int natoms, rvec x[], matrix box)
2051 {
2052     char          fname[STRLEN], buf[22];
2053     FILE         *out;
2054     int           i, ii, resnr, c;
2055     char         *atomname, *resname;
2056     real          b;
2057     gmx_domdec_t *dd;
2058
2059     dd = cr->dd;
2060     if (natoms == -1)
2061     {
2062         natoms = dd->comm->nat[ddnatVSITE];
2063     }
2064
2065     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2066
2067     out = gmx_fio_fopen(fname, "w");
2068
2069     fprintf(out, "TITLE     %s\n", title);
2070     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2071     for (i = 0; i < natoms; i++)
2072     {
2073         ii = dd->gatindex[i];
2074         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2075         if (i < dd->comm->nat[ddnatZONE])
2076         {
2077             c = 0;
2078             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2079             {
2080                 c++;
2081             }
2082             b = c;
2083         }
2084         else if (i < dd->comm->nat[ddnatVSITE])
2085         {
2086             b = dd->comm->zones.n;
2087         }
2088         else
2089         {
2090             b = dd->comm->zones.n + 1;
2091         }
2092         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
2093                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
2094     }
2095     fprintf(out, "TER\n");
2096
2097     gmx_fio_fclose(out);
2098 }
2099
2100 real dd_cutoff_multibody(const gmx_domdec_t *dd)
2101 {
2102     gmx_domdec_comm_t *comm;
2103     int                di;
2104     real               r;
2105
2106     comm = dd->comm;
2107
2108     r = -1;
2109     if (comm->bInterCGBondeds)
2110     {
2111         if (comm->cutoff_mbody > 0)
2112         {
2113             r = comm->cutoff_mbody;
2114         }
2115         else
2116         {
2117             /* cutoff_mbody=0 means we do not have DLB */
2118             r = comm->cellsize_min[dd->dim[0]];
2119             for (di = 1; di < dd->ndim; di++)
2120             {
2121                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
2122             }
2123             if (comm->bBondComm)
2124             {
2125                 r = std::max(r, comm->cutoff_mbody);
2126             }
2127             else
2128             {
2129                 r = std::min(r, comm->cutoff);
2130             }
2131         }
2132     }
2133
2134     return r;
2135 }
2136
2137 real dd_cutoff_twobody(const gmx_domdec_t *dd)
2138 {
2139     real r_mb;
2140
2141     r_mb = dd_cutoff_multibody(dd);
2142
2143     return std::max(dd->comm->cutoff, r_mb);
2144 }
2145
2146
2147 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2148 {
2149     int nc, ntot;
2150
2151     nc   = dd->nc[dd->comm->cartpmedim];
2152     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2153     copy_ivec(coord, coord_pme);
2154     coord_pme[dd->comm->cartpmedim] =
2155         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2156 }
2157
2158 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2159 {
2160     /* Here we assign a PME node to communicate with this DD node
2161      * by assuming that the major index of both is x.
2162      * We add cr->npmenodes/2 to obtain an even distribution.
2163      */
2164     return (ddindex*npme + npme/2)/ndd;
2165 }
2166
2167 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2168 {
2169     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2170 }
2171
2172 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2173 {
2174     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2175 }
2176
2177 static int *dd_pmenodes(t_commrec *cr)
2178 {
2179     int *pmenodes;
2180     int  n, i, p0, p1;
2181
2182     snew(pmenodes, cr->npmenodes);
2183     n = 0;
2184     for (i = 0; i < cr->dd->nnodes; i++)
2185     {
2186         p0 = cr_ddindex2pmeindex(cr, i);
2187         p1 = cr_ddindex2pmeindex(cr, i+1);
2188         if (i+1 == cr->dd->nnodes || p1 > p0)
2189         {
2190             if (debug)
2191             {
2192                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2193             }
2194             pmenodes[n] = i + 1 + n;
2195             n++;
2196         }
2197     }
2198
2199     return pmenodes;
2200 }
2201
2202 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2203 {
2204     gmx_domdec_t *dd;
2205     ivec          coords;
2206     int           slab;
2207
2208     dd = cr->dd;
2209     /*
2210        if (dd->comm->bCartesian) {
2211        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2212        dd_coords2pmecoords(dd,coords,coords_pme);
2213        copy_ivec(dd->ntot,nc);
2214        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2215        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2216
2217        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2218        } else {
2219        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2220        }
2221      */
2222     coords[XX] = x;
2223     coords[YY] = y;
2224     coords[ZZ] = z;
2225     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2226
2227     return slab;
2228 }
2229
2230 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2231 {
2232     gmx_domdec_comm_t *comm;
2233     ivec               coords;
2234     int                ddindex, nodeid = -1;
2235
2236     comm = cr->dd->comm;
2237
2238     coords[XX] = x;
2239     coords[YY] = y;
2240     coords[ZZ] = z;
2241     if (comm->bCartesianPP_PME)
2242     {
2243 #ifdef GMX_MPI
2244         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2245 #endif
2246     }
2247     else
2248     {
2249         ddindex = dd_index(cr->dd->nc, coords);
2250         if (comm->bCartesianPP)
2251         {
2252             nodeid = comm->ddindex2simnodeid[ddindex];
2253         }
2254         else
2255         {
2256             if (comm->pmenodes)
2257             {
2258                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2259             }
2260             else
2261             {
2262                 nodeid = ddindex;
2263             }
2264         }
2265     }
2266
2267     return nodeid;
2268 }
2269
2270 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2271 {
2272     gmx_domdec_t      *dd;
2273     gmx_domdec_comm_t *comm;
2274     int                i;
2275     int                pmenode = -1;
2276
2277     dd   = cr->dd;
2278     comm = dd->comm;
2279
2280     /* This assumes a uniform x domain decomposition grid cell size */
2281     if (comm->bCartesianPP_PME)
2282     {
2283 #ifdef GMX_MPI
2284         ivec coord, coord_pme;
2285         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2286         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2287         {
2288             /* This is a PP node */
2289             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2290             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2291         }
2292 #endif
2293     }
2294     else if (comm->bCartesianPP)
2295     {
2296         if (sim_nodeid < dd->nnodes)
2297         {
2298             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2299         }
2300     }
2301     else
2302     {
2303         /* This assumes DD cells with identical x coordinates
2304          * are numbered sequentially.
2305          */
2306         if (dd->comm->pmenodes == NULL)
2307         {
2308             if (sim_nodeid < dd->nnodes)
2309             {
2310                 /* The DD index equals the nodeid */
2311                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2312             }
2313         }
2314         else
2315         {
2316             i = 0;
2317             while (sim_nodeid > dd->comm->pmenodes[i])
2318             {
2319                 i++;
2320             }
2321             if (sim_nodeid < dd->comm->pmenodes[i])
2322             {
2323                 pmenode = dd->comm->pmenodes[i];
2324             }
2325         }
2326     }
2327
2328     return pmenode;
2329 }
2330
2331 void get_pme_nnodes(const gmx_domdec_t *dd,
2332                     int *npmenodes_x, int *npmenodes_y)
2333 {
2334     if (dd != NULL)
2335     {
2336         *npmenodes_x = dd->comm->npmenodes_x;
2337         *npmenodes_y = dd->comm->npmenodes_y;
2338     }
2339     else
2340     {
2341         *npmenodes_x = 1;
2342         *npmenodes_y = 1;
2343     }
2344 }
2345
2346 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2347                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2348 {
2349     gmx_domdec_t *dd;
2350     int           x, y, z;
2351     ivec          coord, coord_pme;
2352
2353     dd = cr->dd;
2354
2355     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2356
2357     *nmy_ddnodes = 0;
2358     for (x = 0; x < dd->nc[XX]; x++)
2359     {
2360         for (y = 0; y < dd->nc[YY]; y++)
2361         {
2362             for (z = 0; z < dd->nc[ZZ]; z++)
2363             {
2364                 if (dd->comm->bCartesianPP_PME)
2365                 {
2366                     coord[XX] = x;
2367                     coord[YY] = y;
2368                     coord[ZZ] = z;
2369                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2370                     if (dd->ci[XX] == coord_pme[XX] &&
2371                         dd->ci[YY] == coord_pme[YY] &&
2372                         dd->ci[ZZ] == coord_pme[ZZ])
2373                     {
2374                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2375                     }
2376                 }
2377                 else
2378                 {
2379                     /* The slab corresponds to the nodeid in the PME group */
2380                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2381                     {
2382                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2383                     }
2384                 }
2385             }
2386         }
2387     }
2388
2389     /* The last PP-only node is the peer node */
2390     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2391
2392     if (debug)
2393     {
2394         fprintf(debug, "Receive coordinates from PP ranks:");
2395         for (x = 0; x < *nmy_ddnodes; x++)
2396         {
2397             fprintf(debug, " %d", (*my_ddnodes)[x]);
2398         }
2399         fprintf(debug, "\n");
2400     }
2401 }
2402
2403 static gmx_bool receive_vir_ener(t_commrec *cr)
2404 {
2405     gmx_domdec_comm_t *comm;
2406     int                pmenode;
2407     gmx_bool           bReceive;
2408
2409     bReceive = TRUE;
2410     if (cr->npmenodes < cr->dd->nnodes)
2411     {
2412         comm = cr->dd->comm;
2413         if (comm->bCartesianPP_PME)
2414         {
2415             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2416 #ifdef GMX_MPI
2417             ivec coords;
2418             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2419             coords[comm->cartpmedim]++;
2420             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2421             {
2422                 int rank;
2423                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2424                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2425                 {
2426                     /* This is not the last PP node for pmenode */
2427                     bReceive = FALSE;
2428                 }
2429             }
2430 #endif
2431         }
2432         else
2433         {
2434             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2435             if (cr->sim_nodeid+1 < cr->nnodes &&
2436                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2437             {
2438                 /* This is not the last PP node for pmenode */
2439                 bReceive = FALSE;
2440             }
2441         }
2442     }
2443
2444     return bReceive;
2445 }
2446
2447 static void set_zones_ncg_home(gmx_domdec_t *dd)
2448 {
2449     gmx_domdec_zones_t *zones;
2450     int                 i;
2451
2452     zones = &dd->comm->zones;
2453
2454     zones->cg_range[0] = 0;
2455     for (i = 1; i < zones->n+1; i++)
2456     {
2457         zones->cg_range[i] = dd->ncg_home;
2458     }
2459     /* zone_ncg1[0] should always be equal to ncg_home */
2460     dd->comm->zone_ncg1[0] = dd->ncg_home;
2461 }
2462
2463 static void rebuild_cgindex(gmx_domdec_t *dd,
2464                             const int *gcgs_index, t_state *state)
2465 {
2466     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2467
2468     ind        = state->cg_gl;
2469     dd_cg_gl   = dd->index_gl;
2470     cgindex    = dd->cgindex;
2471     nat        = 0;
2472     cgindex[0] = nat;
2473     for (i = 0; i < state->ncg_gl; i++)
2474     {
2475         cgindex[i]  = nat;
2476         cg_gl       = ind[i];
2477         dd_cg_gl[i] = cg_gl;
2478         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2479     }
2480     cgindex[i] = nat;
2481
2482     dd->ncg_home = state->ncg_gl;
2483     dd->nat_home = nat;
2484
2485     set_zones_ncg_home(dd);
2486 }
2487
2488 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2489 {
2490     while (cg >= cginfo_mb->cg_end)
2491     {
2492         cginfo_mb++;
2493     }
2494
2495     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2496 }
2497
2498 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2499                           t_forcerec *fr, char *bLocalCG)
2500 {
2501     cginfo_mb_t *cginfo_mb;
2502     int         *cginfo;
2503     int          cg;
2504
2505     if (fr != NULL)
2506     {
2507         cginfo_mb = fr->cginfo_mb;
2508         cginfo    = fr->cginfo;
2509
2510         for (cg = cg0; cg < cg1; cg++)
2511         {
2512             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2513         }
2514     }
2515
2516     if (bLocalCG != NULL)
2517     {
2518         for (cg = cg0; cg < cg1; cg++)
2519         {
2520             bLocalCG[index_gl[cg]] = TRUE;
2521         }
2522     }
2523 }
2524
2525 static void make_dd_indices(gmx_domdec_t *dd,
2526                             const int *gcgs_index, int cg_start)
2527 {
2528     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2529     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2530     gmx_bool     bCGs;
2531
2532     if (dd->nat_tot > dd->gatindex_nalloc)
2533     {
2534         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2535         srenew(dd->gatindex, dd->gatindex_nalloc);
2536     }
2537
2538     nzone      = dd->comm->zones.n;
2539     zone2cg    = dd->comm->zones.cg_range;
2540     zone_ncg1  = dd->comm->zone_ncg1;
2541     index_gl   = dd->index_gl;
2542     gatindex   = dd->gatindex;
2543     bCGs       = dd->comm->bCGs;
2544
2545     if (zone2cg[1] != dd->ncg_home)
2546     {
2547         gmx_incons("dd->ncg_zone is not up to date");
2548     }
2549
2550     /* Make the local to global and global to local atom index */
2551     a = dd->cgindex[cg_start];
2552     for (zone = 0; zone < nzone; zone++)
2553     {
2554         if (zone == 0)
2555         {
2556             cg0 = cg_start;
2557         }
2558         else
2559         {
2560             cg0 = zone2cg[zone];
2561         }
2562         cg1    = zone2cg[zone+1];
2563         cg1_p1 = cg0 + zone_ncg1[zone];
2564
2565         for (cg = cg0; cg < cg1; cg++)
2566         {
2567             zone1 = zone;
2568             if (cg >= cg1_p1)
2569             {
2570                 /* Signal that this cg is from more than one pulse away */
2571                 zone1 += nzone;
2572             }
2573             cg_gl = index_gl[cg];
2574             if (bCGs)
2575             {
2576                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2577                 {
2578                     gatindex[a] = a_gl;
2579                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2580                     a++;
2581                 }
2582             }
2583             else
2584             {
2585                 gatindex[a] = cg_gl;
2586                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2587                 a++;
2588             }
2589         }
2590     }
2591 }
2592
2593 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2594                           const char *where)
2595 {
2596     int i, ngl, nerr;
2597
2598     nerr = 0;
2599     if (bLocalCG == NULL)
2600     {
2601         return nerr;
2602     }
2603     for (i = 0; i < dd->ncg_tot; i++)
2604     {
2605         if (!bLocalCG[dd->index_gl[i]])
2606         {
2607             fprintf(stderr,
2608                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2609             nerr++;
2610         }
2611     }
2612     ngl = 0;
2613     for (i = 0; i < ncg_sys; i++)
2614     {
2615         if (bLocalCG[i])
2616         {
2617             ngl++;
2618         }
2619     }
2620     if (ngl != dd->ncg_tot)
2621     {
2622         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2623         nerr++;
2624     }
2625
2626     return nerr;
2627 }
2628
2629 static void check_index_consistency(gmx_domdec_t *dd,
2630                                     int natoms_sys, int ncg_sys,
2631                                     const char *where)
2632 {
2633     int   nerr, ngl, i, a, cell;
2634     int  *have;
2635
2636     nerr = 0;
2637
2638     if (dd->comm->DD_debug > 1)
2639     {
2640         snew(have, natoms_sys);
2641         for (a = 0; a < dd->nat_tot; a++)
2642         {
2643             if (have[dd->gatindex[a]] > 0)
2644             {
2645                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2646             }
2647             else
2648             {
2649                 have[dd->gatindex[a]] = a + 1;
2650             }
2651         }
2652         sfree(have);
2653     }
2654
2655     snew(have, dd->nat_tot);
2656
2657     ngl  = 0;
2658     for (i = 0; i < natoms_sys; i++)
2659     {
2660         if (ga2la_get(dd->ga2la, i, &a, &cell))
2661         {
2662             if (a >= dd->nat_tot)
2663             {
2664                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2665                 nerr++;
2666             }
2667             else
2668             {
2669                 have[a] = 1;
2670                 if (dd->gatindex[a] != i)
2671                 {
2672                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2673                     nerr++;
2674                 }
2675             }
2676             ngl++;
2677         }
2678     }
2679     if (ngl != dd->nat_tot)
2680     {
2681         fprintf(stderr,
2682                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2683                 dd->rank, where, ngl, dd->nat_tot);
2684     }
2685     for (a = 0; a < dd->nat_tot; a++)
2686     {
2687         if (have[a] == 0)
2688         {
2689             fprintf(stderr,
2690                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2691                     dd->rank, where, a+1, dd->gatindex[a]+1);
2692         }
2693     }
2694     sfree(have);
2695
2696     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2697
2698     if (nerr > 0)
2699     {
2700         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2701                   dd->rank, where, nerr);
2702     }
2703 }
2704
2705 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2706 {
2707     int   i;
2708     char *bLocalCG;
2709
2710     if (a_start == 0)
2711     {
2712         /* Clear the whole list without searching */
2713         ga2la_clear(dd->ga2la);
2714     }
2715     else
2716     {
2717         for (i = a_start; i < dd->nat_tot; i++)
2718         {
2719             ga2la_del(dd->ga2la, dd->gatindex[i]);
2720         }
2721     }
2722
2723     bLocalCG = dd->comm->bLocalCG;
2724     if (bLocalCG)
2725     {
2726         for (i = cg_start; i < dd->ncg_tot; i++)
2727         {
2728             bLocalCG[dd->index_gl[i]] = FALSE;
2729         }
2730     }
2731
2732     dd_clear_local_vsite_indices(dd);
2733
2734     if (dd->constraints)
2735     {
2736         dd_clear_local_constraint_indices(dd);
2737     }
2738 }
2739
2740 /* This function should be used for moving the domain boudaries during DLB,
2741  * for obtaining the minimum cell size. It checks the initially set limit
2742  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2743  * and, possibly, a longer cut-off limit set for PME load balancing.
2744  */
2745 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2746 {
2747     real cellsize_min;
2748
2749     cellsize_min = comm->cellsize_min[dim];
2750
2751     if (!comm->bVacDLBNoLimit)
2752     {
2753         /* The cut-off might have changed, e.g. by PME load balacning,
2754          * from the value used to set comm->cellsize_min, so check it.
2755          */
2756         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2757
2758         if (comm->bPMELoadBalDLBLimits)
2759         {
2760             /* Check for the cut-off limit set by the PME load balancing */
2761             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2762         }
2763     }
2764
2765     return cellsize_min;
2766 }
2767
2768 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2769                             int dim_ind)
2770 {
2771     real grid_jump_limit;
2772
2773     /* The distance between the boundaries of cells at distance
2774      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2775      * and by the fact that cells should not be shifted by more than
2776      * half their size, such that cg's only shift by one cell
2777      * at redecomposition.
2778      */
2779     grid_jump_limit = comm->cellsize_limit;
2780     if (!comm->bVacDLBNoLimit)
2781     {
2782         if (comm->bPMELoadBalDLBLimits)
2783         {
2784             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2785         }
2786         grid_jump_limit = std::max(grid_jump_limit,
2787                                    cutoff/comm->cd[dim_ind].np);
2788     }
2789
2790     return grid_jump_limit;
2791 }
2792
2793 static gmx_bool check_grid_jump(gmx_int64_t     step,
2794                                 gmx_domdec_t   *dd,
2795                                 real            cutoff,
2796                                 gmx_ddbox_t    *ddbox,
2797                                 gmx_bool        bFatal)
2798 {
2799     gmx_domdec_comm_t *comm;
2800     int                d, dim;
2801     real               limit, bfac;
2802     gmx_bool           bInvalid;
2803
2804     bInvalid = FALSE;
2805
2806     comm = dd->comm;
2807
2808     for (d = 1; d < dd->ndim; d++)
2809     {
2810         dim   = dd->dim[d];
2811         limit = grid_jump_limit(comm, cutoff, d);
2812         bfac  = ddbox->box_size[dim];
2813         if (ddbox->tric_dir[dim])
2814         {
2815             bfac *= ddbox->skew_fac[dim];
2816         }
2817         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2818                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2819         {
2820             bInvalid = TRUE;
2821
2822             if (bFatal)
2823             {
2824                 char buf[22];
2825
2826                 /* This error should never be triggered under normal
2827                  * circumstances, but you never know ...
2828                  */
2829                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2830                           gmx_step_str(step, buf),
2831                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2832             }
2833         }
2834     }
2835
2836     return bInvalid;
2837 }
2838
2839 static int dd_load_count(gmx_domdec_comm_t *comm)
2840 {
2841     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2842 }
2843
2844 static float dd_force_load(gmx_domdec_comm_t *comm)
2845 {
2846     float load;
2847
2848     if (comm->eFlop)
2849     {
2850         load = comm->flop;
2851         if (comm->eFlop > 1)
2852         {
2853             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2854         }
2855     }
2856     else
2857     {
2858         load = comm->cycl[ddCyclF];
2859         if (comm->cycl_n[ddCyclF] > 1)
2860         {
2861             /* Subtract the maximum of the last n cycle counts
2862              * to get rid of possible high counts due to other sources,
2863              * for instance system activity, that would otherwise
2864              * affect the dynamic load balancing.
2865              */
2866             load -= comm->cycl_max[ddCyclF];
2867         }
2868
2869 #ifdef GMX_MPI
2870         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2871         {
2872             float gpu_wait, gpu_wait_sum;
2873
2874             gpu_wait = comm->cycl[ddCyclWaitGPU];
2875             if (comm->cycl_n[ddCyclF] > 1)
2876             {
2877                 /* We should remove the WaitGPU time of the same MD step
2878                  * as the one with the maximum F time, since the F time
2879                  * and the wait time are not independent.
2880                  * Furthermore, the step for the max F time should be chosen
2881                  * the same on all ranks that share the same GPU.
2882                  * But to keep the code simple, we remove the average instead.
2883                  * The main reason for artificially long times at some steps
2884                  * is spurious CPU activity or MPI time, so we don't expect
2885                  * that changes in the GPU wait time matter a lot here.
2886                  */
2887                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2888             }
2889             /* Sum the wait times over the ranks that share the same GPU */
2890             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2891                           comm->mpi_comm_gpu_shared);
2892             /* Replace the wait time by the average over the ranks */
2893             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2894         }
2895 #endif
2896     }
2897
2898     return load;
2899 }
2900
2901 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2902 {
2903     gmx_domdec_comm_t *comm;
2904     int                i;
2905
2906     comm = dd->comm;
2907
2908     snew(*dim_f, dd->nc[dim]+1);
2909     (*dim_f)[0] = 0;
2910     for (i = 1; i < dd->nc[dim]; i++)
2911     {
2912         if (comm->slb_frac[dim])
2913         {
2914             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2915         }
2916         else
2917         {
2918             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2919         }
2920     }
2921     (*dim_f)[dd->nc[dim]] = 1;
2922 }
2923
2924 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2925 {
2926     int  pmeindex, slab, nso, i;
2927     ivec xyz;
2928
2929     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2930     {
2931         ddpme->dim = YY;
2932     }
2933     else
2934     {
2935         ddpme->dim = dimind;
2936     }
2937     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2938
2939     ddpme->nslab = (ddpme->dim == 0 ?
2940                     dd->comm->npmenodes_x :
2941                     dd->comm->npmenodes_y);
2942
2943     if (ddpme->nslab <= 1)
2944     {
2945         return;
2946     }
2947
2948     nso = dd->comm->npmenodes/ddpme->nslab;
2949     /* Determine for each PME slab the PP location range for dimension dim */
2950     snew(ddpme->pp_min, ddpme->nslab);
2951     snew(ddpme->pp_max, ddpme->nslab);
2952     for (slab = 0; slab < ddpme->nslab; slab++)
2953     {
2954         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2955         ddpme->pp_max[slab] = 0;
2956     }
2957     for (i = 0; i < dd->nnodes; i++)
2958     {
2959         ddindex2xyz(dd->nc, i, xyz);
2960         /* For y only use our y/z slab.
2961          * This assumes that the PME x grid size matches the DD grid size.
2962          */
2963         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2964         {
2965             pmeindex = ddindex2pmeindex(dd, i);
2966             if (dimind == 0)
2967             {
2968                 slab = pmeindex/nso;
2969             }
2970             else
2971             {
2972                 slab = pmeindex % ddpme->nslab;
2973             }
2974             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2975             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2976         }
2977     }
2978
2979     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2980 }
2981
2982 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2983 {
2984     if (dd->comm->ddpme[0].dim == XX)
2985     {
2986         return dd->comm->ddpme[0].maxshift;
2987     }
2988     else
2989     {
2990         return 0;
2991     }
2992 }
2993
2994 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2995 {
2996     if (dd->comm->ddpme[0].dim == YY)
2997     {
2998         return dd->comm->ddpme[0].maxshift;
2999     }
3000     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
3001     {
3002         return dd->comm->ddpme[1].maxshift;
3003     }
3004     else
3005     {
3006         return 0;
3007     }
3008 }
3009
3010 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
3011                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
3012 {
3013     gmx_domdec_comm_t *comm;
3014     int                nc, ns, s;
3015     int               *xmin, *xmax;
3016     real               range, pme_boundary;
3017     int                sh;
3018
3019     comm = dd->comm;
3020     nc   = dd->nc[ddpme->dim];
3021     ns   = ddpme->nslab;
3022
3023     if (!ddpme->dim_match)
3024     {
3025         /* PP decomposition is not along dim: the worst situation */
3026         sh = ns/2;
3027     }
3028     else if (ns <= 3 || (bUniform && ns == nc))
3029     {
3030         /* The optimal situation */
3031         sh = 1;
3032     }
3033     else
3034     {
3035         /* We need to check for all pme nodes which nodes they
3036          * could possibly need to communicate with.
3037          */
3038         xmin = ddpme->pp_min;
3039         xmax = ddpme->pp_max;
3040         /* Allow for atoms to be maximally 2/3 times the cut-off
3041          * out of their DD cell. This is a reasonable balance between
3042          * between performance and support for most charge-group/cut-off
3043          * combinations.
3044          */
3045         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3046         /* Avoid extra communication when we are exactly at a boundary */
3047         range *= 0.999;
3048
3049         sh = 1;
3050         for (s = 0; s < ns; s++)
3051         {
3052             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3053             pme_boundary = (real)s/ns;
3054             while (sh+1 < ns &&
3055                    ((s-(sh+1) >= 0 &&
3056                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3057                     (s-(sh+1) <  0 &&
3058                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3059             {
3060                 sh++;
3061             }
3062             pme_boundary = (real)(s+1)/ns;
3063             while (sh+1 < ns &&
3064                    ((s+(sh+1) <  ns &&
3065                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3066                     (s+(sh+1) >= ns &&
3067                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3068             {
3069                 sh++;
3070             }
3071         }
3072     }
3073
3074     ddpme->maxshift = sh;
3075
3076     if (debug)
3077     {
3078         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3079                 ddpme->dim, ddpme->maxshift);
3080     }
3081 }
3082
3083 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3084 {
3085     int d, dim;
3086
3087     for (d = 0; d < dd->ndim; d++)
3088     {
3089         dim = dd->dim[d];
3090         if (dim < ddbox->nboundeddim &&
3091             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3092             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3093         {
3094             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3095                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3096                       dd->nc[dim], dd->comm->cellsize_limit);
3097         }
3098     }
3099 }
3100
3101 enum {
3102     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
3103 };
3104
3105 /* Set the domain boundaries. Use for static (or no) load balancing,
3106  * and also for the starting state for dynamic load balancing.
3107  * setmode determine if and where the boundaries are stored, use enum above.
3108  * Returns the number communication pulses in npulse.
3109  */
3110 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3111                                   int setmode, ivec npulse)
3112 {
3113     gmx_domdec_comm_t *comm;
3114     int                d, j;
3115     rvec               cellsize_min;
3116     real              *cell_x, cell_dx, cellsize;
3117
3118     comm = dd->comm;
3119
3120     for (d = 0; d < DIM; d++)
3121     {
3122         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3123         npulse[d]       = 1;
3124         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3125         {
3126             /* Uniform grid */
3127             cell_dx = ddbox->box_size[d]/dd->nc[d];
3128             switch (setmode)
3129             {
3130                 case setcellsizeslbMASTER:
3131                     for (j = 0; j < dd->nc[d]+1; j++)
3132                     {
3133                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3134                     }
3135                     break;
3136                 case setcellsizeslbLOCAL:
3137                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3138                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3139                     break;
3140                 default:
3141                     break;
3142             }
3143             cellsize = cell_dx*ddbox->skew_fac[d];
3144             while (cellsize*npulse[d] < comm->cutoff)
3145             {
3146                 npulse[d]++;
3147             }
3148             cellsize_min[d] = cellsize;
3149         }
3150         else
3151         {
3152             /* Statically load balanced grid */
3153             /* Also when we are not doing a master distribution we determine
3154              * all cell borders in a loop to obtain identical values
3155              * to the master distribution case and to determine npulse.
3156              */
3157             if (setmode == setcellsizeslbMASTER)
3158             {
3159                 cell_x = dd->ma->cell_x[d];
3160             }
3161             else
3162             {
3163                 snew(cell_x, dd->nc[d]+1);
3164             }
3165             cell_x[0] = ddbox->box0[d];
3166             for (j = 0; j < dd->nc[d]; j++)
3167             {
3168                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3169                 cell_x[j+1] = cell_x[j] + cell_dx;
3170                 cellsize    = cell_dx*ddbox->skew_fac[d];
3171                 while (cellsize*npulse[d] < comm->cutoff &&
3172                        npulse[d] < dd->nc[d]-1)
3173                 {
3174                     npulse[d]++;
3175                 }
3176                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
3177             }
3178             if (setmode == setcellsizeslbLOCAL)
3179             {
3180                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3181                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3182             }
3183             if (setmode != setcellsizeslbMASTER)
3184             {
3185                 sfree(cell_x);
3186             }
3187         }
3188         /* The following limitation is to avoid that a cell would receive
3189          * some of its own home charge groups back over the periodic boundary.
3190          * Double charge groups cause trouble with the global indices.
3191          */
3192         if (d < ddbox->npbcdim &&
3193             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3194         {
3195             char error_string[STRLEN];
3196
3197             sprintf(error_string,
3198                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3199                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3200                     comm->cutoff,
3201                     dd->nc[d], dd->nc[d],
3202                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
3203
3204             if (setmode == setcellsizeslbLOCAL)
3205             {
3206                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
3207             }
3208             else
3209             {
3210                 gmx_fatal(FARGS, error_string);
3211             }
3212         }
3213     }
3214
3215     if (!comm->bDynLoadBal)
3216     {
3217         copy_rvec(cellsize_min, comm->cellsize_min);
3218     }
3219
3220     for (d = 0; d < comm->npmedecompdim; d++)
3221     {
3222         set_pme_maxshift(dd, &comm->ddpme[d],
3223                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3224                          comm->ddpme[d].slb_dim_f);
3225     }
3226 }
3227
3228
3229 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3230                                                   int d, int dim, gmx_domdec_root_t *root,
3231                                                   gmx_ddbox_t *ddbox,
3232                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
3233 {
3234     gmx_domdec_comm_t *comm;
3235     int                ncd, i, j, nmin, nmin_old;
3236     gmx_bool           bLimLo, bLimHi;
3237     real              *cell_size;
3238     real               fac, halfway, cellsize_limit_f_i, region_size;
3239     gmx_bool           bPBC, bLastHi = FALSE;
3240     int                nrange[] = {range[0], range[1]};
3241
3242     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3243
3244     comm = dd->comm;
3245
3246     ncd = dd->nc[dim];
3247
3248     bPBC = (dim < ddbox->npbcdim);
3249
3250     cell_size = root->buf_ncd;
3251
3252     if (debug)
3253     {
3254         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3255     }
3256
3257     /* First we need to check if the scaling does not make cells
3258      * smaller than the smallest allowed size.
3259      * We need to do this iteratively, since if a cell is too small,
3260      * it needs to be enlarged, which makes all the other cells smaller,
3261      * which could in turn make another cell smaller than allowed.
3262      */
3263     for (i = range[0]; i < range[1]; i++)
3264     {
3265         root->bCellMin[i] = FALSE;
3266     }
3267     nmin = 0;
3268     do
3269     {
3270         nmin_old = nmin;
3271         /* We need the total for normalization */
3272         fac = 0;
3273         for (i = range[0]; i < range[1]; i++)
3274         {
3275             if (root->bCellMin[i] == FALSE)
3276             {
3277                 fac += cell_size[i];
3278             }
3279         }
3280         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3281         /* Determine the cell boundaries */
3282         for (i = range[0]; i < range[1]; i++)
3283         {
3284             if (root->bCellMin[i] == FALSE)
3285             {
3286                 cell_size[i] *= fac;
3287                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3288                 {
3289                     cellsize_limit_f_i = 0;
3290                 }
3291                 else
3292                 {
3293                     cellsize_limit_f_i = cellsize_limit_f;
3294                 }
3295                 if (cell_size[i] < cellsize_limit_f_i)
3296                 {
3297                     root->bCellMin[i] = TRUE;
3298                     cell_size[i]      = cellsize_limit_f_i;
3299                     nmin++;
3300                 }
3301             }
3302             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3303         }
3304     }
3305     while (nmin > nmin_old);
3306
3307     i            = range[1]-1;
3308     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3309     /* For this check we should not use DD_CELL_MARGIN,
3310      * but a slightly smaller factor,
3311      * since rounding could get use below the limit.
3312      */
3313     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3314     {
3315         char buf[22];
3316         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3317                   gmx_step_str(step, buf),
3318                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3319                   ncd, comm->cellsize_min[dim]);
3320     }
3321
3322     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3323
3324     if (!bUniform)
3325     {
3326         /* Check if the boundary did not displace more than halfway
3327          * each of the cells it bounds, as this could cause problems,
3328          * especially when the differences between cell sizes are large.
3329          * If changes are applied, they will not make cells smaller
3330          * than the cut-off, as we check all the boundaries which
3331          * might be affected by a change and if the old state was ok,
3332          * the cells will at most be shrunk back to their old size.
3333          */
3334         for (i = range[0]+1; i < range[1]; i++)
3335         {
3336             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3337             if (root->cell_f[i] < halfway)
3338             {
3339                 root->cell_f[i] = halfway;
3340                 /* Check if the change also causes shifts of the next boundaries */
3341                 for (j = i+1; j < range[1]; j++)
3342                 {
3343                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3344                     {
3345                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3346                     }
3347                 }
3348             }
3349             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3350             if (root->cell_f[i] > halfway)
3351             {
3352                 root->cell_f[i] = halfway;
3353                 /* Check if the change also causes shifts of the next boundaries */
3354                 for (j = i-1; j >= range[0]+1; j--)
3355                 {
3356                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3357                     {
3358                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3359                     }
3360                 }
3361             }
3362         }
3363     }
3364
3365     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3366     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3367      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3368      * for a and b nrange is used */
3369     if (d > 0)
3370     {
3371         /* Take care of the staggering of the cell boundaries */
3372         if (bUniform)
3373         {
3374             for (i = range[0]; i < range[1]; i++)
3375             {
3376                 root->cell_f_max0[i] = root->cell_f[i];
3377                 root->cell_f_min1[i] = root->cell_f[i+1];
3378             }
3379         }
3380         else
3381         {
3382             for (i = range[0]+1; i < range[1]; i++)
3383             {
3384                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3385                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3386                 if (bLimLo && bLimHi)
3387                 {
3388                     /* Both limits violated, try the best we can */
3389                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3390                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3391                     nrange[0]       = range[0];
3392                     nrange[1]       = i;
3393                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3394
3395                     nrange[0] = i;
3396                     nrange[1] = range[1];
3397                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3398
3399                     return;
3400                 }
3401                 else if (bLimLo)
3402                 {
3403                     /* root->cell_f[i] = root->bound_min[i]; */
3404                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3405                     bLastHi   = FALSE;
3406                 }
3407                 else if (bLimHi && !bLastHi)
3408                 {
3409                     bLastHi = TRUE;
3410                     if (nrange[1] < range[1])   /* found a LimLo before */
3411                     {
3412                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3413                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3414                         nrange[0] = nrange[1];
3415                     }
3416                     root->cell_f[i] = root->bound_max[i];
3417                     nrange[1]       = i;
3418                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3419                     nrange[0] = i;
3420                     nrange[1] = range[1];
3421                 }
3422             }
3423             if (nrange[1] < range[1])   /* found last a LimLo */
3424             {
3425                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3426                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3427                 nrange[0] = nrange[1];
3428                 nrange[1] = range[1];
3429                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3430             }
3431             else if (nrange[0] > range[0]) /* found at least one LimHi */
3432             {
3433                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3434             }
3435         }
3436     }
3437 }
3438
3439
3440 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3441                                        int d, int dim, gmx_domdec_root_t *root,
3442                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3443                                        gmx_bool bUniform, gmx_int64_t step)
3444 {
3445     gmx_domdec_comm_t *comm;
3446     int                ncd, d1, i, pos;
3447     real              *cell_size;
3448     real               load_aver, load_i, imbalance, change, change_max, sc;
3449     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3450     real               change_limit;
3451     real               relax = 0.5;
3452     gmx_bool           bPBC;
3453     int                range[] = { 0, 0 };
3454
3455     comm = dd->comm;
3456
3457     /* Convert the maximum change from the input percentage to a fraction */
3458     change_limit = comm->dlb_scale_lim*0.01;
3459
3460     ncd = dd->nc[dim];
3461
3462     bPBC = (dim < ddbox->npbcdim);
3463
3464     cell_size = root->buf_ncd;
3465
3466     /* Store the original boundaries */
3467     for (i = 0; i < ncd+1; i++)
3468     {
3469         root->old_cell_f[i] = root->cell_f[i];
3470     }
3471     if (bUniform)
3472     {
3473         for (i = 0; i < ncd; i++)
3474         {
3475             cell_size[i] = 1.0/ncd;
3476         }
3477     }
3478     else if (dd_load_count(comm) > 0)
3479     {
3480         load_aver  = comm->load[d].sum_m/ncd;
3481         change_max = 0;
3482         for (i = 0; i < ncd; i++)
3483         {
3484             /* Determine the relative imbalance of cell i */
3485             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3486             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3487             /* Determine the change of the cell size using underrelaxation */
3488             change     = -relax*imbalance;
3489             change_max = std::max(change_max, std::max(change, -change));
3490         }
3491         /* Limit the amount of scaling.
3492          * We need to use the same rescaling for all cells in one row,
3493          * otherwise the load balancing might not converge.
3494          */
3495         sc = relax;
3496         if (change_max > change_limit)
3497         {
3498             sc *= change_limit/change_max;
3499         }
3500         for (i = 0; i < ncd; i++)
3501         {
3502             /* Determine the relative imbalance of cell i */
3503             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3504             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3505             /* Determine the change of the cell size using underrelaxation */
3506             change       = -sc*imbalance;
3507             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3508         }
3509     }
3510
3511     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3512     cellsize_limit_f *= DD_CELL_MARGIN;
3513     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3514     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3515     if (ddbox->tric_dir[dim])
3516     {
3517         cellsize_limit_f /= ddbox->skew_fac[dim];
3518         dist_min_f       /= ddbox->skew_fac[dim];
3519     }
3520     if (bDynamicBox && d > 0)
3521     {
3522         dist_min_f *= DD_PRES_SCALE_MARGIN;
3523     }
3524     if (d > 0 && !bUniform)
3525     {
3526         /* Make sure that the grid is not shifted too much */
3527         for (i = 1; i < ncd; i++)
3528         {
3529             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3530             {
3531                 gmx_incons("Inconsistent DD boundary staggering limits!");
3532             }
3533             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3534             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3535             if (space > 0)
3536             {
3537                 root->bound_min[i] += 0.5*space;
3538             }
3539             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3540             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3541             if (space < 0)
3542             {
3543                 root->bound_max[i] += 0.5*space;
3544             }
3545             if (debug)
3546             {
3547                 fprintf(debug,
3548                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3549                         d, i,
3550                         root->cell_f_max0[i-1] + dist_min_f,
3551                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3552                         root->cell_f_min1[i] - dist_min_f);
3553             }
3554         }
3555     }
3556     range[1]          = ncd;
3557     root->cell_f[0]   = 0;
3558     root->cell_f[ncd] = 1;
3559     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3560
3561
3562     /* After the checks above, the cells should obey the cut-off
3563      * restrictions, but it does not hurt to check.
3564      */
3565     for (i = 0; i < ncd; i++)
3566     {
3567         if (debug)
3568         {
3569             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3570                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3571         }
3572
3573         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3574             root->cell_f[i+1] - root->cell_f[i] <
3575             cellsize_limit_f/DD_CELL_MARGIN)
3576         {
3577             char buf[22];
3578             fprintf(stderr,
3579                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3580                     gmx_step_str(step, buf), dim2char(dim), i,
3581                     (root->cell_f[i+1] - root->cell_f[i])
3582                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3583         }
3584     }
3585
3586     pos = ncd + 1;
3587     /* Store the cell boundaries of the lower dimensions at the end */
3588     for (d1 = 0; d1 < d; d1++)
3589     {
3590         root->cell_f[pos++] = comm->cell_f0[d1];
3591         root->cell_f[pos++] = comm->cell_f1[d1];
3592     }
3593
3594     if (d < comm->npmedecompdim)
3595     {
3596         /* The master determines the maximum shift for
3597          * the coordinate communication between separate PME nodes.
3598          */
3599         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3600     }
3601     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3602     if (d >= 1)
3603     {
3604         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3605     }
3606 }
3607
3608 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3609                                              gmx_ddbox_t *ddbox, int dimind)
3610 {
3611     gmx_domdec_comm_t *comm;
3612     int                dim;
3613
3614     comm = dd->comm;
3615
3616     /* Set the cell dimensions */
3617     dim                = dd->dim[dimind];
3618     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3619     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3620     if (dim >= ddbox->nboundeddim)
3621     {
3622         comm->cell_x0[dim] += ddbox->box0[dim];
3623         comm->cell_x1[dim] += ddbox->box0[dim];
3624     }
3625 }
3626
3627 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3628                                          int d, int dim, real *cell_f_row,
3629                                          gmx_ddbox_t *ddbox)
3630 {
3631     gmx_domdec_comm_t *comm;
3632     int                d1, pos;
3633
3634     comm = dd->comm;
3635
3636 #ifdef GMX_MPI
3637     /* Each node would only need to know two fractions,
3638      * but it is probably cheaper to broadcast the whole array.
3639      */
3640     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3641               0, comm->mpi_comm_load[d]);
3642 #endif
3643     /* Copy the fractions for this dimension from the buffer */
3644     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3645     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3646     /* The whole array was communicated, so set the buffer position */
3647     pos = dd->nc[dim] + 1;
3648     for (d1 = 0; d1 <= d; d1++)
3649     {
3650         if (d1 < d)
3651         {
3652             /* Copy the cell fractions of the lower dimensions */
3653             comm->cell_f0[d1] = cell_f_row[pos++];
3654             comm->cell_f1[d1] = cell_f_row[pos++];
3655         }
3656         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3657     }
3658     /* Convert the communicated shift from float to int */
3659     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3660     if (d >= 1)
3661     {
3662         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3663     }
3664 }
3665
3666 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3667                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3668                                          gmx_bool bUniform, gmx_int64_t step)
3669 {
3670     gmx_domdec_comm_t *comm;
3671     int                d, dim, d1;
3672     gmx_bool           bRowMember, bRowRoot;
3673     real              *cell_f_row;
3674
3675     comm = dd->comm;
3676
3677     for (d = 0; d < dd->ndim; d++)
3678     {
3679         dim        = dd->dim[d];
3680         bRowMember = TRUE;
3681         bRowRoot   = TRUE;
3682         for (d1 = d; d1 < dd->ndim; d1++)
3683         {
3684             if (dd->ci[dd->dim[d1]] > 0)
3685             {
3686                 if (d1 != d)
3687                 {
3688                     bRowMember = FALSE;
3689                 }
3690                 bRowRoot = FALSE;
3691             }
3692         }
3693         if (bRowMember)
3694         {
3695             if (bRowRoot)
3696             {
3697                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3698                                            ddbox, bDynamicBox, bUniform, step);
3699                 cell_f_row = comm->root[d]->cell_f;
3700             }
3701             else
3702             {
3703                 cell_f_row = comm->cell_f_row;
3704             }
3705             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3706         }
3707     }
3708 }
3709
3710 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3711 {
3712     int d;
3713
3714     /* This function assumes the box is static and should therefore
3715      * not be called when the box has changed since the last
3716      * call to dd_partition_system.
3717      */
3718     for (d = 0; d < dd->ndim; d++)
3719     {
3720         relative_to_absolute_cell_bounds(dd, ddbox, d);
3721     }
3722 }
3723
3724
3725
3726 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3727                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3728                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3729                                   gmx_wallcycle_t wcycle)
3730 {
3731     gmx_domdec_comm_t *comm;
3732     int                dim;
3733
3734     comm = dd->comm;
3735
3736     if (bDoDLB)
3737     {
3738         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3739         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3740         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3741     }
3742     else if (bDynamicBox)
3743     {
3744         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3745     }
3746
3747     /* Set the dimensions for which no DD is used */
3748     for (dim = 0; dim < DIM; dim++)
3749     {
3750         if (dd->nc[dim] == 1)
3751         {
3752             comm->cell_x0[dim] = 0;
3753             comm->cell_x1[dim] = ddbox->box_size[dim];
3754             if (dim >= ddbox->nboundeddim)
3755             {
3756                 comm->cell_x0[dim] += ddbox->box0[dim];
3757                 comm->cell_x1[dim] += ddbox->box0[dim];
3758             }
3759         }
3760     }
3761 }
3762
3763 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3764 {
3765     int                    d, np, i;
3766     gmx_domdec_comm_dim_t *cd;
3767
3768     for (d = 0; d < dd->ndim; d++)
3769     {
3770         cd = &dd->comm->cd[d];
3771         np = npulse[dd->dim[d]];
3772         if (np > cd->np_nalloc)
3773         {
3774             if (debug)
3775             {
3776                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3777                         dim2char(dd->dim[d]), np);
3778             }
3779             if (DDMASTER(dd) && cd->np_nalloc > 0)
3780             {
3781                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3782             }
3783             srenew(cd->ind, np);
3784             for (i = cd->np_nalloc; i < np; i++)
3785             {
3786                 cd->ind[i].index  = NULL;
3787                 cd->ind[i].nalloc = 0;
3788             }
3789             cd->np_nalloc = np;
3790         }
3791         cd->np = np;
3792     }
3793 }
3794
3795
3796 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3797                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3798                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3799                               gmx_wallcycle_t wcycle)
3800 {
3801     gmx_domdec_comm_t *comm;
3802     int                d;
3803     ivec               npulse;
3804
3805     comm = dd->comm;
3806
3807     /* Copy the old cell boundaries for the cg displacement check */
3808     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3809     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3810
3811     if (comm->bDynLoadBal)
3812     {
3813         if (DDMASTER(dd))
3814         {
3815             check_box_size(dd, ddbox);
3816         }
3817         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3818     }
3819     else
3820     {
3821         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3822         realloc_comm_ind(dd, npulse);
3823     }
3824
3825     if (debug)
3826     {
3827         for (d = 0; d < DIM; d++)
3828         {
3829             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3830                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3831         }
3832     }
3833 }
3834
3835 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3836                                   gmx_ddbox_t *ddbox,
3837                                   rvec cell_ns_x0, rvec cell_ns_x1,
3838                                   gmx_int64_t step)
3839 {
3840     gmx_domdec_comm_t *comm;
3841     int                dim_ind, dim;
3842
3843     comm = dd->comm;
3844
3845     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3846     {
3847         dim = dd->dim[dim_ind];
3848
3849         /* Without PBC we don't have restrictions on the outer cells */
3850         if (!(dim >= ddbox->npbcdim &&
3851               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3852             comm->bDynLoadBal &&
3853             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3854             comm->cellsize_min[dim])
3855         {
3856             char buf[22];
3857             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3858                       gmx_step_str(step, buf), dim2char(dim),
3859                       comm->cell_x1[dim] - comm->cell_x0[dim],
3860                       ddbox->skew_fac[dim],
3861                       dd->comm->cellsize_min[dim],
3862                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3863         }
3864     }
3865
3866     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3867     {
3868         /* Communicate the boundaries and update cell_ns_x0/1 */
3869         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3870         if (dd->bGridJump && dd->ndim > 1)
3871         {
3872             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3873         }
3874     }
3875 }
3876
3877 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3878 {
3879     if (YY < npbcdim)
3880     {
3881         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3882     }
3883     else
3884     {
3885         tcm[YY][XX] = 0;
3886     }
3887     if (ZZ < npbcdim)
3888     {
3889         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3890         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3891     }
3892     else
3893     {
3894         tcm[ZZ][XX] = 0;
3895         tcm[ZZ][YY] = 0;
3896     }
3897 }
3898
3899 static void check_screw_box(matrix box)
3900 {
3901     /* Mathematical limitation */
3902     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3903     {
3904         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3905     }
3906
3907     /* Limitation due to the asymmetry of the eighth shell method */
3908     if (box[ZZ][YY] != 0)
3909     {
3910         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3911     }
3912 }
3913
3914 static void distribute_cg(FILE *fplog,
3915                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3916                           gmx_domdec_t *dd)
3917 {
3918     gmx_domdec_master_t *ma;
3919     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3920     int                  i, icg, j, k, k0, k1, d;
3921     matrix               tcm;
3922     rvec                 cg_cm;
3923     ivec                 ind;
3924     real                 nrcg, inv_ncg, pos_d;
3925     atom_id             *cgindex;
3926     gmx_bool             bScrew;
3927
3928     ma = dd->ma;
3929
3930     if (tmp_ind == NULL)
3931     {
3932         snew(tmp_nalloc, dd->nnodes);
3933         snew(tmp_ind, dd->nnodes);
3934         for (i = 0; i < dd->nnodes; i++)
3935         {
3936             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3937             snew(tmp_ind[i], tmp_nalloc[i]);
3938         }
3939     }
3940
3941     /* Clear the count */
3942     for (i = 0; i < dd->nnodes; i++)
3943     {
3944         ma->ncg[i] = 0;
3945         ma->nat[i] = 0;
3946     }
3947
3948     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3949
3950     cgindex = cgs->index;
3951
3952     /* Compute the center of geometry for all charge groups */
3953     for (icg = 0; icg < cgs->nr; icg++)
3954     {
3955         k0      = cgindex[icg];
3956         k1      = cgindex[icg+1];
3957         nrcg    = k1 - k0;
3958         if (nrcg == 1)
3959         {
3960             copy_rvec(pos[k0], cg_cm);
3961         }
3962         else
3963         {
3964             inv_ncg = 1.0/nrcg;
3965
3966             clear_rvec(cg_cm);
3967             for (k = k0; (k < k1); k++)
3968             {
3969                 rvec_inc(cg_cm, pos[k]);
3970             }
3971             for (d = 0; (d < DIM); d++)
3972             {
3973                 cg_cm[d] *= inv_ncg;
3974             }
3975         }
3976         /* Put the charge group in the box and determine the cell index */
3977         for (d = DIM-1; d >= 0; d--)
3978         {
3979             pos_d = cg_cm[d];
3980             if (d < dd->npbcdim)
3981             {
3982                 bScrew = (dd->bScrewPBC && d == XX);
3983                 if (tric_dir[d] && dd->nc[d] > 1)
3984                 {
3985                     /* Use triclinic coordintates for this dimension */
3986                     for (j = d+1; j < DIM; j++)
3987                     {
3988                         pos_d += cg_cm[j]*tcm[j][d];
3989                     }
3990                 }
3991                 while (pos_d >= box[d][d])
3992                 {
3993                     pos_d -= box[d][d];
3994                     rvec_dec(cg_cm, box[d]);
3995                     if (bScrew)
3996                     {
3997                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3998                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3999                     }
4000                     for (k = k0; (k < k1); k++)
4001                     {
4002                         rvec_dec(pos[k], box[d]);
4003                         if (bScrew)
4004                         {
4005                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4006                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4007                         }
4008                     }
4009                 }
4010                 while (pos_d < 0)
4011                 {
4012                     pos_d += box[d][d];
4013                     rvec_inc(cg_cm, box[d]);
4014                     if (bScrew)
4015                     {
4016                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4017                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4018                     }
4019                     for (k = k0; (k < k1); k++)
4020                     {
4021                         rvec_inc(pos[k], box[d]);
4022                         if (bScrew)
4023                         {
4024                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4025                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4026                         }
4027                     }
4028                 }
4029             }
4030             /* This could be done more efficiently */
4031             ind[d] = 0;
4032             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
4033             {
4034                 ind[d]++;
4035             }
4036         }
4037         i = dd_index(dd->nc, ind);
4038         if (ma->ncg[i] == tmp_nalloc[i])
4039         {
4040             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
4041             srenew(tmp_ind[i], tmp_nalloc[i]);
4042         }
4043         tmp_ind[i][ma->ncg[i]] = icg;
4044         ma->ncg[i]++;
4045         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
4046     }
4047
4048     k1 = 0;
4049     for (i = 0; i < dd->nnodes; i++)
4050     {
4051         ma->index[i] = k1;
4052         for (k = 0; k < ma->ncg[i]; k++)
4053         {
4054             ma->cg[k1++] = tmp_ind[i][k];
4055         }
4056     }
4057     ma->index[dd->nnodes] = k1;
4058
4059     for (i = 0; i < dd->nnodes; i++)
4060     {
4061         sfree(tmp_ind[i]);
4062     }
4063     sfree(tmp_ind);
4064     sfree(tmp_nalloc);
4065
4066     if (fplog)
4067     {
4068         /* Here we avoid int overflows due to #atoms^2: use double, dsqr */
4069         int    nat_sum, nat_min, nat_max;
4070         double nat2_sum;
4071
4072         nat_sum  = 0;
4073         nat2_sum = 0;
4074         nat_min  = ma->nat[0];
4075         nat_max  = ma->nat[0];
4076         for (i = 0; i < dd->nnodes; i++)
4077         {
4078             nat_sum  += ma->nat[i];
4079             nat2_sum += dsqr(ma->nat[i]);
4080             nat_min   = std::min(nat_min, ma->nat[i]);
4081             nat_max   = std::max(nat_max, ma->nat[i]);
4082         }
4083         nat_sum  /= dd->nnodes;
4084         nat2_sum /= dd->nnodes;
4085
4086         fprintf(fplog, "Atom distribution over %d domains: av %d stddev %d min %d max %d\n",
4087                 dd->nnodes,
4088                 nat_sum,
4089                 static_cast<int>(sqrt(nat2_sum - dsqr(nat_sum) + 0.5)),
4090                 nat_min, nat_max);
4091     }
4092 }
4093
4094 static void get_cg_distribution(FILE *fplog, gmx_domdec_t *dd,
4095                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4096                                 rvec pos[])
4097 {
4098     gmx_domdec_master_t *ma = NULL;
4099     ivec                 npulse;
4100     int                  i, cg_gl;
4101     int                 *ibuf, buf2[2] = { 0, 0 };
4102     gmx_bool             bMaster = DDMASTER(dd);
4103
4104     if (bMaster)
4105     {
4106         ma = dd->ma;
4107
4108         if (dd->bScrewPBC)
4109         {
4110             check_screw_box(box);
4111         }
4112
4113         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
4114
4115         distribute_cg(fplog, box, ddbox->tric_dir, cgs, pos, dd);
4116         for (i = 0; i < dd->nnodes; i++)
4117         {
4118             ma->ibuf[2*i]   = ma->ncg[i];
4119             ma->ibuf[2*i+1] = ma->nat[i];
4120         }
4121         ibuf = ma->ibuf;
4122     }
4123     else
4124     {
4125         ibuf = NULL;
4126     }
4127     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4128
4129     dd->ncg_home = buf2[0];
4130     dd->nat_home = buf2[1];
4131     dd->ncg_tot  = dd->ncg_home;
4132     dd->nat_tot  = dd->nat_home;
4133     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4134     {
4135         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4136         srenew(dd->index_gl, dd->cg_nalloc);
4137         srenew(dd->cgindex, dd->cg_nalloc+1);
4138     }
4139     if (bMaster)
4140     {
4141         for (i = 0; i < dd->nnodes; i++)
4142         {
4143             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4144             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4145         }
4146     }
4147
4148     dd_scatterv(dd,
4149                 bMaster ? ma->ibuf : NULL,
4150                 bMaster ? ma->ibuf+dd->nnodes : NULL,
4151                 bMaster ? ma->cg : NULL,
4152                 dd->ncg_home*sizeof(int), dd->index_gl);
4153
4154     /* Determine the home charge group sizes */
4155     dd->cgindex[0] = 0;
4156     for (i = 0; i < dd->ncg_home; i++)
4157     {
4158         cg_gl            = dd->index_gl[i];
4159         dd->cgindex[i+1] =
4160             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4161     }
4162
4163     if (debug)
4164     {
4165         fprintf(debug, "Home charge groups:\n");
4166         for (i = 0; i < dd->ncg_home; i++)
4167         {
4168             fprintf(debug, " %d", dd->index_gl[i]);
4169             if (i % 10 == 9)
4170             {
4171                 fprintf(debug, "\n");
4172             }
4173         }
4174         fprintf(debug, "\n");
4175     }
4176 }
4177
4178 static int compact_and_copy_vec_at(int ncg, int *move,
4179                                    int *cgindex,
4180                                    int nvec, int vec,
4181                                    rvec *src, gmx_domdec_comm_t *comm,
4182                                    gmx_bool bCompact)
4183 {
4184     int m, icg, i, i0, i1, nrcg;
4185     int home_pos;
4186     int pos_vec[DIM*2];
4187
4188     home_pos = 0;
4189
4190     for (m = 0; m < DIM*2; m++)
4191     {
4192         pos_vec[m] = 0;
4193     }
4194
4195     i0 = 0;
4196     for (icg = 0; icg < ncg; icg++)
4197     {
4198         i1 = cgindex[icg+1];
4199         m  = move[icg];
4200         if (m == -1)
4201         {
4202             if (bCompact)
4203             {
4204                 /* Compact the home array in place */
4205                 for (i = i0; i < i1; i++)
4206                 {
4207                     copy_rvec(src[i], src[home_pos++]);
4208                 }
4209             }
4210         }
4211         else
4212         {
4213             /* Copy to the communication buffer */
4214             nrcg        = i1 - i0;
4215             pos_vec[m] += 1 + vec*nrcg;
4216             for (i = i0; i < i1; i++)
4217             {
4218                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4219             }
4220             pos_vec[m] += (nvec - vec - 1)*nrcg;
4221         }
4222         if (!bCompact)
4223         {
4224             home_pos += i1 - i0;
4225         }
4226         i0 = i1;
4227     }
4228
4229     return home_pos;
4230 }
4231
4232 static int compact_and_copy_vec_cg(int ncg, int *move,
4233                                    int *cgindex,
4234                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4235                                    gmx_bool bCompact)
4236 {
4237     int m, icg, i0, i1, nrcg;
4238     int home_pos;
4239     int pos_vec[DIM*2];
4240
4241     home_pos = 0;
4242
4243     for (m = 0; m < DIM*2; m++)
4244     {
4245         pos_vec[m] = 0;
4246     }
4247
4248     i0 = 0;
4249     for (icg = 0; icg < ncg; icg++)
4250     {
4251         i1 = cgindex[icg+1];
4252         m  = move[icg];
4253         if (m == -1)
4254         {
4255             if (bCompact)
4256             {
4257                 /* Compact the home array in place */
4258                 copy_rvec(src[icg], src[home_pos++]);
4259             }
4260         }
4261         else
4262         {
4263             nrcg = i1 - i0;
4264             /* Copy to the communication buffer */
4265             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4266             pos_vec[m] += 1 + nrcg*nvec;
4267         }
4268         i0 = i1;
4269     }
4270     if (!bCompact)
4271     {
4272         home_pos = ncg;
4273     }
4274
4275     return home_pos;
4276 }
4277
4278 static int compact_ind(int ncg, int *move,
4279                        int *index_gl, int *cgindex,
4280                        int *gatindex,
4281                        gmx_ga2la_t ga2la, char *bLocalCG,
4282                        int *cginfo)
4283 {
4284     int cg, nat, a0, a1, a, a_gl;
4285     int home_pos;
4286
4287     home_pos = 0;
4288     nat      = 0;
4289     for (cg = 0; cg < ncg; cg++)
4290     {
4291         a0 = cgindex[cg];
4292         a1 = cgindex[cg+1];
4293         if (move[cg] == -1)
4294         {
4295             /* Compact the home arrays in place.
4296              * Anything that can be done here avoids access to global arrays.
4297              */
4298             cgindex[home_pos] = nat;
4299             for (a = a0; a < a1; a++)
4300             {
4301                 a_gl          = gatindex[a];
4302                 gatindex[nat] = a_gl;
4303                 /* The cell number stays 0, so we don't need to set it */
4304                 ga2la_change_la(ga2la, a_gl, nat);
4305                 nat++;
4306             }
4307             index_gl[home_pos] = index_gl[cg];
4308             cginfo[home_pos]   = cginfo[cg];
4309             /* The charge group remains local, so bLocalCG does not change */
4310             home_pos++;
4311         }
4312         else
4313         {
4314             /* Clear the global indices */
4315             for (a = a0; a < a1; a++)
4316             {
4317                 ga2la_del(ga2la, gatindex[a]);
4318             }
4319             if (bLocalCG)
4320             {
4321                 bLocalCG[index_gl[cg]] = FALSE;
4322             }
4323         }
4324     }
4325     cgindex[home_pos] = nat;
4326
4327     return home_pos;
4328 }
4329
4330 static void clear_and_mark_ind(int ncg, int *move,
4331                                int *index_gl, int *cgindex, int *gatindex,
4332                                gmx_ga2la_t ga2la, char *bLocalCG,
4333                                int *cell_index)
4334 {
4335     int cg, a0, a1, a;
4336
4337     for (cg = 0; cg < ncg; cg++)
4338     {
4339         if (move[cg] >= 0)
4340         {
4341             a0 = cgindex[cg];
4342             a1 = cgindex[cg+1];
4343             /* Clear the global indices */
4344             for (a = a0; a < a1; a++)
4345             {
4346                 ga2la_del(ga2la, gatindex[a]);
4347             }
4348             if (bLocalCG)
4349             {
4350                 bLocalCG[index_gl[cg]] = FALSE;
4351             }
4352             /* Signal that this cg has moved using the ns cell index.
4353              * Here we set it to -1. fill_grid will change it
4354              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4355              */
4356             cell_index[cg] = -1;
4357         }
4358     }
4359 }
4360
4361 static void print_cg_move(FILE *fplog,
4362                           gmx_domdec_t *dd,
4363                           gmx_int64_t step, int cg, int dim, int dir,
4364                           gmx_bool bHaveCgcmOld, real limitd,
4365                           rvec cm_old, rvec cm_new, real pos_d)
4366 {
4367     gmx_domdec_comm_t *comm;
4368     char               buf[22];
4369
4370     comm = dd->comm;
4371
4372     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4373     if (limitd > 0)
4374     {
4375         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4376                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4377                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4378     }
4379     else
4380     {
4381         /* We don't have a limiting distance available: don't print it */
4382         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4383                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4384                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4385     }
4386     fprintf(fplog, "distance out of cell %f\n",
4387             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4388     if (bHaveCgcmOld)
4389     {
4390         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4391                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4392     }
4393     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4394             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4395     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4396             dim2char(dim),
4397             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4398     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4399             dim2char(dim),
4400             comm->cell_x0[dim], comm->cell_x1[dim]);
4401 }
4402
4403 static void cg_move_error(FILE *fplog,
4404                           gmx_domdec_t *dd,
4405                           gmx_int64_t step, int cg, int dim, int dir,
4406                           gmx_bool bHaveCgcmOld, real limitd,
4407                           rvec cm_old, rvec cm_new, real pos_d)
4408 {
4409     if (fplog)
4410     {
4411         print_cg_move(fplog, dd, step, cg, dim, dir,
4412                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4413     }
4414     print_cg_move(stderr, dd, step, cg, dim, dir,
4415                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4416     gmx_fatal(FARGS,
4417               "%s moved too far between two domain decomposition steps\n"
4418               "This usually means that your system is not well equilibrated",
4419               dd->comm->bCGs ? "A charge group" : "An atom");
4420 }
4421
4422 static void rotate_state_atom(t_state *state, int a)
4423 {
4424     int est;
4425
4426     for (est = 0; est < estNR; est++)
4427     {
4428         if (EST_DISTR(est) && (state->flags & (1<<est)))
4429         {
4430             switch (est)
4431             {
4432                 case estX:
4433                     /* Rotate the complete state; for a rectangular box only */
4434                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4435                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4436                     break;
4437                 case estV:
4438                     state->v[a][YY] = -state->v[a][YY];
4439                     state->v[a][ZZ] = -state->v[a][ZZ];
4440                     break;
4441                 case estSDX:
4442                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4443                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4444                     break;
4445                 case estCGP:
4446                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4447                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4448                     break;
4449                 case estDISRE_INITF:
4450                 case estDISRE_RM3TAV:
4451                 case estORIRE_INITF:
4452                 case estORIRE_DTAV:
4453                     /* These are distances, so not affected by rotation */
4454                     break;
4455                 default:
4456                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4457             }
4458         }
4459     }
4460 }
4461
4462 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4463 {
4464     if (natoms > comm->moved_nalloc)
4465     {
4466         /* Contents should be preserved here */
4467         comm->moved_nalloc = over_alloc_dd(natoms);
4468         srenew(comm->moved, comm->moved_nalloc);
4469     }
4470
4471     return comm->moved;
4472 }
4473
4474 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4475                          gmx_domdec_t *dd,
4476                          t_state *state,
4477                          ivec tric_dir, matrix tcm,
4478                          rvec cell_x0, rvec cell_x1,
4479                          rvec limitd, rvec limit0, rvec limit1,
4480                          const int *cgindex,
4481                          int cg_start, int cg_end,
4482                          rvec *cg_cm,
4483                          int *move)
4484 {
4485     int      npbcdim;
4486     int      cg, k, k0, k1, d, dim, d2;
4487     int      mc, nrcg;
4488     int      flag;
4489     gmx_bool bScrew;
4490     ivec     dev;
4491     real     inv_ncg, pos_d;
4492     rvec     cm_new;
4493
4494     npbcdim = dd->npbcdim;
4495
4496     for (cg = cg_start; cg < cg_end; cg++)
4497     {
4498         k0   = cgindex[cg];
4499         k1   = cgindex[cg+1];
4500         nrcg = k1 - k0;
4501         if (nrcg == 1)
4502         {
4503             copy_rvec(state->x[k0], cm_new);
4504         }
4505         else
4506         {
4507             inv_ncg = 1.0/nrcg;
4508
4509             clear_rvec(cm_new);
4510             for (k = k0; (k < k1); k++)
4511             {
4512                 rvec_inc(cm_new, state->x[k]);
4513             }
4514             for (d = 0; (d < DIM); d++)
4515             {
4516                 cm_new[d] = inv_ncg*cm_new[d];
4517             }
4518         }
4519
4520         clear_ivec(dev);
4521         /* Do pbc and check DD cell boundary crossings */
4522         for (d = DIM-1; d >= 0; d--)
4523         {
4524             if (dd->nc[d] > 1)
4525             {
4526                 bScrew = (dd->bScrewPBC && d == XX);
4527                 /* Determine the location of this cg in lattice coordinates */
4528                 pos_d = cm_new[d];
4529                 if (tric_dir[d])
4530                 {
4531                     for (d2 = d+1; d2 < DIM; d2++)
4532                     {
4533                         pos_d += cm_new[d2]*tcm[d2][d];
4534                     }
4535                 }
4536                 /* Put the charge group in the triclinic unit-cell */
4537                 if (pos_d >= cell_x1[d])
4538                 {
4539                     if (pos_d >= limit1[d])
4540                     {
4541                         cg_move_error(fplog, dd, step, cg, d, 1,
4542                                       cg_cm != state->x, limitd[d],
4543                                       cg_cm[cg], cm_new, pos_d);
4544                     }
4545                     dev[d] = 1;
4546                     if (dd->ci[d] == dd->nc[d] - 1)
4547                     {
4548                         rvec_dec(cm_new, state->box[d]);
4549                         if (bScrew)
4550                         {
4551                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4552                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4553                         }
4554                         for (k = k0; (k < k1); k++)
4555                         {
4556                             rvec_dec(state->x[k], state->box[d]);
4557                             if (bScrew)
4558                             {
4559                                 rotate_state_atom(state, k);
4560                             }
4561                         }
4562                     }
4563                 }
4564                 else if (pos_d < cell_x0[d])
4565                 {
4566                     if (pos_d < limit0[d])
4567                     {
4568                         cg_move_error(fplog, dd, step, cg, d, -1,
4569                                       cg_cm != state->x, limitd[d],
4570                                       cg_cm[cg], cm_new, pos_d);
4571                     }
4572                     dev[d] = -1;
4573                     if (dd->ci[d] == 0)
4574                     {
4575                         rvec_inc(cm_new, state->box[d]);
4576                         if (bScrew)
4577                         {
4578                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4579                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4580                         }
4581                         for (k = k0; (k < k1); k++)
4582                         {
4583                             rvec_inc(state->x[k], state->box[d]);
4584                             if (bScrew)
4585                             {
4586                                 rotate_state_atom(state, k);
4587                             }
4588                         }
4589                     }
4590                 }
4591             }
4592             else if (d < npbcdim)
4593             {
4594                 /* Put the charge group in the rectangular unit-cell */
4595                 while (cm_new[d] >= state->box[d][d])
4596                 {
4597                     rvec_dec(cm_new, state->box[d]);
4598                     for (k = k0; (k < k1); k++)
4599                     {
4600                         rvec_dec(state->x[k], state->box[d]);
4601                     }
4602                 }
4603                 while (cm_new[d] < 0)
4604                 {
4605                     rvec_inc(cm_new, state->box[d]);
4606                     for (k = k0; (k < k1); k++)
4607                     {
4608                         rvec_inc(state->x[k], state->box[d]);
4609                     }
4610                 }
4611             }
4612         }
4613
4614         copy_rvec(cm_new, cg_cm[cg]);
4615
4616         /* Determine where this cg should go */
4617         flag = 0;
4618         mc   = -1;
4619         for (d = 0; d < dd->ndim; d++)
4620         {
4621             dim = dd->dim[d];
4622             if (dev[dim] == 1)
4623             {
4624                 flag |= DD_FLAG_FW(d);
4625                 if (mc == -1)
4626                 {
4627                     mc = d*2;
4628                 }
4629             }
4630             else if (dev[dim] == -1)
4631             {
4632                 flag |= DD_FLAG_BW(d);
4633                 if (mc == -1)
4634                 {
4635                     if (dd->nc[dim] > 2)
4636                     {
4637                         mc = d*2 + 1;
4638                     }
4639                     else
4640                     {
4641                         mc = d*2;
4642                     }
4643                 }
4644             }
4645         }
4646         /* Temporarily store the flag in move */
4647         move[cg] = mc + flag;
4648     }
4649 }
4650
4651 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4652                                gmx_domdec_t *dd, ivec tric_dir,
4653                                t_state *state, rvec **f,
4654                                t_forcerec *fr,
4655                                gmx_bool bCompact,
4656                                t_nrnb *nrnb,
4657                                int *ncg_stay_home,
4658                                int *ncg_moved)
4659 {
4660     int               *move;
4661     int                npbcdim;
4662     int                ncg[DIM*2], nat[DIM*2];
4663     int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
4664     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4665     int                sbuf[2], rbuf[2];
4666     int                home_pos_cg, home_pos_at, buf_pos;
4667     int                flag;
4668     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4669     real               pos_d;
4670     matrix             tcm;
4671     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
4672     atom_id           *cgindex;
4673     cginfo_mb_t       *cginfo_mb;
4674     gmx_domdec_comm_t *comm;
4675     int               *moved;
4676     int                nthread, thread;
4677
4678     if (dd->bScrewPBC)
4679     {
4680         check_screw_box(state->box);
4681     }
4682
4683     comm  = dd->comm;
4684     if (fr->cutoff_scheme == ecutsGROUP)
4685     {
4686         cg_cm = fr->cg_cm;
4687     }
4688
4689     for (i = 0; i < estNR; i++)
4690     {
4691         if (EST_DISTR(i))
4692         {
4693             switch (i)
4694             {
4695                 case estX: /* Always present */ break;
4696                 case estV:   bV   = (state->flags & (1<<i)); break;
4697                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4698                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4699                 case estLD_RNG:
4700                 case estLD_RNGI:
4701                 case estDISRE_INITF:
4702                 case estDISRE_RM3TAV:
4703                 case estORIRE_INITF:
4704                 case estORIRE_DTAV:
4705                     /* No processing required */
4706                     break;
4707                 default:
4708                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4709             }
4710         }
4711     }
4712
4713     if (dd->ncg_tot > comm->nalloc_int)
4714     {
4715         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4716         srenew(comm->buf_int, comm->nalloc_int);
4717     }
4718     move = comm->buf_int;
4719
4720     /* Clear the count */
4721     for (c = 0; c < dd->ndim*2; c++)
4722     {
4723         ncg[c] = 0;
4724         nat[c] = 0;
4725     }
4726
4727     npbcdim = dd->npbcdim;
4728
4729     for (d = 0; (d < DIM); d++)
4730     {
4731         limitd[d] = dd->comm->cellsize_min[d];
4732         if (d >= npbcdim && dd->ci[d] == 0)
4733         {
4734             cell_x0[d] = -GMX_FLOAT_MAX;
4735         }
4736         else
4737         {
4738             cell_x0[d] = comm->cell_x0[d];
4739         }
4740         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4741         {
4742             cell_x1[d] = GMX_FLOAT_MAX;
4743         }
4744         else
4745         {
4746             cell_x1[d] = comm->cell_x1[d];
4747         }
4748         if (d < npbcdim)
4749         {
4750             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4751             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4752         }
4753         else
4754         {
4755             /* We check after communication if a charge group moved
4756              * more than one cell. Set the pre-comm check limit to float_max.
4757              */
4758             limit0[d] = -GMX_FLOAT_MAX;
4759             limit1[d] =  GMX_FLOAT_MAX;
4760         }
4761     }
4762
4763     make_tric_corr_matrix(npbcdim, state->box, tcm);
4764
4765     cgindex = dd->cgindex;
4766
4767     nthread = gmx_omp_nthreads_get(emntDomdec);
4768
4769     /* Compute the center of geometry for all home charge groups
4770      * and put them in the box and determine where they should go.
4771      */
4772 #pragma omp parallel for num_threads(nthread) schedule(static)
4773     for (thread = 0; thread < nthread; thread++)
4774     {
4775         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4776                      cell_x0, cell_x1, limitd, limit0, limit1,
4777                      cgindex,
4778                      ( thread   *dd->ncg_home)/nthread,
4779                      ((thread+1)*dd->ncg_home)/nthread,
4780                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4781                      move);
4782     }
4783
4784     for (cg = 0; cg < dd->ncg_home; cg++)
4785     {
4786         if (move[cg] >= 0)
4787         {
4788             mc       = move[cg];
4789             flag     = mc & ~DD_FLAG_NRCG;
4790             mc       = mc & DD_FLAG_NRCG;
4791             move[cg] = mc;
4792
4793             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4794             {
4795                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4796                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4797             }
4798             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4799             /* We store the cg size in the lower 16 bits
4800              * and the place where the charge group should go
4801              * in the next 6 bits. This saves some communication volume.
4802              */
4803             nrcg = cgindex[cg+1] - cgindex[cg];
4804             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4805             ncg[mc] += 1;
4806             nat[mc] += nrcg;
4807         }
4808     }
4809
4810     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4811     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4812
4813     *ncg_moved = 0;
4814     for (i = 0; i < dd->ndim*2; i++)
4815     {
4816         *ncg_moved += ncg[i];
4817     }
4818
4819     nvec = 1;
4820     if (bV)
4821     {
4822         nvec++;
4823     }
4824     if (bSDX)
4825     {
4826         nvec++;
4827     }
4828     if (bCGP)
4829     {
4830         nvec++;
4831     }
4832
4833     /* Make sure the communication buffers are large enough */
4834     for (mc = 0; mc < dd->ndim*2; mc++)
4835     {
4836         nvr = ncg[mc] + nat[mc]*nvec;
4837         if (nvr > comm->cgcm_state_nalloc[mc])
4838         {
4839             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4840             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4841         }
4842     }
4843
4844     switch (fr->cutoff_scheme)
4845     {
4846         case ecutsGROUP:
4847             /* Recalculating cg_cm might be cheaper than communicating,
4848              * but that could give rise to rounding issues.
4849              */
4850             home_pos_cg =
4851                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4852                                         nvec, cg_cm, comm, bCompact);
4853             break;
4854         case ecutsVERLET:
4855             /* Without charge groups we send the moved atom coordinates
4856              * over twice. This is so the code below can be used without
4857              * many conditionals for both for with and without charge groups.
4858              */
4859             home_pos_cg =
4860                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4861                                         nvec, state->x, comm, FALSE);
4862             if (bCompact)
4863             {
4864                 home_pos_cg -= *ncg_moved;
4865             }
4866             break;
4867         default:
4868             gmx_incons("unimplemented");
4869             home_pos_cg = 0;
4870     }
4871
4872     vec         = 0;
4873     home_pos_at =
4874         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4875                                 nvec, vec++, state->x, comm, bCompact);
4876     if (bV)
4877     {
4878         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4879                                 nvec, vec++, state->v, comm, bCompact);
4880     }
4881     if (bSDX)
4882     {
4883         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4884                                 nvec, vec++, state->sd_X, comm, bCompact);
4885     }
4886     if (bCGP)
4887     {
4888         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4889                                 nvec, vec++, state->cg_p, comm, bCompact);
4890     }
4891
4892     if (bCompact)
4893     {
4894         compact_ind(dd->ncg_home, move,
4895                     dd->index_gl, dd->cgindex, dd->gatindex,
4896                     dd->ga2la, comm->bLocalCG,
4897                     fr->cginfo);
4898     }
4899     else
4900     {
4901         if (fr->cutoff_scheme == ecutsVERLET)
4902         {
4903             moved = get_moved(comm, dd->ncg_home);
4904
4905             for (k = 0; k < dd->ncg_home; k++)
4906             {
4907                 moved[k] = 0;
4908             }
4909         }
4910         else
4911         {
4912             moved = fr->ns.grid->cell_index;
4913         }
4914
4915         clear_and_mark_ind(dd->ncg_home, move,
4916                            dd->index_gl, dd->cgindex, dd->gatindex,
4917                            dd->ga2la, comm->bLocalCG,
4918                            moved);
4919     }
4920
4921     cginfo_mb = fr->cginfo_mb;
4922
4923     *ncg_stay_home = home_pos_cg;
4924     for (d = 0; d < dd->ndim; d++)
4925     {
4926         dim      = dd->dim[d];
4927         ncg_recv = 0;
4928         nvr      = 0;
4929         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4930         {
4931             cdd = d*2 + dir;
4932             /* Communicate the cg and atom counts */
4933             sbuf[0] = ncg[cdd];
4934             sbuf[1] = nat[cdd];
4935             if (debug)
4936             {
4937                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4938                         d, dir, sbuf[0], sbuf[1]);
4939             }
4940             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4941
4942             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4943             {
4944                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4945                 srenew(comm->buf_int, comm->nalloc_int);
4946             }
4947
4948             /* Communicate the charge group indices, sizes and flags */
4949             dd_sendrecv_int(dd, d, dir,
4950                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4951                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4952
4953             nvs = ncg[cdd] + nat[cdd]*nvec;
4954             i   = rbuf[0]  + rbuf[1] *nvec;
4955             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4956
4957             /* Communicate cgcm and state */
4958             dd_sendrecv_rvec(dd, d, dir,
4959                              comm->cgcm_state[cdd], nvs,
4960                              comm->vbuf.v+nvr, i);
4961             ncg_recv += rbuf[0];
4962             nvr      += i;
4963         }
4964
4965         /* Process the received charge groups */
4966         buf_pos = 0;
4967         for (cg = 0; cg < ncg_recv; cg++)
4968         {
4969             flag = comm->buf_int[cg*DD_CGIBS+1];
4970
4971             if (dim >= npbcdim && dd->nc[dim] > 2)
4972             {
4973                 /* No pbc in this dim and more than one domain boundary.
4974                  * We do a separate check if a charge group didn't move too far.
4975                  */
4976                 if (((flag & DD_FLAG_FW(d)) &&
4977                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4978                     ((flag & DD_FLAG_BW(d)) &&
4979                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4980                 {
4981                     cg_move_error(fplog, dd, step, cg, dim,
4982                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4983                                   fr->cutoff_scheme == ecutsGROUP, 0,
4984                                   comm->vbuf.v[buf_pos],
4985                                   comm->vbuf.v[buf_pos],
4986                                   comm->vbuf.v[buf_pos][dim]);
4987                 }
4988             }
4989
4990             mc = -1;
4991             if (d < dd->ndim-1)
4992             {
4993                 /* Check which direction this cg should go */
4994                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4995                 {
4996                     if (dd->bGridJump)
4997                     {
4998                         /* The cell boundaries for dimension d2 are not equal
4999                          * for each cell row of the lower dimension(s),
5000                          * therefore we might need to redetermine where
5001                          * this cg should go.
5002                          */
5003                         dim2 = dd->dim[d2];
5004                         /* If this cg crosses the box boundary in dimension d2
5005                          * we can use the communicated flag, so we do not
5006                          * have to worry about pbc.
5007                          */
5008                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
5009                                (flag & DD_FLAG_FW(d2))) ||
5010                               (dd->ci[dim2] == 0 &&
5011                                (flag & DD_FLAG_BW(d2)))))
5012                         {
5013                             /* Clear the two flags for this dimension */
5014                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
5015                             /* Determine the location of this cg
5016                              * in lattice coordinates
5017                              */
5018                             pos_d = comm->vbuf.v[buf_pos][dim2];
5019                             if (tric_dir[dim2])
5020                             {
5021                                 for (d3 = dim2+1; d3 < DIM; d3++)
5022                                 {
5023                                     pos_d +=
5024                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
5025                                 }
5026                             }
5027                             /* Check of we are not at the box edge.
5028                              * pbc is only handled in the first step above,
5029                              * but this check could move over pbc while
5030                              * the first step did not due to different rounding.
5031                              */
5032                             if (pos_d >= cell_x1[dim2] &&
5033                                 dd->ci[dim2] != dd->nc[dim2]-1)
5034                             {
5035                                 flag |= DD_FLAG_FW(d2);
5036                             }
5037                             else if (pos_d < cell_x0[dim2] &&
5038                                      dd->ci[dim2] != 0)
5039                             {
5040                                 flag |= DD_FLAG_BW(d2);
5041                             }
5042                             comm->buf_int[cg*DD_CGIBS+1] = flag;
5043                         }
5044                     }
5045                     /* Set to which neighboring cell this cg should go */
5046                     if (flag & DD_FLAG_FW(d2))
5047                     {
5048                         mc = d2*2;
5049                     }
5050                     else if (flag & DD_FLAG_BW(d2))
5051                     {
5052                         if (dd->nc[dd->dim[d2]] > 2)
5053                         {
5054                             mc = d2*2+1;
5055                         }
5056                         else
5057                         {
5058                             mc = d2*2;
5059                         }
5060                     }
5061                 }
5062             }
5063
5064             nrcg = flag & DD_FLAG_NRCG;
5065             if (mc == -1)
5066             {
5067                 if (home_pos_cg+1 > dd->cg_nalloc)
5068                 {
5069                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5070                     srenew(dd->index_gl, dd->cg_nalloc);
5071                     srenew(dd->cgindex, dd->cg_nalloc+1);
5072                 }
5073                 /* Set the global charge group index and size */
5074                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5075                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5076                 /* Copy the state from the buffer */
5077                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5078                 if (fr->cutoff_scheme == ecutsGROUP)
5079                 {
5080                     cg_cm = fr->cg_cm;
5081                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5082                 }
5083                 buf_pos++;
5084
5085                 /* Set the cginfo */
5086                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5087                                                    dd->index_gl[home_pos_cg]);
5088                 if (comm->bLocalCG)
5089                 {
5090                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5091                 }
5092
5093                 if (home_pos_at+nrcg > state->nalloc)
5094                 {
5095                     dd_realloc_state(state, f, home_pos_at+nrcg);
5096                 }
5097                 for (i = 0; i < nrcg; i++)
5098                 {
5099                     copy_rvec(comm->vbuf.v[buf_pos++],
5100                               state->x[home_pos_at+i]);
5101                 }
5102                 if (bV)
5103                 {
5104                     for (i = 0; i < nrcg; i++)
5105                     {
5106                         copy_rvec(comm->vbuf.v[buf_pos++],
5107                                   state->v[home_pos_at+i]);
5108                     }
5109                 }
5110                 if (bSDX)
5111                 {
5112                     for (i = 0; i < nrcg; i++)
5113                     {
5114                         copy_rvec(comm->vbuf.v[buf_pos++],
5115                                   state->sd_X[home_pos_at+i]);
5116                     }
5117                 }
5118                 if (bCGP)
5119                 {
5120                     for (i = 0; i < nrcg; i++)
5121                     {
5122                         copy_rvec(comm->vbuf.v[buf_pos++],
5123                                   state->cg_p[home_pos_at+i]);
5124                     }
5125                 }
5126                 home_pos_cg += 1;
5127                 home_pos_at += nrcg;
5128             }
5129             else
5130             {
5131                 /* Reallocate the buffers if necessary  */
5132                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5133                 {
5134                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5135                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5136                 }
5137                 nvr = ncg[mc] + nat[mc]*nvec;
5138                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5139                 {
5140                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5141                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5142                 }
5143                 /* Copy from the receive to the send buffers */
5144                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5145                        comm->buf_int + cg*DD_CGIBS,
5146                        DD_CGIBS*sizeof(int));
5147                 memcpy(comm->cgcm_state[mc][nvr],
5148                        comm->vbuf.v[buf_pos],
5149                        (1+nrcg*nvec)*sizeof(rvec));
5150                 buf_pos += 1 + nrcg*nvec;
5151                 ncg[mc] += 1;
5152                 nat[mc] += nrcg;
5153             }
5154         }
5155     }
5156
5157     /* With sorting (!bCompact) the indices are now only partially up to date
5158      * and ncg_home and nat_home are not the real count, since there are
5159      * "holes" in the arrays for the charge groups that moved to neighbors.
5160      */
5161     if (fr->cutoff_scheme == ecutsVERLET)
5162     {
5163         moved = get_moved(comm, home_pos_cg);
5164
5165         for (i = dd->ncg_home; i < home_pos_cg; i++)
5166         {
5167             moved[i] = 0;
5168         }
5169     }
5170     dd->ncg_home = home_pos_cg;
5171     dd->nat_home = home_pos_at;
5172
5173     if (debug)
5174     {
5175         fprintf(debug,
5176                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5177                 *ncg_moved, dd->ncg_home-*ncg_moved);
5178
5179     }
5180 }
5181
5182 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5183 {
5184     dd->comm->cycl[ddCycl] += cycles;
5185     dd->comm->cycl_n[ddCycl]++;
5186     if (cycles > dd->comm->cycl_max[ddCycl])
5187     {
5188         dd->comm->cycl_max[ddCycl] = cycles;
5189     }
5190 }
5191
5192 static double force_flop_count(t_nrnb *nrnb)
5193 {
5194     int         i;
5195     double      sum;
5196     const char *name;
5197
5198     sum = 0;
5199     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5200     {
5201         /* To get closer to the real timings, we half the count
5202          * for the normal loops and again half it for water loops.
5203          */
5204         name = nrnb_str(i);
5205         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5206         {
5207             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5208         }
5209         else
5210         {
5211             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5212         }
5213     }
5214     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5215     {
5216         name = nrnb_str(i);
5217         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5218         {
5219             sum += nrnb->n[i]*cost_nrnb(i);
5220         }
5221     }
5222     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5223     {
5224         sum += nrnb->n[i]*cost_nrnb(i);
5225     }
5226
5227     return sum;
5228 }
5229
5230 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5231 {
5232     if (dd->comm->eFlop)
5233     {
5234         dd->comm->flop -= force_flop_count(nrnb);
5235     }
5236 }
5237 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5238 {
5239     if (dd->comm->eFlop)
5240     {
5241         dd->comm->flop += force_flop_count(nrnb);
5242         dd->comm->flop_n++;
5243     }
5244 }
5245
5246 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5247 {
5248     int i;
5249
5250     for (i = 0; i < ddCyclNr; i++)
5251     {
5252         dd->comm->cycl[i]     = 0;
5253         dd->comm->cycl_n[i]   = 0;
5254         dd->comm->cycl_max[i] = 0;
5255     }
5256     dd->comm->flop   = 0;
5257     dd->comm->flop_n = 0;
5258 }
5259
5260 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5261 {
5262     gmx_domdec_comm_t *comm;
5263     gmx_domdec_load_t *load;
5264     gmx_domdec_root_t *root = NULL;
5265     int                d, dim, i, pos;
5266     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5267     gmx_bool           bSepPME;
5268
5269     if (debug)
5270     {
5271         fprintf(debug, "get_load_distribution start\n");
5272     }
5273
5274     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5275
5276     comm = dd->comm;
5277
5278     bSepPME = (dd->pme_nodeid >= 0);
5279
5280     for (d = dd->ndim-1; d >= 0; d--)
5281     {
5282         dim = dd->dim[d];
5283         /* Check if we participate in the communication in this dimension */
5284         if (d == dd->ndim-1 ||
5285             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5286         {
5287             load = &comm->load[d];
5288             if (dd->bGridJump)
5289             {
5290                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5291             }
5292             pos = 0;
5293             if (d == dd->ndim-1)
5294             {
5295                 sbuf[pos++] = dd_force_load(comm);
5296                 sbuf[pos++] = sbuf[0];
5297                 if (dd->bGridJump)
5298                 {
5299                     sbuf[pos++] = sbuf[0];
5300                     sbuf[pos++] = cell_frac;
5301                     if (d > 0)
5302                     {
5303                         sbuf[pos++] = comm->cell_f_max0[d];
5304                         sbuf[pos++] = comm->cell_f_min1[d];
5305                     }
5306                 }
5307                 if (bSepPME)
5308                 {
5309                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5310                     sbuf[pos++] = comm->cycl[ddCyclPME];
5311                 }
5312             }
5313             else
5314             {
5315                 sbuf[pos++] = comm->load[d+1].sum;
5316                 sbuf[pos++] = comm->load[d+1].max;
5317                 if (dd->bGridJump)
5318                 {
5319                     sbuf[pos++] = comm->load[d+1].sum_m;
5320                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5321                     sbuf[pos++] = comm->load[d+1].flags;
5322                     if (d > 0)
5323                     {
5324                         sbuf[pos++] = comm->cell_f_max0[d];
5325                         sbuf[pos++] = comm->cell_f_min1[d];
5326                     }
5327                 }
5328                 if (bSepPME)
5329                 {
5330                     sbuf[pos++] = comm->load[d+1].mdf;
5331                     sbuf[pos++] = comm->load[d+1].pme;
5332                 }
5333             }
5334             load->nload = pos;
5335             /* Communicate a row in DD direction d.
5336              * The communicators are setup such that the root always has rank 0.
5337              */
5338 #ifdef GMX_MPI
5339             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5340                        load->load, load->nload*sizeof(float), MPI_BYTE,
5341                        0, comm->mpi_comm_load[d]);
5342 #endif
5343             if (dd->ci[dim] == dd->master_ci[dim])
5344             {
5345                 /* We are the root, process this row */
5346                 if (comm->bDynLoadBal)
5347                 {
5348                     root = comm->root[d];
5349                 }
5350                 load->sum      = 0;
5351                 load->max      = 0;
5352                 load->sum_m    = 0;
5353                 load->cvol_min = 1;
5354                 load->flags    = 0;
5355                 load->mdf      = 0;
5356                 load->pme      = 0;
5357                 pos            = 0;
5358                 for (i = 0; i < dd->nc[dim]; i++)
5359                 {
5360                     load->sum += load->load[pos++];
5361                     load->max  = std::max(load->max, load->load[pos]);
5362                     pos++;
5363                     if (dd->bGridJump)
5364                     {
5365                         if (root->bLimited)
5366                         {
5367                             /* This direction could not be load balanced properly,
5368                              * therefore we need to use the maximum iso the average load.
5369                              */
5370                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5371                         }
5372                         else
5373                         {
5374                             load->sum_m += load->load[pos];
5375                         }
5376                         pos++;
5377                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5378                         pos++;
5379                         if (d < dd->ndim-1)
5380                         {
5381                             load->flags = (int)(load->load[pos++] + 0.5);
5382                         }
5383                         if (d > 0)
5384                         {
5385                             root->cell_f_max0[i] = load->load[pos++];
5386                             root->cell_f_min1[i] = load->load[pos++];
5387                         }
5388                     }
5389                     if (bSepPME)
5390                     {
5391                         load->mdf = std::max(load->mdf, load->load[pos]);
5392                         pos++;
5393                         load->pme = std::max(load->pme, load->load[pos]);
5394                         pos++;
5395                     }
5396                 }
5397                 if (comm->bDynLoadBal && root->bLimited)
5398                 {
5399                     load->sum_m *= dd->nc[dim];
5400                     load->flags |= (1<<d);
5401                 }
5402             }
5403         }
5404     }
5405
5406     if (DDMASTER(dd))
5407     {
5408         comm->nload      += dd_load_count(comm);
5409         comm->load_step  += comm->cycl[ddCyclStep];
5410         comm->load_sum   += comm->load[0].sum;
5411         comm->load_max   += comm->load[0].max;
5412         if (comm->bDynLoadBal)
5413         {
5414             for (d = 0; d < dd->ndim; d++)
5415             {
5416                 if (comm->load[0].flags & (1<<d))
5417                 {
5418                     comm->load_lim[d]++;
5419                 }
5420             }
5421         }
5422         if (bSepPME)
5423         {
5424             comm->load_mdf += comm->load[0].mdf;
5425             comm->load_pme += comm->load[0].pme;
5426         }
5427     }
5428
5429     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5430
5431     if (debug)
5432     {
5433         fprintf(debug, "get_load_distribution finished\n");
5434     }
5435 }
5436
5437 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5438 {
5439     /* Return the relative performance loss on the total run time
5440      * due to the force calculation load imbalance.
5441      */
5442     if (dd->comm->nload > 0)
5443     {
5444         return
5445             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5446             (dd->comm->load_step*dd->nnodes);
5447     }
5448     else
5449     {
5450         return 0;
5451     }
5452 }
5453
5454 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5455 {
5456     char               buf[STRLEN];
5457     int                npp, npme, nnodes, d, limp;
5458     float              imbal, pme_f_ratio, lossf, lossp = 0;
5459     gmx_bool           bLim;
5460     gmx_domdec_comm_t *comm;
5461
5462     comm = dd->comm;
5463     if (DDMASTER(dd) && comm->nload > 0)
5464     {
5465         npp    = dd->nnodes;
5466         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5467         nnodes = npp + npme;
5468         imbal  = comm->load_max*npp/comm->load_sum - 1;
5469         lossf  = dd_force_imb_perf_loss(dd);
5470         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5471         fprintf(fplog, "%s", buf);
5472         fprintf(stderr, "\n");
5473         fprintf(stderr, "%s", buf);
5474         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5475         fprintf(fplog, "%s", buf);
5476         fprintf(stderr, "%s", buf);
5477         bLim = FALSE;
5478         if (comm->bDynLoadBal)
5479         {
5480             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5481             for (d = 0; d < dd->ndim; d++)
5482             {
5483                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5484                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5485                 if (limp >= 50)
5486                 {
5487                     bLim = TRUE;
5488                 }
5489             }
5490             sprintf(buf+strlen(buf), "\n");
5491             fprintf(fplog, "%s", buf);
5492             fprintf(stderr, "%s", buf);
5493         }
5494         if (npme > 0)
5495         {
5496             pme_f_ratio = comm->load_pme/comm->load_mdf;
5497             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5498             if (lossp <= 0)
5499             {
5500                 lossp *= (float)npme/(float)nnodes;
5501             }
5502             else
5503             {
5504                 lossp *= (float)npp/(float)nnodes;
5505             }
5506             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5507             fprintf(fplog, "%s", buf);
5508             fprintf(stderr, "%s", buf);
5509             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5510             fprintf(fplog, "%s", buf);
5511             fprintf(stderr, "%s", buf);
5512         }
5513         fprintf(fplog, "\n");
5514         fprintf(stderr, "\n");
5515
5516         if (lossf >= DD_PERF_LOSS_WARN)
5517         {
5518             sprintf(buf,
5519                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5520                     "      in the domain decomposition.\n", lossf*100);
5521             if (!comm->bDynLoadBal)
5522             {
5523                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5524             }
5525             else if (bLim)
5526             {
5527                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5528             }
5529             fprintf(fplog, "%s\n", buf);
5530             fprintf(stderr, "%s\n", buf);
5531         }
5532         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5533         {
5534             sprintf(buf,
5535                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5536                     "      had %s work to do than the PP ranks.\n"
5537                     "      You might want to %s the number of PME ranks\n"
5538                     "      or %s the cut-off and the grid spacing.\n",
5539                     fabs(lossp*100),
5540                     (lossp < 0) ? "less"     : "more",
5541                     (lossp < 0) ? "decrease" : "increase",
5542                     (lossp < 0) ? "decrease" : "increase");
5543             fprintf(fplog, "%s\n", buf);
5544             fprintf(stderr, "%s\n", buf);
5545         }
5546     }
5547 }
5548
5549 static float dd_vol_min(gmx_domdec_t *dd)
5550 {
5551     return dd->comm->load[0].cvol_min*dd->nnodes;
5552 }
5553
5554 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5555 {
5556     return dd->comm->load[0].flags;
5557 }
5558
5559 static float dd_f_imbal(gmx_domdec_t *dd)
5560 {
5561     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5562 }
5563
5564 float dd_pme_f_ratio(gmx_domdec_t *dd)
5565 {
5566     if (dd->comm->load[0].mdf > 0 && dd->comm->cycl_n[ddCyclPME] > 0)
5567     {
5568         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5569     }
5570     else
5571     {
5572         return -1.0;
5573     }
5574 }
5575
5576 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5577 {
5578     int  flags, d;
5579     char buf[22];
5580
5581     flags = dd_load_flags(dd);
5582     if (flags)
5583     {
5584         fprintf(fplog,
5585                 "DD  load balancing is limited by minimum cell size in dimension");
5586         for (d = 0; d < dd->ndim; d++)
5587         {
5588             if (flags & (1<<d))
5589             {
5590                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5591             }
5592         }
5593         fprintf(fplog, "\n");
5594     }
5595     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5596     if (dd->comm->bDynLoadBal)
5597     {
5598         fprintf(fplog, "  vol min/aver %5.3f%c",
5599                 dd_vol_min(dd), flags ? '!' : ' ');
5600     }
5601     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5602     if (dd->comm->cycl_n[ddCyclPME])
5603     {
5604         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5605     }
5606     fprintf(fplog, "\n\n");
5607 }
5608
5609 static void dd_print_load_verbose(gmx_domdec_t *dd)
5610 {
5611     if (dd->comm->bDynLoadBal)
5612     {
5613         fprintf(stderr, "vol %4.2f%c ",
5614                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5615     }
5616     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5617     if (dd->comm->cycl_n[ddCyclPME])
5618     {
5619         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5620     }
5621 }
5622
5623 #ifdef GMX_MPI
5624 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5625 {
5626     MPI_Comm           c_row;
5627     int                dim, i, rank;
5628     ivec               loc_c;
5629     gmx_domdec_root_t *root;
5630     gmx_bool           bPartOfGroup = FALSE;
5631
5632     dim = dd->dim[dim_ind];
5633     copy_ivec(loc, loc_c);
5634     for (i = 0; i < dd->nc[dim]; i++)
5635     {
5636         loc_c[dim] = i;
5637         rank       = dd_index(dd->nc, loc_c);
5638         if (rank == dd->rank)
5639         {
5640             /* This process is part of the group */
5641             bPartOfGroup = TRUE;
5642         }
5643     }
5644     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5645                    &c_row);
5646     if (bPartOfGroup)
5647     {
5648         dd->comm->mpi_comm_load[dim_ind] = c_row;
5649         if (dd->comm->eDLB != edlbNO)
5650         {
5651             if (dd->ci[dim] == dd->master_ci[dim])
5652             {
5653                 /* This is the root process of this row */
5654                 snew(dd->comm->root[dim_ind], 1);
5655                 root = dd->comm->root[dim_ind];
5656                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5657                 snew(root->old_cell_f, dd->nc[dim]+1);
5658                 snew(root->bCellMin, dd->nc[dim]);
5659                 if (dim_ind > 0)
5660                 {
5661                     snew(root->cell_f_max0, dd->nc[dim]);
5662                     snew(root->cell_f_min1, dd->nc[dim]);
5663                     snew(root->bound_min, dd->nc[dim]);
5664                     snew(root->bound_max, dd->nc[dim]);
5665                 }
5666                 snew(root->buf_ncd, dd->nc[dim]);
5667             }
5668             else
5669             {
5670                 /* This is not a root process, we only need to receive cell_f */
5671                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5672             }
5673         }
5674         if (dd->ci[dim] == dd->master_ci[dim])
5675         {
5676             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5677         }
5678     }
5679 }
5680 #endif
5681
5682 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5683                                    const gmx_hw_info_t gmx_unused *hwinfo,
5684                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5685 {
5686 #ifdef GMX_MPI
5687     int           physicalnode_id_hash;
5688     int           gpu_id;
5689     gmx_domdec_t *dd;
5690     MPI_Comm      mpi_comm_pp_physicalnode;
5691
5692     if (!(cr->duty & DUTY_PP) ||
5693         hw_opt->gpu_opt.ncuda_dev_use == 0)
5694     {
5695         /* Only PP nodes (currently) use GPUs.
5696          * If we don't have GPUs, there are no resources to share.
5697          */
5698         return;
5699     }
5700
5701     physicalnode_id_hash = gmx_physicalnode_id_hash();
5702
5703     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5704
5705     dd = cr->dd;
5706
5707     if (debug)
5708     {
5709         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5710         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5711                 dd->rank, physicalnode_id_hash, gpu_id);
5712     }
5713     /* Split the PP communicator over the physical nodes */
5714     /* TODO: See if we should store this (before), as it's also used for
5715      * for the nodecomm summution.
5716      */
5717     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5718                    &mpi_comm_pp_physicalnode);
5719     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5720                    &dd->comm->mpi_comm_gpu_shared);
5721     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5722     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5723
5724     if (debug)
5725     {
5726         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5727     }
5728
5729     /* Note that some ranks could share a GPU, while others don't */
5730
5731     if (dd->comm->nrank_gpu_shared == 1)
5732     {
5733         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5734     }
5735 #endif
5736 }
5737
5738 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5739 {
5740 #ifdef GMX_MPI
5741     int  dim0, dim1, i, j;
5742     ivec loc;
5743
5744     if (debug)
5745     {
5746         fprintf(debug, "Making load communicators\n");
5747     }
5748
5749     snew(dd->comm->load, dd->ndim);
5750     snew(dd->comm->mpi_comm_load, dd->ndim);
5751
5752     clear_ivec(loc);
5753     make_load_communicator(dd, 0, loc);
5754     if (dd->ndim > 1)
5755     {
5756         dim0 = dd->dim[0];
5757         for (i = 0; i < dd->nc[dim0]; i++)
5758         {
5759             loc[dim0] = i;
5760             make_load_communicator(dd, 1, loc);
5761         }
5762     }
5763     if (dd->ndim > 2)
5764     {
5765         dim0 = dd->dim[0];
5766         for (i = 0; i < dd->nc[dim0]; i++)
5767         {
5768             loc[dim0] = i;
5769             dim1      = dd->dim[1];
5770             for (j = 0; j < dd->nc[dim1]; j++)
5771             {
5772                 loc[dim1] = j;
5773                 make_load_communicator(dd, 2, loc);
5774             }
5775         }
5776     }
5777
5778     if (debug)
5779     {
5780         fprintf(debug, "Finished making load communicators\n");
5781     }
5782 #endif
5783 }
5784
5785 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5786 {
5787     int                     d, dim, i, j, m;
5788     ivec                    tmp, s;
5789     int                     nzone, nzonep;
5790     ivec                    dd_zp[DD_MAXIZONE];
5791     gmx_domdec_zones_t     *zones;
5792     gmx_domdec_ns_ranges_t *izone;
5793
5794     for (d = 0; d < dd->ndim; d++)
5795     {
5796         dim = dd->dim[d];
5797         copy_ivec(dd->ci, tmp);
5798         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5799         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5800         copy_ivec(dd->ci, tmp);
5801         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5802         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5803         if (debug)
5804         {
5805             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5806                     dd->rank, dim,
5807                     dd->neighbor[d][0],
5808                     dd->neighbor[d][1]);
5809         }
5810     }
5811
5812     if (fplog)
5813     {
5814         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5815                 dd->ndim,
5816                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5817                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5818     }
5819     switch (dd->ndim)
5820     {
5821         case 3:
5822             nzone  = dd_z3n;
5823             nzonep = dd_zp3n;
5824             for (i = 0; i < nzonep; i++)
5825             {
5826                 copy_ivec(dd_zp3[i], dd_zp[i]);
5827             }
5828             break;
5829         case 2:
5830             nzone  = dd_z2n;
5831             nzonep = dd_zp2n;
5832             for (i = 0; i < nzonep; i++)
5833             {
5834                 copy_ivec(dd_zp2[i], dd_zp[i]);
5835             }
5836             break;
5837         case 1:
5838             nzone  = dd_z1n;
5839             nzonep = dd_zp1n;
5840             for (i = 0; i < nzonep; i++)
5841             {
5842                 copy_ivec(dd_zp1[i], dd_zp[i]);
5843             }
5844             break;
5845         default:
5846             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5847             nzone  = 0;
5848             nzonep = 0;
5849     }
5850
5851     zones = &dd->comm->zones;
5852
5853     for (i = 0; i < nzone; i++)
5854     {
5855         m = 0;
5856         clear_ivec(zones->shift[i]);
5857         for (d = 0; d < dd->ndim; d++)
5858         {
5859             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5860         }
5861     }
5862
5863     zones->n = nzone;
5864     for (i = 0; i < nzone; i++)
5865     {
5866         for (d = 0; d < DIM; d++)
5867         {
5868             s[d] = dd->ci[d] - zones->shift[i][d];
5869             if (s[d] < 0)
5870             {
5871                 s[d] += dd->nc[d];
5872             }
5873             else if (s[d] >= dd->nc[d])
5874             {
5875                 s[d] -= dd->nc[d];
5876             }
5877         }
5878     }
5879     zones->nizone = nzonep;
5880     for (i = 0; i < zones->nizone; i++)
5881     {
5882         if (dd_zp[i][0] != i)
5883         {
5884             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5885         }
5886         izone     = &zones->izone[i];
5887         izone->j0 = dd_zp[i][1];
5888         izone->j1 = dd_zp[i][2];
5889         for (dim = 0; dim < DIM; dim++)
5890         {
5891             if (dd->nc[dim] == 1)
5892             {
5893                 /* All shifts should be allowed */
5894                 izone->shift0[dim] = -1;
5895                 izone->shift1[dim] = 1;
5896             }
5897             else
5898             {
5899                 /*
5900                    izone->shift0[d] = 0;
5901                    izone->shift1[d] = 0;
5902                    for(j=izone->j0; j<izone->j1; j++) {
5903                    if (dd->shift[j][d] > dd->shift[i][d])
5904                    izone->shift0[d] = -1;
5905                    if (dd->shift[j][d] < dd->shift[i][d])
5906                    izone->shift1[d] = 1;
5907                    }
5908                  */
5909
5910                 int shift_diff;
5911
5912                 /* Assume the shift are not more than 1 cell */
5913                 izone->shift0[dim] = 1;
5914                 izone->shift1[dim] = -1;
5915                 for (j = izone->j0; j < izone->j1; j++)
5916                 {
5917                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5918                     if (shift_diff < izone->shift0[dim])
5919                     {
5920                         izone->shift0[dim] = shift_diff;
5921                     }
5922                     if (shift_diff > izone->shift1[dim])
5923                     {
5924                         izone->shift1[dim] = shift_diff;
5925                     }
5926                 }
5927             }
5928         }
5929     }
5930
5931     if (dd->comm->eDLB != edlbNO)
5932     {
5933         snew(dd->comm->root, dd->ndim);
5934     }
5935
5936     if (dd->comm->bRecordLoad)
5937     {
5938         make_load_communicators(dd);
5939     }
5940 }
5941
5942 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5943 {
5944     gmx_domdec_t      *dd;
5945     dd   = cr->dd;
5946
5947 #ifdef GMX_MPI
5948     gmx_domdec_comm_t *comm;
5949     int                rank, *buf;
5950     ivec               periods;
5951     MPI_Comm           comm_cart;
5952
5953     comm = dd->comm;
5954
5955     if (comm->bCartesianPP)
5956     {
5957         /* Set up cartesian communication for the particle-particle part */
5958         if (fplog)
5959         {
5960             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5961                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5962         }
5963
5964         for (int i = 0; i < DIM; i++)
5965         {
5966             periods[i] = TRUE;
5967         }
5968         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5969                         &comm_cart);
5970         /* We overwrite the old communicator with the new cartesian one */
5971         cr->mpi_comm_mygroup = comm_cart;
5972     }
5973
5974     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5975     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5976
5977     if (comm->bCartesianPP_PME)
5978     {
5979         /* Since we want to use the original cartesian setup for sim,
5980          * and not the one after split, we need to make an index.
5981          */
5982         snew(comm->ddindex2ddnodeid, dd->nnodes);
5983         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5984         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5985         /* Get the rank of the DD master,
5986          * above we made sure that the master node is a PP node.
5987          */
5988         if (MASTER(cr))
5989         {
5990             rank = dd->rank;
5991         }
5992         else
5993         {
5994             rank = 0;
5995         }
5996         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5997     }
5998     else if (comm->bCartesianPP)
5999     {
6000         if (cr->npmenodes == 0)
6001         {
6002             /* The PP communicator is also
6003              * the communicator for this simulation
6004              */
6005             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
6006         }
6007         cr->nodeid = dd->rank;
6008
6009         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
6010
6011         /* We need to make an index to go from the coordinates
6012          * to the nodeid of this simulation.
6013          */
6014         snew(comm->ddindex2simnodeid, dd->nnodes);
6015         snew(buf, dd->nnodes);
6016         if (cr->duty & DUTY_PP)
6017         {
6018             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6019         }
6020         /* Communicate the ddindex to simulation nodeid index */
6021         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6022                       cr->mpi_comm_mysim);
6023         sfree(buf);
6024
6025         /* Determine the master coordinates and rank.
6026          * The DD master should be the same node as the master of this sim.
6027          */
6028         for (int i = 0; i < dd->nnodes; i++)
6029         {
6030             if (comm->ddindex2simnodeid[i] == 0)
6031             {
6032                 ddindex2xyz(dd->nc, i, dd->master_ci);
6033                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
6034             }
6035         }
6036         if (debug)
6037         {
6038             fprintf(debug, "The master rank is %d\n", dd->masterrank);
6039         }
6040     }
6041     else
6042     {
6043         /* No Cartesian communicators */
6044         /* We use the rank in dd->comm->all as DD index */
6045         ddindex2xyz(dd->nc, dd->rank, dd->ci);
6046         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6047         dd->masterrank = 0;
6048         clear_ivec(dd->master_ci);
6049     }
6050 #endif
6051
6052     if (fplog)
6053     {
6054         fprintf(fplog,
6055                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6056                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6057     }
6058     if (debug)
6059     {
6060         fprintf(debug,
6061                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6062                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6063     }
6064 }
6065
6066 static void receive_ddindex2simnodeid(t_commrec gmx_unused *cr)
6067 {
6068 #ifdef GMX_MPI
6069     gmx_domdec_t      *dd;
6070     gmx_domdec_comm_t *comm;
6071
6072     dd   = cr->dd;
6073     comm = dd->comm;
6074
6075     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
6076     {
6077         int *buf;
6078         snew(comm->ddindex2simnodeid, dd->nnodes);
6079         snew(buf, dd->nnodes);
6080         if (cr->duty & DUTY_PP)
6081         {
6082             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6083         }
6084         /* Communicate the ddindex to simulation nodeid index */
6085         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6086                       cr->mpi_comm_mysim);
6087         sfree(buf);
6088     }
6089 #endif
6090 }
6091
6092 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
6093                                                      int ncg, int natoms)
6094 {
6095     gmx_domdec_master_t *ma;
6096     int                  i;
6097
6098     snew(ma, 1);
6099
6100     snew(ma->ncg, dd->nnodes);
6101     snew(ma->index, dd->nnodes+1);
6102     snew(ma->cg, ncg);
6103     snew(ma->nat, dd->nnodes);
6104     snew(ma->ibuf, dd->nnodes*2);
6105     snew(ma->cell_x, DIM);
6106     for (i = 0; i < DIM; i++)
6107     {
6108         snew(ma->cell_x[i], dd->nc[i]+1);
6109     }
6110
6111     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
6112     {
6113         ma->vbuf = NULL;
6114     }
6115     else
6116     {
6117         snew(ma->vbuf, natoms);
6118     }
6119
6120     return ma;
6121 }
6122
6123 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
6124                                int gmx_unused reorder)
6125 {
6126     gmx_domdec_t      *dd;
6127     gmx_domdec_comm_t *comm;
6128     int                i;
6129     gmx_bool           bDiv[DIM];
6130 #ifdef GMX_MPI
6131     MPI_Comm           comm_cart;
6132 #endif
6133
6134     dd   = cr->dd;
6135     comm = dd->comm;
6136
6137     if (comm->bCartesianPP)
6138     {
6139         for (i = 1; i < DIM; i++)
6140         {
6141             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6142         }
6143         if (bDiv[YY] || bDiv[ZZ])
6144         {
6145             comm->bCartesianPP_PME = TRUE;
6146             /* If we have 2D PME decomposition, which is always in x+y,
6147              * we stack the PME only nodes in z.
6148              * Otherwise we choose the direction that provides the thinnest slab
6149              * of PME only nodes as this will have the least effect
6150              * on the PP communication.
6151              * But for the PME communication the opposite might be better.
6152              */
6153             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6154                              !bDiv[YY] ||
6155                              dd->nc[YY] > dd->nc[ZZ]))
6156             {
6157                 comm->cartpmedim = ZZ;
6158             }
6159             else
6160             {
6161                 comm->cartpmedim = YY;
6162             }
6163             comm->ntot[comm->cartpmedim]
6164                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6165         }
6166         else if (fplog)
6167         {
6168             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6169             fprintf(fplog,
6170                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6171         }
6172     }
6173
6174 #ifdef GMX_MPI
6175     if (comm->bCartesianPP_PME)
6176     {
6177         int  rank;
6178         ivec periods;
6179
6180         if (fplog)
6181         {
6182             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6183         }
6184
6185         for (i = 0; i < DIM; i++)
6186         {
6187             periods[i] = TRUE;
6188         }
6189         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6190                         &comm_cart);
6191         MPI_Comm_rank(comm_cart, &rank);
6192         if (MASTERNODE(cr) && rank != 0)
6193         {
6194             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6195         }
6196
6197         /* With this assigment we loose the link to the original communicator
6198          * which will usually be MPI_COMM_WORLD, unless have multisim.
6199          */
6200         cr->mpi_comm_mysim = comm_cart;
6201         cr->sim_nodeid     = rank;
6202
6203         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6204
6205         if (fplog)
6206         {
6207             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
6208                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6209         }
6210
6211         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6212         {
6213             cr->duty = DUTY_PP;
6214         }
6215         if (cr->npmenodes == 0 ||
6216             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6217         {
6218             cr->duty = DUTY_PME;
6219         }
6220
6221         /* Split the sim communicator into PP and PME only nodes */
6222         MPI_Comm_split(cr->mpi_comm_mysim,
6223                        cr->duty,
6224                        dd_index(comm->ntot, dd->ci),
6225                        &cr->mpi_comm_mygroup);
6226     }
6227     else
6228     {
6229         switch (dd_node_order)
6230         {
6231             case ddnoPP_PME:
6232                 if (fplog)
6233                 {
6234                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
6235                 }
6236                 break;
6237             case ddnoINTERLEAVE:
6238                 /* Interleave the PP-only and PME-only nodes,
6239                  * as on clusters with dual-core machines this will double
6240                  * the communication bandwidth of the PME processes
6241                  * and thus speed up the PP <-> PME and inter PME communication.
6242                  */
6243                 if (fplog)
6244                 {
6245                     fprintf(fplog, "Interleaving PP and PME ranks\n");
6246                 }
6247                 comm->pmenodes = dd_pmenodes(cr);
6248                 break;
6249             case ddnoCARTESIAN:
6250                 break;
6251             default:
6252                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6253         }
6254
6255         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6256         {
6257             cr->duty = DUTY_PME;
6258         }
6259         else
6260         {
6261             cr->duty = DUTY_PP;
6262         }
6263
6264         /* Split the sim communicator into PP and PME only nodes */
6265         MPI_Comm_split(cr->mpi_comm_mysim,
6266                        cr->duty,
6267                        cr->nodeid,
6268                        &cr->mpi_comm_mygroup);
6269         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6270     }
6271 #endif
6272
6273     if (fplog)
6274     {
6275         fprintf(fplog, "This rank does only %s work.\n\n",
6276                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6277     }
6278 }
6279
6280 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6281 {
6282     gmx_domdec_t      *dd;
6283     gmx_domdec_comm_t *comm;
6284     int                CartReorder;
6285
6286     dd   = cr->dd;
6287     comm = dd->comm;
6288
6289     copy_ivec(dd->nc, comm->ntot);
6290
6291     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6292     comm->bCartesianPP_PME = FALSE;
6293
6294     /* Reorder the nodes by default. This might change the MPI ranks.
6295      * Real reordering is only supported on very few architectures,
6296      * Blue Gene is one of them.
6297      */
6298     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6299
6300     if (cr->npmenodes > 0)
6301     {
6302         /* Split the communicator into a PP and PME part */
6303         split_communicator(fplog, cr, dd_node_order, CartReorder);
6304         if (comm->bCartesianPP_PME)
6305         {
6306             /* We (possibly) reordered the nodes in split_communicator,
6307              * so it is no longer required in make_pp_communicator.
6308              */
6309             CartReorder = FALSE;
6310         }
6311     }
6312     else
6313     {
6314         /* All nodes do PP and PME */
6315 #ifdef GMX_MPI
6316         /* We do not require separate communicators */
6317         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6318 #endif
6319     }
6320
6321     if (cr->duty & DUTY_PP)
6322     {
6323         /* Copy or make a new PP communicator */
6324         make_pp_communicator(fplog, cr, CartReorder);
6325     }
6326     else
6327     {
6328         receive_ddindex2simnodeid(cr);
6329     }
6330
6331     if (!(cr->duty & DUTY_PME))
6332     {
6333         /* Set up the commnuication to our PME node */
6334         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6335         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6336         if (debug)
6337         {
6338             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6339                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6340         }
6341     }
6342     else
6343     {
6344         dd->pme_nodeid = -1;
6345     }
6346
6347     if (DDMASTER(dd))
6348     {
6349         dd->ma = init_gmx_domdec_master_t(dd,
6350                                           comm->cgs_gl.nr,
6351                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6352     }
6353 }
6354
6355 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6356 {
6357     real  *slb_frac, tot;
6358     int    i, n;
6359     double dbl;
6360
6361     slb_frac = NULL;
6362     if (nc > 1 && size_string != NULL)
6363     {
6364         if (fplog)
6365         {
6366             fprintf(fplog, "Using static load balancing for the %s direction\n",
6367                     dir);
6368         }
6369         snew(slb_frac, nc);
6370         tot = 0;
6371         for (i = 0; i < nc; i++)
6372         {
6373             dbl = 0;
6374             sscanf(size_string, "%20lf%n", &dbl, &n);
6375             if (dbl == 0)
6376             {
6377                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6378             }
6379             slb_frac[i]  = dbl;
6380             size_string += n;
6381             tot         += slb_frac[i];
6382         }
6383         /* Normalize */
6384         if (fplog)
6385         {
6386             fprintf(fplog, "Relative cell sizes:");
6387         }
6388         for (i = 0; i < nc; i++)
6389         {
6390             slb_frac[i] /= tot;
6391             if (fplog)
6392             {
6393                 fprintf(fplog, " %5.3f", slb_frac[i]);
6394             }
6395         }
6396         if (fplog)
6397         {
6398             fprintf(fplog, "\n");
6399         }
6400     }
6401
6402     return slb_frac;
6403 }
6404
6405 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6406 {
6407     int                  n, nmol, ftype;
6408     gmx_mtop_ilistloop_t iloop;
6409     t_ilist             *il;
6410
6411     n     = 0;
6412     iloop = gmx_mtop_ilistloop_init(mtop);
6413     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6414     {
6415         for (ftype = 0; ftype < F_NRE; ftype++)
6416         {
6417             if ((interaction_function[ftype].flags & IF_BOND) &&
6418                 NRAL(ftype) >  2)
6419             {
6420                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6421             }
6422         }
6423     }
6424
6425     return n;
6426 }
6427
6428 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6429 {
6430     char *val;
6431     int   nst;
6432
6433     nst = def;
6434     val = getenv(env_var);
6435     if (val)
6436     {
6437         if (sscanf(val, "%20d", &nst) <= 0)
6438         {
6439             nst = 1;
6440         }
6441         if (fplog)
6442         {
6443             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6444                     env_var, val, nst);
6445         }
6446     }
6447
6448     return nst;
6449 }
6450
6451 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6452 {
6453     if (MASTER(cr))
6454     {
6455         fprintf(stderr, "\n%s\n", warn_string);
6456     }
6457     if (fplog)
6458     {
6459         fprintf(fplog, "\n%s\n", warn_string);
6460     }
6461 }
6462
6463 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6464                                   t_inputrec *ir, FILE *fplog)
6465 {
6466     if (ir->ePBC == epbcSCREW &&
6467         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6468     {
6469         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6470     }
6471
6472     if (ir->ns_type == ensSIMPLE)
6473     {
6474         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6475     }
6476
6477     if (ir->nstlist == 0)
6478     {
6479         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6480     }
6481
6482     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6483     {
6484         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6485     }
6486 }
6487
6488 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6489 {
6490     int  di, d;
6491     real r;
6492
6493     r = ddbox->box_size[XX];
6494     for (di = 0; di < dd->ndim; di++)
6495     {
6496         d = dd->dim[di];
6497         /* Check using the initial average cell size */
6498         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6499     }
6500
6501     return r;
6502 }
6503
6504 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6505                              const char *dlb_opt, gmx_bool bRecordLoad,
6506                              unsigned long Flags, t_inputrec *ir)
6507 {
6508     int           eDLB = -1;
6509     char          buf[STRLEN];
6510
6511     switch (dlb_opt[0])
6512     {
6513         case 'a': eDLB = edlbAUTO; break;
6514         case 'n': eDLB = edlbNO;   break;
6515         case 'y': eDLB = edlbYES;  break;
6516         default: gmx_incons("Unknown dlb_opt");
6517     }
6518
6519     if (Flags & MD_RERUN)
6520     {
6521         return edlbNO;
6522     }
6523
6524     if (!EI_DYNAMICS(ir->eI))
6525     {
6526         if (eDLB == edlbYES)
6527         {
6528             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6529             dd_warning(cr, fplog, buf);
6530         }
6531
6532         return edlbNO;
6533     }
6534
6535     if (!bRecordLoad)
6536     {
6537         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6538
6539         return edlbNO;
6540     }
6541
6542     if (Flags & MD_REPRODUCIBLE)
6543     {
6544         switch (eDLB)
6545         {
6546             case edlbNO:
6547                 break;
6548             case edlbAUTO:
6549                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6550                 eDLB = edlbNO;
6551                 break;
6552             case edlbYES:
6553                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6554                 break;
6555             default:
6556                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6557                 break;
6558         }
6559     }
6560
6561     return eDLB;
6562 }
6563
6564 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6565 {
6566     int dim;
6567
6568     dd->ndim = 0;
6569     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6570     {
6571         /* Decomposition order z,y,x */
6572         if (fplog)
6573         {
6574             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6575         }
6576         for (dim = DIM-1; dim >= 0; dim--)
6577         {
6578             if (dd->nc[dim] > 1)
6579             {
6580                 dd->dim[dd->ndim++] = dim;
6581             }
6582         }
6583     }
6584     else
6585     {
6586         /* Decomposition order x,y,z */
6587         for (dim = 0; dim < DIM; dim++)
6588         {
6589             if (dd->nc[dim] > 1)
6590             {
6591                 dd->dim[dd->ndim++] = dim;
6592             }
6593         }
6594     }
6595 }
6596
6597 static gmx_domdec_comm_t *init_dd_comm()
6598 {
6599     gmx_domdec_comm_t *comm;
6600     int                i;
6601
6602     snew(comm, 1);
6603     snew(comm->cggl_flag, DIM*2);
6604     snew(comm->cgcm_state, DIM*2);
6605     for (i = 0; i < DIM*2; i++)
6606     {
6607         comm->cggl_flag_nalloc[i]  = 0;
6608         comm->cgcm_state_nalloc[i] = 0;
6609     }
6610
6611     comm->nalloc_int = 0;
6612     comm->buf_int    = NULL;
6613
6614     vec_rvec_init(&comm->vbuf);
6615
6616     comm->n_load_have    = 0;
6617     comm->n_load_collect = 0;
6618
6619     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6620     {
6621         comm->sum_nat[i] = 0;
6622     }
6623     comm->ndecomp   = 0;
6624     comm->nload     = 0;
6625     comm->load_step = 0;
6626     comm->load_sum  = 0;
6627     comm->load_max  = 0;
6628     clear_ivec(comm->load_lim);
6629     comm->load_mdf  = 0;
6630     comm->load_pme  = 0;
6631
6632     return comm;
6633 }
6634
6635 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6636                                         unsigned long Flags,
6637                                         ivec nc,
6638                                         real comm_distance_min, real rconstr,
6639                                         const char *dlb_opt, real dlb_scale,
6640                                         const char *sizex, const char *sizey, const char *sizez,
6641                                         gmx_mtop_t *mtop, t_inputrec *ir,
6642                                         matrix box, rvec *x,
6643                                         gmx_ddbox_t *ddbox,
6644                                         int *npme_x, int *npme_y)
6645 {
6646     gmx_domdec_t      *dd;
6647     gmx_domdec_comm_t *comm;
6648     int                recload;
6649     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6650     gmx_bool           bC;
6651     char               buf[STRLEN];
6652     const real         tenPercentMargin = 1.1;
6653
6654     if (fplog)
6655     {
6656         fprintf(fplog,
6657                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6658     }
6659
6660     snew(dd, 1);
6661
6662     dd->comm = init_dd_comm();
6663     comm     = dd->comm;
6664     snew(comm->cggl_flag, DIM*2);
6665     snew(comm->cgcm_state, DIM*2);
6666
6667     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6668     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6669
6670     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6671     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6672     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6673     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6674     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6675     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6676     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6677     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6678
6679     dd->pme_recv_f_alloc = 0;
6680     dd->pme_recv_f_buf   = NULL;
6681
6682     if (dd->bSendRecv2 && fplog)
6683     {
6684         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6685     }
6686     if (comm->eFlop)
6687     {
6688         if (fplog)
6689         {
6690             fprintf(fplog, "Will load balance based on FLOP count\n");
6691         }
6692         if (comm->eFlop > 1)
6693         {
6694             srand(1+cr->nodeid);
6695         }
6696         comm->bRecordLoad = TRUE;
6697     }
6698     else
6699     {
6700         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6701
6702     }
6703
6704     /* Initialize to GPU share count to 0, might change later */
6705     comm->nrank_gpu_shared = 0;
6706
6707     comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6708     comm->bDLB_locked = FALSE;
6709
6710     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6711     if (fplog)
6712     {
6713         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6714     }
6715     dd->bGridJump              = comm->bDynLoadBal;
6716     comm->bPMELoadBalDLBLimits = FALSE;
6717
6718     if (comm->nstSortCG)
6719     {
6720         if (fplog)
6721         {
6722             if (comm->nstSortCG == 1)
6723             {
6724                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6725             }
6726             else
6727             {
6728                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6729                         comm->nstSortCG);
6730             }
6731         }
6732         snew(comm->sort, 1);
6733     }
6734     else
6735     {
6736         if (fplog)
6737         {
6738             fprintf(fplog, "Will not sort the charge groups\n");
6739         }
6740     }
6741
6742     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6743
6744     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6745     if (comm->bInterCGBondeds)
6746     {
6747         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6748     }
6749     else
6750     {
6751         comm->bInterCGMultiBody = FALSE;
6752     }
6753
6754     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6755     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6756
6757     if (ir->rlistlong == 0)
6758     {
6759         /* Set the cut-off to some very large value,
6760          * so we don't need if statements everywhere in the code.
6761          * We use sqrt, since the cut-off is squared in some places.
6762          */
6763         comm->cutoff   = GMX_CUTOFF_INF;
6764     }
6765     else
6766     {
6767         comm->cutoff   = ir->rlistlong;
6768     }
6769     comm->cutoff_mbody = 0;
6770
6771     comm->cellsize_limit = 0;
6772     comm->bBondComm      = FALSE;
6773
6774     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6775      * within nstlist steps. Since boundaries are allowed to displace by half
6776      * a cell size, DD cells should be at least the size of the list buffer.
6777      */
6778     comm->cellsize_limit = std::max(comm->cellsize_limit,
6779                                     ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
6780
6781     if (comm->bInterCGBondeds)
6782     {
6783         if (comm_distance_min > 0)
6784         {
6785             comm->cutoff_mbody = comm_distance_min;
6786             if (Flags & MD_DDBONDCOMM)
6787             {
6788                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6789             }
6790             else
6791             {
6792                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6793             }
6794             r_bonded_limit = comm->cutoff_mbody;
6795         }
6796         else if (ir->bPeriodicMols)
6797         {
6798             /* Can not easily determine the required cut-off */
6799             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6800             comm->cutoff_mbody = comm->cutoff/2;
6801             r_bonded_limit     = comm->cutoff_mbody;
6802         }
6803         else
6804         {
6805             if (MASTER(cr))
6806             {
6807                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6808                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6809             }
6810             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6811             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6812
6813             /* We use an initial margin of 10% for the minimum cell size,
6814              * except when we are just below the non-bonded cut-off.
6815              */
6816             if (Flags & MD_DDBONDCOMM)
6817             {
6818                 if (std::max(r_2b, r_mb) > comm->cutoff)
6819                 {
6820                     r_bonded        = std::max(r_2b, r_mb);
6821                     r_bonded_limit  = tenPercentMargin*r_bonded;
6822                     comm->bBondComm = TRUE;
6823                 }
6824                 else
6825                 {
6826                     r_bonded       = r_mb;
6827                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6828                 }
6829                 /* We determine cutoff_mbody later */
6830             }
6831             else
6832             {
6833                 /* No special bonded communication,
6834                  * simply increase the DD cut-off.
6835                  */
6836                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6837                 comm->cutoff_mbody = r_bonded_limit;
6838                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6839             }
6840         }
6841         if (fplog)
6842         {
6843             fprintf(fplog,
6844                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6845                     r_bonded_limit);
6846         }
6847         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6848     }
6849
6850     if (dd->bInterCGcons && rconstr <= 0)
6851     {
6852         /* There is a cell size limit due to the constraints (P-LINCS) */
6853         rconstr = constr_r_max(fplog, mtop, ir);
6854         if (fplog)
6855         {
6856             fprintf(fplog,
6857                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6858                     rconstr);
6859             if (rconstr > comm->cellsize_limit)
6860             {
6861                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6862             }
6863         }
6864     }
6865     else if (rconstr > 0 && fplog)
6866     {
6867         /* Here we do not check for dd->bInterCGcons,
6868          * because one can also set a cell size limit for virtual sites only
6869          * and at this point we don't know yet if there are intercg v-sites.
6870          */
6871         fprintf(fplog,
6872                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6873                 rconstr);
6874     }
6875     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6876
6877     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6878
6879     if (nc[XX] > 0)
6880     {
6881         copy_ivec(nc, dd->nc);
6882         set_dd_dim(fplog, dd);
6883         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6884
6885         if (cr->npmenodes == -1)
6886         {
6887             cr->npmenodes = 0;
6888         }
6889         acs = average_cellsize_min(dd, ddbox);
6890         if (acs < comm->cellsize_limit)
6891         {
6892             if (fplog)
6893             {
6894                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6895             }
6896             gmx_fatal_collective(FARGS, cr, NULL,
6897                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6898                                  acs, comm->cellsize_limit);
6899         }
6900     }
6901     else
6902     {
6903         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6904
6905         /* We need to choose the optimal DD grid and possibly PME nodes */
6906         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6907                                comm->eDLB != edlbNO, dlb_scale,
6908                                comm->cellsize_limit, comm->cutoff,
6909                                comm->bInterCGBondeds);
6910
6911         if (dd->nc[XX] == 0)
6912         {
6913             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6914             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6915                     !bC ? "-rdd" : "-rcon",
6916                     comm->eDLB != edlbNO ? " or -dds" : "",
6917                     bC ? " or your LINCS settings" : "");
6918
6919             gmx_fatal_collective(FARGS, cr, NULL,
6920                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6921                                  "%s\n"
6922                                  "Look in the log file for details on the domain decomposition",
6923                                  cr->nnodes-cr->npmenodes, limit, buf);
6924         }
6925         set_dd_dim(fplog, dd);
6926     }
6927
6928     if (fplog)
6929     {
6930         fprintf(fplog,
6931                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6932                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6933     }
6934
6935     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6936     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6937     {
6938         gmx_fatal_collective(FARGS, cr, NULL,
6939                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6940                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6941     }
6942     if (cr->npmenodes > dd->nnodes)
6943     {
6944         gmx_fatal_collective(FARGS, cr, NULL,
6945                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6946     }
6947     if (cr->npmenodes > 0)
6948     {
6949         comm->npmenodes = cr->npmenodes;
6950     }
6951     else
6952     {
6953         comm->npmenodes = dd->nnodes;
6954     }
6955
6956     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6957     {
6958         /* The following choices should match those
6959          * in comm_cost_est in domdec_setup.c.
6960          * Note that here the checks have to take into account
6961          * that the decomposition might occur in a different order than xyz
6962          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6963          * in which case they will not match those in comm_cost_est,
6964          * but since that is mainly for testing purposes that's fine.
6965          */
6966         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6967             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6968             getenv("GMX_PMEONEDD") == NULL)
6969         {
6970             comm->npmedecompdim = 2;
6971             comm->npmenodes_x   = dd->nc[XX];
6972             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6973         }
6974         else
6975         {
6976             /* In case nc is 1 in both x and y we could still choose to
6977              * decompose pme in y instead of x, but we use x for simplicity.
6978              */
6979             comm->npmedecompdim = 1;
6980             if (dd->dim[0] == YY)
6981             {
6982                 comm->npmenodes_x = 1;
6983                 comm->npmenodes_y = comm->npmenodes;
6984             }
6985             else
6986             {
6987                 comm->npmenodes_x = comm->npmenodes;
6988                 comm->npmenodes_y = 1;
6989             }
6990         }
6991         if (fplog)
6992         {
6993             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6994                     comm->npmenodes_x, comm->npmenodes_y, 1);
6995         }
6996     }
6997     else
6998     {
6999         comm->npmedecompdim = 0;
7000         comm->npmenodes_x   = 0;
7001         comm->npmenodes_y   = 0;
7002     }
7003
7004     /* Technically we don't need both of these,
7005      * but it simplifies code not having to recalculate it.
7006      */
7007     *npme_x = comm->npmenodes_x;
7008     *npme_y = comm->npmenodes_y;
7009
7010     snew(comm->slb_frac, DIM);
7011     if (comm->eDLB == edlbNO)
7012     {
7013         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
7014         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
7015         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
7016     }
7017
7018     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
7019     {
7020         if (comm->bBondComm || comm->eDLB != edlbNO)
7021         {
7022             /* Set the bonded communication distance to halfway
7023              * the minimum and the maximum,
7024              * since the extra communication cost is nearly zero.
7025              */
7026             acs                = average_cellsize_min(dd, ddbox);
7027             comm->cutoff_mbody = 0.5*(r_bonded + acs);
7028             if (comm->eDLB != edlbNO)
7029             {
7030                 /* Check if this does not limit the scaling */
7031                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
7032             }
7033             if (!comm->bBondComm)
7034             {
7035                 /* Without bBondComm do not go beyond the n.b. cut-off */
7036                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
7037                 if (comm->cellsize_limit >= comm->cutoff)
7038                 {
7039                     /* We don't loose a lot of efficieny
7040                      * when increasing it to the n.b. cut-off.
7041                      * It can even be slightly faster, because we need
7042                      * less checks for the communication setup.
7043                      */
7044                     comm->cutoff_mbody = comm->cutoff;
7045                 }
7046             }
7047             /* Check if we did not end up below our original limit */
7048             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
7049
7050             if (comm->cutoff_mbody > comm->cellsize_limit)
7051             {
7052                 comm->cellsize_limit = comm->cutoff_mbody;
7053             }
7054         }
7055         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7056     }
7057
7058     if (debug)
7059     {
7060         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
7061                 "cellsize limit %f\n",
7062                 comm->bBondComm, comm->cellsize_limit);
7063     }
7064
7065     if (MASTER(cr))
7066     {
7067         check_dd_restrictions(cr, dd, ir, fplog);
7068     }
7069
7070     comm->partition_step = INT_MIN;
7071     dd->ddp_count        = 0;
7072
7073     clear_dd_cycle_counts(dd);
7074
7075     return dd;
7076 }
7077
7078 static void set_dlb_limits(gmx_domdec_t *dd)
7079
7080 {
7081     int d;
7082
7083     for (d = 0; d < dd->ndim; d++)
7084     {
7085         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
7086         dd->comm->cellsize_min[dd->dim[d]] =
7087             dd->comm->cellsize_min_dlb[dd->dim[d]];
7088     }
7089 }
7090
7091
7092 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
7093 {
7094     gmx_domdec_t      *dd;
7095     gmx_domdec_comm_t *comm;
7096     real               cellsize_min;
7097     int                d, nc, i;
7098     char               buf[STRLEN];
7099
7100     dd   = cr->dd;
7101     comm = dd->comm;
7102
7103     if (fplog)
7104     {
7105         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
7106     }
7107
7108     cellsize_min = comm->cellsize_min[dd->dim[0]];
7109     for (d = 1; d < dd->ndim; d++)
7110     {
7111         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
7112     }
7113
7114     if (cellsize_min < comm->cellsize_limit*1.05)
7115     {
7116         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7117
7118         /* Change DLB from "auto" to "no". */
7119         comm->eDLB = edlbNO;
7120
7121         return;
7122     }
7123
7124     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
7125     comm->bDynLoadBal = TRUE;
7126     dd->bGridJump     = TRUE;
7127
7128     set_dlb_limits(dd);
7129
7130     /* We can set the required cell size info here,
7131      * so we do not need to communicate this.
7132      * The grid is completely uniform.
7133      */
7134     for (d = 0; d < dd->ndim; d++)
7135     {
7136         if (comm->root[d])
7137         {
7138             comm->load[d].sum_m = comm->load[d].sum;
7139
7140             nc = dd->nc[dd->dim[d]];
7141             for (i = 0; i < nc; i++)
7142             {
7143                 comm->root[d]->cell_f[i]    = i/(real)nc;
7144                 if (d > 0)
7145                 {
7146                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7147                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7148                 }
7149             }
7150             comm->root[d]->cell_f[nc] = 1.0;
7151         }
7152     }
7153 }
7154
7155 static char *init_bLocalCG(gmx_mtop_t *mtop)
7156 {
7157     int   ncg, cg;
7158     char *bLocalCG;
7159
7160     ncg = ncg_mtop(mtop);
7161     snew(bLocalCG, ncg);
7162     for (cg = 0; cg < ncg; cg++)
7163     {
7164         bLocalCG[cg] = FALSE;
7165     }
7166
7167     return bLocalCG;
7168 }
7169
7170 void dd_init_bondeds(FILE *fplog,
7171                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7172                      gmx_vsite_t *vsite,
7173                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7174 {
7175     gmx_domdec_comm_t *comm;
7176
7177     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
7178
7179     comm = dd->comm;
7180
7181     if (comm->bBondComm)
7182     {
7183         /* Communicate atoms beyond the cut-off for bonded interactions */
7184         comm = dd->comm;
7185
7186         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7187
7188         comm->bLocalCG = init_bLocalCG(mtop);
7189     }
7190     else
7191     {
7192         /* Only communicate atoms based on cut-off */
7193         comm->cglink   = NULL;
7194         comm->bLocalCG = NULL;
7195     }
7196 }
7197
7198 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7199                               t_inputrec *ir,
7200                               gmx_bool bDynLoadBal, real dlb_scale,
7201                               gmx_ddbox_t *ddbox)
7202 {
7203     gmx_domdec_comm_t *comm;
7204     int                d;
7205     ivec               np;
7206     real               limit, shrink;
7207     char               buf[64];
7208
7209     if (fplog == NULL)
7210     {
7211         return;
7212     }
7213
7214     comm = dd->comm;
7215
7216     if (bDynLoadBal)
7217     {
7218         fprintf(fplog, "The maximum number of communication pulses is:");
7219         for (d = 0; d < dd->ndim; d++)
7220         {
7221             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7222         }
7223         fprintf(fplog, "\n");
7224         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7225         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7226         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7227         for (d = 0; d < DIM; d++)
7228         {
7229             if (dd->nc[d] > 1)
7230             {
7231                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7232                 {
7233                     shrink = 0;
7234                 }
7235                 else
7236                 {
7237                     shrink =
7238                         comm->cellsize_min_dlb[d]/
7239                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7240                 }
7241                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7242             }
7243         }
7244         fprintf(fplog, "\n");
7245     }
7246     else
7247     {
7248         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
7249         fprintf(fplog, "The initial number of communication pulses is:");
7250         for (d = 0; d < dd->ndim; d++)
7251         {
7252             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7253         }
7254         fprintf(fplog, "\n");
7255         fprintf(fplog, "The initial domain decomposition cell size is:");
7256         for (d = 0; d < DIM; d++)
7257         {
7258             if (dd->nc[d] > 1)
7259             {
7260                 fprintf(fplog, " %c %.2f nm",
7261                         dim2char(d), dd->comm->cellsize_min[d]);
7262             }
7263         }
7264         fprintf(fplog, "\n\n");
7265     }
7266
7267     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7268     {
7269         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7270         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7271                 "non-bonded interactions", "", comm->cutoff);
7272
7273         if (bDynLoadBal)
7274         {
7275             limit = dd->comm->cellsize_limit;
7276         }
7277         else
7278         {
7279             if (dynamic_dd_box(ddbox, ir))
7280             {
7281                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7282             }
7283             limit = dd->comm->cellsize_min[XX];
7284             for (d = 1; d < DIM; d++)
7285             {
7286                 limit = std::min(limit, dd->comm->cellsize_min[d]);
7287             }
7288         }
7289
7290         if (comm->bInterCGBondeds)
7291         {
7292             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7293                     "two-body bonded interactions", "(-rdd)",
7294                     std::max(comm->cutoff, comm->cutoff_mbody));
7295             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7296                     "multi-body bonded interactions", "(-rdd)",
7297                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
7298         }
7299         if (dd->vsite_comm)
7300         {
7301             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7302                     "virtual site constructions", "(-rcon)", limit);
7303         }
7304         if (dd->constraint_comm)
7305         {
7306             sprintf(buf, "atoms separated by up to %d constraints",
7307                     1+ir->nProjOrder);
7308             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7309                     buf, "(-rcon)", limit);
7310         }
7311         fprintf(fplog, "\n");
7312     }
7313
7314     fflush(fplog);
7315 }
7316
7317 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7318                                 real               dlb_scale,
7319                                 const t_inputrec  *ir,
7320                                 const gmx_ddbox_t *ddbox)
7321 {
7322     gmx_domdec_comm_t *comm;
7323     int                d, dim, npulse, npulse_d_max, npulse_d;
7324     gmx_bool           bNoCutOff;
7325
7326     comm = dd->comm;
7327
7328     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7329
7330     /* Determine the maximum number of comm. pulses in one dimension */
7331
7332     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7333
7334     /* Determine the maximum required number of grid pulses */
7335     if (comm->cellsize_limit >= comm->cutoff)
7336     {
7337         /* Only a single pulse is required */
7338         npulse = 1;
7339     }
7340     else if (!bNoCutOff && comm->cellsize_limit > 0)
7341     {
7342         /* We round down slightly here to avoid overhead due to the latency
7343          * of extra communication calls when the cut-off
7344          * would be only slightly longer than the cell size.
7345          * Later cellsize_limit is redetermined,
7346          * so we can not miss interactions due to this rounding.
7347          */
7348         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7349     }
7350     else
7351     {
7352         /* There is no cell size limit */
7353         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7354     }
7355
7356     if (!bNoCutOff && npulse > 1)
7357     {
7358         /* See if we can do with less pulses, based on dlb_scale */
7359         npulse_d_max = 0;
7360         for (d = 0; d < dd->ndim; d++)
7361         {
7362             dim      = dd->dim[d];
7363             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7364                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7365             npulse_d_max = std::max(npulse_d_max, npulse_d);
7366         }
7367         npulse = std::min(npulse, npulse_d_max);
7368     }
7369
7370     /* This env var can override npulse */
7371     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7372     if (d > 0)
7373     {
7374         npulse = d;
7375     }
7376
7377     comm->maxpulse       = 1;
7378     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7379     for (d = 0; d < dd->ndim; d++)
7380     {
7381         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7382         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7383         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7384         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7385         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7386         {
7387             comm->bVacDLBNoLimit = FALSE;
7388         }
7389     }
7390
7391     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7392     if (!comm->bVacDLBNoLimit)
7393     {
7394         comm->cellsize_limit = std::max(comm->cellsize_limit,
7395                                         comm->cutoff/comm->maxpulse);
7396     }
7397     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7398     /* Set the minimum cell size for each DD dimension */
7399     for (d = 0; d < dd->ndim; d++)
7400     {
7401         if (comm->bVacDLBNoLimit ||
7402             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7403         {
7404             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7405         }
7406         else
7407         {
7408             comm->cellsize_min_dlb[dd->dim[d]] =
7409                 comm->cutoff/comm->cd[d].np_dlb;
7410         }
7411     }
7412     if (comm->cutoff_mbody <= 0)
7413     {
7414         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7415     }
7416     if (comm->bDynLoadBal)
7417     {
7418         set_dlb_limits(dd);
7419     }
7420 }
7421
7422 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7423 {
7424     /* If each molecule is a single charge group
7425      * or we use domain decomposition for each periodic dimension,
7426      * we do not need to take pbc into account for the bonded interactions.
7427      */
7428     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7429             !(dd->nc[XX] > 1 &&
7430               dd->nc[YY] > 1 &&
7431               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7432 }
7433
7434 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7435                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7436 {
7437     gmx_domdec_comm_t *comm;
7438     int                natoms_tot;
7439     real               vol_frac;
7440
7441     comm = dd->comm;
7442
7443     /* Initialize the thread data.
7444      * This can not be done in init_domain_decomposition,
7445      * as the numbers of threads is determined later.
7446      */
7447     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7448     if (comm->nth > 1)
7449     {
7450         snew(comm->dth, comm->nth);
7451     }
7452
7453     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7454     {
7455         init_ddpme(dd, &comm->ddpme[0], 0);
7456         if (comm->npmedecompdim >= 2)
7457         {
7458             init_ddpme(dd, &comm->ddpme[1], 1);
7459         }
7460     }
7461     else
7462     {
7463         comm->npmenodes = 0;
7464         if (dd->pme_nodeid >= 0)
7465         {
7466             gmx_fatal_collective(FARGS, NULL, dd,
7467                                  "Can not have separate PME ranks without PME electrostatics");
7468         }
7469     }
7470
7471     if (debug)
7472     {
7473         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7474     }
7475     if (comm->eDLB != edlbNO)
7476     {
7477         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7478     }
7479
7480     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7481     if (comm->eDLB == edlbAUTO)
7482     {
7483         if (fplog)
7484         {
7485             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7486         }
7487         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7488     }
7489
7490     if (ir->ePBC == epbcNONE)
7491     {
7492         vol_frac = 1 - 1/(double)dd->nnodes;
7493     }
7494     else
7495     {
7496         vol_frac =
7497             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7498     }
7499     if (debug)
7500     {
7501         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7502     }
7503     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7504
7505     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7506 }
7507
7508 static gmx_bool test_dd_cutoff(t_commrec *cr,
7509                                t_state *state, t_inputrec *ir,
7510                                real cutoff_req)
7511 {
7512     gmx_domdec_t *dd;
7513     gmx_ddbox_t   ddbox;
7514     int           d, dim, np;
7515     real          inv_cell_size;
7516     int           LocallyLimited;
7517
7518     dd = cr->dd;
7519
7520     set_ddbox(dd, FALSE, cr, ir, state->box,
7521               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7522
7523     LocallyLimited = 0;
7524
7525     for (d = 0; d < dd->ndim; d++)
7526     {
7527         dim = dd->dim[d];
7528
7529         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7530         if (dynamic_dd_box(&ddbox, ir))
7531         {
7532             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7533         }
7534
7535         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7536
7537         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7538             dd->comm->cd[d].np_dlb > 0)
7539         {
7540             if (np > dd->comm->cd[d].np_dlb)
7541             {
7542                 return FALSE;
7543             }
7544
7545             /* If a current local cell size is smaller than the requested
7546              * cut-off, we could still fix it, but this gets very complicated.
7547              * Without fixing here, we might actually need more checks.
7548              */
7549             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7550             {
7551                 LocallyLimited = 1;
7552             }
7553         }
7554     }
7555
7556     if (dd->comm->eDLB != edlbNO)
7557     {
7558         /* If DLB is not active yet, we don't need to check the grid jumps.
7559          * Actually we shouldn't, because then the grid jump data is not set.
7560          */
7561         if (dd->comm->bDynLoadBal &&
7562             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7563         {
7564             LocallyLimited = 1;
7565         }
7566
7567         gmx_sumi(1, &LocallyLimited, cr);
7568
7569         if (LocallyLimited > 0)
7570         {
7571             return FALSE;
7572         }
7573     }
7574
7575     return TRUE;
7576 }
7577
7578 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7579                           real cutoff_req)
7580 {
7581     gmx_bool bCutoffAllowed;
7582
7583     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7584
7585     if (bCutoffAllowed)
7586     {
7587         cr->dd->comm->cutoff = cutoff_req;
7588     }
7589
7590     return bCutoffAllowed;
7591 }
7592
7593 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7594 {
7595     gmx_domdec_comm_t *comm;
7596
7597     comm = cr->dd->comm;
7598
7599     /* Turn on the DLB limiting (might have been on already) */
7600     comm->bPMELoadBalDLBLimits = TRUE;
7601
7602     /* Change the cut-off limit */
7603     comm->PMELoadBal_max_cutoff = comm->cutoff;
7604 }
7605
7606 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7607 {
7608     return dd->comm->bDLB_locked;
7609 }
7610
7611 void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
7612 {
7613     /* We can only lock the DLB when it is set to auto, otherwise don't lock */
7614     if (dd->comm->eDLB == edlbAUTO)
7615     {
7616         dd->comm->bDLB_locked = bValue;
7617     }
7618 }
7619
7620 static void merge_cg_buffers(int ncell,
7621                              gmx_domdec_comm_dim_t *cd, int pulse,
7622                              int  *ncg_cell,
7623                              int  *index_gl, int  *recv_i,
7624                              rvec *cg_cm,    rvec *recv_vr,
7625                              int *cgindex,
7626                              cginfo_mb_t *cginfo_mb, int *cginfo)
7627 {
7628     gmx_domdec_ind_t *ind, *ind_p;
7629     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7630     int               shift, shift_at;
7631
7632     ind = &cd->ind[pulse];
7633
7634     /* First correct the already stored data */
7635     shift = ind->nrecv[ncell];
7636     for (cell = ncell-1; cell >= 0; cell--)
7637     {
7638         shift -= ind->nrecv[cell];
7639         if (shift > 0)
7640         {
7641             /* Move the cg's present from previous grid pulses */
7642             cg0                = ncg_cell[ncell+cell];
7643             cg1                = ncg_cell[ncell+cell+1];
7644             cgindex[cg1+shift] = cgindex[cg1];
7645             for (cg = cg1-1; cg >= cg0; cg--)
7646             {
7647                 index_gl[cg+shift] = index_gl[cg];
7648                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7649                 cgindex[cg+shift] = cgindex[cg];
7650                 cginfo[cg+shift]  = cginfo[cg];
7651             }
7652             /* Correct the already stored send indices for the shift */
7653             for (p = 1; p <= pulse; p++)
7654             {
7655                 ind_p = &cd->ind[p];
7656                 cg0   = 0;
7657                 for (c = 0; c < cell; c++)
7658                 {
7659                     cg0 += ind_p->nsend[c];
7660                 }
7661                 cg1 = cg0 + ind_p->nsend[cell];
7662                 for (cg = cg0; cg < cg1; cg++)
7663                 {
7664                     ind_p->index[cg] += shift;
7665                 }
7666             }
7667         }
7668     }
7669
7670     /* Merge in the communicated buffers */
7671     shift    = 0;
7672     shift_at = 0;
7673     cg0      = 0;
7674     for (cell = 0; cell < ncell; cell++)
7675     {
7676         cg1 = ncg_cell[ncell+cell+1] + shift;
7677         if (shift_at > 0)
7678         {
7679             /* Correct the old cg indices */
7680             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7681             {
7682                 cgindex[cg+1] += shift_at;
7683             }
7684         }
7685         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7686         {
7687             /* Copy this charge group from the buffer */
7688             index_gl[cg1] = recv_i[cg0];
7689             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7690             /* Add it to the cgindex */
7691             cg_gl          = index_gl[cg1];
7692             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7693             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7694             cgindex[cg1+1] = cgindex[cg1] + nat;
7695             cg0++;
7696             cg1++;
7697             shift_at += nat;
7698         }
7699         shift                 += ind->nrecv[cell];
7700         ncg_cell[ncell+cell+1] = cg1;
7701     }
7702 }
7703
7704 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7705                                int nzone, int cg0, const int *cgindex)
7706 {
7707     int cg, zone, p;
7708
7709     /* Store the atom block boundaries for easy copying of communication buffers
7710      */
7711     cg = cg0;
7712     for (zone = 0; zone < nzone; zone++)
7713     {
7714         for (p = 0; p < cd->np; p++)
7715         {
7716             cd->ind[p].cell2at0[zone] = cgindex[cg];
7717             cg += cd->ind[p].nrecv[zone];
7718             cd->ind[p].cell2at1[zone] = cgindex[cg];
7719         }
7720     }
7721 }
7722
7723 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7724 {
7725     int      i;
7726     gmx_bool bMiss;
7727
7728     bMiss = FALSE;
7729     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7730     {
7731         if (!bLocalCG[link->a[i]])
7732         {
7733             bMiss = TRUE;
7734         }
7735     }
7736
7737     return bMiss;
7738 }
7739
7740 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7741 typedef struct {
7742     real c[DIM][4]; /* the corners for the non-bonded communication */
7743     real cr0;       /* corner for rounding */
7744     real cr1[4];    /* corners for rounding */
7745     real bc[DIM];   /* corners for bounded communication */
7746     real bcr1;      /* corner for rounding for bonded communication */
7747 } dd_corners_t;
7748
7749 /* Determine the corners of the domain(s) we are communicating with */
7750 static void
7751 set_dd_corners(const gmx_domdec_t *dd,
7752                int dim0, int dim1, int dim2,
7753                gmx_bool bDistMB,
7754                dd_corners_t *c)
7755 {
7756     const gmx_domdec_comm_t  *comm;
7757     const gmx_domdec_zones_t *zones;
7758     int i, j;
7759
7760     comm = dd->comm;
7761
7762     zones = &comm->zones;
7763
7764     /* Keep the compiler happy */
7765     c->cr0  = 0;
7766     c->bcr1 = 0;
7767
7768     /* The first dimension is equal for all cells */
7769     c->c[0][0] = comm->cell_x0[dim0];
7770     if (bDistMB)
7771     {
7772         c->bc[0] = c->c[0][0];
7773     }
7774     if (dd->ndim >= 2)
7775     {
7776         dim1 = dd->dim[1];
7777         /* This cell row is only seen from the first row */
7778         c->c[1][0] = comm->cell_x0[dim1];
7779         /* All rows can see this row */
7780         c->c[1][1] = comm->cell_x0[dim1];
7781         if (dd->bGridJump)
7782         {
7783             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7784             if (bDistMB)
7785             {
7786                 /* For the multi-body distance we need the maximum */
7787                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7788             }
7789         }
7790         /* Set the upper-right corner for rounding */
7791         c->cr0 = comm->cell_x1[dim0];
7792
7793         if (dd->ndim >= 3)
7794         {
7795             dim2 = dd->dim[2];
7796             for (j = 0; j < 4; j++)
7797             {
7798                 c->c[2][j] = comm->cell_x0[dim2];
7799             }
7800             if (dd->bGridJump)
7801             {
7802                 /* Use the maximum of the i-cells that see a j-cell */
7803                 for (i = 0; i < zones->nizone; i++)
7804                 {
7805                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7806                     {
7807                         if (j >= 4)
7808                         {
7809                             c->c[2][j-4] =
7810                                 std::max(c->c[2][j-4],
7811                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7812                         }
7813                     }
7814                 }
7815                 if (bDistMB)
7816                 {
7817                     /* For the multi-body distance we need the maximum */
7818                     c->bc[2] = comm->cell_x0[dim2];
7819                     for (i = 0; i < 2; i++)
7820                     {
7821                         for (j = 0; j < 2; j++)
7822                         {
7823                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7824                         }
7825                     }
7826                 }
7827             }
7828
7829             /* Set the upper-right corner for rounding */
7830             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7831              * Only cell (0,0,0) can see cell 7 (1,1,1)
7832              */
7833             c->cr1[0] = comm->cell_x1[dim1];
7834             c->cr1[3] = comm->cell_x1[dim1];
7835             if (dd->bGridJump)
7836             {
7837                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7838                 if (bDistMB)
7839                 {
7840                     /* For the multi-body distance we need the maximum */
7841                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7842                 }
7843             }
7844         }
7845     }
7846 }
7847
7848 /* Determine which cg's we need to send in this pulse from this zone */
7849 static void
7850 get_zone_pulse_cgs(gmx_domdec_t *dd,
7851                    int zonei, int zone,
7852                    int cg0, int cg1,
7853                    const int *index_gl,
7854                    const int *cgindex,
7855                    int dim, int dim_ind,
7856                    int dim0, int dim1, int dim2,
7857                    real r_comm2, real r_bcomm2,
7858                    matrix box,
7859                    ivec tric_dist,
7860                    rvec *normal,
7861                    real skew_fac2_d, real skew_fac_01,
7862                    rvec *v_d, rvec *v_0, rvec *v_1,
7863                    const dd_corners_t *c,
7864                    rvec sf2_round,
7865                    gmx_bool bDistBonded,
7866                    gmx_bool bBondComm,
7867                    gmx_bool bDist2B,
7868                    gmx_bool bDistMB,
7869                    rvec *cg_cm,
7870                    int *cginfo,
7871                    gmx_domdec_ind_t *ind,
7872                    int **ibuf, int *ibuf_nalloc,
7873                    vec_rvec_t *vbuf,
7874                    int *nsend_ptr,
7875                    int *nat_ptr,
7876                    int *nsend_z_ptr)
7877 {
7878     gmx_domdec_comm_t *comm;
7879     gmx_bool           bScrew;
7880     gmx_bool           bDistMB_pulse;
7881     int                cg, i;
7882     real               r2, rb2, r, tric_sh;
7883     rvec               rn, rb;
7884     int                dimd;
7885     int                nsend_z, nsend, nat;
7886
7887     comm = dd->comm;
7888
7889     bScrew = (dd->bScrewPBC && dim == XX);
7890
7891     bDistMB_pulse = (bDistMB && bDistBonded);
7892
7893     nsend_z = 0;
7894     nsend   = *nsend_ptr;
7895     nat     = *nat_ptr;
7896
7897     for (cg = cg0; cg < cg1; cg++)
7898     {
7899         r2  = 0;
7900         rb2 = 0;
7901         if (tric_dist[dim_ind] == 0)
7902         {
7903             /* Rectangular direction, easy */
7904             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7905             if (r > 0)
7906             {
7907                 r2 += r*r;
7908             }
7909             if (bDistMB_pulse)
7910             {
7911                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7912                 if (r > 0)
7913                 {
7914                     rb2 += r*r;
7915                 }
7916             }
7917             /* Rounding gives at most a 16% reduction
7918              * in communicated atoms
7919              */
7920             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7921             {
7922                 r = cg_cm[cg][dim0] - c->cr0;
7923                 /* This is the first dimension, so always r >= 0 */
7924                 r2 += r*r;
7925                 if (bDistMB_pulse)
7926                 {
7927                     rb2 += r*r;
7928                 }
7929             }
7930             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7931             {
7932                 r = cg_cm[cg][dim1] - c->cr1[zone];
7933                 if (r > 0)
7934                 {
7935                     r2 += r*r;
7936                 }
7937                 if (bDistMB_pulse)
7938                 {
7939                     r = cg_cm[cg][dim1] - c->bcr1;
7940                     if (r > 0)
7941                     {
7942                         rb2 += r*r;
7943                     }
7944                 }
7945             }
7946         }
7947         else
7948         {
7949             /* Triclinic direction, more complicated */
7950             clear_rvec(rn);
7951             clear_rvec(rb);
7952             /* Rounding, conservative as the skew_fac multiplication
7953              * will slightly underestimate the distance.
7954              */
7955             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7956             {
7957                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7958                 for (i = dim0+1; i < DIM; i++)
7959                 {
7960                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7961                 }
7962                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7963                 if (bDistMB_pulse)
7964                 {
7965                     rb[dim0] = rn[dim0];
7966                     rb2      = r2;
7967                 }
7968                 /* Take care that the cell planes along dim0 might not
7969                  * be orthogonal to those along dim1 and dim2.
7970                  */
7971                 for (i = 1; i <= dim_ind; i++)
7972                 {
7973                     dimd = dd->dim[i];
7974                     if (normal[dim0][dimd] > 0)
7975                     {
7976                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7977                         if (bDistMB_pulse)
7978                         {
7979                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7980                         }
7981                     }
7982                 }
7983             }
7984             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7985             {
7986                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7987                 tric_sh   = 0;
7988                 for (i = dim1+1; i < DIM; i++)
7989                 {
7990                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7991                 }
7992                 rn[dim1] += tric_sh;
7993                 if (rn[dim1] > 0)
7994                 {
7995                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7996                     /* Take care of coupling of the distances
7997                      * to the planes along dim0 and dim1 through dim2.
7998                      */
7999                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
8000                     /* Take care that the cell planes along dim1
8001                      * might not be orthogonal to that along dim2.
8002                      */
8003                     if (normal[dim1][dim2] > 0)
8004                     {
8005                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
8006                     }
8007                 }
8008                 if (bDistMB_pulse)
8009                 {
8010                     rb[dim1] +=
8011                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
8012                     if (rb[dim1] > 0)
8013                     {
8014                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
8015                         /* Take care of coupling of the distances
8016                          * to the planes along dim0 and dim1 through dim2.
8017                          */
8018                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
8019                         /* Take care that the cell planes along dim1
8020                          * might not be orthogonal to that along dim2.
8021                          */
8022                         if (normal[dim1][dim2] > 0)
8023                         {
8024                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
8025                         }
8026                     }
8027                 }
8028             }
8029             /* The distance along the communication direction */
8030             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
8031             tric_sh  = 0;
8032             for (i = dim+1; i < DIM; i++)
8033             {
8034                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
8035             }
8036             rn[dim] += tric_sh;
8037             if (rn[dim] > 0)
8038             {
8039                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
8040                 /* Take care of coupling of the distances
8041                  * to the planes along dim0 and dim1 through dim2.
8042                  */
8043                 if (dim_ind == 1 && zonei == 1)
8044                 {
8045                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
8046                 }
8047             }
8048             if (bDistMB_pulse)
8049             {
8050                 clear_rvec(rb);
8051                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
8052                 if (rb[dim] > 0)
8053                 {
8054                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
8055                     /* Take care of coupling of the distances
8056                      * to the planes along dim0 and dim1 through dim2.
8057                      */
8058                     if (dim_ind == 1 && zonei == 1)
8059                     {
8060                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
8061                     }
8062                 }
8063             }
8064         }
8065
8066         if (r2 < r_comm2 ||
8067             (bDistBonded &&
8068              ((bDistMB && rb2 < r_bcomm2) ||
8069               (bDist2B && r2  < r_bcomm2)) &&
8070              (!bBondComm ||
8071               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
8072                missing_link(comm->cglink, index_gl[cg],
8073                             comm->bLocalCG)))))
8074         {
8075             /* Make an index to the local charge groups */
8076             if (nsend+1 > ind->nalloc)
8077             {
8078                 ind->nalloc = over_alloc_large(nsend+1);
8079                 srenew(ind->index, ind->nalloc);
8080             }
8081             if (nsend+1 > *ibuf_nalloc)
8082             {
8083                 *ibuf_nalloc = over_alloc_large(nsend+1);
8084                 srenew(*ibuf, *ibuf_nalloc);
8085             }
8086             ind->index[nsend] = cg;
8087             (*ibuf)[nsend]    = index_gl[cg];
8088             nsend_z++;
8089             vec_rvec_check_alloc(vbuf, nsend+1);
8090
8091             if (dd->ci[dim] == 0)
8092             {
8093                 /* Correct cg_cm for pbc */
8094                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
8095                 if (bScrew)
8096                 {
8097                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
8098                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
8099                 }
8100             }
8101             else
8102             {
8103                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
8104             }
8105             nsend++;
8106             nat += cgindex[cg+1] - cgindex[cg];
8107         }
8108     }
8109
8110     *nsend_ptr   = nsend;
8111     *nat_ptr     = nat;
8112     *nsend_z_ptr = nsend_z;
8113 }
8114
8115 static void setup_dd_communication(gmx_domdec_t *dd,
8116                                    matrix box, gmx_ddbox_t *ddbox,
8117                                    t_forcerec *fr, t_state *state, rvec **f)
8118 {
8119     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8120     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
8121     int                    c, i, cg, cg_gl, nrcg;
8122     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8123     gmx_domdec_comm_t     *comm;
8124     gmx_domdec_zones_t    *zones;
8125     gmx_domdec_comm_dim_t *cd;
8126     gmx_domdec_ind_t      *ind;
8127     cginfo_mb_t           *cginfo_mb;
8128     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8129     real                   r_comm2, r_bcomm2;
8130     dd_corners_t           corners;
8131     ivec                   tric_dist;
8132     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
8133     real                   skew_fac2_d, skew_fac_01;
8134     rvec                   sf2_round;
8135     int                    nsend, nat;
8136     int                    th;
8137
8138     if (debug)
8139     {
8140         fprintf(debug, "Setting up DD communication\n");
8141     }
8142
8143     comm  = dd->comm;
8144
8145     switch (fr->cutoff_scheme)
8146     {
8147         case ecutsGROUP:
8148             cg_cm = fr->cg_cm;
8149             break;
8150         case ecutsVERLET:
8151             cg_cm = state->x;
8152             break;
8153         default:
8154             gmx_incons("unimplemented");
8155             cg_cm = NULL;
8156     }
8157
8158     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8159     {
8160         /* Check if we need to use triclinic distances */
8161         tric_dist[dim_ind] = 0;
8162         for (i = 0; i <= dim_ind; i++)
8163         {
8164             if (ddbox->tric_dir[dd->dim[i]])
8165             {
8166                 tric_dist[dim_ind] = 1;
8167             }
8168         }
8169     }
8170
8171     bBondComm = comm->bBondComm;
8172
8173     /* Do we need to determine extra distances for multi-body bondeds? */
8174     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8175
8176     /* Do we need to determine extra distances for only two-body bondeds? */
8177     bDist2B = (bBondComm && !bDistMB);
8178
8179     r_comm2  = sqr(comm->cutoff);
8180     r_bcomm2 = sqr(comm->cutoff_mbody);
8181
8182     if (debug)
8183     {
8184         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8185     }
8186
8187     zones = &comm->zones;
8188
8189     dim0 = dd->dim[0];
8190     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8191     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8192
8193     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8194
8195     /* Triclinic stuff */
8196     normal      = ddbox->normal;
8197     skew_fac_01 = 0;
8198     if (dd->ndim >= 2)
8199     {
8200         v_0 = ddbox->v[dim0];
8201         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8202         {
8203             /* Determine the coupling coefficient for the distances
8204              * to the cell planes along dim0 and dim1 through dim2.
8205              * This is required for correct rounding.
8206              */
8207             skew_fac_01 =
8208                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8209             if (debug)
8210             {
8211                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8212             }
8213         }
8214     }
8215     if (dd->ndim >= 3)
8216     {
8217         v_1 = ddbox->v[dim1];
8218     }
8219
8220     zone_cg_range = zones->cg_range;
8221     index_gl      = dd->index_gl;
8222     cgindex       = dd->cgindex;
8223     cginfo_mb     = fr->cginfo_mb;
8224
8225     zone_cg_range[0]   = 0;
8226     zone_cg_range[1]   = dd->ncg_home;
8227     comm->zone_ncg1[0] = dd->ncg_home;
8228     pos_cg             = dd->ncg_home;
8229
8230     nat_tot = dd->nat_home;
8231     nzone   = 1;
8232     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8233     {
8234         dim = dd->dim[dim_ind];
8235         cd  = &comm->cd[dim_ind];
8236
8237         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8238         {
8239             /* No pbc in this dimension, the first node should not comm. */
8240             nzone_send = 0;
8241         }
8242         else
8243         {
8244             nzone_send = nzone;
8245         }
8246
8247         v_d         = ddbox->v[dim];
8248         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8249
8250         cd->bInPlace = TRUE;
8251         for (p = 0; p < cd->np; p++)
8252         {
8253             /* Only atoms communicated in the first pulse are used
8254              * for multi-body bonded interactions or for bBondComm.
8255              */
8256             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8257
8258             ind   = &cd->ind[p];
8259             nsend = 0;
8260             nat   = 0;
8261             for (zone = 0; zone < nzone_send; zone++)
8262             {
8263                 if (tric_dist[dim_ind] && dim_ind > 0)
8264                 {
8265                     /* Determine slightly more optimized skew_fac's
8266                      * for rounding.
8267                      * This reduces the number of communicated atoms
8268                      * by about 10% for 3D DD of rhombic dodecahedra.
8269                      */
8270                     for (dimd = 0; dimd < dim; dimd++)
8271                     {
8272                         sf2_round[dimd] = 1;
8273                         if (ddbox->tric_dir[dimd])
8274                         {
8275                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8276                             {
8277                                 /* If we are shifted in dimension i
8278                                  * and the cell plane is tilted forward
8279                                  * in dimension i, skip this coupling.
8280                                  */
8281                                 if (!(zones->shift[nzone+zone][i] &&
8282                                       ddbox->v[dimd][i][dimd] >= 0))
8283                                 {
8284                                     sf2_round[dimd] +=
8285                                         sqr(ddbox->v[dimd][i][dimd]);
8286                                 }
8287                             }
8288                             sf2_round[dimd] = 1/sf2_round[dimd];
8289                         }
8290                     }
8291                 }
8292
8293                 zonei = zone_perm[dim_ind][zone];
8294                 if (p == 0)
8295                 {
8296                     /* Here we permutate the zones to obtain a convenient order
8297                      * for neighbor searching
8298                      */
8299                     cg0 = zone_cg_range[zonei];
8300                     cg1 = zone_cg_range[zonei+1];
8301                 }
8302                 else
8303                 {
8304                     /* Look only at the cg's received in the previous grid pulse
8305                      */
8306                     cg1 = zone_cg_range[nzone+zone+1];
8307                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8308                 }
8309
8310 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8311                 for (th = 0; th < comm->nth; th++)
8312                 {
8313                     gmx_domdec_ind_t *ind_p;
8314                     int             **ibuf_p, *ibuf_nalloc_p;
8315                     vec_rvec_t       *vbuf_p;
8316                     int              *nsend_p, *nat_p;
8317                     int              *nsend_zone_p;
8318                     int               cg0_th, cg1_th;
8319
8320                     if (th == 0)
8321                     {
8322                         /* Thread 0 writes in the comm buffers */
8323                         ind_p         = ind;
8324                         ibuf_p        = &comm->buf_int;
8325                         ibuf_nalloc_p = &comm->nalloc_int;
8326                         vbuf_p        = &comm->vbuf;
8327                         nsend_p       = &nsend;
8328                         nat_p         = &nat;
8329                         nsend_zone_p  = &ind->nsend[zone];
8330                     }
8331                     else
8332                     {
8333                         /* Other threads write into temp buffers */
8334                         ind_p         = &comm->dth[th].ind;
8335                         ibuf_p        = &comm->dth[th].ibuf;
8336                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8337                         vbuf_p        = &comm->dth[th].vbuf;
8338                         nsend_p       = &comm->dth[th].nsend;
8339                         nat_p         = &comm->dth[th].nat;
8340                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8341
8342                         comm->dth[th].nsend      = 0;
8343                         comm->dth[th].nat        = 0;
8344                         comm->dth[th].nsend_zone = 0;
8345                     }
8346
8347                     if (comm->nth == 1)
8348                     {
8349                         cg0_th = cg0;
8350                         cg1_th = cg1;
8351                     }
8352                     else
8353                     {
8354                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8355                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8356                     }
8357
8358                     /* Get the cg's for this pulse in this zone */
8359                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8360                                        index_gl, cgindex,
8361                                        dim, dim_ind, dim0, dim1, dim2,
8362                                        r_comm2, r_bcomm2,
8363                                        box, tric_dist,
8364                                        normal, skew_fac2_d, skew_fac_01,
8365                                        v_d, v_0, v_1, &corners, sf2_round,
8366                                        bDistBonded, bBondComm,
8367                                        bDist2B, bDistMB,
8368                                        cg_cm, fr->cginfo,
8369                                        ind_p,
8370                                        ibuf_p, ibuf_nalloc_p,
8371                                        vbuf_p,
8372                                        nsend_p, nat_p,
8373                                        nsend_zone_p);
8374                 }
8375
8376                 /* Append data of threads>=1 to the communication buffers */
8377                 for (th = 1; th < comm->nth; th++)
8378                 {
8379                     dd_comm_setup_work_t *dth;
8380                     int                   i, ns1;
8381
8382                     dth = &comm->dth[th];
8383
8384                     ns1 = nsend + dth->nsend_zone;
8385                     if (ns1 > ind->nalloc)
8386                     {
8387                         ind->nalloc = over_alloc_dd(ns1);
8388                         srenew(ind->index, ind->nalloc);
8389                     }
8390                     if (ns1 > comm->nalloc_int)
8391                     {
8392                         comm->nalloc_int = over_alloc_dd(ns1);
8393                         srenew(comm->buf_int, comm->nalloc_int);
8394                     }
8395                     if (ns1 > comm->vbuf.nalloc)
8396                     {
8397                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8398                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8399                     }
8400
8401                     for (i = 0; i < dth->nsend_zone; i++)
8402                     {
8403                         ind->index[nsend]    = dth->ind.index[i];
8404                         comm->buf_int[nsend] = dth->ibuf[i];
8405                         copy_rvec(dth->vbuf.v[i],
8406                                   comm->vbuf.v[nsend]);
8407                         nsend++;
8408                     }
8409                     nat              += dth->nat;
8410                     ind->nsend[zone] += dth->nsend_zone;
8411                 }
8412             }
8413             /* Clear the counts in case we do not have pbc */
8414             for (zone = nzone_send; zone < nzone; zone++)
8415             {
8416                 ind->nsend[zone] = 0;
8417             }
8418             ind->nsend[nzone]   = nsend;
8419             ind->nsend[nzone+1] = nat;
8420             /* Communicate the number of cg's and atoms to receive */
8421             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8422                             ind->nsend, nzone+2,
8423                             ind->nrecv, nzone+2);
8424
8425             /* The rvec buffer is also required for atom buffers of size nsend
8426              * in dd_move_x and dd_move_f.
8427              */
8428             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8429
8430             if (p > 0)
8431             {
8432                 /* We can receive in place if only the last zone is not empty */
8433                 for (zone = 0; zone < nzone-1; zone++)
8434                 {
8435                     if (ind->nrecv[zone] > 0)
8436                     {
8437                         cd->bInPlace = FALSE;
8438                     }
8439                 }
8440                 if (!cd->bInPlace)
8441                 {
8442                     /* The int buffer is only required here for the cg indices */
8443                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8444                     {
8445                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8446                         srenew(comm->buf_int2, comm->nalloc_int2);
8447                     }
8448                     /* The rvec buffer is also required for atom buffers
8449                      * of size nrecv in dd_move_x and dd_move_f.
8450                      */
8451                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8452                     vec_rvec_check_alloc(&comm->vbuf2, i);
8453                 }
8454             }
8455
8456             /* Make space for the global cg indices */
8457             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8458                 || dd->cg_nalloc == 0)
8459             {
8460                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8461                 srenew(index_gl, dd->cg_nalloc);
8462                 srenew(cgindex, dd->cg_nalloc+1);
8463             }
8464             /* Communicate the global cg indices */
8465             if (cd->bInPlace)
8466             {
8467                 recv_i = index_gl + pos_cg;
8468             }
8469             else
8470             {
8471                 recv_i = comm->buf_int2;
8472             }
8473             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8474                             comm->buf_int, nsend,
8475                             recv_i,        ind->nrecv[nzone]);
8476
8477             /* Make space for cg_cm */
8478             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8479             if (fr->cutoff_scheme == ecutsGROUP)
8480             {
8481                 cg_cm = fr->cg_cm;
8482             }
8483             else
8484             {
8485                 cg_cm = state->x;
8486             }
8487             /* Communicate cg_cm */
8488             if (cd->bInPlace)
8489             {
8490                 recv_vr = cg_cm + pos_cg;
8491             }
8492             else
8493             {
8494                 recv_vr = comm->vbuf2.v;
8495             }
8496             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8497                              comm->vbuf.v, nsend,
8498                              recv_vr,      ind->nrecv[nzone]);
8499
8500             /* Make the charge group index */
8501             if (cd->bInPlace)
8502             {
8503                 zone = (p == 0 ? 0 : nzone - 1);
8504                 while (zone < nzone)
8505                 {
8506                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8507                     {
8508                         cg_gl              = index_gl[pos_cg];
8509                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8510                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8511                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8512                         if (bBondComm)
8513                         {
8514                             /* Update the charge group presence,
8515                              * so we can use it in the next pass of the loop.
8516                              */
8517                             comm->bLocalCG[cg_gl] = TRUE;
8518                         }
8519                         pos_cg++;
8520                     }
8521                     if (p == 0)
8522                     {
8523                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8524                     }
8525                     zone++;
8526                     zone_cg_range[nzone+zone] = pos_cg;
8527                 }
8528             }
8529             else
8530             {
8531                 /* This part of the code is never executed with bBondComm. */
8532                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8533                                  index_gl, recv_i, cg_cm, recv_vr,
8534                                  cgindex, fr->cginfo_mb, fr->cginfo);
8535                 pos_cg += ind->nrecv[nzone];
8536             }
8537             nat_tot += ind->nrecv[nzone+1];
8538         }
8539         if (!cd->bInPlace)
8540         {
8541             /* Store the atom block for easy copying of communication buffers */
8542             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8543         }
8544         nzone += nzone;
8545     }
8546     dd->index_gl = index_gl;
8547     dd->cgindex  = cgindex;
8548
8549     dd->ncg_tot          = zone_cg_range[zones->n];
8550     dd->nat_tot          = nat_tot;
8551     comm->nat[ddnatHOME] = dd->nat_home;
8552     for (i = ddnatZONE; i < ddnatNR; i++)
8553     {
8554         comm->nat[i] = dd->nat_tot;
8555     }
8556
8557     if (!bBondComm)
8558     {
8559         /* We don't need to update cginfo, since that was alrady done above.
8560          * So we pass NULL for the forcerec.
8561          */
8562         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8563                       NULL, comm->bLocalCG);
8564     }
8565
8566     if (debug)
8567     {
8568         fprintf(debug, "Finished setting up DD communication, zones:");
8569         for (c = 0; c < zones->n; c++)
8570         {
8571             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8572         }
8573         fprintf(debug, "\n");
8574     }
8575 }
8576
8577 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8578 {
8579     int c;
8580
8581     for (c = 0; c < zones->nizone; c++)
8582     {
8583         zones->izone[c].cg1  = zones->cg_range[c+1];
8584         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8585         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8586     }
8587 }
8588
8589 static void set_zones_size(gmx_domdec_t *dd,
8590                            matrix box, const gmx_ddbox_t *ddbox,
8591                            int zone_start, int zone_end)
8592 {
8593     gmx_domdec_comm_t  *comm;
8594     gmx_domdec_zones_t *zones;
8595     gmx_bool            bDistMB;
8596     int                 z, zi, d, dim;
8597     real                rcs, rcmbs;
8598     int                 i, j;
8599     real                vol;
8600
8601     comm = dd->comm;
8602
8603     zones = &comm->zones;
8604
8605     /* Do we need to determine extra distances for multi-body bondeds? */
8606     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8607
8608     for (z = zone_start; z < zone_end; z++)
8609     {
8610         /* Copy cell limits to zone limits.
8611          * Valid for non-DD dims and non-shifted dims.
8612          */
8613         copy_rvec(comm->cell_x0, zones->size[z].x0);
8614         copy_rvec(comm->cell_x1, zones->size[z].x1);
8615     }
8616
8617     for (d = 0; d < dd->ndim; d++)
8618     {
8619         dim = dd->dim[d];
8620
8621         for (z = 0; z < zones->n; z++)
8622         {
8623             /* With a staggered grid we have different sizes
8624              * for non-shifted dimensions.
8625              */
8626             if (dd->bGridJump && zones->shift[z][dim] == 0)
8627             {
8628                 if (d == 1)
8629                 {
8630                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8631                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8632                 }
8633                 else if (d == 2)
8634                 {
8635                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8636                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8637                 }
8638             }
8639         }
8640
8641         rcs   = comm->cutoff;
8642         rcmbs = comm->cutoff_mbody;
8643         if (ddbox->tric_dir[dim])
8644         {
8645             rcs   /= ddbox->skew_fac[dim];
8646             rcmbs /= ddbox->skew_fac[dim];
8647         }
8648
8649         /* Set the lower limit for the shifted zone dimensions */
8650         for (z = zone_start; z < zone_end; z++)
8651         {
8652             if (zones->shift[z][dim] > 0)
8653             {
8654                 dim = dd->dim[d];
8655                 if (!dd->bGridJump || d == 0)
8656                 {
8657                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8658                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8659                 }
8660                 else
8661                 {
8662                     /* Here we take the lower limit of the zone from
8663                      * the lowest domain of the zone below.
8664                      */
8665                     if (z < 4)
8666                     {
8667                         zones->size[z].x0[dim] =
8668                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8669                     }
8670                     else
8671                     {
8672                         if (d == 1)
8673                         {
8674                             zones->size[z].x0[dim] =
8675                                 zones->size[zone_perm[2][z-4]].x0[dim];
8676                         }
8677                         else
8678                         {
8679                             zones->size[z].x0[dim] =
8680                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8681                         }
8682                     }
8683                     /* A temporary limit, is updated below */
8684                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8685
8686                     if (bDistMB)
8687                     {
8688                         for (zi = 0; zi < zones->nizone; zi++)
8689                         {
8690                             if (zones->shift[zi][dim] == 0)
8691                             {
8692                                 /* This takes the whole zone into account.
8693                                  * With multiple pulses this will lead
8694                                  * to a larger zone then strictly necessary.
8695                                  */
8696                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8697                                                                   zones->size[zi].x1[dim]+rcmbs);
8698                             }
8699                         }
8700                     }
8701                 }
8702             }
8703         }
8704
8705         /* Loop over the i-zones to set the upper limit of each
8706          * j-zone they see.
8707          */
8708         for (zi = 0; zi < zones->nizone; zi++)
8709         {
8710             if (zones->shift[zi][dim] == 0)
8711             {
8712                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8713                 {
8714                     if (zones->shift[z][dim] > 0)
8715                     {
8716                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8717                                                           zones->size[zi].x1[dim]+rcs);
8718                     }
8719                 }
8720             }
8721         }
8722     }
8723
8724     for (z = zone_start; z < zone_end; z++)
8725     {
8726         /* Initialization only required to keep the compiler happy */
8727         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8728         int  nc, c;
8729
8730         /* To determine the bounding box for a zone we need to find
8731          * the extreme corners of 4, 2 or 1 corners.
8732          */
8733         nc = 1 << (ddbox->nboundeddim - 1);
8734
8735         for (c = 0; c < nc; c++)
8736         {
8737             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8738             corner[XX] = 0;
8739             if ((c & 1) == 0)
8740             {
8741                 corner[YY] = zones->size[z].x0[YY];
8742             }
8743             else
8744             {
8745                 corner[YY] = zones->size[z].x1[YY];
8746             }
8747             if ((c & 2) == 0)
8748             {
8749                 corner[ZZ] = zones->size[z].x0[ZZ];
8750             }
8751             else
8752             {
8753                 corner[ZZ] = zones->size[z].x1[ZZ];
8754             }
8755             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8756                 box[ZZ][1 - dd->dim[0]] != 0)
8757             {
8758                 /* With 1D domain decomposition the cg's are not in
8759                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8760                  * Shift the corner of the z-vector back to along the box
8761                  * vector of dimension d, so it will later end up at 0 along d.
8762                  * This can affect the location of this corner along dd->dim[0]
8763                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8764                  */
8765                 int d = 1 - dd->dim[0];
8766
8767                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8768             }
8769             /* Apply the triclinic couplings */
8770             assert(ddbox->npbcdim <= DIM);
8771             for (i = YY; i < ddbox->npbcdim; i++)
8772             {
8773                 for (j = XX; j < i; j++)
8774                 {
8775                     corner[j] += corner[i]*box[i][j]/box[i][i];
8776                 }
8777             }
8778             if (c == 0)
8779             {
8780                 copy_rvec(corner, corner_min);
8781                 copy_rvec(corner, corner_max);
8782             }
8783             else
8784             {
8785                 for (i = 0; i < DIM; i++)
8786                 {
8787                     corner_min[i] = std::min(corner_min[i], corner[i]);
8788                     corner_max[i] = std::max(corner_max[i], corner[i]);
8789                 }
8790             }
8791         }
8792         /* Copy the extreme cornes without offset along x */
8793         for (i = 0; i < DIM; i++)
8794         {
8795             zones->size[z].bb_x0[i] = corner_min[i];
8796             zones->size[z].bb_x1[i] = corner_max[i];
8797         }
8798         /* Add the offset along x */
8799         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8800         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8801     }
8802
8803     if (zone_start == 0)
8804     {
8805         vol = 1;
8806         for (dim = 0; dim < DIM; dim++)
8807         {
8808             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8809         }
8810         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8811     }
8812
8813     if (debug)
8814     {
8815         for (z = zone_start; z < zone_end; z++)
8816         {
8817             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8818                     z,
8819                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8820                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8821                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8822             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8823                     z,
8824                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8825                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8826                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8827         }
8828     }
8829 }
8830
8831 static int comp_cgsort(const void *a, const void *b)
8832 {
8833     int           comp;
8834
8835     gmx_cgsort_t *cga, *cgb;
8836     cga = (gmx_cgsort_t *)a;
8837     cgb = (gmx_cgsort_t *)b;
8838
8839     comp = cga->nsc - cgb->nsc;
8840     if (comp == 0)
8841     {
8842         comp = cga->ind_gl - cgb->ind_gl;
8843     }
8844
8845     return comp;
8846 }
8847
8848 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8849                          int *a, int *buf)
8850 {
8851     int i;
8852
8853     /* Order the data */
8854     for (i = 0; i < n; i++)
8855     {
8856         buf[i] = a[sort[i].ind];
8857     }
8858
8859     /* Copy back to the original array */
8860     for (i = 0; i < n; i++)
8861     {
8862         a[i] = buf[i];
8863     }
8864 }
8865
8866 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8867                          rvec *v, rvec *buf)
8868 {
8869     int i;
8870
8871     /* Order the data */
8872     for (i = 0; i < n; i++)
8873     {
8874         copy_rvec(v[sort[i].ind], buf[i]);
8875     }
8876
8877     /* Copy back to the original array */
8878     for (i = 0; i < n; i++)
8879     {
8880         copy_rvec(buf[i], v[i]);
8881     }
8882 }
8883
8884 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8885                            rvec *v, rvec *buf)
8886 {
8887     int a, atot, cg, cg0, cg1, i;
8888
8889     if (cgindex == NULL)
8890     {
8891         /* Avoid the useless loop of the atoms within a cg */
8892         order_vec_cg(ncg, sort, v, buf);
8893
8894         return;
8895     }
8896
8897     /* Order the data */
8898     a = 0;
8899     for (cg = 0; cg < ncg; cg++)
8900     {
8901         cg0 = cgindex[sort[cg].ind];
8902         cg1 = cgindex[sort[cg].ind+1];
8903         for (i = cg0; i < cg1; i++)
8904         {
8905             copy_rvec(v[i], buf[a]);
8906             a++;
8907         }
8908     }
8909     atot = a;
8910
8911     /* Copy back to the original array */
8912     for (a = 0; a < atot; a++)
8913     {
8914         copy_rvec(buf[a], v[a]);
8915     }
8916 }
8917
8918 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8919                          int nsort_new, gmx_cgsort_t *sort_new,
8920                          gmx_cgsort_t *sort1)
8921 {
8922     int i1, i2, i_new;
8923
8924     /* The new indices are not very ordered, so we qsort them */
8925     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8926
8927     /* sort2 is already ordered, so now we can merge the two arrays */
8928     i1    = 0;
8929     i2    = 0;
8930     i_new = 0;
8931     while (i2 < nsort2 || i_new < nsort_new)
8932     {
8933         if (i2 == nsort2)
8934         {
8935             sort1[i1++] = sort_new[i_new++];
8936         }
8937         else if (i_new == nsort_new)
8938         {
8939             sort1[i1++] = sort2[i2++];
8940         }
8941         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8942                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8943                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8944         {
8945             sort1[i1++] = sort2[i2++];
8946         }
8947         else
8948         {
8949             sort1[i1++] = sort_new[i_new++];
8950         }
8951     }
8952 }
8953
8954 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8955 {
8956     gmx_domdec_sort_t *sort;
8957     gmx_cgsort_t      *cgsort, *sort_i;
8958     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8959
8960     sort = dd->comm->sort;
8961
8962     a = fr->ns.grid->cell_index;
8963
8964     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8965
8966     if (ncg_home_old >= 0)
8967     {
8968         /* The charge groups that remained in the same ns grid cell
8969          * are completely ordered. So we can sort efficiently by sorting
8970          * the charge groups that did move into the stationary list.
8971          */
8972         ncg_new   = 0;
8973         nsort2    = 0;
8974         nsort_new = 0;
8975         for (i = 0; i < dd->ncg_home; i++)
8976         {
8977             /* Check if this cg did not move to another node */
8978             if (a[i] < moved)
8979             {
8980                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8981                 {
8982                     /* This cg is new on this node or moved ns grid cell */
8983                     if (nsort_new >= sort->sort_new_nalloc)
8984                     {
8985                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8986                         srenew(sort->sort_new, sort->sort_new_nalloc);
8987                     }
8988                     sort_i = &(sort->sort_new[nsort_new++]);
8989                 }
8990                 else
8991                 {
8992                     /* This cg did not move */
8993                     sort_i = &(sort->sort2[nsort2++]);
8994                 }
8995                 /* Sort on the ns grid cell indices
8996                  * and the global topology index.
8997                  * index_gl is irrelevant with cell ns,
8998                  * but we set it here anyhow to avoid a conditional.
8999                  */
9000                 sort_i->nsc    = a[i];
9001                 sort_i->ind_gl = dd->index_gl[i];
9002                 sort_i->ind    = i;
9003                 ncg_new++;
9004             }
9005         }
9006         if (debug)
9007         {
9008             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
9009                     nsort2, nsort_new);
9010         }
9011         /* Sort efficiently */
9012         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
9013                      sort->sort);
9014     }
9015     else
9016     {
9017         cgsort  = sort->sort;
9018         ncg_new = 0;
9019         for (i = 0; i < dd->ncg_home; i++)
9020         {
9021             /* Sort on the ns grid cell indices
9022              * and the global topology index
9023              */
9024             cgsort[i].nsc    = a[i];
9025             cgsort[i].ind_gl = dd->index_gl[i];
9026             cgsort[i].ind    = i;
9027             if (cgsort[i].nsc < moved)
9028             {
9029                 ncg_new++;
9030             }
9031         }
9032         if (debug)
9033         {
9034             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
9035         }
9036         /* Determine the order of the charge groups using qsort */
9037         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
9038     }
9039
9040     return ncg_new;
9041 }
9042
9043 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
9044 {
9045     gmx_cgsort_t *sort;
9046     int           ncg_new, i, *a, na;
9047
9048     sort = dd->comm->sort->sort;
9049
9050     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
9051
9052     ncg_new = 0;
9053     for (i = 0; i < na; i++)
9054     {
9055         if (a[i] >= 0)
9056         {
9057             sort[ncg_new].ind = a[i];
9058             ncg_new++;
9059         }
9060     }
9061
9062     return ncg_new;
9063 }
9064
9065 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
9066                           int ncg_home_old)
9067 {
9068     gmx_domdec_sort_t *sort;
9069     gmx_cgsort_t      *cgsort;
9070     int               *cgindex;
9071     int                ncg_new, i, *ibuf, cgsize;
9072     rvec              *vbuf;
9073
9074     sort = dd->comm->sort;
9075
9076     if (dd->ncg_home > sort->sort_nalloc)
9077     {
9078         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9079         srenew(sort->sort, sort->sort_nalloc);
9080         srenew(sort->sort2, sort->sort_nalloc);
9081     }
9082     cgsort = sort->sort;
9083
9084     switch (fr->cutoff_scheme)
9085     {
9086         case ecutsGROUP:
9087             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9088             break;
9089         case ecutsVERLET:
9090             ncg_new = dd_sort_order_nbnxn(dd, fr);
9091             break;
9092         default:
9093             gmx_incons("unimplemented");
9094             ncg_new = 0;
9095     }
9096
9097     /* We alloc with the old size, since cgindex is still old */
9098     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9099     vbuf = dd->comm->vbuf.v;
9100
9101     if (dd->comm->bCGs)
9102     {
9103         cgindex = dd->cgindex;
9104     }
9105     else
9106     {
9107         cgindex = NULL;
9108     }
9109
9110     /* Remove the charge groups which are no longer at home here */
9111     dd->ncg_home = ncg_new;
9112     if (debug)
9113     {
9114         fprintf(debug, "Set the new home charge group count to %d\n",
9115                 dd->ncg_home);
9116     }
9117
9118     /* Reorder the state */
9119     for (i = 0; i < estNR; i++)
9120     {
9121         if (EST_DISTR(i) && (state->flags & (1<<i)))
9122         {
9123             switch (i)
9124             {
9125                 case estX:
9126                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
9127                     break;
9128                 case estV:
9129                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
9130                     break;
9131                 case estSDX:
9132                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
9133                     break;
9134                 case estCGP:
9135                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
9136                     break;
9137                 case estLD_RNG:
9138                 case estLD_RNGI:
9139                 case estDISRE_INITF:
9140                 case estDISRE_RM3TAV:
9141                 case estORIRE_INITF:
9142                 case estORIRE_DTAV:
9143                     /* No ordering required */
9144                     break;
9145                 default:
9146                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9147                     break;
9148             }
9149         }
9150     }
9151     if (fr->cutoff_scheme == ecutsGROUP)
9152     {
9153         /* Reorder cgcm */
9154         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9155     }
9156
9157     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9158     {
9159         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9160         srenew(sort->ibuf, sort->ibuf_nalloc);
9161     }
9162     ibuf = sort->ibuf;
9163     /* Reorder the global cg index */
9164     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9165     /* Reorder the cginfo */
9166     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9167     /* Rebuild the local cg index */
9168     if (dd->comm->bCGs)
9169     {
9170         ibuf[0] = 0;
9171         for (i = 0; i < dd->ncg_home; i++)
9172         {
9173             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9174             ibuf[i+1] = ibuf[i] + cgsize;
9175         }
9176         for (i = 0; i < dd->ncg_home+1; i++)
9177         {
9178             dd->cgindex[i] = ibuf[i];
9179         }
9180     }
9181     else
9182     {
9183         for (i = 0; i < dd->ncg_home+1; i++)
9184         {
9185             dd->cgindex[i] = i;
9186         }
9187     }
9188     /* Set the home atom number */
9189     dd->nat_home = dd->cgindex[dd->ncg_home];
9190
9191     if (fr->cutoff_scheme == ecutsVERLET)
9192     {
9193         /* The atoms are now exactly in grid order, update the grid order */
9194         nbnxn_set_atomorder(fr->nbv->nbs);
9195     }
9196     else
9197     {
9198         /* Copy the sorted ns cell indices back to the ns grid struct */
9199         for (i = 0; i < dd->ncg_home; i++)
9200         {
9201             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9202         }
9203         fr->ns.grid->nr = dd->ncg_home;
9204     }
9205 }
9206
9207 static void add_dd_statistics(gmx_domdec_t *dd)
9208 {
9209     gmx_domdec_comm_t *comm;
9210     int                ddnat;
9211
9212     comm = dd->comm;
9213
9214     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9215     {
9216         comm->sum_nat[ddnat-ddnatZONE] +=
9217             comm->nat[ddnat] - comm->nat[ddnat-1];
9218     }
9219     comm->ndecomp++;
9220 }
9221
9222 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9223 {
9224     gmx_domdec_comm_t *comm;
9225     int                ddnat;
9226
9227     comm = dd->comm;
9228
9229     /* Reset all the statistics and counters for total run counting */
9230     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9231     {
9232         comm->sum_nat[ddnat-ddnatZONE] = 0;
9233     }
9234     comm->ndecomp   = 0;
9235     comm->nload     = 0;
9236     comm->load_step = 0;
9237     comm->load_sum  = 0;
9238     comm->load_max  = 0;
9239     clear_ivec(comm->load_lim);
9240     comm->load_mdf = 0;
9241     comm->load_pme = 0;
9242 }
9243
9244 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9245 {
9246     gmx_domdec_comm_t *comm;
9247     int                ddnat;
9248     double             av;
9249
9250     comm = cr->dd->comm;
9251
9252     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9253
9254     if (fplog == NULL)
9255     {
9256         return;
9257     }
9258
9259     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9260
9261     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9262     {
9263         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9264         switch (ddnat)
9265         {
9266             case ddnatZONE:
9267                 fprintf(fplog,
9268                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9269                         2, av);
9270                 break;
9271             case ddnatVSITE:
9272                 if (cr->dd->vsite_comm)
9273                 {
9274                     fprintf(fplog,
9275                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9276                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9277                             av);
9278                 }
9279                 break;
9280             case ddnatCON:
9281                 if (cr->dd->constraint_comm)
9282                 {
9283                     fprintf(fplog,
9284                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9285                             1 + ir->nLincsIter, av);
9286                 }
9287                 break;
9288             default:
9289                 gmx_incons(" Unknown type for DD statistics");
9290         }
9291     }
9292     fprintf(fplog, "\n");
9293
9294     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9295     {
9296         print_dd_load_av(fplog, cr->dd);
9297     }
9298 }
9299
9300 void dd_partition_system(FILE                *fplog,
9301                          gmx_int64_t          step,
9302                          t_commrec           *cr,
9303                          gmx_bool             bMasterState,
9304                          int                  nstglobalcomm,
9305                          t_state             *state_global,
9306                          gmx_mtop_t          *top_global,
9307                          t_inputrec          *ir,
9308                          t_state             *state_local,
9309                          rvec               **f,
9310                          t_mdatoms           *mdatoms,
9311                          gmx_localtop_t      *top_local,
9312                          t_forcerec          *fr,
9313                          gmx_vsite_t         *vsite,
9314                          gmx_shellfc_t        shellfc,
9315                          gmx_constr_t         constr,
9316                          t_nrnb              *nrnb,
9317                          gmx_wallcycle_t      wcycle,
9318                          gmx_bool             bVerbose)
9319 {
9320     gmx_domdec_t      *dd;
9321     gmx_domdec_comm_t *comm;
9322     gmx_ddbox_t        ddbox = {0};
9323     t_block           *cgs_gl;
9324     gmx_int64_t        step_pcoupl;
9325     rvec               cell_ns_x0, cell_ns_x1;
9326     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9327     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9328     gmx_bool           bRedist, bSortCG, bResortAll;
9329     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9330     real               grid_density;
9331     char               sbuf[22];
9332
9333     dd   = cr->dd;
9334     comm = dd->comm;
9335
9336     bBoxChanged = (bMasterState || DEFORM(*ir));
9337     if (ir->epc != epcNO)
9338     {
9339         /* With nstpcouple > 1 pressure coupling happens.
9340          * one step after calculating the pressure.
9341          * Box scaling happens at the end of the MD step,
9342          * after the DD partitioning.
9343          * We therefore have to do DLB in the first partitioning
9344          * after an MD step where P-coupling occured.
9345          * We need to determine the last step in which p-coupling occurred.
9346          * MRS -- need to validate this for vv?
9347          */
9348         n = ir->nstpcouple;
9349         if (n == 1)
9350         {
9351             step_pcoupl = step - 1;
9352         }
9353         else
9354         {
9355             step_pcoupl = ((step - 1)/n)*n + 1;
9356         }
9357         if (step_pcoupl >= comm->partition_step)
9358         {
9359             bBoxChanged = TRUE;
9360         }
9361     }
9362
9363     bNStGlobalComm = (step % nstglobalcomm == 0);
9364
9365     if (!comm->bDynLoadBal)
9366     {
9367         bDoDLB = FALSE;
9368     }
9369     else
9370     {
9371         /* Should we do dynamic load balacing this step?
9372          * Since it requires (possibly expensive) global communication,
9373          * we might want to do DLB less frequently.
9374          */
9375         if (bBoxChanged || ir->epc != epcNO)
9376         {
9377             bDoDLB = bBoxChanged;
9378         }
9379         else
9380         {
9381             bDoDLB = bNStGlobalComm;
9382         }
9383     }
9384
9385     /* Check if we have recorded loads on the nodes */
9386     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9387     {
9388         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
9389         {
9390             /* Check if we should use DLB at the second partitioning
9391              * and every 100 partitionings,
9392              * so the extra communication cost is negligible.
9393              */
9394             const int nddp_chk_dlb = 100;
9395             bCheckDLB = (comm->n_load_collect == 0 ||
9396                          comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
9397         }
9398         else
9399         {
9400             bCheckDLB = FALSE;
9401         }
9402
9403         /* Print load every nstlog, first and last step to the log file */
9404         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9405                     comm->n_load_collect == 0 ||
9406                     (ir->nsteps >= 0 &&
9407                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9408
9409         /* Avoid extra communication due to verbose screen output
9410          * when nstglobalcomm is set.
9411          */
9412         if (bDoDLB || bLogLoad || bCheckDLB ||
9413             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9414         {
9415             get_load_distribution(dd, wcycle);
9416             if (DDMASTER(dd))
9417             {
9418                 if (bLogLoad)
9419                 {
9420                     dd_print_load(fplog, dd, step-1);
9421                 }
9422                 if (bVerbose)
9423                 {
9424                     dd_print_load_verbose(dd);
9425                 }
9426             }
9427             comm->n_load_collect++;
9428
9429             if (bCheckDLB)
9430             {
9431                 /* Since the timings are node dependent, the master decides */
9432                 if (DDMASTER(dd))
9433                 {
9434                     /* Here we check if the max PME rank load is more than 0.98
9435                      * the max PP force load. If so, PP DLB will not help,
9436                      * since we are (almost) limited by PME. Furthermore,
9437                      * DLB will cause a significant extra x/f redistribution
9438                      * cost on the PME ranks, which will then surely result
9439                      * in lower total performance.
9440                      * This check might be fragile, since one measurement
9441                      * below 0.98 (although only done once every 100 DD part.)
9442                      * could turn on DLB for the rest of the run.
9443                      */
9444                     if (cr->npmenodes > 0 &&
9445                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9446                     {
9447                         bTurnOnDLB = FALSE;
9448                     }
9449                     else
9450                     {
9451                         bTurnOnDLB =
9452                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9453                     }
9454                     if (debug)
9455                     {
9456                         fprintf(debug, "step %s, imb loss %f\n",
9457                                 gmx_step_str(step, sbuf),
9458                                 dd_force_imb_perf_loss(dd));
9459                     }
9460                 }
9461                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9462                 if (bTurnOnDLB)
9463                 {
9464                     turn_on_dlb(fplog, cr, step);
9465                     bDoDLB = TRUE;
9466                 }
9467             }
9468         }
9469         comm->n_load_have++;
9470     }
9471
9472     cgs_gl = &comm->cgs_gl;
9473
9474     bRedist = FALSE;
9475     if (bMasterState)
9476     {
9477         /* Clear the old state */
9478         clear_dd_indices(dd, 0, 0);
9479         ncgindex_set = 0;
9480
9481         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9482                   TRUE, cgs_gl, state_global->x, &ddbox);
9483
9484         get_cg_distribution(fplog, dd, cgs_gl,
9485                             state_global->box, &ddbox, state_global->x);
9486
9487         dd_distribute_state(dd, cgs_gl,
9488                             state_global, state_local, f);
9489
9490         dd_make_local_cgs(dd, &top_local->cgs);
9491
9492         /* Ensure that we have space for the new distribution */
9493         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9494
9495         if (fr->cutoff_scheme == ecutsGROUP)
9496         {
9497             calc_cgcm(fplog, 0, dd->ncg_home,
9498                       &top_local->cgs, state_local->x, fr->cg_cm);
9499         }
9500
9501         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9502
9503         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9504     }
9505     else if (state_local->ddp_count != dd->ddp_count)
9506     {
9507         if (state_local->ddp_count > dd->ddp_count)
9508         {
9509             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9510         }
9511
9512         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9513         {
9514             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9515         }
9516
9517         /* Clear the old state */
9518         clear_dd_indices(dd, 0, 0);
9519
9520         /* Build the new indices */
9521         rebuild_cgindex(dd, cgs_gl->index, state_local);
9522         make_dd_indices(dd, cgs_gl->index, 0);
9523         ncgindex_set = dd->ncg_home;
9524
9525         if (fr->cutoff_scheme == ecutsGROUP)
9526         {
9527             /* Redetermine the cg COMs */
9528             calc_cgcm(fplog, 0, dd->ncg_home,
9529                       &top_local->cgs, state_local->x, fr->cg_cm);
9530         }
9531
9532         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9533
9534         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9535
9536         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9537                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9538
9539         bRedist = comm->bDynLoadBal;
9540     }
9541     else
9542     {
9543         /* We have the full state, only redistribute the cgs */
9544
9545         /* Clear the non-home indices */
9546         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9547         ncgindex_set = 0;
9548
9549         /* Avoid global communication for dim's without pbc and -gcom */
9550         if (!bNStGlobalComm)
9551         {
9552             copy_rvec(comm->box0, ddbox.box0    );
9553             copy_rvec(comm->box_size, ddbox.box_size);
9554         }
9555         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9556                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9557
9558         bBoxChanged = TRUE;
9559         bRedist     = TRUE;
9560     }
9561     /* For dim's without pbc and -gcom */
9562     copy_rvec(ddbox.box0, comm->box0    );
9563     copy_rvec(ddbox.box_size, comm->box_size);
9564
9565     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9566                       step, wcycle);
9567
9568     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9569     {
9570         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9571     }
9572
9573     /* Check if we should sort the charge groups */
9574     if (comm->nstSortCG > 0)
9575     {
9576         bSortCG = (bMasterState ||
9577                    (bRedist && (step % comm->nstSortCG == 0)));
9578     }
9579     else
9580     {
9581         bSortCG = FALSE;
9582     }
9583
9584     ncg_home_old = dd->ncg_home;
9585
9586     ncg_moved = 0;
9587     if (bRedist)
9588     {
9589         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9590
9591         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9592                            state_local, f, fr,
9593                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9594
9595         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9596     }
9597
9598     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9599                           dd, &ddbox,
9600                           &comm->cell_x0, &comm->cell_x1,
9601                           dd->ncg_home, fr->cg_cm,
9602                           cell_ns_x0, cell_ns_x1, &grid_density);
9603
9604     if (bBoxChanged)
9605     {
9606         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9607     }
9608
9609     switch (fr->cutoff_scheme)
9610     {
9611         case ecutsGROUP:
9612             copy_ivec(fr->ns.grid->n, ncells_old);
9613             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9614                        state_local->box, cell_ns_x0, cell_ns_x1,
9615                        fr->rlistlong, grid_density);
9616             break;
9617         case ecutsVERLET:
9618             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9619             break;
9620         default:
9621             gmx_incons("unimplemented");
9622     }
9623     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9624     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9625
9626     if (bSortCG)
9627     {
9628         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9629
9630         /* Sort the state on charge group position.
9631          * This enables exact restarts from this step.
9632          * It also improves performance by about 15% with larger numbers
9633          * of atoms per node.
9634          */
9635
9636         /* Fill the ns grid with the home cell,
9637          * so we can sort with the indices.
9638          */
9639         set_zones_ncg_home(dd);
9640
9641         switch (fr->cutoff_scheme)
9642         {
9643             case ecutsVERLET:
9644                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9645
9646                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9647                                   0,
9648                                   comm->zones.size[0].bb_x0,
9649                                   comm->zones.size[0].bb_x1,
9650                                   0, dd->ncg_home,
9651                                   comm->zones.dens_zone0,
9652                                   fr->cginfo,
9653                                   state_local->x,
9654                                   ncg_moved, bRedist ? comm->moved : NULL,
9655                                   fr->nbv->grp[eintLocal].kernel_type,
9656                                   fr->nbv->grp[eintLocal].nbat);
9657
9658                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9659                 break;
9660             case ecutsGROUP:
9661                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9662                           0, dd->ncg_home, fr->cg_cm);
9663
9664                 copy_ivec(fr->ns.grid->n, ncells_new);
9665                 break;
9666             default:
9667                 gmx_incons("unimplemented");
9668         }
9669
9670         bResortAll = bMasterState;
9671
9672         /* Check if we can user the old order and ns grid cell indices
9673          * of the charge groups to sort the charge groups efficiently.
9674          */
9675         if (ncells_new[XX] != ncells_old[XX] ||
9676             ncells_new[YY] != ncells_old[YY] ||
9677             ncells_new[ZZ] != ncells_old[ZZ])
9678         {
9679             bResortAll = TRUE;
9680         }
9681
9682         if (debug)
9683         {
9684             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9685                     gmx_step_str(step, sbuf), dd->ncg_home);
9686         }
9687         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9688                       bResortAll ? -1 : ncg_home_old);
9689         /* Rebuild all the indices */
9690         ga2la_clear(dd->ga2la);
9691         ncgindex_set = 0;
9692
9693         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9694     }
9695
9696     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9697
9698     /* Setup up the communication and communicate the coordinates */
9699     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9700
9701     /* Set the indices */
9702     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9703
9704     /* Set the charge group boundaries for neighbor searching */
9705     set_cg_boundaries(&comm->zones);
9706
9707     if (fr->cutoff_scheme == ecutsVERLET)
9708     {
9709         set_zones_size(dd, state_local->box, &ddbox,
9710                        bSortCG ? 1 : 0, comm->zones.n);
9711     }
9712
9713     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9714
9715     /*
9716        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9717                  -1,state_local->x,state_local->box);
9718      */
9719
9720     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9721
9722     /* Extract a local topology from the global topology */
9723     for (i = 0; i < dd->ndim; i++)
9724     {
9725         np[dd->dim[i]] = comm->cd[i].np;
9726     }
9727     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9728                       comm->cellsize_min, np,
9729                       fr,
9730                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9731                       vsite, top_global, top_local);
9732
9733     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9734
9735     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9736
9737     /* Set up the special atom communication */
9738     n = comm->nat[ddnatZONE];
9739     for (i = ddnatZONE+1; i < ddnatNR; i++)
9740     {
9741         switch (i)
9742         {
9743             case ddnatVSITE:
9744                 if (vsite && vsite->n_intercg_vsite)
9745                 {
9746                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9747                 }
9748                 break;
9749             case ddnatCON:
9750                 if (dd->bInterCGcons || dd->bInterCGsettles)
9751                 {
9752                     /* Only for inter-cg constraints we need special code */
9753                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9754                                                   constr, ir->nProjOrder,
9755                                                   top_local->idef.il);
9756                 }
9757                 break;
9758             default:
9759                 gmx_incons("Unknown special atom type setup");
9760         }
9761         comm->nat[i] = n;
9762     }
9763
9764     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9765
9766     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9767
9768     /* Make space for the extra coordinates for virtual site
9769      * or constraint communication.
9770      */
9771     state_local->natoms = comm->nat[ddnatNR-1];
9772     if (state_local->natoms > state_local->nalloc)
9773     {
9774         dd_realloc_state(state_local, f, state_local->natoms);
9775     }
9776
9777     if (fr->bF_NoVirSum)
9778     {
9779         if (vsite && vsite->n_intercg_vsite)
9780         {
9781             nat_f_novirsum = comm->nat[ddnatVSITE];
9782         }
9783         else
9784         {
9785             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9786             {
9787                 nat_f_novirsum = dd->nat_tot;
9788             }
9789             else
9790             {
9791                 nat_f_novirsum = dd->nat_home;
9792             }
9793         }
9794     }
9795     else
9796     {
9797         nat_f_novirsum = 0;
9798     }
9799
9800     /* Set the number of atoms required for the force calculation.
9801      * Forces need to be constrained when using a twin-range setup
9802      * or with energy minimization. For simple simulations we could
9803      * avoid some allocation, zeroing and copying, but this is
9804      * probably not worth the complications ande checking.
9805      */
9806     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9807                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9808
9809     /* We make the all mdatoms up to nat_tot_con.
9810      * We could save some work by only setting invmass
9811      * between nat_tot and nat_tot_con.
9812      */
9813     /* This call also sets the new number of home particles to dd->nat_home */
9814     atoms2md(top_global, ir,
9815              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9816
9817     /* Now we have the charges we can sort the FE interactions */
9818     dd_sort_local_top(dd, mdatoms, top_local);
9819
9820     if (vsite != NULL)
9821     {
9822         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9823         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9824                                   mdatoms, FALSE, vsite);
9825     }
9826
9827     if (shellfc)
9828     {
9829         /* Make the local shell stuff, currently no communication is done */
9830         make_local_shells(cr, mdatoms, shellfc);
9831     }
9832
9833     if (ir->implicit_solvent)
9834     {
9835         make_local_gb(cr, fr->born, ir->gb_algorithm);
9836     }
9837
9838     setup_bonded_threading(fr, &top_local->idef);
9839
9840     if (!(cr->duty & DUTY_PME))
9841     {
9842         /* Send the charges and/or c6/sigmas to our PME only node */
9843         gmx_pme_send_parameters(cr,
9844                                 fr->ic,
9845                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9846                                 mdatoms->chargeA, mdatoms->chargeB,
9847                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9848                                 mdatoms->sigmaA, mdatoms->sigmaB,
9849                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9850     }
9851
9852     if (constr)
9853     {
9854         set_constraints(constr, top_local, ir, mdatoms, cr);
9855     }
9856
9857     if (ir->ePull != epullNO)
9858     {
9859         /* Update the local pull groups */
9860         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9861     }
9862
9863     if (ir->bRot)
9864     {
9865         /* Update the local rotation groups */
9866         dd_make_local_rotation_groups(dd, ir->rot);
9867     }
9868
9869     if (ir->eSwapCoords != eswapNO)
9870     {
9871         /* Update the local groups needed for ion swapping */
9872         dd_make_local_swap_groups(dd, ir->swap);
9873     }
9874
9875     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9876     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9877
9878     add_dd_statistics(dd);
9879
9880     /* Make sure we only count the cycles for this DD partitioning */
9881     clear_dd_cycle_counts(dd);
9882
9883     /* Because the order of the atoms might have changed since
9884      * the last vsite construction, we need to communicate the constructing
9885      * atom coordinates again (for spreading the forces this MD step).
9886      */
9887     dd_move_x_vsites(dd, state_local->box, state_local->x);
9888
9889     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9890
9891     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9892     {
9893         dd_move_x(dd, state_local->box, state_local->x);
9894         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9895                      -1, state_local->x, state_local->box);
9896     }
9897
9898     /* Store the partitioning step */
9899     comm->partition_step = step;
9900
9901     /* Increase the DD partitioning counter */
9902     dd->ddp_count++;
9903     /* The state currently matches this DD partitioning count, store it */
9904     state_local->ddp_count = dd->ddp_count;
9905     if (bMasterState)
9906     {
9907         /* The DD master node knows the complete cg distribution,
9908          * store the count so we can possibly skip the cg info communication.
9909          */
9910         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9911     }
9912
9913     if (comm->DD_debug > 0)
9914     {
9915         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9916         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9917                                 "after partitioning");
9918     }
9919 }