src/gromacs/domdec/domdec.cpp

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #include "gmxpre.h"
  37
  38 #include "domdec.h"
  39
  40 #include "config.h"
  41
  42 #include <assert.h>
  43 #include <limits.h>
  44 #include <math.h>
  45 #include <stdio.h>
  46 #include <stdlib.h>
  47 #include <string.h>
  48
  49 #include <algorithm>
  50
  51 #include "gromacs/domdec/domdec_network.h"
  52 #include "gromacs/ewald/pme.h"
  53 #include "gromacs/fileio/gmxfio.h"
  54 #include "gromacs/fileio/pdbio.h"
  55 #include "gromacs/gmxlib/gpu_utils/gpu_utils.h"
  56 #include "gromacs/imd/imd.h"
  57 #include "gromacs/legacyheaders/chargegroup.h"
  58 #include "gromacs/legacyheaders/constr.h"
  59 #include "gromacs/legacyheaders/force.h"
  60 #include "gromacs/legacyheaders/genborn.h"
  61 #include "gromacs/legacyheaders/gmx_ga2la.h"
  62 #include "gromacs/legacyheaders/gmx_omp_nthreads.h"
  63 #include "gromacs/legacyheaders/mdatoms.h"
  64 #include "gromacs/legacyheaders/mdrun.h"
  65 #include "gromacs/legacyheaders/names.h"
  66 #include "gromacs/legacyheaders/network.h"
  67 #include "gromacs/legacyheaders/nrnb.h"
  68 #include "gromacs/legacyheaders/nsgrid.h"
  69 #include "gromacs/legacyheaders/shellfc.h"
  70 #include "gromacs/legacyheaders/typedefs.h"
  71 #include "gromacs/legacyheaders/vsite.h"
  72 #include "gromacs/legacyheaders/types/commrec.h"
  73 #include "gromacs/legacyheaders/types/constr.h"
  74 #include "gromacs/legacyheaders/types/enums.h"
  75 #include "gromacs/legacyheaders/types/forcerec.h"
  76 #include "gromacs/legacyheaders/types/hw_info.h"
  77 #include "gromacs/legacyheaders/types/ifunc.h"
  78 #include "gromacs/legacyheaders/types/inputrec.h"
  79 #include "gromacs/legacyheaders/types/mdatom.h"
  80 #include "gromacs/legacyheaders/types/nrnb.h"
  81 #include "gromacs/legacyheaders/types/ns.h"
  82 #include "gromacs/legacyheaders/types/nsgrid.h"
  83 #include "gromacs/legacyheaders/types/shellfc.h"
  84 #include "gromacs/legacyheaders/types/simple.h"
  85 #include "gromacs/legacyheaders/types/state.h"
  86 #include "gromacs/listed-forces/manage-threading.h"
  87 #include "gromacs/math/vec.h"
  88 #include "gromacs/math/vectypes.h"
  89 #include "gromacs/mdlib/nb_verlet.h"
  90 #include "gromacs/mdlib/nbnxn_search.h"
  91 #include "gromacs/pbcutil/ishift.h"
  92 #include "gromacs/pbcutil/pbc.h"
  93 #include "gromacs/pulling/pull.h"
  94 #include "gromacs/pulling/pull_rotation.h"
  95 #include "gromacs/swap/swapcoords.h"
  96 #include "gromacs/timing/wallcycle.h"
  97 #include "gromacs/topology/block.h"
  98 #include "gromacs/topology/idef.h"
  99 #include "gromacs/topology/mtop_util.h"
 100 #include "gromacs/topology/topology.h"
 101 #include "gromacs/utility/basedefinitions.h"
 102 #include "gromacs/utility/basenetwork.h"
 103 #include "gromacs/utility/cstringutil.h"
 104 #include "gromacs/utility/fatalerror.h"
 105 #include "gromacs/utility/gmxmpi.h"
 106 #include "gromacs/utility/qsort_threadsafe.h"
 107 #include "gromacs/utility/real.h"
 108 #include "gromacs/utility/smalloc.h"
 109
 110 #include "domdec_constraints.h"
 111 #include "domdec_internal.h"
 112 #include "domdec_vsite.h"
 113
 114 #define DDRANK(dd, rank)    (rank)
 115 #define DDMASTERRANK(dd)   (dd->masterrank)
 116
 117 typedef struct gmx_domdec_master
 118 {
 119     /* The cell boundaries */
 120     real **cell_x;
 121     /* The global charge group division */
 122     int   *ncg;    /* Number of home charge groups for each node */
 123     int   *index;  /* Index of nnodes+1 into cg */
 124     int   *cg;     /* Global charge group index */
 125     int   *nat;    /* Number of home atoms for each node. */
 126     int   *ibuf;   /* Buffer for communication */
 127     rvec  *vbuf;   /* Buffer for state scattering and gathering */
 128 } gmx_domdec_master_t;
 129
 130 typedef struct
 131 {
 132     /* The numbers of charge groups to send and receive for each cell
 133      * that requires communication, the last entry contains the total
 134      * number of atoms that needs to be communicated.
 135      */
 136     int  nsend[DD_MAXIZONE+2];
 137     int  nrecv[DD_MAXIZONE+2];
 138     /* The charge groups to send */
 139     int *index;
 140     int  nalloc;
 141     /* The atom range for non-in-place communication */
 142     int  cell2at0[DD_MAXIZONE];
 143     int  cell2at1[DD_MAXIZONE];
 144 } gmx_domdec_ind_t;
 145
 146 typedef struct
 147 {
 148     int               np;       /* Number of grid pulses in this dimension */
 149     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 150     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 151     int               np_nalloc;
 152     gmx_bool          bInPlace; /* Can we communicate in place?            */
 153 } gmx_domdec_comm_dim_t;
 154
 155 typedef struct
 156 {
 157     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 158     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 159     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 160     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 161     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 162     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 163     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 164     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 165     real     *buf_ncd;     /* Temp. var.                                     */
 166 } gmx_domdec_root_t;
 167
 168 #define DD_NLOAD_MAX 9
 169
 170 /* Here floats are accurate enough, since these variables
 171  * only influence the load balancing, not the actual MD results.
 172  */
 173 typedef struct
 174 {
 175     int    nload;
 176     float *load;
 177     float  sum;
 178     float  max;
 179     float  sum_m;
 180     float  cvol_min;
 181     float  mdf;
 182     float  pme;
 183     int    flags;
 184 } gmx_domdec_load_t;
 185
 186 typedef struct
 187 {
 188     int  nsc;
 189     int  ind_gl;
 190     int  ind;
 191 } gmx_cgsort_t;
 192
 193 typedef struct
 194 {
 195     gmx_cgsort_t *sort;
 196     gmx_cgsort_t *sort2;
 197     int           sort_nalloc;
 198     gmx_cgsort_t *sort_new;
 199     int           sort_new_nalloc;
 200     int          *ibuf;
 201     int           ibuf_nalloc;
 202 } gmx_domdec_sort_t;
 203
 204 typedef struct
 205 {
 206     rvec *v;
 207     int   nalloc;
 208 } vec_rvec_t;
 209
 210 /* This enum determines the order of the coordinates.
 211  * ddnatHOME and ddnatZONE should be first and second,
 212  * the others can be ordered as wanted.
 213  */
 214 enum {
 215     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 216 };
 217
 218 enum {
 219     edlbAUTO, edlbNO, edlbYES, edlbNR
 220 };
 221 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 222
 223 typedef struct
 224 {
 225     int      dim;       /* The dimension                                          */
 226     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 227     int      nslab;     /* The number of PME slabs in this dimension              */
 228     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 229     int     *pp_min;    /* The minimum pp node location, size nslab               */
 230     int     *pp_max;    /* The maximum pp node location,size nslab                */
 231     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 232 } gmx_ddpme_t;
 233
 234 typedef struct
 235 {
 236     real min0;    /* The minimum bottom of this zone                        */
 237     real max1;    /* The maximum top of this zone                           */
 238     real min1;    /* The minimum top of this zone                           */
 239     real mch0;    /* The maximum bottom communicaton height for this zone   */
 240     real mch1;    /* The maximum top communicaton height for this zone      */
 241     real p1_0;    /* The bottom value of the first cell in this zone        */
 242     real p1_1;    /* The top value of the first cell in this zone           */
 243 } gmx_ddzone_t;
 244
 245 typedef struct
 246 {
 247     gmx_domdec_ind_t ind;
 248     int             *ibuf;
 249     int              ibuf_nalloc;
 250     vec_rvec_t       vbuf;
 251     int              nsend;
 252     int              nat;
 253     int              nsend_zone;
 254 } dd_comm_setup_work_t;
 255
 256 typedef struct gmx_domdec_comm
 257 {
 258     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 259      * unless stated otherwise.
 260      */
 261
 262     /* The number of decomposition dimensions for PME, 0: no PME */
 263     int         npmedecompdim;
 264     /* The number of nodes doing PME (PP/PME or only PME) */
 265     int         npmenodes;
 266     int         npmenodes_x;
 267     int         npmenodes_y;
 268     /* The communication setup including the PME only nodes */
 269     gmx_bool    bCartesianPP_PME;
 270     ivec        ntot;
 271     int         cartpmedim;
 272     int        *pmenodes;          /* size npmenodes                         */
 273     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 274                                     * but with bCartesianPP_PME              */
 275     gmx_ddpme_t ddpme[2];
 276
 277     /* The DD particle-particle nodes only */
 278     gmx_bool bCartesianPP;
 279     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 280
 281     /* The global charge groups */
 282     t_block cgs_gl;
 283
 284     /* Should we sort the cgs */
 285     int                nstSortCG;
 286     gmx_domdec_sort_t *sort;
 287
 288     /* Are there charge groups? */
 289     gmx_bool bCGs;
 290
 291     /* Are there bonded and multi-body interactions between charge groups? */
 292     gmx_bool bInterCGBondeds;
 293     gmx_bool bInterCGMultiBody;
 294
 295     /* Data for the optional bonded interaction atom communication range */
 296     gmx_bool  bBondComm;
 297     t_blocka *cglink;
 298     char     *bLocalCG;
 299
 300     /* The DLB option */
 301     int      eDLB;
 302     /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
 303     gmx_bool bDLB_locked;
 304     /* Are we actually using DLB? */
 305     gmx_bool bDynLoadBal;
 306
 307     /* Cell sizes for static load balancing, first index cartesian */
 308     real **slb_frac;
 309
 310     /* The width of the communicated boundaries */
 311     real     cutoff_mbody;
 312     real     cutoff;
 313     /* The minimum cell size (including triclinic correction) */
 314     rvec     cellsize_min;
 315     /* For dlb, for use with edlbAUTO */
 316     rvec     cellsize_min_dlb;
 317     /* The lower limit for the DD cell size with DLB */
 318     real     cellsize_limit;
 319     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 320     gmx_bool bVacDLBNoLimit;
 321
 322     /* With PME load balancing we set limits on DLB */
 323     gmx_bool bPMELoadBalDLBLimits;
 324     /* DLB needs to take into account that we want to allow this maximum
 325      * cut-off (for PME load balancing), this could limit cell boundaries.
 326      */
 327     real PMELoadBal_max_cutoff;
 328
 329     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 330     ivec tric_dir;
 331     /* box0 and box_size are required with dim's without pbc and -gcom */
 332     rvec box0;
 333     rvec box_size;
 334
 335     /* The cell boundaries */
 336     rvec cell_x0;
 337     rvec cell_x1;
 338
 339     /* The old location of the cell boundaries, to check cg displacements */
 340     rvec old_cell_x0;
 341     rvec old_cell_x1;
 342
 343     /* The communication setup and charge group boundaries for the zones */
 344     gmx_domdec_zones_t zones;
 345
 346     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 347      * cell boundaries of neighboring cells for dynamic load balancing.
 348      */
 349     gmx_ddzone_t zone_d1[2];
 350     gmx_ddzone_t zone_d2[2][2];
 351
 352     /* The coordinate/force communication setup and indices */
 353     gmx_domdec_comm_dim_t cd[DIM];
 354     /* The maximum number of cells to communicate with in one dimension */
 355     int                   maxpulse;
 356
 357     /* Which cg distribution is stored on the master node */
 358     int master_cg_ddp_count;
 359
 360     /* The number of cg's received from the direct neighbors */
 361     int  zone_ncg1[DD_MAXZONE];
 362
 363     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 364     int  nat[ddnatNR];
 365
 366     /* Array for signalling if atoms have moved to another domain */
 367     int  *moved;
 368     int   moved_nalloc;
 369
 370     /* Communication buffer for general use */
 371     int  *buf_int;
 372     int   nalloc_int;
 373
 374     /* Communication buffer for general use */
 375     vec_rvec_t vbuf;
 376
 377     /* Temporary storage for thread parallel communication setup */
 378     int                   nth;
 379     dd_comm_setup_work_t *dth;
 380
 381     /* Communication buffers only used with multiple grid pulses */
 382     int       *buf_int2;
 383     int        nalloc_int2;
 384     vec_rvec_t vbuf2;
 385
 386     /* Communication buffers for local redistribution */
 387     int  **cggl_flag;
 388     int    cggl_flag_nalloc[DIM*2];
 389     rvec **cgcm_state;
 390     int    cgcm_state_nalloc[DIM*2];
 391
 392     /* Cell sizes for dynamic load balancing */
 393     gmx_domdec_root_t **root;
 394     real               *cell_f_row;
 395     real                cell_f0[DIM];
 396     real                cell_f1[DIM];
 397     real                cell_f_max0[DIM];
 398     real                cell_f_min1[DIM];
 399
 400     /* Stuff for load communication */
 401     gmx_bool           bRecordLoad;
 402     gmx_domdec_load_t *load;
 403     int                nrank_gpu_shared;
 404 #ifdef GMX_MPI
 405     MPI_Comm          *mpi_comm_load;
 406     MPI_Comm           mpi_comm_gpu_shared;
 407 #endif
 408
 409     /* Maximum DLB scaling per load balancing step in percent */
 410     int dlb_scale_lim;
 411
 412     /* Cycle counters */
 413     float  cycl[ddCyclNr];
 414     int    cycl_n[ddCyclNr];
 415     float  cycl_max[ddCyclNr];
 416     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 417     int    eFlop;
 418     double flop;
 419     int    flop_n;
 420     /* How many times have did we have load measurements */
 421     int    n_load_have;
 422     /* How many times have we collected the load measurements */
 423     int    n_load_collect;
 424
 425     /* Statistics */
 426     double sum_nat[ddnatNR-ddnatZONE];
 427     int    ndecomp;
 428     int    nload;
 429     double load_step;
 430     double load_sum;
 431     double load_max;
 432     ivec   load_lim;
 433     double load_mdf;
 434     double load_pme;
 435
 436     /* The last partition step */
 437     gmx_int64_t partition_step;
 438
 439     /* Debugging */
 440     int  nstDDDump;
 441     int  nstDDDumpGrid;
 442     int  DD_debug;
 443 } gmx_domdec_comm_t;
 444
 445 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 446 #define DD_CGIBS 2
 447
 448 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 449 #define DD_FLAG_NRCG  65535
 450 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 451 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 452
 453 /* Zone permutation required to obtain consecutive charge groups
 454  * for neighbor searching.
 455  */
 456 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 457
 458 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 459  * components see only j zones with that component 0.
 460  */
 461
 462 /* The DD zone order */
 463 static const ivec dd_zo[DD_MAXZONE] =
 464 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 465
 466 /* The 3D setup */
 467 #define dd_z3n  8
 468 #define dd_zp3n 4
 469 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 470
 471 /* The 2D setup */
 472 #define dd_z2n  4
 473 #define dd_zp2n 2
 474 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 475
 476 /* The 1D setup */
 477 #define dd_z1n  2
 478 #define dd_zp1n 1
 479 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 480
 481 /* Factors used to avoid problems due to rounding issues */
 482 #define DD_CELL_MARGIN       1.0001
 483 #define DD_CELL_MARGIN2      1.00005
 484 /* Factor to account for pressure scaling during nstlist steps */
 485 #define DD_PRES_SCALE_MARGIN 1.02
 486
 487 /* Turn on DLB when the load imbalance causes this amount of total loss.
 488  * There is a bit of overhead with DLB and it's difficult to achieve
 489  * a load imbalance of less than 2% with DLB.
 490  */
 491 #define DD_PERF_LOSS_DLB_ON  0.02
 492
 493 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 494 #define DD_PERF_LOSS_WARN    0.05
 495
 496 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 497
 498 /* Use separate MPI send and receive commands
 499  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 500  * This saves memory (and some copying for small nnodes).
 501  * For high parallelization scatter and gather calls are used.
 502  */
 503 #define GMX_DD_NNODES_SENDRECV 4
 504
 505
 506 /*
 507    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 508
 509    static void index2xyz(ivec nc,int ind,ivec xyz)
 510    {
 511    xyz[XX] = ind % nc[XX];
 512    xyz[YY] = (ind / nc[XX]) % nc[YY];
 513    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 514    }
 515  */
 516
 517 /* This order is required to minimize the coordinate communication in PME
 518  * which uses decomposition in the x direction.
 519  */
 520 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 521
 522 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 523 {
 524     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 525     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 526     xyz[ZZ] = ind % nc[ZZ];
 527 }
 528
 529 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 530 {
 531     int ddindex;
 532     int ddnodeid = -1;
 533
 534     ddindex = dd_index(dd->nc, c);
 535     if (dd->comm->bCartesianPP_PME)
 536     {
 537         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 538     }
 539     else if (dd->comm->bCartesianPP)
 540     {
 541 #ifdef GMX_MPI
 542         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 543 #endif
 544     }
 545     else
 546     {
 547         ddnodeid = ddindex;
 548     }
 549
 550     return ddnodeid;
 551 }
 552
 553 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 554 {
 555     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 556 }
 557
 558 int ddglatnr(gmx_domdec_t *dd, int i)
 559 {
 560     int atnr;
 561
 562     if (dd == NULL)
 563     {
 564         atnr = i + 1;
 565     }
 566     else
 567     {
 568         if (i >= dd->comm->nat[ddnatNR-1])
 569         {
 570             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 571         }
 572         atnr = dd->gatindex[i] + 1;
 573     }
 574
 575     return atnr;
 576 }
 577
 578 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 579 {
 580     return &dd->comm->cgs_gl;
 581 }
 582
 583 static void vec_rvec_init(vec_rvec_t *v)
 584 {
 585     v->nalloc = 0;
 586     v->v      = NULL;
 587 }
 588
 589 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 590 {
 591     if (n > v->nalloc)
 592     {
 593         v->nalloc = over_alloc_dd(n);
 594         srenew(v->v, v->nalloc);
 595     }
 596 }
 597
 598 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 599 {
 600     int i;
 601
 602     if (state->ddp_count != dd->ddp_count)
 603     {
 604         gmx_incons("The state does not the domain decomposition state");
 605     }
 606
 607     state->ncg_gl = dd->ncg_home;
 608     if (state->ncg_gl > state->cg_gl_nalloc)
 609     {
 610         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 611         srenew(state->cg_gl, state->cg_gl_nalloc);
 612     }
 613     for (i = 0; i < state->ncg_gl; i++)
 614     {
 615         state->cg_gl[i] = dd->index_gl[i];
 616     }
 617
 618     state->ddp_count_cg_gl = dd->ddp_count;
 619 }
 620
 621 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 622 {
 623     return &dd->comm->zones;
 624 }
 625
 626 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 627                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 628 {
 629     gmx_domdec_zones_t *zones;
 630     int                 izone, d, dim;
 631
 632     zones = &dd->comm->zones;
 633
 634     izone = 0;
 635     while (icg >= zones->izone[izone].cg1)
 636     {
 637         izone++;
 638     }
 639
 640     if (izone == 0)
 641     {
 642         *jcg0 = icg;
 643     }
 644     else if (izone < zones->nizone)
 645     {
 646         *jcg0 = zones->izone[izone].jcg0;
 647     }
 648     else
 649     {
 650         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 651                   icg, izone, zones->nizone);
 652     }
 653
 654     *jcg1 = zones->izone[izone].jcg1;
 655
 656     for (d = 0; d < dd->ndim; d++)
 657     {
 658         dim         = dd->dim[d];
 659         shift0[dim] = zones->izone[izone].shift0[dim];
 660         shift1[dim] = zones->izone[izone].shift1[dim];
 661         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 662         {
 663             /* A conservative approach, this can be optimized */
 664             shift0[dim] -= 1;
 665             shift1[dim] += 1;
 666         }
 667     }
 668 }
 669
 670 int dd_natoms_vsite(gmx_domdec_t *dd)
 671 {
 672     return dd->comm->nat[ddnatVSITE];
 673 }
 674
 675 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 676 {
 677     *at_start = dd->comm->nat[ddnatCON-1];
 678     *at_end   = dd->comm->nat[ddnatCON];
 679 }
 680
 681 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 682 {
 683     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 684     int                   *index, *cgindex;
 685     gmx_domdec_comm_t     *comm;
 686     gmx_domdec_comm_dim_t *cd;
 687     gmx_domdec_ind_t      *ind;
 688     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 689     gmx_bool               bPBC, bScrew;
 690
 691     comm = dd->comm;
 692
 693     cgindex = dd->cgindex;
 694
 695     buf = comm->vbuf.v;
 696
 697     nzone   = 1;
 698     nat_tot = dd->nat_home;
 699     for (d = 0; d < dd->ndim; d++)
 700     {
 701         bPBC   = (dd->ci[dd->dim[d]] == 0);
 702         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 703         if (bPBC)
 704         {
 705             copy_rvec(box[dd->dim[d]], shift);
 706         }
 707         cd = &comm->cd[d];
 708         for (p = 0; p < cd->np; p++)
 709         {
 710             ind   = &cd->ind[p];
 711             index = ind->index;
 712             n     = 0;
 713             if (!bPBC)
 714             {
 715                 for (i = 0; i < ind->nsend[nzone]; i++)
 716                 {
 717                     at0 = cgindex[index[i]];
 718                     at1 = cgindex[index[i]+1];
 719                     for (j = at0; j < at1; j++)
 720                     {
 721                         copy_rvec(x[j], buf[n]);
 722                         n++;
 723                     }
 724                 }
 725             }
 726             else if (!bScrew)
 727             {
 728                 for (i = 0; i < ind->nsend[nzone]; i++)
 729                 {
 730                     at0 = cgindex[index[i]];
 731                     at1 = cgindex[index[i]+1];
 732                     for (j = at0; j < at1; j++)
 733                     {
 734                         /* We need to shift the coordinates */
 735                         rvec_add(x[j], shift, buf[n]);
 736                         n++;
 737                     }
 738                 }
 739             }
 740             else
 741             {
 742                 for (i = 0; i < ind->nsend[nzone]; i++)
 743                 {
 744                     at0 = cgindex[index[i]];
 745                     at1 = cgindex[index[i]+1];
 746                     for (j = at0; j < at1; j++)
 747                     {
 748                         /* Shift x */
 749                         buf[n][XX] = x[j][XX] + shift[XX];
 750                         /* Rotate y and z.
 751                          * This operation requires a special shift force
 752                          * treatment, which is performed in calc_vir.
 753                          */
 754                         buf[n][YY] = box[YY][YY] - x[j][YY];
 755                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 756                         n++;
 757                     }
 758                 }
 759             }
 760
 761             if (cd->bInPlace)
 762             {
 763                 rbuf = x + nat_tot;
 764             }
 765             else
 766             {
 767                 rbuf = comm->vbuf2.v;
 768             }
 769             /* Send and receive the coordinates */
 770             dd_sendrecv_rvec(dd, d, dddirBackward,
 771                              buf,  ind->nsend[nzone+1],
 772                              rbuf, ind->nrecv[nzone+1]);
 773             if (!cd->bInPlace)
 774             {
 775                 j = 0;
 776                 for (zone = 0; zone < nzone; zone++)
 777                 {
 778                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 779                     {
 780                         copy_rvec(rbuf[j], x[i]);
 781                         j++;
 782                     }
 783                 }
 784             }
 785             nat_tot += ind->nrecv[nzone+1];
 786         }
 787         nzone += nzone;
 788     }
 789 }
 790
 791 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 792 {
 793     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 794     int                   *index, *cgindex;
 795     gmx_domdec_comm_t     *comm;
 796     gmx_domdec_comm_dim_t *cd;
 797     gmx_domdec_ind_t      *ind;
 798     rvec                  *buf, *sbuf;
 799     ivec                   vis;
 800     int                    is;
 801     gmx_bool               bShiftForcesNeedPbc, bScrew;
 802
 803     comm = dd->comm;
 804
 805     cgindex = dd->cgindex;
 806
 807     buf = comm->vbuf.v;
 808
 809     nzone   = comm->zones.n/2;
 810     nat_tot = dd->nat_tot;
 811     for (d = dd->ndim-1; d >= 0; d--)
 812     {
 813         /* Only forces in domains near the PBC boundaries need to
 814            consider PBC in the treatment of fshift */
 815         bShiftForcesNeedPbc   = (dd->ci[dd->dim[d]] == 0);
 816         bScrew                = (bShiftForcesNeedPbc && dd->bScrewPBC && dd->dim[d] == XX);
 817         if (fshift == NULL && !bScrew)
 818         {
 819             bShiftForcesNeedPbc = FALSE;
 820         }
 821         /* Determine which shift vector we need */
 822         clear_ivec(vis);
 823         vis[dd->dim[d]] = 1;
 824         is              = IVEC2IS(vis);
 825
 826         cd = &comm->cd[d];
 827         for (p = cd->np-1; p >= 0; p--)
 828         {
 829             ind      = &cd->ind[p];
 830             nat_tot -= ind->nrecv[nzone+1];
 831             if (cd->bInPlace)
 832             {
 833                 sbuf = f + nat_tot;
 834             }
 835             else
 836             {
 837                 sbuf = comm->vbuf2.v;
 838                 j    = 0;
 839                 for (zone = 0; zone < nzone; zone++)
 840                 {
 841                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 842                     {
 843                         copy_rvec(f[i], sbuf[j]);
 844                         j++;
 845                     }
 846                 }
 847             }
 848             /* Communicate the forces */
 849             dd_sendrecv_rvec(dd, d, dddirForward,
 850                              sbuf, ind->nrecv[nzone+1],
 851                              buf,  ind->nsend[nzone+1]);
 852             index = ind->index;
 853             /* Add the received forces */
 854             n = 0;
 855             if (!bShiftForcesNeedPbc)
 856             {
 857                 for (i = 0; i < ind->nsend[nzone]; i++)
 858                 {
 859                     at0 = cgindex[index[i]];
 860                     at1 = cgindex[index[i]+1];
 861                     for (j = at0; j < at1; j++)
 862                     {
 863                         rvec_inc(f[j], buf[n]);
 864                         n++;
 865                     }
 866                 }
 867             }
 868             else if (!bScrew)
 869             {
 870                 /* fshift should always be defined if this function is
 871                  * called when bShiftForcesNeedPbc is true */
 872                 assert(NULL != fshift);
 873                 for (i = 0; i < ind->nsend[nzone]; i++)
 874                 {
 875                     at0 = cgindex[index[i]];
 876                     at1 = cgindex[index[i]+1];
 877                     for (j = at0; j < at1; j++)
 878                     {
 879                         rvec_inc(f[j], buf[n]);
 880                         /* Add this force to the shift force */
 881                         rvec_inc(fshift[is], buf[n]);
 882                         n++;
 883                     }
 884                 }
 885             }
 886             else
 887             {
 888                 for (i = 0; i < ind->nsend[nzone]; i++)
 889                 {
 890                     at0 = cgindex[index[i]];
 891                     at1 = cgindex[index[i]+1];
 892                     for (j = at0; j < at1; j++)
 893                     {
 894                         /* Rotate the force */
 895                         f[j][XX] += buf[n][XX];
 896                         f[j][YY] -= buf[n][YY];
 897                         f[j][ZZ] -= buf[n][ZZ];
 898                         if (fshift)
 899                         {
 900                             /* Add this force to the shift force */
 901                             rvec_inc(fshift[is], buf[n]);
 902                         }
 903                         n++;
 904                     }
 905                 }
 906             }
 907         }
 908         nzone /= 2;
 909     }
 910 }
 911
 912 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 913 {
 914     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 915     int                   *index, *cgindex;
 916     gmx_domdec_comm_t     *comm;
 917     gmx_domdec_comm_dim_t *cd;
 918     gmx_domdec_ind_t      *ind;
 919     real                  *buf, *rbuf;
 920
 921     comm = dd->comm;
 922
 923     cgindex = dd->cgindex;
 924
 925     buf = &comm->vbuf.v[0][0];
 926
 927     nzone   = 1;
 928     nat_tot = dd->nat_home;
 929     for (d = 0; d < dd->ndim; d++)
 930     {
 931         cd = &comm->cd[d];
 932         for (p = 0; p < cd->np; p++)
 933         {
 934             ind   = &cd->ind[p];
 935             index = ind->index;
 936             n     = 0;
 937             for (i = 0; i < ind->nsend[nzone]; i++)
 938             {
 939                 at0 = cgindex[index[i]];
 940                 at1 = cgindex[index[i]+1];
 941                 for (j = at0; j < at1; j++)
 942                 {
 943                     buf[n] = v[j];
 944                     n++;
 945                 }
 946             }
 947
 948             if (cd->bInPlace)
 949             {
 950                 rbuf = v + nat_tot;
 951             }
 952             else
 953             {
 954                 rbuf = &comm->vbuf2.v[0][0];
 955             }
 956             /* Send and receive the coordinates */
 957             dd_sendrecv_real(dd, d, dddirBackward,
 958                              buf,  ind->nsend[nzone+1],
 959                              rbuf, ind->nrecv[nzone+1]);
 960             if (!cd->bInPlace)
 961             {
 962                 j = 0;
 963                 for (zone = 0; zone < nzone; zone++)
 964                 {
 965                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 966                     {
 967                         v[i] = rbuf[j];
 968                         j++;
 969                     }
 970                 }
 971             }
 972             nat_tot += ind->nrecv[nzone+1];
 973         }
 974         nzone += nzone;
 975     }
 976 }
 977
 978 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 979 {
 980     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 981     int                   *index, *cgindex;
 982     gmx_domdec_comm_t     *comm;
 983     gmx_domdec_comm_dim_t *cd;
 984     gmx_domdec_ind_t      *ind;
 985     real                  *buf, *sbuf;
 986
 987     comm = dd->comm;
 988
 989     cgindex = dd->cgindex;
 990
 991     buf = &comm->vbuf.v[0][0];
 992
 993     nzone   = comm->zones.n/2;
 994     nat_tot = dd->nat_tot;
 995     for (d = dd->ndim-1; d >= 0; d--)
 996     {
 997         cd = &comm->cd[d];
 998         for (p = cd->np-1; p >= 0; p--)
 999         {
1000             ind      = &cd->ind[p];
1001             nat_tot -= ind->nrecv[nzone+1];
1002             if (cd->bInPlace)
1003             {
1004                 sbuf = v + nat_tot;
1005             }
1006             else
1007             {
1008                 sbuf = &comm->vbuf2.v[0][0];
1009                 j    = 0;
1010                 for (zone = 0; zone < nzone; zone++)
1011                 {
1012                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
1013                     {
1014                         sbuf[j] = v[i];
1015                         j++;
1016                     }
1017                 }
1018             }
1019             /* Communicate the forces */
1020             dd_sendrecv_real(dd, d, dddirForward,
1021                              sbuf, ind->nrecv[nzone+1],
1022                              buf,  ind->nsend[nzone+1]);
1023             index = ind->index;
1024             /* Add the received forces */
1025             n = 0;
1026             for (i = 0; i < ind->nsend[nzone]; i++)
1027             {
1028                 at0 = cgindex[index[i]];
1029                 at1 = cgindex[index[i]+1];
1030                 for (j = at0; j < at1; j++)
1031                 {
1032                     v[j] += buf[n];
1033                     n++;
1034                 }
1035             }
1036         }
1037         nzone /= 2;
1038     }
1039 }
1040
1041 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
1042 {
1043     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1044             d, i, j,
1045             zone->min0, zone->max1,
1046             zone->mch0, zone->mch0,
1047             zone->p1_0, zone->p1_1);
1048 }
1049
1050
1051 #define DDZONECOMM_MAXZONE  5
1052 #define DDZONECOMM_BUFSIZE  3
1053
1054 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1055                                int ddimind, int direction,
1056                                gmx_ddzone_t *buf_s, int n_s,
1057                                gmx_ddzone_t *buf_r, int n_r)
1058 {
1059 #define ZBS  DDZONECOMM_BUFSIZE
1060     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1061     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1062     int  i;
1063
1064     for (i = 0; i < n_s; i++)
1065     {
1066         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1067         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1068         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1069         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1070         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1071         vbuf_s[i*ZBS+1][2] = 0;
1072         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1073         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1074         vbuf_s[i*ZBS+2][2] = 0;
1075     }
1076
1077     dd_sendrecv_rvec(dd, ddimind, direction,
1078                      vbuf_s, n_s*ZBS,
1079                      vbuf_r, n_r*ZBS);
1080
1081     for (i = 0; i < n_r; i++)
1082     {
1083         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1084         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1085         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1086         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1087         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1088         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1089         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1090     }
1091
1092 #undef ZBS
1093 }
1094
1095 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1096                           rvec cell_ns_x0, rvec cell_ns_x1)
1097 {
1098     int                d, d1, dim, pos, buf_size, i, j, p, npulse, npulse_min;
1099     gmx_ddzone_t      *zp;
1100     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1101     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1102     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1103     rvec               extr_s[2], extr_r[2];
1104     rvec               dh;
1105     real               dist_d, c = 0, det;
1106     gmx_domdec_comm_t *comm;
1107     gmx_bool           bPBC, bUse;
1108
1109     comm = dd->comm;
1110
1111     for (d = 1; d < dd->ndim; d++)
1112     {
1113         dim      = dd->dim[d];
1114         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1115         zp->min0 = cell_ns_x0[dim];
1116         zp->max1 = cell_ns_x1[dim];
1117         zp->min1 = cell_ns_x1[dim];
1118         zp->mch0 = cell_ns_x0[dim];
1119         zp->mch1 = cell_ns_x1[dim];
1120         zp->p1_0 = cell_ns_x0[dim];
1121         zp->p1_1 = cell_ns_x1[dim];
1122     }
1123
1124     for (d = dd->ndim-2; d >= 0; d--)
1125     {
1126         dim  = dd->dim[d];
1127         bPBC = (dim < ddbox->npbcdim);
1128
1129         /* Use an rvec to store two reals */
1130         extr_s[d][0] = comm->cell_f0[d+1];
1131         extr_s[d][1] = comm->cell_f1[d+1];
1132         extr_s[d][2] = comm->cell_f1[d+1];
1133
1134         pos = 0;
1135         /* Store the extremes in the backward sending buffer,
1136          * so the get updated separately from the forward communication.
1137          */
1138         for (d1 = d; d1 < dd->ndim-1; d1++)
1139         {
1140             /* We invert the order to be able to use the same loop for buf_e */
1141             buf_s[pos].min0 = extr_s[d1][1];
1142             buf_s[pos].max1 = extr_s[d1][0];
1143             buf_s[pos].min1 = extr_s[d1][2];
1144             buf_s[pos].mch0 = 0;
1145             buf_s[pos].mch1 = 0;
1146             /* Store the cell corner of the dimension we communicate along */
1147             buf_s[pos].p1_0 = comm->cell_x0[dim];
1148             buf_s[pos].p1_1 = 0;
1149             pos++;
1150         }
1151
1152         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1153         pos++;
1154
1155         if (dd->ndim == 3 && d == 0)
1156         {
1157             buf_s[pos] = comm->zone_d2[0][1];
1158             pos++;
1159             buf_s[pos] = comm->zone_d1[0];
1160             pos++;
1161         }
1162
1163         /* We only need to communicate the extremes
1164          * in the forward direction
1165          */
1166         npulse = comm->cd[d].np;
1167         if (bPBC)
1168         {
1169             /* Take the minimum to avoid double communication */
1170             npulse_min = std::min(npulse, dd->nc[dim]-1-npulse);
1171         }
1172         else
1173         {
1174             /* Without PBC we should really not communicate over
1175              * the boundaries, but implementing that complicates
1176              * the communication setup and therefore we simply
1177              * do all communication, but ignore some data.
1178              */
1179             npulse_min = npulse;
1180         }
1181         for (p = 0; p < npulse_min; p++)
1182         {
1183             /* Communicate the extremes forward */
1184             bUse = (bPBC || dd->ci[dim] > 0);
1185
1186             dd_sendrecv_rvec(dd, d, dddirForward,
1187                              extr_s+d, dd->ndim-d-1,
1188                              extr_r+d, dd->ndim-d-1);
1189
1190             if (bUse)
1191             {
1192                 for (d1 = d; d1 < dd->ndim-1; d1++)
1193                 {
1194                     extr_s[d1][0] = std::max(extr_s[d1][0], extr_r[d1][0]);
1195                     extr_s[d1][1] = std::min(extr_s[d1][1], extr_r[d1][1]);
1196                     extr_s[d1][2] = std::min(extr_s[d1][2], extr_r[d1][2]);
1197                 }
1198             }
1199         }
1200
1201         buf_size = pos;
1202         for (p = 0; p < npulse; p++)
1203         {
1204             /* Communicate all the zone information backward */
1205             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1206
1207             dd_sendrecv_ddzone(dd, d, dddirBackward,
1208                                buf_s, buf_size,
1209                                buf_r, buf_size);
1210
1211             clear_rvec(dh);
1212             if (p > 0)
1213             {
1214                 for (d1 = d+1; d1 < dd->ndim; d1++)
1215                 {
1216                     /* Determine the decrease of maximum required
1217                      * communication height along d1 due to the distance along d,
1218                      * this avoids a lot of useless atom communication.
1219                      */
1220                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1221
1222                     if (ddbox->tric_dir[dim])
1223                     {
1224                         /* c is the off-diagonal coupling between the cell planes
1225                          * along directions d and d1.
1226                          */
1227                         c = ddbox->v[dim][dd->dim[d1]][dim];
1228                     }
1229                     else
1230                     {
1231                         c = 0;
1232                     }
1233                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1234                     if (det > 0)
1235                     {
1236                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1237                     }
1238                     else
1239                     {
1240                         /* A negative value signals out of range */
1241                         dh[d1] = -1;
1242                     }
1243                 }
1244             }
1245
1246             /* Accumulate the extremes over all pulses */
1247             for (i = 0; i < buf_size; i++)
1248             {
1249                 if (p == 0)
1250                 {
1251                     buf_e[i] = buf_r[i];
1252                 }
1253                 else
1254                 {
1255                     if (bUse)
1256                     {
1257                         buf_e[i].min0 = std::min(buf_e[i].min0, buf_r[i].min0);
1258                         buf_e[i].max1 = std::max(buf_e[i].max1, buf_r[i].max1);
1259                         buf_e[i].min1 = std::min(buf_e[i].min1, buf_r[i].min1);
1260                     }
1261
1262                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1263                     {
1264                         d1 = 1;
1265                     }
1266                     else
1267                     {
1268                         d1 = d + 1;
1269                     }
1270                     if (bUse && dh[d1] >= 0)
1271                     {
1272                         buf_e[i].mch0 = std::max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1273                         buf_e[i].mch1 = std::max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1274                     }
1275                 }
1276                 /* Copy the received buffer to the send buffer,
1277                  * to pass the data through with the next pulse.
1278                  */
1279                 buf_s[i] = buf_r[i];
1280             }
1281             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1282                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1283             {
1284                 /* Store the extremes */
1285                 pos = 0;
1286
1287                 for (d1 = d; d1 < dd->ndim-1; d1++)
1288                 {
1289                     extr_s[d1][1] = std::min(extr_s[d1][1], buf_e[pos].min0);
1290                     extr_s[d1][0] = std::max(extr_s[d1][0], buf_e[pos].max1);
1291                     extr_s[d1][2] = std::min(extr_s[d1][2], buf_e[pos].min1);
1292                     pos++;
1293                 }
1294
1295                 if (d == 1 || (d == 0 && dd->ndim == 3))
1296                 {
1297                     for (i = d; i < 2; i++)
1298                     {
1299                         comm->zone_d2[1-d][i] = buf_e[pos];
1300                         pos++;
1301                     }
1302                 }
1303                 if (d == 0)
1304                 {
1305                     comm->zone_d1[1] = buf_e[pos];
1306                     pos++;
1307                 }
1308             }
1309         }
1310     }
1311
1312     if (dd->ndim >= 2)
1313     {
1314         dim = dd->dim[1];
1315         for (i = 0; i < 2; i++)
1316         {
1317             if (debug)
1318             {
1319                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1320             }
1321             cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1322             cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1323         }
1324     }
1325     if (dd->ndim >= 3)
1326     {
1327         dim = dd->dim[2];
1328         for (i = 0; i < 2; i++)
1329         {
1330             for (j = 0; j < 2; j++)
1331             {
1332                 if (debug)
1333                 {
1334                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1335                 }
1336                 cell_ns_x0[dim] = std::min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1337                 cell_ns_x1[dim] = std::max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1338             }
1339         }
1340     }
1341     for (d = 1; d < dd->ndim; d++)
1342     {
1343         comm->cell_f_max0[d] = extr_s[d-1][0];
1344         comm->cell_f_min1[d] = extr_s[d-1][1];
1345         if (debug)
1346         {
1347             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1348                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1349         }
1350     }
1351 }
1352
1353 static void dd_collect_cg(gmx_domdec_t *dd,
1354                           t_state      *state_local)
1355 {
1356     gmx_domdec_master_t *ma = NULL;
1357     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1358
1359     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1360     {
1361         /* The master has the correct distribution */
1362         return;
1363     }
1364
1365     if (state_local->ddp_count == dd->ddp_count)
1366     {
1367         /* The local state and DD are in sync, use the DD indices */
1368         ncg_home = dd->ncg_home;
1369         cg       = dd->index_gl;
1370         nat_home = dd->nat_home;
1371     }
1372     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1373     {
1374         /* The DD is out of sync with the local state, but we have stored
1375          * the cg indices with the local state, so we can use those.
1376          */
1377         t_block *cgs_gl;
1378
1379         cgs_gl = &dd->comm->cgs_gl;
1380
1381         ncg_home = state_local->ncg_gl;
1382         cg       = state_local->cg_gl;
1383         nat_home = 0;
1384         for (i = 0; i < ncg_home; i++)
1385         {
1386             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1387         }
1388     }
1389     else
1390     {
1391         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1392     }
1393
1394     buf2[0] = ncg_home;
1395     buf2[1] = nat_home;
1396     if (DDMASTER(dd))
1397     {
1398         ma   = dd->ma;
1399         ibuf = ma->ibuf;
1400     }
1401     else
1402     {
1403         ibuf = NULL;
1404     }
1405     /* Collect the charge group and atom counts on the master */
1406     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1407
1408     if (DDMASTER(dd))
1409     {
1410         ma->index[0] = 0;
1411         for (i = 0; i < dd->nnodes; i++)
1412         {
1413             ma->ncg[i]     = ma->ibuf[2*i];
1414             ma->nat[i]     = ma->ibuf[2*i+1];
1415             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1416
1417         }
1418         /* Make byte counts and indices */
1419         for (i = 0; i < dd->nnodes; i++)
1420         {
1421             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1422             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1423         }
1424         if (debug)
1425         {
1426             fprintf(debug, "Initial charge group distribution: ");
1427             for (i = 0; i < dd->nnodes; i++)
1428             {
1429                 fprintf(debug, " %d", ma->ncg[i]);
1430             }
1431             fprintf(debug, "\n");
1432         }
1433     }
1434
1435     /* Collect the charge group indices on the master */
1436     dd_gatherv(dd,
1437                ncg_home*sizeof(int), cg,
1438                DDMASTER(dd) ? ma->ibuf : NULL,
1439                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1440                DDMASTER(dd) ? ma->cg : NULL);
1441
1442     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1443 }
1444
1445 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1446                                     rvec *lv, rvec *v)
1447 {
1448     gmx_domdec_master_t *ma;
1449     int                  n, i, c, a, nalloc = 0;
1450     rvec                *buf = NULL;
1451     t_block             *cgs_gl;
1452
1453     ma = dd->ma;
1454
1455     if (!DDMASTER(dd))
1456     {
1457 #ifdef GMX_MPI
1458         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1459                  dd->rank, dd->mpi_comm_all);
1460 #endif
1461     }
1462     else
1463     {
1464         /* Copy the master coordinates to the global array */
1465         cgs_gl = &dd->comm->cgs_gl;
1466
1467         n = DDMASTERRANK(dd);
1468         a = 0;
1469         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1470         {
1471             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1472             {
1473                 copy_rvec(lv[a++], v[c]);
1474             }
1475         }
1476
1477         for (n = 0; n < dd->nnodes; n++)
1478         {
1479             if (n != dd->rank)
1480             {
1481                 if (ma->nat[n] > nalloc)
1482                 {
1483                     nalloc = over_alloc_dd(ma->nat[n]);
1484                     srenew(buf, nalloc);
1485                 }
1486 #ifdef GMX_MPI
1487                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1488                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1489 #endif
1490                 a = 0;
1491                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1492                 {
1493                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1494                     {
1495                         copy_rvec(buf[a++], v[c]);
1496                     }
1497                 }
1498             }
1499         }
1500         sfree(buf);
1501     }
1502 }
1503
1504 static void get_commbuffer_counts(gmx_domdec_t *dd,
1505                                   int **counts, int **disps)
1506 {
1507     gmx_domdec_master_t *ma;
1508     int                  n;
1509
1510     ma = dd->ma;
1511
1512     /* Make the rvec count and displacment arrays */
1513     *counts  = ma->ibuf;
1514     *disps   = ma->ibuf + dd->nnodes;
1515     for (n = 0; n < dd->nnodes; n++)
1516     {
1517         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1518         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1519     }
1520 }
1521
1522 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1523                                    rvec *lv, rvec *v)
1524 {
1525     gmx_domdec_master_t *ma;
1526     int                 *rcounts = NULL, *disps = NULL;
1527     int                  n, i, c, a;
1528     rvec                *buf = NULL;
1529     t_block             *cgs_gl;
1530
1531     ma = dd->ma;
1532
1533     if (DDMASTER(dd))
1534     {
1535         get_commbuffer_counts(dd, &rcounts, &disps);
1536
1537         buf = ma->vbuf;
1538     }
1539
1540     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1541
1542     if (DDMASTER(dd))
1543     {
1544         cgs_gl = &dd->comm->cgs_gl;
1545
1546         a = 0;
1547         for (n = 0; n < dd->nnodes; n++)
1548         {
1549             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1550             {
1551                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1552                 {
1553                     copy_rvec(buf[a++], v[c]);
1554                 }
1555             }
1556         }
1557     }
1558 }
1559
1560 void dd_collect_vec(gmx_domdec_t *dd,
1561                     t_state *state_local, rvec *lv, rvec *v)
1562 {
1563     dd_collect_cg(dd, state_local);
1564
1565     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1566     {
1567         dd_collect_vec_sendrecv(dd, lv, v);
1568     }
1569     else
1570     {
1571         dd_collect_vec_gatherv(dd, lv, v);
1572     }
1573 }
1574
1575
1576 void dd_collect_state(gmx_domdec_t *dd,
1577                       t_state *state_local, t_state *state)
1578 {
1579     int est, i, j, nh;
1580
1581     nh = state->nhchainlength;
1582
1583     if (DDMASTER(dd))
1584     {
1585         for (i = 0; i < efptNR; i++)
1586         {
1587             state->lambda[i] = state_local->lambda[i];
1588         }
1589         state->fep_state = state_local->fep_state;
1590         state->veta      = state_local->veta;
1591         state->vol0      = state_local->vol0;
1592         copy_mat(state_local->box, state->box);
1593         copy_mat(state_local->boxv, state->boxv);
1594         copy_mat(state_local->svir_prev, state->svir_prev);
1595         copy_mat(state_local->fvir_prev, state->fvir_prev);
1596         copy_mat(state_local->pres_prev, state->pres_prev);
1597
1598         for (i = 0; i < state_local->ngtc; i++)
1599         {
1600             for (j = 0; j < nh; j++)
1601             {
1602                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1603                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1604             }
1605             state->therm_integral[i] = state_local->therm_integral[i];
1606         }
1607         for (i = 0; i < state_local->nnhpres; i++)
1608         {
1609             for (j = 0; j < nh; j++)
1610             {
1611                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1612                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1613             }
1614         }
1615     }
1616     for (est = 0; est < estNR; est++)
1617     {
1618         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1619         {
1620             switch (est)
1621             {
1622                 case estX:
1623                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1624                     break;
1625                 case estV:
1626                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1627                     break;
1628                 case estSDX:
1629                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1630                     break;
1631                 case estCGP:
1632                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1633                     break;
1634                 case estDISRE_INITF:
1635                 case estDISRE_RM3TAV:
1636                 case estORIRE_INITF:
1637                 case estORIRE_DTAV:
1638                     break;
1639                 default:
1640                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1641             }
1642         }
1643     }
1644 }
1645
1646 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1647 {
1648     int est;
1649
1650     if (debug)
1651     {
1652         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1653     }
1654
1655     state->nalloc = over_alloc_dd(nalloc);
1656
1657     for (est = 0; est < estNR; est++)
1658     {
1659         if (EST_DISTR(est) && (state->flags & (1<<est)))
1660         {
1661             switch (est)
1662             {
1663                 case estX:
1664                     srenew(state->x, state->nalloc);
1665                     break;
1666                 case estV:
1667                     srenew(state->v, state->nalloc);
1668                     break;
1669                 case estSDX:
1670                     srenew(state->sd_X, state->nalloc);
1671                     break;
1672                 case estCGP:
1673                     srenew(state->cg_p, state->nalloc);
1674                     break;
1675                 case estDISRE_INITF:
1676                 case estDISRE_RM3TAV:
1677                 case estORIRE_INITF:
1678                 case estORIRE_DTAV:
1679                     /* No reallocation required */
1680                     break;
1681                 default:
1682                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1683             }
1684         }
1685     }
1686
1687     if (f != NULL)
1688     {
1689         srenew(*f, state->nalloc);
1690     }
1691 }
1692
1693 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1694                                int nalloc)
1695 {
1696     if (nalloc > fr->cg_nalloc)
1697     {
1698         if (debug)
1699         {
1700             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1701         }
1702         fr->cg_nalloc = over_alloc_dd(nalloc);
1703         srenew(fr->cginfo, fr->cg_nalloc);
1704         if (fr->cutoff_scheme == ecutsGROUP)
1705         {
1706             srenew(fr->cg_cm, fr->cg_nalloc);
1707         }
1708     }
1709     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1710     {
1711         /* We don't use charge groups, we use x in state to set up
1712          * the atom communication.
1713          */
1714         dd_realloc_state(state, f, nalloc);
1715     }
1716 }
1717
1718 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1719                                        rvec *v, rvec *lv)
1720 {
1721     gmx_domdec_master_t *ma;
1722     int                  n, i, c, a, nalloc = 0;
1723     rvec                *buf = NULL;
1724
1725     if (DDMASTER(dd))
1726     {
1727         ma  = dd->ma;
1728
1729         for (n = 0; n < dd->nnodes; n++)
1730         {
1731             if (n != dd->rank)
1732             {
1733                 if (ma->nat[n] > nalloc)
1734                 {
1735                     nalloc = over_alloc_dd(ma->nat[n]);
1736                     srenew(buf, nalloc);
1737                 }
1738                 /* Use lv as a temporary buffer */
1739                 a = 0;
1740                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1741                 {
1742                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1743                     {
1744                         copy_rvec(v[c], buf[a++]);
1745                     }
1746                 }
1747                 if (a != ma->nat[n])
1748                 {
1749                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1750                               a, ma->nat[n]);
1751                 }
1752
1753 #ifdef GMX_MPI
1754                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1755                          DDRANK(dd, n), n, dd->mpi_comm_all);
1756 #endif
1757             }
1758         }
1759         sfree(buf);
1760         n = DDMASTERRANK(dd);
1761         a = 0;
1762         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1763         {
1764             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1765             {
1766                 copy_rvec(v[c], lv[a++]);
1767             }
1768         }
1769     }
1770     else
1771     {
1772 #ifdef GMX_MPI
1773         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1774                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1775 #endif
1776     }
1777 }
1778
1779 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1780                                        rvec *v, rvec *lv)
1781 {
1782     gmx_domdec_master_t *ma;
1783     int                 *scounts = NULL, *disps = NULL;
1784     int                  n, i, c, a;
1785     rvec                *buf = NULL;
1786
1787     if (DDMASTER(dd))
1788     {
1789         ma  = dd->ma;
1790
1791         get_commbuffer_counts(dd, &scounts, &disps);
1792
1793         buf = ma->vbuf;
1794         a   = 0;
1795         for (n = 0; n < dd->nnodes; n++)
1796         {
1797             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1798             {
1799                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1800                 {
1801                     copy_rvec(v[c], buf[a++]);
1802                 }
1803             }
1804         }
1805     }
1806
1807     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1808 }
1809
1810 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1811 {
1812     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1813     {
1814         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1815     }
1816     else
1817     {
1818         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1819     }
1820 }
1821
1822 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1823 {
1824     int i;
1825     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1826     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1827     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1828
1829     if (dfhist->nlambda > 0)
1830     {
1831         int nlam = dfhist->nlambda;
1832         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1833         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1834         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1835         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1836         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1837         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1838
1839         for (i = 0; i < nlam; i++)
1840         {
1841             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1842             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1843             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1844             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1845             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1846             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1847         }
1848     }
1849 }
1850
1851 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1852                                 t_state *state, t_state *state_local,
1853                                 rvec **f)
1854 {
1855     int  i, j, nh;
1856
1857     nh = state->nhchainlength;
1858
1859     if (DDMASTER(dd))
1860     {
1861         for (i = 0; i < efptNR; i++)
1862         {
1863             state_local->lambda[i] = state->lambda[i];
1864         }
1865         state_local->fep_state = state->fep_state;
1866         state_local->veta      = state->veta;
1867         state_local->vol0      = state->vol0;
1868         copy_mat(state->box, state_local->box);
1869         copy_mat(state->box_rel, state_local->box_rel);
1870         copy_mat(state->boxv, state_local->boxv);
1871         copy_mat(state->svir_prev, state_local->svir_prev);
1872         copy_mat(state->fvir_prev, state_local->fvir_prev);
1873         copy_df_history(&state_local->dfhist, &state->dfhist);
1874         for (i = 0; i < state_local->ngtc; i++)
1875         {
1876             for (j = 0; j < nh; j++)
1877             {
1878                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1879                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1880             }
1881             state_local->therm_integral[i] = state->therm_integral[i];
1882         }
1883         for (i = 0; i < state_local->nnhpres; i++)
1884         {
1885             for (j = 0; j < nh; j++)
1886             {
1887                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1888                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1889             }
1890         }
1891     }
1892     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1893     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1894     dd_bcast(dd, sizeof(real), &state_local->veta);
1895     dd_bcast(dd, sizeof(real), &state_local->vol0);
1896     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1897     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1898     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1899     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1900     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1901     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1902     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1903     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1904     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1905     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1906
1907     /* communicate df_history -- required for restarting from checkpoint */
1908     dd_distribute_dfhist(dd, &state_local->dfhist);
1909
1910     if (dd->nat_home > state_local->nalloc)
1911     {
1912         dd_realloc_state(state_local, f, dd->nat_home);
1913     }
1914     for (i = 0; i < estNR; i++)
1915     {
1916         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1917         {
1918             switch (i)
1919             {
1920                 case estX:
1921                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1922                     break;
1923                 case estV:
1924                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1925                     break;
1926                 case estSDX:
1927                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1928                     break;
1929                 case estCGP:
1930                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1931                     break;
1932                 case estDISRE_INITF:
1933                 case estDISRE_RM3TAV:
1934                 case estORIRE_INITF:
1935                 case estORIRE_DTAV:
1936                     /* Not implemented yet */
1937                     break;
1938                 default:
1939                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1940             }
1941         }
1942     }
1943 }
1944
1945 static char dim2char(int dim)
1946 {
1947     char c = '?';
1948
1949     switch (dim)
1950     {
1951         case XX: c = 'X'; break;
1952         case YY: c = 'Y'; break;
1953         case ZZ: c = 'Z'; break;
1954         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1955     }
1956
1957     return c;
1958 }
1959
1960 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1961                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1962 {
1963     rvec   grid_s[2], *grid_r = NULL, cx, r;
1964     char   fname[STRLEN], buf[22];
1965     FILE  *out;
1966     int    a, i, d, z, y, x;
1967     matrix tric;
1968     real   vol;
1969
1970     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1971     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1972
1973     if (DDMASTER(dd))
1974     {
1975         snew(grid_r, 2*dd->nnodes);
1976     }
1977
1978     dd_gather(dd, 2*sizeof(rvec), grid_s, DDMASTER(dd) ? grid_r : NULL);
1979
1980     if (DDMASTER(dd))
1981     {
1982         for (d = 0; d < DIM; d++)
1983         {
1984             for (i = 0; i < DIM; i++)
1985             {
1986                 if (d == i)
1987                 {
1988                     tric[d][i] = 1;
1989                 }
1990                 else
1991                 {
1992                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1993                     {
1994                         tric[d][i] = box[i][d]/box[i][i];
1995                     }
1996                     else
1997                     {
1998                         tric[d][i] = 0;
1999                     }
2000                 }
2001             }
2002         }
2003         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
2004         out = gmx_fio_fopen(fname, "w");
2005         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2006         a = 1;
2007         for (i = 0; i < dd->nnodes; i++)
2008         {
2009             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
2010             for (d = 0; d < DIM; d++)
2011             {
2012                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
2013             }
2014             for (z = 0; z < 2; z++)
2015             {
2016                 for (y = 0; y < 2; y++)
2017                 {
2018                     for (x = 0; x < 2; x++)
2019                     {
2020                         cx[XX] = grid_r[i*2+x][XX];
2021                         cx[YY] = grid_r[i*2+y][YY];
2022                         cx[ZZ] = grid_r[i*2+z][ZZ];
2023                         mvmul(tric, cx, r);
2024                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
2025                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
2026                     }
2027                 }
2028             }
2029             for (d = 0; d < DIM; d++)
2030             {
2031                 for (x = 0; x < 4; x++)
2032                 {
2033                     switch (d)
2034                     {
2035                         case 0: y = 1 + i*8 + 2*x; break;
2036                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2037                         case 2: y = 1 + i*8 + x; break;
2038                     }
2039                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2040                 }
2041             }
2042         }
2043         gmx_fio_fclose(out);
2044         sfree(grid_r);
2045     }
2046 }
2047
2048 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
2049                   gmx_mtop_t *mtop, t_commrec *cr,
2050                   int natoms, rvec x[], matrix box)
2051 {
2052     char          fname[STRLEN], buf[22];
2053     FILE         *out;
2054     int           i, ii, resnr, c;
2055     char         *atomname, *resname;
2056     real          b;
2057     gmx_domdec_t *dd;
2058
2059     dd = cr->dd;
2060     if (natoms == -1)
2061     {
2062         natoms = dd->comm->nat[ddnatVSITE];
2063     }
2064
2065     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2066
2067     out = gmx_fio_fopen(fname, "w");
2068
2069     fprintf(out, "TITLE     %s\n", title);
2070     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2071     for (i = 0; i < natoms; i++)
2072     {
2073         ii = dd->gatindex[i];
2074         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2075         if (i < dd->comm->nat[ddnatZONE])
2076         {
2077             c = 0;
2078             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2079             {
2080                 c++;
2081             }
2082             b = c;
2083         }
2084         else if (i < dd->comm->nat[ddnatVSITE])
2085         {
2086             b = dd->comm->zones.n;
2087         }
2088         else
2089         {
2090             b = dd->comm->zones.n + 1;
2091         }
2092         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
2093                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
2094     }
2095     fprintf(out, "TER\n");
2096
2097     gmx_fio_fclose(out);
2098 }
2099
2100 real dd_cutoff_multibody(const gmx_domdec_t *dd)
2101 {
2102     gmx_domdec_comm_t *comm;
2103     int                di;
2104     real               r;
2105
2106     comm = dd->comm;
2107
2108     r = -1;
2109     if (comm->bInterCGBondeds)
2110     {
2111         if (comm->cutoff_mbody > 0)
2112         {
2113             r = comm->cutoff_mbody;
2114         }
2115         else
2116         {
2117             /* cutoff_mbody=0 means we do not have DLB */
2118             r = comm->cellsize_min[dd->dim[0]];
2119             for (di = 1; di < dd->ndim; di++)
2120             {
2121                 r = std::min(r, comm->cellsize_min[dd->dim[di]]);
2122             }
2123             if (comm->bBondComm)
2124             {
2125                 r = std::max(r, comm->cutoff_mbody);
2126             }
2127             else
2128             {
2129                 r = std::min(r, comm->cutoff);
2130             }
2131         }
2132     }
2133
2134     return r;
2135 }
2136
2137 real dd_cutoff_twobody(const gmx_domdec_t *dd)
2138 {
2139     real r_mb;
2140
2141     r_mb = dd_cutoff_multibody(dd);
2142
2143     return std::max(dd->comm->cutoff, r_mb);
2144 }
2145
2146
2147 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2148 {
2149     int nc, ntot;
2150
2151     nc   = dd->nc[dd->comm->cartpmedim];
2152     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2153     copy_ivec(coord, coord_pme);
2154     coord_pme[dd->comm->cartpmedim] =
2155         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2156 }
2157
2158 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2159 {
2160     /* Here we assign a PME node to communicate with this DD node
2161      * by assuming that the major index of both is x.
2162      * We add cr->npmenodes/2 to obtain an even distribution.
2163      */
2164     return (ddindex*npme + npme/2)/ndd;
2165 }
2166
2167 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2168 {
2169     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2170 }
2171
2172 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2173 {
2174     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2175 }
2176
2177 static int *dd_pmenodes(t_commrec *cr)
2178 {
2179     int *pmenodes;
2180     int  n, i, p0, p1;
2181
2182     snew(pmenodes, cr->npmenodes);
2183     n = 0;
2184     for (i = 0; i < cr->dd->nnodes; i++)
2185     {
2186         p0 = cr_ddindex2pmeindex(cr, i);
2187         p1 = cr_ddindex2pmeindex(cr, i+1);
2188         if (i+1 == cr->dd->nnodes || p1 > p0)
2189         {
2190             if (debug)
2191             {
2192                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2193             }
2194             pmenodes[n] = i + 1 + n;
2195             n++;
2196         }
2197     }
2198
2199     return pmenodes;
2200 }
2201
2202 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2203 {
2204     gmx_domdec_t *dd;
2205     ivec          coords;
2206     int           slab;
2207
2208     dd = cr->dd;
2209     /*
2210        if (dd->comm->bCartesian) {
2211        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2212        dd_coords2pmecoords(dd,coords,coords_pme);
2213        copy_ivec(dd->ntot,nc);
2214        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2215        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2216
2217        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2218        } else {
2219        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2220        }
2221      */
2222     coords[XX] = x;
2223     coords[YY] = y;
2224     coords[ZZ] = z;
2225     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2226
2227     return slab;
2228 }
2229
2230 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2231 {
2232     gmx_domdec_comm_t *comm;
2233     ivec               coords;
2234     int                ddindex, nodeid = -1;
2235
2236     comm = cr->dd->comm;
2237
2238     coords[XX] = x;
2239     coords[YY] = y;
2240     coords[ZZ] = z;
2241     if (comm->bCartesianPP_PME)
2242     {
2243 #ifdef GMX_MPI
2244         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2245 #endif
2246     }
2247     else
2248     {
2249         ddindex = dd_index(cr->dd->nc, coords);
2250         if (comm->bCartesianPP)
2251         {
2252             nodeid = comm->ddindex2simnodeid[ddindex];
2253         }
2254         else
2255         {
2256             if (comm->pmenodes)
2257             {
2258                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2259             }
2260             else
2261             {
2262                 nodeid = ddindex;
2263             }
2264         }
2265     }
2266
2267     return nodeid;
2268 }
2269
2270 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2271 {
2272     gmx_domdec_t      *dd;
2273     gmx_domdec_comm_t *comm;
2274     int                i;
2275     int                pmenode = -1;
2276
2277     dd   = cr->dd;
2278     comm = dd->comm;
2279
2280     /* This assumes a uniform x domain decomposition grid cell size */
2281     if (comm->bCartesianPP_PME)
2282     {
2283 #ifdef GMX_MPI
2284         ivec coord, coord_pme;
2285         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2286         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2287         {
2288             /* This is a PP node */
2289             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2290             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2291         }
2292 #endif
2293     }
2294     else if (comm->bCartesianPP)
2295     {
2296         if (sim_nodeid < dd->nnodes)
2297         {
2298             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2299         }
2300     }
2301     else
2302     {
2303         /* This assumes DD cells with identical x coordinates
2304          * are numbered sequentially.
2305          */
2306         if (dd->comm->pmenodes == NULL)
2307         {
2308             if (sim_nodeid < dd->nnodes)
2309             {
2310                 /* The DD index equals the nodeid */
2311                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2312             }
2313         }
2314         else
2315         {
2316             i = 0;
2317             while (sim_nodeid > dd->comm->pmenodes[i])
2318             {
2319                 i++;
2320             }
2321             if (sim_nodeid < dd->comm->pmenodes[i])
2322             {
2323                 pmenode = dd->comm->pmenodes[i];
2324             }
2325         }
2326     }
2327
2328     return pmenode;
2329 }
2330
2331 void get_pme_nnodes(const gmx_domdec_t *dd,
2332                     int *npmenodes_x, int *npmenodes_y)
2333 {
2334     if (dd != NULL)
2335     {
2336         *npmenodes_x = dd->comm->npmenodes_x;
2337         *npmenodes_y = dd->comm->npmenodes_y;
2338     }
2339     else
2340     {
2341         *npmenodes_x = 1;
2342         *npmenodes_y = 1;
2343     }
2344 }
2345
2346 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2347                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2348 {
2349     gmx_domdec_t *dd;
2350     int           x, y, z;
2351     ivec          coord, coord_pme;
2352
2353     dd = cr->dd;
2354
2355     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2356
2357     *nmy_ddnodes = 0;
2358     for (x = 0; x < dd->nc[XX]; x++)
2359     {
2360         for (y = 0; y < dd->nc[YY]; y++)
2361         {
2362             for (z = 0; z < dd->nc[ZZ]; z++)
2363             {
2364                 if (dd->comm->bCartesianPP_PME)
2365                 {
2366                     coord[XX] = x;
2367                     coord[YY] = y;
2368                     coord[ZZ] = z;
2369                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2370                     if (dd->ci[XX] == coord_pme[XX] &&
2371                         dd->ci[YY] == coord_pme[YY] &&
2372                         dd->ci[ZZ] == coord_pme[ZZ])
2373                     {
2374                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2375                     }
2376                 }
2377                 else
2378                 {
2379                     /* The slab corresponds to the nodeid in the PME group */
2380                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2381                     {
2382                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2383                     }
2384                 }
2385             }
2386         }
2387     }
2388
2389     /* The last PP-only node is the peer node */
2390     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2391
2392     if (debug)
2393     {
2394         fprintf(debug, "Receive coordinates from PP ranks:");
2395         for (x = 0; x < *nmy_ddnodes; x++)
2396         {
2397             fprintf(debug, " %d", (*my_ddnodes)[x]);
2398         }
2399         fprintf(debug, "\n");
2400     }
2401 }
2402
2403 static gmx_bool receive_vir_ener(t_commrec *cr)
2404 {
2405     gmx_domdec_comm_t *comm;
2406     int                pmenode;
2407     gmx_bool           bReceive;
2408
2409     bReceive = TRUE;
2410     if (cr->npmenodes < cr->dd->nnodes)
2411     {
2412         comm = cr->dd->comm;
2413         if (comm->bCartesianPP_PME)
2414         {
2415             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2416 #ifdef GMX_MPI
2417             ivec coords;
2418             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2419             coords[comm->cartpmedim]++;
2420             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2421             {
2422                 int rank;
2423                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2424                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2425                 {
2426                     /* This is not the last PP node for pmenode */
2427                     bReceive = FALSE;
2428                 }
2429             }
2430 #endif
2431         }
2432         else
2433         {
2434             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2435             if (cr->sim_nodeid+1 < cr->nnodes &&
2436                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2437             {
2438                 /* This is not the last PP node for pmenode */
2439                 bReceive = FALSE;
2440             }
2441         }
2442     }
2443
2444     return bReceive;
2445 }
2446
2447 static void set_zones_ncg_home(gmx_domdec_t *dd)
2448 {
2449     gmx_domdec_zones_t *zones;
2450     int                 i;
2451
2452     zones = &dd->comm->zones;
2453
2454     zones->cg_range[0] = 0;
2455     for (i = 1; i < zones->n+1; i++)
2456     {
2457         zones->cg_range[i] = dd->ncg_home;
2458     }
2459     /* zone_ncg1[0] should always be equal to ncg_home */
2460     dd->comm->zone_ncg1[0] = dd->ncg_home;
2461 }
2462
2463 static void rebuild_cgindex(gmx_domdec_t *dd,
2464                             const int *gcgs_index, t_state *state)
2465 {
2466     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2467
2468     ind        = state->cg_gl;
2469     dd_cg_gl   = dd->index_gl;
2470     cgindex    = dd->cgindex;
2471     nat        = 0;
2472     cgindex[0] = nat;
2473     for (i = 0; i < state->ncg_gl; i++)
2474     {
2475         cgindex[i]  = nat;
2476         cg_gl       = ind[i];
2477         dd_cg_gl[i] = cg_gl;
2478         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2479     }
2480     cgindex[i] = nat;
2481
2482     dd->ncg_home = state->ncg_gl;
2483     dd->nat_home = nat;
2484
2485     set_zones_ncg_home(dd);
2486 }
2487
2488 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2489 {
2490     while (cg >= cginfo_mb->cg_end)
2491     {
2492         cginfo_mb++;
2493     }
2494
2495     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2496 }
2497
2498 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2499                           t_forcerec *fr, char *bLocalCG)
2500 {
2501     cginfo_mb_t *cginfo_mb;
2502     int         *cginfo;
2503     int          cg;
2504
2505     if (fr != NULL)
2506     {
2507         cginfo_mb = fr->cginfo_mb;
2508         cginfo    = fr->cginfo;
2509
2510         for (cg = cg0; cg < cg1; cg++)
2511         {
2512             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2513         }
2514     }
2515
2516     if (bLocalCG != NULL)
2517     {
2518         for (cg = cg0; cg < cg1; cg++)
2519         {
2520             bLocalCG[index_gl[cg]] = TRUE;
2521         }
2522     }
2523 }
2524
2525 static void make_dd_indices(gmx_domdec_t *dd,
2526                             const int *gcgs_index, int cg_start)
2527 {
2528     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2529     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2530     gmx_bool     bCGs;
2531
2532     if (dd->nat_tot > dd->gatindex_nalloc)
2533     {
2534         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2535         srenew(dd->gatindex, dd->gatindex_nalloc);
2536     }
2537
2538     nzone      = dd->comm->zones.n;
2539     zone2cg    = dd->comm->zones.cg_range;
2540     zone_ncg1  = dd->comm->zone_ncg1;
2541     index_gl   = dd->index_gl;
2542     gatindex   = dd->gatindex;
2543     bCGs       = dd->comm->bCGs;
2544
2545     if (zone2cg[1] != dd->ncg_home)
2546     {
2547         gmx_incons("dd->ncg_zone is not up to date");
2548     }
2549
2550     /* Make the local to global and global to local atom index */
2551     a = dd->cgindex[cg_start];
2552     for (zone = 0; zone < nzone; zone++)
2553     {
2554         if (zone == 0)
2555         {
2556             cg0 = cg_start;
2557         }
2558         else
2559         {
2560             cg0 = zone2cg[zone];
2561         }
2562         cg1    = zone2cg[zone+1];
2563         cg1_p1 = cg0 + zone_ncg1[zone];
2564
2565         for (cg = cg0; cg < cg1; cg++)
2566         {
2567             zone1 = zone;
2568             if (cg >= cg1_p1)
2569             {
2570                 /* Signal that this cg is from more than one pulse away */
2571                 zone1 += nzone;
2572             }
2573             cg_gl = index_gl[cg];
2574             if (bCGs)
2575             {
2576                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2577                 {
2578                     gatindex[a] = a_gl;
2579                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2580                     a++;
2581                 }
2582             }
2583             else
2584             {
2585                 gatindex[a] = cg_gl;
2586                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2587                 a++;
2588             }
2589         }
2590     }
2591 }
2592
2593 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2594                           const char *where)
2595 {
2596     int i, ngl, nerr;
2597
2598     nerr = 0;
2599     if (bLocalCG == NULL)
2600     {
2601         return nerr;
2602     }
2603     for (i = 0; i < dd->ncg_tot; i++)
2604     {
2605         if (!bLocalCG[dd->index_gl[i]])
2606         {
2607             fprintf(stderr,
2608                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2609             nerr++;
2610         }
2611     }
2612     ngl = 0;
2613     for (i = 0; i < ncg_sys; i++)
2614     {
2615         if (bLocalCG[i])
2616         {
2617             ngl++;
2618         }
2619     }
2620     if (ngl != dd->ncg_tot)
2621     {
2622         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2623         nerr++;
2624     }
2625
2626     return nerr;
2627 }
2628
2629 static void check_index_consistency(gmx_domdec_t *dd,
2630                                     int natoms_sys, int ncg_sys,
2631                                     const char *where)
2632 {
2633     int   nerr, ngl, i, a, cell;
2634     int  *have;
2635
2636     nerr = 0;
2637
2638     if (dd->comm->DD_debug > 1)
2639     {
2640         snew(have, natoms_sys);
2641         for (a = 0; a < dd->nat_tot; a++)
2642         {
2643             if (have[dd->gatindex[a]] > 0)
2644             {
2645                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2646             }
2647             else
2648             {
2649                 have[dd->gatindex[a]] = a + 1;
2650             }
2651         }
2652         sfree(have);
2653     }
2654
2655     snew(have, dd->nat_tot);
2656
2657     ngl  = 0;
2658     for (i = 0; i < natoms_sys; i++)
2659     {
2660         if (ga2la_get(dd->ga2la, i, &a, &cell))
2661         {
2662             if (a >= dd->nat_tot)
2663             {
2664                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2665                 nerr++;
2666             }
2667             else
2668             {
2669                 have[a] = 1;
2670                 if (dd->gatindex[a] != i)
2671                 {
2672                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2673                     nerr++;
2674                 }
2675             }
2676             ngl++;
2677         }
2678     }
2679     if (ngl != dd->nat_tot)
2680     {
2681         fprintf(stderr,
2682                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2683                 dd->rank, where, ngl, dd->nat_tot);
2684     }
2685     for (a = 0; a < dd->nat_tot; a++)
2686     {
2687         if (have[a] == 0)
2688         {
2689             fprintf(stderr,
2690                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2691                     dd->rank, where, a+1, dd->gatindex[a]+1);
2692         }
2693     }
2694     sfree(have);
2695
2696     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2697
2698     if (nerr > 0)
2699     {
2700         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2701                   dd->rank, where, nerr);
2702     }
2703 }
2704
2705 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2706 {
2707     int   i;
2708     char *bLocalCG;
2709
2710     if (a_start == 0)
2711     {
2712         /* Clear the whole list without searching */
2713         ga2la_clear(dd->ga2la);
2714     }
2715     else
2716     {
2717         for (i = a_start; i < dd->nat_tot; i++)
2718         {
2719             ga2la_del(dd->ga2la, dd->gatindex[i]);
2720         }
2721     }
2722
2723     bLocalCG = dd->comm->bLocalCG;
2724     if (bLocalCG)
2725     {
2726         for (i = cg_start; i < dd->ncg_tot; i++)
2727         {
2728             bLocalCG[dd->index_gl[i]] = FALSE;
2729         }
2730     }
2731
2732     dd_clear_local_vsite_indices(dd);
2733
2734     if (dd->constraints)
2735     {
2736         dd_clear_local_constraint_indices(dd);
2737     }
2738 }
2739
2740 /* This function should be used for moving the domain boudaries during DLB,
2741  * for obtaining the minimum cell size. It checks the initially set limit
2742  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2743  * and, possibly, a longer cut-off limit set for PME load balancing.
2744  */
2745 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2746 {
2747     real cellsize_min;
2748
2749     cellsize_min = comm->cellsize_min[dim];
2750
2751     if (!comm->bVacDLBNoLimit)
2752     {
2753         /* The cut-off might have changed, e.g. by PME load balacning,
2754          * from the value used to set comm->cellsize_min, so check it.
2755          */
2756         cellsize_min = std::max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2757
2758         if (comm->bPMELoadBalDLBLimits)
2759         {
2760             /* Check for the cut-off limit set by the PME load balancing */
2761             cellsize_min = std::max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2762         }
2763     }
2764
2765     return cellsize_min;
2766 }
2767
2768 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2769                             int dim_ind)
2770 {
2771     real grid_jump_limit;
2772
2773     /* The distance between the boundaries of cells at distance
2774      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2775      * and by the fact that cells should not be shifted by more than
2776      * half their size, such that cg's only shift by one cell
2777      * at redecomposition.
2778      */
2779     grid_jump_limit = comm->cellsize_limit;
2780     if (!comm->bVacDLBNoLimit)
2781     {
2782         if (comm->bPMELoadBalDLBLimits)
2783         {
2784             cutoff = std::max(cutoff, comm->PMELoadBal_max_cutoff);
2785         }
2786         grid_jump_limit = std::max(grid_jump_limit,
2787                                    cutoff/comm->cd[dim_ind].np);
2788     }
2789
2790     return grid_jump_limit;
2791 }
2792
2793 static gmx_bool check_grid_jump(gmx_int64_t     step,
2794                                 gmx_domdec_t   *dd,
2795                                 real            cutoff,
2796                                 gmx_ddbox_t    *ddbox,
2797                                 gmx_bool        bFatal)
2798 {
2799     gmx_domdec_comm_t *comm;
2800     int                d, dim;
2801     real               limit, bfac;
2802     gmx_bool           bInvalid;
2803
2804     bInvalid = FALSE;
2805
2806     comm = dd->comm;
2807
2808     for (d = 1; d < dd->ndim; d++)
2809     {
2810         dim   = dd->dim[d];
2811         limit = grid_jump_limit(comm, cutoff, d);
2812         bfac  = ddbox->box_size[dim];
2813         if (ddbox->tric_dir[dim])
2814         {
2815             bfac *= ddbox->skew_fac[dim];
2816         }
2817         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2818                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2819         {
2820             bInvalid = TRUE;
2821
2822             if (bFatal)
2823             {
2824                 char buf[22];
2825
2826                 /* This error should never be triggered under normal
2827                  * circumstances, but you never know ...
2828                  */
2829                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2830                           gmx_step_str(step, buf),
2831                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2832             }
2833         }
2834     }
2835
2836     return bInvalid;
2837 }
2838
2839 static int dd_load_count(gmx_domdec_comm_t *comm)
2840 {
2841     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2842 }
2843
2844 static float dd_force_load(gmx_domdec_comm_t *comm)
2845 {
2846     float load;
2847
2848     if (comm->eFlop)
2849     {
2850         load = comm->flop;
2851         if (comm->eFlop > 1)
2852         {
2853             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2854         }
2855     }
2856     else
2857     {
2858         load = comm->cycl[ddCyclF];
2859         if (comm->cycl_n[ddCyclF] > 1)
2860         {
2861             /* Subtract the maximum of the last n cycle counts
2862              * to get rid of possible high counts due to other sources,
2863              * for instance system activity, that would otherwise
2864              * affect the dynamic load balancing.
2865              */
2866             load -= comm->cycl_max[ddCyclF];
2867         }
2868
2869 #ifdef GMX_MPI
2870         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2871         {
2872             float gpu_wait, gpu_wait_sum;
2873
2874             gpu_wait = comm->cycl[ddCyclWaitGPU];
2875             if (comm->cycl_n[ddCyclF] > 1)
2876             {
2877                 /* We should remove the WaitGPU time of the same MD step
2878                  * as the one with the maximum F time, since the F time
2879                  * and the wait time are not independent.
2880                  * Furthermore, the step for the max F time should be chosen
2881                  * the same on all ranks that share the same GPU.
2882                  * But to keep the code simple, we remove the average instead.
2883                  * The main reason for artificially long times at some steps
2884                  * is spurious CPU activity or MPI time, so we don't expect
2885                  * that changes in the GPU wait time matter a lot here.
2886                  */
2887                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2888             }
2889             /* Sum the wait times over the ranks that share the same GPU */
2890             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2891                           comm->mpi_comm_gpu_shared);
2892             /* Replace the wait time by the average over the ranks */
2893             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2894         }
2895 #endif
2896     }
2897
2898     return load;
2899 }
2900
2901 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2902 {
2903     gmx_domdec_comm_t *comm;
2904     int                i;
2905
2906     comm = dd->comm;
2907
2908     snew(*dim_f, dd->nc[dim]+1);
2909     (*dim_f)[0] = 0;
2910     for (i = 1; i < dd->nc[dim]; i++)
2911     {
2912         if (comm->slb_frac[dim])
2913         {
2914             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2915         }
2916         else
2917         {
2918             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2919         }
2920     }
2921     (*dim_f)[dd->nc[dim]] = 1;
2922 }
2923
2924 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2925 {
2926     int  pmeindex, slab, nso, i;
2927     ivec xyz;
2928
2929     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2930     {
2931         ddpme->dim = YY;
2932     }
2933     else
2934     {
2935         ddpme->dim = dimind;
2936     }
2937     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2938
2939     ddpme->nslab = (ddpme->dim == 0 ?
2940                     dd->comm->npmenodes_x :
2941                     dd->comm->npmenodes_y);
2942
2943     if (ddpme->nslab <= 1)
2944     {
2945         return;
2946     }
2947
2948     nso = dd->comm->npmenodes/ddpme->nslab;
2949     /* Determine for each PME slab the PP location range for dimension dim */
2950     snew(ddpme->pp_min, ddpme->nslab);
2951     snew(ddpme->pp_max, ddpme->nslab);
2952     for (slab = 0; slab < ddpme->nslab; slab++)
2953     {
2954         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2955         ddpme->pp_max[slab] = 0;
2956     }
2957     for (i = 0; i < dd->nnodes; i++)
2958     {
2959         ddindex2xyz(dd->nc, i, xyz);
2960         /* For y only use our y/z slab.
2961          * This assumes that the PME x grid size matches the DD grid size.
2962          */
2963         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2964         {
2965             pmeindex = ddindex2pmeindex(dd, i);
2966             if (dimind == 0)
2967             {
2968                 slab = pmeindex/nso;
2969             }
2970             else
2971             {
2972                 slab = pmeindex % ddpme->nslab;
2973             }
2974             ddpme->pp_min[slab] = std::min(ddpme->pp_min[slab], xyz[dimind]);
2975             ddpme->pp_max[slab] = std::max(ddpme->pp_max[slab], xyz[dimind]);
2976         }
2977     }
2978
2979     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2980 }
2981
2982 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2983 {
2984     if (dd->comm->ddpme[0].dim == XX)
2985     {
2986         return dd->comm->ddpme[0].maxshift;
2987     }
2988     else
2989     {
2990         return 0;
2991     }
2992 }
2993
2994 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2995 {
2996     if (dd->comm->ddpme[0].dim == YY)
2997     {
2998         return dd->comm->ddpme[0].maxshift;
2999     }
3000     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
3001     {
3002         return dd->comm->ddpme[1].maxshift;
3003     }
3004     else
3005     {
3006         return 0;
3007     }
3008 }
3009
3010 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
3011                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
3012 {
3013     gmx_domdec_comm_t *comm;
3014     int                nc, ns, s;
3015     int               *xmin, *xmax;
3016     real               range, pme_boundary;
3017     int                sh;
3018
3019     comm = dd->comm;
3020     nc   = dd->nc[ddpme->dim];
3021     ns   = ddpme->nslab;
3022
3023     if (!ddpme->dim_match)
3024     {
3025         /* PP decomposition is not along dim: the worst situation */
3026         sh = ns/2;
3027     }
3028     else if (ns <= 3 || (bUniform && ns == nc))
3029     {
3030         /* The optimal situation */
3031         sh = 1;
3032     }
3033     else
3034     {
3035         /* We need to check for all pme nodes which nodes they
3036          * could possibly need to communicate with.
3037          */
3038         xmin = ddpme->pp_min;
3039         xmax = ddpme->pp_max;
3040         /* Allow for atoms to be maximally 2/3 times the cut-off
3041          * out of their DD cell. This is a reasonable balance between
3042          * between performance and support for most charge-group/cut-off
3043          * combinations.
3044          */
3045         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3046         /* Avoid extra communication when we are exactly at a boundary */
3047         range *= 0.999;
3048
3049         sh = 1;
3050         for (s = 0; s < ns; s++)
3051         {
3052             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3053             pme_boundary = (real)s/ns;
3054             while (sh+1 < ns &&
3055                    ((s-(sh+1) >= 0 &&
3056                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3057                     (s-(sh+1) <  0 &&
3058                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3059             {
3060                 sh++;
3061             }
3062             pme_boundary = (real)(s+1)/ns;
3063             while (sh+1 < ns &&
3064                    ((s+(sh+1) <  ns &&
3065                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3066                     (s+(sh+1) >= ns &&
3067                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3068             {
3069                 sh++;
3070             }
3071         }
3072     }
3073
3074     ddpme->maxshift = sh;
3075
3076     if (debug)
3077     {
3078         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3079                 ddpme->dim, ddpme->maxshift);
3080     }
3081 }
3082
3083 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3084 {
3085     int d, dim;
3086
3087     for (d = 0; d < dd->ndim; d++)
3088     {
3089         dim = dd->dim[d];
3090         if (dim < ddbox->nboundeddim &&
3091             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3092             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3093         {
3094             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3095                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3096                       dd->nc[dim], dd->comm->cellsize_limit);
3097         }
3098     }
3099 }
3100
3101 enum {
3102     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
3103 };
3104
3105 /* Set the domain boundaries. Use for static (or no) load balancing,
3106  * and also for the starting state for dynamic load balancing.
3107  * setmode determine if and where the boundaries are stored, use enum above.
3108  * Returns the number communication pulses in npulse.
3109  */
3110 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3111                                   int setmode, ivec npulse)
3112 {
3113     gmx_domdec_comm_t *comm;
3114     int                d, j;
3115     rvec               cellsize_min;
3116     real              *cell_x, cell_dx, cellsize;
3117
3118     comm = dd->comm;
3119
3120     for (d = 0; d < DIM; d++)
3121     {
3122         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3123         npulse[d]       = 1;
3124         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3125         {
3126             /* Uniform grid */
3127             cell_dx = ddbox->box_size[d]/dd->nc[d];
3128             switch (setmode)
3129             {
3130                 case setcellsizeslbMASTER:
3131                     for (j = 0; j < dd->nc[d]+1; j++)
3132                     {
3133                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3134                     }
3135                     break;
3136                 case setcellsizeslbLOCAL:
3137                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3138                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3139                     break;
3140                 default:
3141                     break;
3142             }
3143             cellsize = cell_dx*ddbox->skew_fac[d];
3144             while (cellsize*npulse[d] < comm->cutoff)
3145             {
3146                 npulse[d]++;
3147             }
3148             cellsize_min[d] = cellsize;
3149         }
3150         else
3151         {
3152             /* Statically load balanced grid */
3153             /* Also when we are not doing a master distribution we determine
3154              * all cell borders in a loop to obtain identical values
3155              * to the master distribution case and to determine npulse.
3156              */
3157             if (setmode == setcellsizeslbMASTER)
3158             {
3159                 cell_x = dd->ma->cell_x[d];
3160             }
3161             else
3162             {
3163                 snew(cell_x, dd->nc[d]+1);
3164             }
3165             cell_x[0] = ddbox->box0[d];
3166             for (j = 0; j < dd->nc[d]; j++)
3167             {
3168                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3169                 cell_x[j+1] = cell_x[j] + cell_dx;
3170                 cellsize    = cell_dx*ddbox->skew_fac[d];
3171                 while (cellsize*npulse[d] < comm->cutoff &&
3172                        npulse[d] < dd->nc[d]-1)
3173                 {
3174                     npulse[d]++;
3175                 }
3176                 cellsize_min[d] = std::min(cellsize_min[d], cellsize);
3177             }
3178             if (setmode == setcellsizeslbLOCAL)
3179             {
3180                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3181                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3182             }
3183             if (setmode != setcellsizeslbMASTER)
3184             {
3185                 sfree(cell_x);
3186             }
3187         }
3188         /* The following limitation is to avoid that a cell would receive
3189          * some of its own home charge groups back over the periodic boundary.
3190          * Double charge groups cause trouble with the global indices.
3191          */
3192         if (d < ddbox->npbcdim &&
3193             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3194         {
3195             char error_string[STRLEN];
3196
3197             sprintf(error_string,
3198                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3199                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3200                     comm->cutoff,
3201                     dd->nc[d], dd->nc[d],
3202                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
3203
3204             if (setmode == setcellsizeslbLOCAL)
3205             {
3206                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
3207             }
3208             else
3209             {
3210                 gmx_fatal(FARGS, error_string);
3211             }
3212         }
3213     }
3214
3215     if (!comm->bDynLoadBal)
3216     {
3217         copy_rvec(cellsize_min, comm->cellsize_min);
3218     }
3219
3220     for (d = 0; d < comm->npmedecompdim; d++)
3221     {
3222         set_pme_maxshift(dd, &comm->ddpme[d],
3223                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3224                          comm->ddpme[d].slb_dim_f);
3225     }
3226 }
3227
3228
3229 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3230                                                   int d, int dim, gmx_domdec_root_t *root,
3231                                                   gmx_ddbox_t *ddbox,
3232                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
3233 {
3234     gmx_domdec_comm_t *comm;
3235     int                ncd, i, j, nmin, nmin_old;
3236     gmx_bool           bLimLo, bLimHi;
3237     real              *cell_size;
3238     real               fac, halfway, cellsize_limit_f_i, region_size;
3239     gmx_bool           bPBC, bLastHi = FALSE;
3240     int                nrange[] = {range[0], range[1]};
3241
3242     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3243
3244     comm = dd->comm;
3245
3246     ncd = dd->nc[dim];
3247
3248     bPBC = (dim < ddbox->npbcdim);
3249
3250     cell_size = root->buf_ncd;
3251
3252     if (debug)
3253     {
3254         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3255     }
3256
3257     /* First we need to check if the scaling does not make cells
3258      * smaller than the smallest allowed size.
3259      * We need to do this iteratively, since if a cell is too small,
3260      * it needs to be enlarged, which makes all the other cells smaller,
3261      * which could in turn make another cell smaller than allowed.
3262      */
3263     for (i = range[0]; i < range[1]; i++)
3264     {
3265         root->bCellMin[i] = FALSE;
3266     }
3267     nmin = 0;
3268     do
3269     {
3270         nmin_old = nmin;
3271         /* We need the total for normalization */
3272         fac = 0;
3273         for (i = range[0]; i < range[1]; i++)
3274         {
3275             if (root->bCellMin[i] == FALSE)
3276             {
3277                 fac += cell_size[i];
3278             }
3279         }
3280         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3281         /* Determine the cell boundaries */
3282         for (i = range[0]; i < range[1]; i++)
3283         {
3284             if (root->bCellMin[i] == FALSE)
3285             {
3286                 cell_size[i] *= fac;
3287                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3288                 {
3289                     cellsize_limit_f_i = 0;
3290                 }
3291                 else
3292                 {
3293                     cellsize_limit_f_i = cellsize_limit_f;
3294                 }
3295                 if (cell_size[i] < cellsize_limit_f_i)
3296                 {
3297                     root->bCellMin[i] = TRUE;
3298                     cell_size[i]      = cellsize_limit_f_i;
3299                     nmin++;
3300                 }
3301             }
3302             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3303         }
3304     }
3305     while (nmin > nmin_old);
3306
3307     i            = range[1]-1;
3308     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3309     /* For this check we should not use DD_CELL_MARGIN,
3310      * but a slightly smaller factor,
3311      * since rounding could get use below the limit.
3312      */
3313     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3314     {
3315         char buf[22];
3316         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3317                   gmx_step_str(step, buf),
3318                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3319                   ncd, comm->cellsize_min[dim]);
3320     }
3321
3322     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3323
3324     if (!bUniform)
3325     {
3326         /* Check if the boundary did not displace more than halfway
3327          * each of the cells it bounds, as this could cause problems,
3328          * especially when the differences between cell sizes are large.
3329          * If changes are applied, they will not make cells smaller
3330          * than the cut-off, as we check all the boundaries which
3331          * might be affected by a change and if the old state was ok,
3332          * the cells will at most be shrunk back to their old size.
3333          */
3334         for (i = range[0]+1; i < range[1]; i++)
3335         {
3336             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3337             if (root->cell_f[i] < halfway)
3338             {
3339                 root->cell_f[i] = halfway;
3340                 /* Check if the change also causes shifts of the next boundaries */
3341                 for (j = i+1; j < range[1]; j++)
3342                 {
3343                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3344                     {
3345                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3346                     }
3347                 }
3348             }
3349             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3350             if (root->cell_f[i] > halfway)
3351             {
3352                 root->cell_f[i] = halfway;
3353                 /* Check if the change also causes shifts of the next boundaries */
3354                 for (j = i-1; j >= range[0]+1; j--)
3355                 {
3356                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3357                     {
3358                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3359                     }
3360                 }
3361             }
3362         }
3363     }
3364
3365     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3366     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3367      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3368      * for a and b nrange is used */
3369     if (d > 0)
3370     {
3371         /* Take care of the staggering of the cell boundaries */
3372         if (bUniform)
3373         {
3374             for (i = range[0]; i < range[1]; i++)
3375             {
3376                 root->cell_f_max0[i] = root->cell_f[i];
3377                 root->cell_f_min1[i] = root->cell_f[i+1];
3378             }
3379         }
3380         else
3381         {
3382             for (i = range[0]+1; i < range[1]; i++)
3383             {
3384                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3385                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3386                 if (bLimLo && bLimHi)
3387                 {
3388                     /* Both limits violated, try the best we can */
3389                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3390                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3391                     nrange[0]       = range[0];
3392                     nrange[1]       = i;
3393                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3394
3395                     nrange[0] = i;
3396                     nrange[1] = range[1];
3397                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3398
3399                     return;
3400                 }
3401                 else if (bLimLo)
3402                 {
3403                     /* root->cell_f[i] = root->bound_min[i]; */
3404                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3405                     bLastHi   = FALSE;
3406                 }
3407                 else if (bLimHi && !bLastHi)
3408                 {
3409                     bLastHi = TRUE;
3410                     if (nrange[1] < range[1])   /* found a LimLo before */
3411                     {
3412                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3413                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3414                         nrange[0] = nrange[1];
3415                     }
3416                     root->cell_f[i] = root->bound_max[i];
3417                     nrange[1]       = i;
3418                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3419                     nrange[0] = i;
3420                     nrange[1] = range[1];
3421                 }
3422             }
3423             if (nrange[1] < range[1])   /* found last a LimLo */
3424             {
3425                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3426                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3427                 nrange[0] = nrange[1];
3428                 nrange[1] = range[1];
3429                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3430             }
3431             else if (nrange[0] > range[0]) /* found at least one LimHi */
3432             {
3433                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3434             }
3435         }
3436     }
3437 }
3438
3439
3440 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3441                                        int d, int dim, gmx_domdec_root_t *root,
3442                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3443                                        gmx_bool bUniform, gmx_int64_t step)
3444 {
3445     gmx_domdec_comm_t *comm;
3446     int                ncd, d1, i, pos;
3447     real              *cell_size;
3448     real               load_aver, load_i, imbalance, change, change_max, sc;
3449     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3450     real               change_limit;
3451     real               relax = 0.5;
3452     gmx_bool           bPBC;
3453     int                range[] = { 0, 0 };
3454
3455     comm = dd->comm;
3456
3457     /* Convert the maximum change from the input percentage to a fraction */
3458     change_limit = comm->dlb_scale_lim*0.01;
3459
3460     ncd = dd->nc[dim];
3461
3462     bPBC = (dim < ddbox->npbcdim);
3463
3464     cell_size = root->buf_ncd;
3465
3466     /* Store the original boundaries */
3467     for (i = 0; i < ncd+1; i++)
3468     {
3469         root->old_cell_f[i] = root->cell_f[i];
3470     }
3471     if (bUniform)
3472     {
3473         for (i = 0; i < ncd; i++)
3474         {
3475             cell_size[i] = 1.0/ncd;
3476         }
3477     }
3478     else if (dd_load_count(comm) > 0)
3479     {
3480         load_aver  = comm->load[d].sum_m/ncd;
3481         change_max = 0;
3482         for (i = 0; i < ncd; i++)
3483         {
3484             /* Determine the relative imbalance of cell i */
3485             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3486             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3487             /* Determine the change of the cell size using underrelaxation */
3488             change     = -relax*imbalance;
3489             change_max = std::max(change_max, std::max(change, -change));
3490         }
3491         /* Limit the amount of scaling.
3492          * We need to use the same rescaling for all cells in one row,
3493          * otherwise the load balancing might not converge.
3494          */
3495         sc = relax;
3496         if (change_max > change_limit)
3497         {
3498             sc *= change_limit/change_max;
3499         }
3500         for (i = 0; i < ncd; i++)
3501         {
3502             /* Determine the relative imbalance of cell i */
3503             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3504             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3505             /* Determine the change of the cell size using underrelaxation */
3506             change       = -sc*imbalance;
3507             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3508         }
3509     }
3510
3511     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3512     cellsize_limit_f *= DD_CELL_MARGIN;
3513     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3514     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3515     if (ddbox->tric_dir[dim])
3516     {
3517         cellsize_limit_f /= ddbox->skew_fac[dim];
3518         dist_min_f       /= ddbox->skew_fac[dim];
3519     }
3520     if (bDynamicBox && d > 0)
3521     {
3522         dist_min_f *= DD_PRES_SCALE_MARGIN;
3523     }
3524     if (d > 0 && !bUniform)
3525     {
3526         /* Make sure that the grid is not shifted too much */
3527         for (i = 1; i < ncd; i++)
3528         {
3529             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3530             {
3531                 gmx_incons("Inconsistent DD boundary staggering limits!");
3532             }
3533             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3534             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3535             if (space > 0)
3536             {
3537                 root->bound_min[i] += 0.5*space;
3538             }
3539             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3540             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3541             if (space < 0)
3542             {
3543                 root->bound_max[i] += 0.5*space;
3544             }
3545             if (debug)
3546             {
3547                 fprintf(debug,
3548                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3549                         d, i,
3550                         root->cell_f_max0[i-1] + dist_min_f,
3551                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3552                         root->cell_f_min1[i] - dist_min_f);
3553             }
3554         }
3555     }
3556     range[1]          = ncd;
3557     root->cell_f[0]   = 0;
3558     root->cell_f[ncd] = 1;
3559     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3560
3561
3562     /* After the checks above, the cells should obey the cut-off
3563      * restrictions, but it does not hurt to check.
3564      */
3565     for (i = 0; i < ncd; i++)
3566     {
3567         if (debug)
3568         {
3569             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3570                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3571         }
3572
3573         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3574             root->cell_f[i+1] - root->cell_f[i] <
3575             cellsize_limit_f/DD_CELL_MARGIN)
3576         {
3577             char buf[22];
3578             fprintf(stderr,
3579                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3580                     gmx_step_str(step, buf), dim2char(dim), i,
3581                     (root->cell_f[i+1] - root->cell_f[i])
3582                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3583         }
3584     }
3585
3586     pos = ncd + 1;
3587     /* Store the cell boundaries of the lower dimensions at the end */
3588     for (d1 = 0; d1 < d; d1++)
3589     {
3590         root->cell_f[pos++] = comm->cell_f0[d1];
3591         root->cell_f[pos++] = comm->cell_f1[d1];
3592     }
3593
3594     if (d < comm->npmedecompdim)
3595     {
3596         /* The master determines the maximum shift for
3597          * the coordinate communication between separate PME nodes.
3598          */
3599         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3600     }
3601     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3602     if (d >= 1)
3603     {
3604         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3605     }
3606 }
3607
3608 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3609                                              gmx_ddbox_t *ddbox, int dimind)
3610 {
3611     gmx_domdec_comm_t *comm;
3612     int                dim;
3613
3614     comm = dd->comm;
3615
3616     /* Set the cell dimensions */
3617     dim                = dd->dim[dimind];
3618     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3619     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3620     if (dim >= ddbox->nboundeddim)
3621     {
3622         comm->cell_x0[dim] += ddbox->box0[dim];
3623         comm->cell_x1[dim] += ddbox->box0[dim];
3624     }
3625 }
3626
3627 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3628                                          int d, int dim, real *cell_f_row,
3629                                          gmx_ddbox_t *ddbox)
3630 {
3631     gmx_domdec_comm_t *comm;
3632     int                d1, pos;
3633
3634     comm = dd->comm;
3635
3636 #ifdef GMX_MPI
3637     /* Each node would only need to know two fractions,
3638      * but it is probably cheaper to broadcast the whole array.
3639      */
3640     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3641               0, comm->mpi_comm_load[d]);
3642 #endif
3643     /* Copy the fractions for this dimension from the buffer */
3644     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3645     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3646     /* The whole array was communicated, so set the buffer position */
3647     pos = dd->nc[dim] + 1;
3648     for (d1 = 0; d1 <= d; d1++)
3649     {
3650         if (d1 < d)
3651         {
3652             /* Copy the cell fractions of the lower dimensions */
3653             comm->cell_f0[d1] = cell_f_row[pos++];
3654             comm->cell_f1[d1] = cell_f_row[pos++];
3655         }
3656         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3657     }
3658     /* Convert the communicated shift from float to int */
3659     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3660     if (d >= 1)
3661     {
3662         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3663     }
3664 }
3665
3666 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3667                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3668                                          gmx_bool bUniform, gmx_int64_t step)
3669 {
3670     gmx_domdec_comm_t *comm;
3671     int                d, dim, d1;
3672     gmx_bool           bRowMember, bRowRoot;
3673     real              *cell_f_row;
3674
3675     comm = dd->comm;
3676
3677     for (d = 0; d < dd->ndim; d++)
3678     {
3679         dim        = dd->dim[d];
3680         bRowMember = TRUE;
3681         bRowRoot   = TRUE;
3682         for (d1 = d; d1 < dd->ndim; d1++)
3683         {
3684             if (dd->ci[dd->dim[d1]] > 0)
3685             {
3686                 if (d1 != d)
3687                 {
3688                     bRowMember = FALSE;
3689                 }
3690                 bRowRoot = FALSE;
3691             }
3692         }
3693         if (bRowMember)
3694         {
3695             if (bRowRoot)
3696             {
3697                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3698                                            ddbox, bDynamicBox, bUniform, step);
3699                 cell_f_row = comm->root[d]->cell_f;
3700             }
3701             else
3702             {
3703                 cell_f_row = comm->cell_f_row;
3704             }
3705             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3706         }
3707     }
3708 }
3709
3710 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3711 {
3712     int d;
3713
3714     /* This function assumes the box is static and should therefore
3715      * not be called when the box has changed since the last
3716      * call to dd_partition_system.
3717      */
3718     for (d = 0; d < dd->ndim; d++)
3719     {
3720         relative_to_absolute_cell_bounds(dd, ddbox, d);
3721     }
3722 }
3723
3724
3725
3726 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3727                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3728                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3729                                   gmx_wallcycle_t wcycle)
3730 {
3731     gmx_domdec_comm_t *comm;
3732     int                dim;
3733
3734     comm = dd->comm;
3735
3736     if (bDoDLB)
3737     {
3738         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3739         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3740         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3741     }
3742     else if (bDynamicBox)
3743     {
3744         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3745     }
3746
3747     /* Set the dimensions for which no DD is used */
3748     for (dim = 0; dim < DIM; dim++)
3749     {
3750         if (dd->nc[dim] == 1)
3751         {
3752             comm->cell_x0[dim] = 0;
3753             comm->cell_x1[dim] = ddbox->box_size[dim];
3754             if (dim >= ddbox->nboundeddim)
3755             {
3756                 comm->cell_x0[dim] += ddbox->box0[dim];
3757                 comm->cell_x1[dim] += ddbox->box0[dim];
3758             }
3759         }
3760     }
3761 }
3762
3763 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3764 {
3765     int                    d, np, i;
3766     gmx_domdec_comm_dim_t *cd;
3767
3768     for (d = 0; d < dd->ndim; d++)
3769     {
3770         cd = &dd->comm->cd[d];
3771         np = npulse[dd->dim[d]];
3772         if (np > cd->np_nalloc)
3773         {
3774             if (debug)
3775             {
3776                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3777                         dim2char(dd->dim[d]), np);
3778             }
3779             if (DDMASTER(dd) && cd->np_nalloc > 0)
3780             {
3781                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3782             }
3783             srenew(cd->ind, np);
3784             for (i = cd->np_nalloc; i < np; i++)
3785             {
3786                 cd->ind[i].index  = NULL;
3787                 cd->ind[i].nalloc = 0;
3788             }
3789             cd->np_nalloc = np;
3790         }
3791         cd->np = np;
3792     }
3793 }
3794
3795
3796 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3797                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3798                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3799                               gmx_wallcycle_t wcycle)
3800 {
3801     gmx_domdec_comm_t *comm;
3802     int                d;
3803     ivec               npulse;
3804
3805     comm = dd->comm;
3806
3807     /* Copy the old cell boundaries for the cg displacement check */
3808     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3809     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3810
3811     if (comm->bDynLoadBal)
3812     {
3813         if (DDMASTER(dd))
3814         {
3815             check_box_size(dd, ddbox);
3816         }
3817         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3818     }
3819     else
3820     {
3821         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3822         realloc_comm_ind(dd, npulse);
3823     }
3824
3825     if (debug)
3826     {
3827         for (d = 0; d < DIM; d++)
3828         {
3829             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3830                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3831         }
3832     }
3833 }
3834
3835 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3836                                   gmx_ddbox_t *ddbox,
3837                                   rvec cell_ns_x0, rvec cell_ns_x1,
3838                                   gmx_int64_t step)
3839 {
3840     gmx_domdec_comm_t *comm;
3841     int                dim_ind, dim;
3842
3843     comm = dd->comm;
3844
3845     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3846     {
3847         dim = dd->dim[dim_ind];
3848
3849         /* Without PBC we don't have restrictions on the outer cells */
3850         if (!(dim >= ddbox->npbcdim &&
3851               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3852             comm->bDynLoadBal &&
3853             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3854             comm->cellsize_min[dim])
3855         {
3856             char buf[22];
3857             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3858                       gmx_step_str(step, buf), dim2char(dim),
3859                       comm->cell_x1[dim] - comm->cell_x0[dim],
3860                       ddbox->skew_fac[dim],
3861                       dd->comm->cellsize_min[dim],
3862                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3863         }
3864     }
3865
3866     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3867     {
3868         /* Communicate the boundaries and update cell_ns_x0/1 */
3869         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3870         if (dd->bGridJump && dd->ndim > 1)
3871         {
3872             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3873         }
3874     }
3875 }
3876
3877 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3878 {
3879     if (YY < npbcdim)
3880     {
3881         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3882     }
3883     else
3884     {
3885         tcm[YY][XX] = 0;
3886     }
3887     if (ZZ < npbcdim)
3888     {
3889         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3890         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3891     }
3892     else
3893     {
3894         tcm[ZZ][XX] = 0;
3895         tcm[ZZ][YY] = 0;
3896     }
3897 }
3898
3899 static void check_screw_box(matrix box)
3900 {
3901     /* Mathematical limitation */
3902     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3903     {
3904         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3905     }
3906
3907     /* Limitation due to the asymmetry of the eighth shell method */
3908     if (box[ZZ][YY] != 0)
3909     {
3910         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3911     }
3912 }
3913
3914 static void distribute_cg(FILE *fplog, gmx_int64_t step,
3915                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3916                           gmx_domdec_t *dd)
3917 {
3918     gmx_domdec_master_t *ma;
3919     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3920     int                  i, icg, j, k, k0, k1, d;
3921     matrix               tcm;
3922     rvec                 cg_cm;
3923     ivec                 ind;
3924     real                 nrcg, inv_ncg, pos_d;
3925     atom_id             *cgindex;
3926     gmx_bool             bScrew;
3927
3928     ma = dd->ma;
3929
3930     if (tmp_ind == NULL)
3931     {
3932         snew(tmp_nalloc, dd->nnodes);
3933         snew(tmp_ind, dd->nnodes);
3934         for (i = 0; i < dd->nnodes; i++)
3935         {
3936             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3937             snew(tmp_ind[i], tmp_nalloc[i]);
3938         }
3939     }
3940
3941     /* Clear the count */
3942     for (i = 0; i < dd->nnodes; i++)
3943     {
3944         ma->ncg[i] = 0;
3945         ma->nat[i] = 0;
3946     }
3947
3948     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3949
3950     cgindex = cgs->index;
3951
3952     /* Compute the center of geometry for all charge groups */
3953     for (icg = 0; icg < cgs->nr; icg++)
3954     {
3955         k0      = cgindex[icg];
3956         k1      = cgindex[icg+1];
3957         nrcg    = k1 - k0;
3958         if (nrcg == 1)
3959         {
3960             copy_rvec(pos[k0], cg_cm);
3961         }
3962         else
3963         {
3964             inv_ncg = 1.0/nrcg;
3965
3966             clear_rvec(cg_cm);
3967             for (k = k0; (k < k1); k++)
3968             {
3969                 rvec_inc(cg_cm, pos[k]);
3970             }
3971             for (d = 0; (d < DIM); d++)
3972             {
3973                 cg_cm[d] *= inv_ncg;
3974             }
3975         }
3976         /* Put the charge group in the box and determine the cell index */
3977         for (d = DIM-1; d >= 0; d--)
3978         {
3979             pos_d = cg_cm[d];
3980             if (d < dd->npbcdim)
3981             {
3982                 bScrew = (dd->bScrewPBC && d == XX);
3983                 if (tric_dir[d] && dd->nc[d] > 1)
3984                 {
3985                     /* Use triclinic coordintates for this dimension */
3986                     for (j = d+1; j < DIM; j++)
3987                     {
3988                         pos_d += cg_cm[j]*tcm[j][d];
3989                     }
3990                 }
3991                 while (pos_d >= box[d][d])
3992                 {
3993                     pos_d -= box[d][d];
3994                     rvec_dec(cg_cm, box[d]);
3995                     if (bScrew)
3996                     {
3997                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3998                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3999                     }
4000                     for (k = k0; (k < k1); k++)
4001                     {
4002                         rvec_dec(pos[k], box[d]);
4003                         if (bScrew)
4004                         {
4005                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4006                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4007                         }
4008                     }
4009                 }
4010                 while (pos_d < 0)
4011                 {
4012                     pos_d += box[d][d];
4013                     rvec_inc(cg_cm, box[d]);
4014                     if (bScrew)
4015                     {
4016                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4017                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4018                     }
4019                     for (k = k0; (k < k1); k++)
4020                     {
4021                         rvec_inc(pos[k], box[d]);
4022                         if (bScrew)
4023                         {
4024                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4025                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4026                         }
4027                     }
4028                 }
4029             }
4030             /* This could be done more efficiently */
4031             ind[d] = 0;
4032             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
4033             {
4034                 ind[d]++;
4035             }
4036         }
4037         i = dd_index(dd->nc, ind);
4038         if (ma->ncg[i] == tmp_nalloc[i])
4039         {
4040             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
4041             srenew(tmp_ind[i], tmp_nalloc[i]);
4042         }
4043         tmp_ind[i][ma->ncg[i]] = icg;
4044         ma->ncg[i]++;
4045         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
4046     }
4047
4048     k1 = 0;
4049     for (i = 0; i < dd->nnodes; i++)
4050     {
4051         ma->index[i] = k1;
4052         for (k = 0; k < ma->ncg[i]; k++)
4053         {
4054             ma->cg[k1++] = tmp_ind[i][k];
4055         }
4056     }
4057     ma->index[dd->nnodes] = k1;
4058
4059     for (i = 0; i < dd->nnodes; i++)
4060     {
4061         sfree(tmp_ind[i]);
4062     }
4063     sfree(tmp_ind);
4064     sfree(tmp_nalloc);
4065
4066     if (fplog)
4067     {
4068         char buf[22];
4069         fprintf(fplog, "Charge group distribution at step %s:",
4070                 gmx_step_str(step, buf));
4071         for (i = 0; i < dd->nnodes; i++)
4072         {
4073             fprintf(fplog, " %d", ma->ncg[i]);
4074         }
4075         fprintf(fplog, "\n");
4076     }
4077 }
4078
4079 static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
4080                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4081                                 rvec pos[])
4082 {
4083     gmx_domdec_master_t *ma = NULL;
4084     ivec                 npulse;
4085     int                  i, cg_gl;
4086     int                 *ibuf, buf2[2] = { 0, 0 };
4087     gmx_bool             bMaster = DDMASTER(dd);
4088
4089     if (bMaster)
4090     {
4091         ma = dd->ma;
4092
4093         if (dd->bScrewPBC)
4094         {
4095             check_screw_box(box);
4096         }
4097
4098         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
4099
4100         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4101         for (i = 0; i < dd->nnodes; i++)
4102         {
4103             ma->ibuf[2*i]   = ma->ncg[i];
4104             ma->ibuf[2*i+1] = ma->nat[i];
4105         }
4106         ibuf = ma->ibuf;
4107     }
4108     else
4109     {
4110         ibuf = NULL;
4111     }
4112     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4113
4114     dd->ncg_home = buf2[0];
4115     dd->nat_home = buf2[1];
4116     dd->ncg_tot  = dd->ncg_home;
4117     dd->nat_tot  = dd->nat_home;
4118     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4119     {
4120         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4121         srenew(dd->index_gl, dd->cg_nalloc);
4122         srenew(dd->cgindex, dd->cg_nalloc+1);
4123     }
4124     if (bMaster)
4125     {
4126         for (i = 0; i < dd->nnodes; i++)
4127         {
4128             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4129             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4130         }
4131     }
4132
4133     dd_scatterv(dd,
4134                 bMaster ? ma->ibuf : NULL,
4135                 bMaster ? ma->ibuf+dd->nnodes : NULL,
4136                 bMaster ? ma->cg : NULL,
4137                 dd->ncg_home*sizeof(int), dd->index_gl);
4138
4139     /* Determine the home charge group sizes */
4140     dd->cgindex[0] = 0;
4141     for (i = 0; i < dd->ncg_home; i++)
4142     {
4143         cg_gl            = dd->index_gl[i];
4144         dd->cgindex[i+1] =
4145             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4146     }
4147
4148     if (debug)
4149     {
4150         fprintf(debug, "Home charge groups:\n");
4151         for (i = 0; i < dd->ncg_home; i++)
4152         {
4153             fprintf(debug, " %d", dd->index_gl[i]);
4154             if (i % 10 == 9)
4155             {
4156                 fprintf(debug, "\n");
4157             }
4158         }
4159         fprintf(debug, "\n");
4160     }
4161 }
4162
4163 static int compact_and_copy_vec_at(int ncg, int *move,
4164                                    int *cgindex,
4165                                    int nvec, int vec,
4166                                    rvec *src, gmx_domdec_comm_t *comm,
4167                                    gmx_bool bCompact)
4168 {
4169     int m, icg, i, i0, i1, nrcg;
4170     int home_pos;
4171     int pos_vec[DIM*2];
4172
4173     home_pos = 0;
4174
4175     for (m = 0; m < DIM*2; m++)
4176     {
4177         pos_vec[m] = 0;
4178     }
4179
4180     i0 = 0;
4181     for (icg = 0; icg < ncg; icg++)
4182     {
4183         i1 = cgindex[icg+1];
4184         m  = move[icg];
4185         if (m == -1)
4186         {
4187             if (bCompact)
4188             {
4189                 /* Compact the home array in place */
4190                 for (i = i0; i < i1; i++)
4191                 {
4192                     copy_rvec(src[i], src[home_pos++]);
4193                 }
4194             }
4195         }
4196         else
4197         {
4198             /* Copy to the communication buffer */
4199             nrcg        = i1 - i0;
4200             pos_vec[m] += 1 + vec*nrcg;
4201             for (i = i0; i < i1; i++)
4202             {
4203                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4204             }
4205             pos_vec[m] += (nvec - vec - 1)*nrcg;
4206         }
4207         if (!bCompact)
4208         {
4209             home_pos += i1 - i0;
4210         }
4211         i0 = i1;
4212     }
4213
4214     return home_pos;
4215 }
4216
4217 static int compact_and_copy_vec_cg(int ncg, int *move,
4218                                    int *cgindex,
4219                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4220                                    gmx_bool bCompact)
4221 {
4222     int m, icg, i0, i1, nrcg;
4223     int home_pos;
4224     int pos_vec[DIM*2];
4225
4226     home_pos = 0;
4227
4228     for (m = 0; m < DIM*2; m++)
4229     {
4230         pos_vec[m] = 0;
4231     }
4232
4233     i0 = 0;
4234     for (icg = 0; icg < ncg; icg++)
4235     {
4236         i1 = cgindex[icg+1];
4237         m  = move[icg];
4238         if (m == -1)
4239         {
4240             if (bCompact)
4241             {
4242                 /* Compact the home array in place */
4243                 copy_rvec(src[icg], src[home_pos++]);
4244             }
4245         }
4246         else
4247         {
4248             nrcg = i1 - i0;
4249             /* Copy to the communication buffer */
4250             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4251             pos_vec[m] += 1 + nrcg*nvec;
4252         }
4253         i0 = i1;
4254     }
4255     if (!bCompact)
4256     {
4257         home_pos = ncg;
4258     }
4259
4260     return home_pos;
4261 }
4262
4263 static int compact_ind(int ncg, int *move,
4264                        int *index_gl, int *cgindex,
4265                        int *gatindex,
4266                        gmx_ga2la_t ga2la, char *bLocalCG,
4267                        int *cginfo)
4268 {
4269     int cg, nat, a0, a1, a, a_gl;
4270     int home_pos;
4271
4272     home_pos = 0;
4273     nat      = 0;
4274     for (cg = 0; cg < ncg; cg++)
4275     {
4276         a0 = cgindex[cg];
4277         a1 = cgindex[cg+1];
4278         if (move[cg] == -1)
4279         {
4280             /* Compact the home arrays in place.
4281              * Anything that can be done here avoids access to global arrays.
4282              */
4283             cgindex[home_pos] = nat;
4284             for (a = a0; a < a1; a++)
4285             {
4286                 a_gl          = gatindex[a];
4287                 gatindex[nat] = a_gl;
4288                 /* The cell number stays 0, so we don't need to set it */
4289                 ga2la_change_la(ga2la, a_gl, nat);
4290                 nat++;
4291             }
4292             index_gl[home_pos] = index_gl[cg];
4293             cginfo[home_pos]   = cginfo[cg];
4294             /* The charge group remains local, so bLocalCG does not change */
4295             home_pos++;
4296         }
4297         else
4298         {
4299             /* Clear the global indices */
4300             for (a = a0; a < a1; a++)
4301             {
4302                 ga2la_del(ga2la, gatindex[a]);
4303             }
4304             if (bLocalCG)
4305             {
4306                 bLocalCG[index_gl[cg]] = FALSE;
4307             }
4308         }
4309     }
4310     cgindex[home_pos] = nat;
4311
4312     return home_pos;
4313 }
4314
4315 static void clear_and_mark_ind(int ncg, int *move,
4316                                int *index_gl, int *cgindex, int *gatindex,
4317                                gmx_ga2la_t ga2la, char *bLocalCG,
4318                                int *cell_index)
4319 {
4320     int cg, a0, a1, a;
4321
4322     for (cg = 0; cg < ncg; cg++)
4323     {
4324         if (move[cg] >= 0)
4325         {
4326             a0 = cgindex[cg];
4327             a1 = cgindex[cg+1];
4328             /* Clear the global indices */
4329             for (a = a0; a < a1; a++)
4330             {
4331                 ga2la_del(ga2la, gatindex[a]);
4332             }
4333             if (bLocalCG)
4334             {
4335                 bLocalCG[index_gl[cg]] = FALSE;
4336             }
4337             /* Signal that this cg has moved using the ns cell index.
4338              * Here we set it to -1. fill_grid will change it
4339              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4340              */
4341             cell_index[cg] = -1;
4342         }
4343     }
4344 }
4345
4346 static void print_cg_move(FILE *fplog,
4347                           gmx_domdec_t *dd,
4348                           gmx_int64_t step, int cg, int dim, int dir,
4349                           gmx_bool bHaveCgcmOld, real limitd,
4350                           rvec cm_old, rvec cm_new, real pos_d)
4351 {
4352     gmx_domdec_comm_t *comm;
4353     char               buf[22];
4354
4355     comm = dd->comm;
4356
4357     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4358     if (limitd > 0)
4359     {
4360         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4361                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4362                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4363     }
4364     else
4365     {
4366         /* We don't have a limiting distance available: don't print it */
4367         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4368                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4369                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4370     }
4371     fprintf(fplog, "distance out of cell %f\n",
4372             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4373     if (bHaveCgcmOld)
4374     {
4375         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4376                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4377     }
4378     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4379             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4380     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4381             dim2char(dim),
4382             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4383     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4384             dim2char(dim),
4385             comm->cell_x0[dim], comm->cell_x1[dim]);
4386 }
4387
4388 static void cg_move_error(FILE *fplog,
4389                           gmx_domdec_t *dd,
4390                           gmx_int64_t step, int cg, int dim, int dir,
4391                           gmx_bool bHaveCgcmOld, real limitd,
4392                           rvec cm_old, rvec cm_new, real pos_d)
4393 {
4394     if (fplog)
4395     {
4396         print_cg_move(fplog, dd, step, cg, dim, dir,
4397                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4398     }
4399     print_cg_move(stderr, dd, step, cg, dim, dir,
4400                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4401     gmx_fatal(FARGS,
4402               "%s moved too far between two domain decomposition steps\n"
4403               "This usually means that your system is not well equilibrated",
4404               dd->comm->bCGs ? "A charge group" : "An atom");
4405 }
4406
4407 static void rotate_state_atom(t_state *state, int a)
4408 {
4409     int est;
4410
4411     for (est = 0; est < estNR; est++)
4412     {
4413         if (EST_DISTR(est) && (state->flags & (1<<est)))
4414         {
4415             switch (est)
4416             {
4417                 case estX:
4418                     /* Rotate the complete state; for a rectangular box only */
4419                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4420                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4421                     break;
4422                 case estV:
4423                     state->v[a][YY] = -state->v[a][YY];
4424                     state->v[a][ZZ] = -state->v[a][ZZ];
4425                     break;
4426                 case estSDX:
4427                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4428                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4429                     break;
4430                 case estCGP:
4431                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4432                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4433                     break;
4434                 case estDISRE_INITF:
4435                 case estDISRE_RM3TAV:
4436                 case estORIRE_INITF:
4437                 case estORIRE_DTAV:
4438                     /* These are distances, so not affected by rotation */
4439                     break;
4440                 default:
4441                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4442             }
4443         }
4444     }
4445 }
4446
4447 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4448 {
4449     if (natoms > comm->moved_nalloc)
4450     {
4451         /* Contents should be preserved here */
4452         comm->moved_nalloc = over_alloc_dd(natoms);
4453         srenew(comm->moved, comm->moved_nalloc);
4454     }
4455
4456     return comm->moved;
4457 }
4458
4459 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4460                          gmx_domdec_t *dd,
4461                          t_state *state,
4462                          ivec tric_dir, matrix tcm,
4463                          rvec cell_x0, rvec cell_x1,
4464                          rvec limitd, rvec limit0, rvec limit1,
4465                          const int *cgindex,
4466                          int cg_start, int cg_end,
4467                          rvec *cg_cm,
4468                          int *move)
4469 {
4470     int      npbcdim;
4471     int      cg, k, k0, k1, d, dim, d2;
4472     int      mc, nrcg;
4473     int      flag;
4474     gmx_bool bScrew;
4475     ivec     dev;
4476     real     inv_ncg, pos_d;
4477     rvec     cm_new;
4478
4479     npbcdim = dd->npbcdim;
4480
4481     for (cg = cg_start; cg < cg_end; cg++)
4482     {
4483         k0   = cgindex[cg];
4484         k1   = cgindex[cg+1];
4485         nrcg = k1 - k0;
4486         if (nrcg == 1)
4487         {
4488             copy_rvec(state->x[k0], cm_new);
4489         }
4490         else
4491         {
4492             inv_ncg = 1.0/nrcg;
4493
4494             clear_rvec(cm_new);
4495             for (k = k0; (k < k1); k++)
4496             {
4497                 rvec_inc(cm_new, state->x[k]);
4498             }
4499             for (d = 0; (d < DIM); d++)
4500             {
4501                 cm_new[d] = inv_ncg*cm_new[d];
4502             }
4503         }
4504
4505         clear_ivec(dev);
4506         /* Do pbc and check DD cell boundary crossings */
4507         for (d = DIM-1; d >= 0; d--)
4508         {
4509             if (dd->nc[d] > 1)
4510             {
4511                 bScrew = (dd->bScrewPBC && d == XX);
4512                 /* Determine the location of this cg in lattice coordinates */
4513                 pos_d = cm_new[d];
4514                 if (tric_dir[d])
4515                 {
4516                     for (d2 = d+1; d2 < DIM; d2++)
4517                     {
4518                         pos_d += cm_new[d2]*tcm[d2][d];
4519                     }
4520                 }
4521                 /* Put the charge group in the triclinic unit-cell */
4522                 if (pos_d >= cell_x1[d])
4523                 {
4524                     if (pos_d >= limit1[d])
4525                     {
4526                         cg_move_error(fplog, dd, step, cg, d, 1,
4527                                       cg_cm != state->x, limitd[d],
4528                                       cg_cm[cg], cm_new, pos_d);
4529                     }
4530                     dev[d] = 1;
4531                     if (dd->ci[d] == dd->nc[d] - 1)
4532                     {
4533                         rvec_dec(cm_new, state->box[d]);
4534                         if (bScrew)
4535                         {
4536                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4537                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4538                         }
4539                         for (k = k0; (k < k1); k++)
4540                         {
4541                             rvec_dec(state->x[k], state->box[d]);
4542                             if (bScrew)
4543                             {
4544                                 rotate_state_atom(state, k);
4545                             }
4546                         }
4547                     }
4548                 }
4549                 else if (pos_d < cell_x0[d])
4550                 {
4551                     if (pos_d < limit0[d])
4552                     {
4553                         cg_move_error(fplog, dd, step, cg, d, -1,
4554                                       cg_cm != state->x, limitd[d],
4555                                       cg_cm[cg], cm_new, pos_d);
4556                     }
4557                     dev[d] = -1;
4558                     if (dd->ci[d] == 0)
4559                     {
4560                         rvec_inc(cm_new, state->box[d]);
4561                         if (bScrew)
4562                         {
4563                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4564                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4565                         }
4566                         for (k = k0; (k < k1); k++)
4567                         {
4568                             rvec_inc(state->x[k], state->box[d]);
4569                             if (bScrew)
4570                             {
4571                                 rotate_state_atom(state, k);
4572                             }
4573                         }
4574                     }
4575                 }
4576             }
4577             else if (d < npbcdim)
4578             {
4579                 /* Put the charge group in the rectangular unit-cell */
4580                 while (cm_new[d] >= state->box[d][d])
4581                 {
4582                     rvec_dec(cm_new, state->box[d]);
4583                     for (k = k0; (k < k1); k++)
4584                     {
4585                         rvec_dec(state->x[k], state->box[d]);
4586                     }
4587                 }
4588                 while (cm_new[d] < 0)
4589                 {
4590                     rvec_inc(cm_new, state->box[d]);
4591                     for (k = k0; (k < k1); k++)
4592                     {
4593                         rvec_inc(state->x[k], state->box[d]);
4594                     }
4595                 }
4596             }
4597         }
4598
4599         copy_rvec(cm_new, cg_cm[cg]);
4600
4601         /* Determine where this cg should go */
4602         flag = 0;
4603         mc   = -1;
4604         for (d = 0; d < dd->ndim; d++)
4605         {
4606             dim = dd->dim[d];
4607             if (dev[dim] == 1)
4608             {
4609                 flag |= DD_FLAG_FW(d);
4610                 if (mc == -1)
4611                 {
4612                     mc = d*2;
4613                 }
4614             }
4615             else if (dev[dim] == -1)
4616             {
4617                 flag |= DD_FLAG_BW(d);
4618                 if (mc == -1)
4619                 {
4620                     if (dd->nc[dim] > 2)
4621                     {
4622                         mc = d*2 + 1;
4623                     }
4624                     else
4625                     {
4626                         mc = d*2;
4627                     }
4628                 }
4629             }
4630         }
4631         /* Temporarily store the flag in move */
4632         move[cg] = mc + flag;
4633     }
4634 }
4635
4636 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4637                                gmx_domdec_t *dd, ivec tric_dir,
4638                                t_state *state, rvec **f,
4639                                t_forcerec *fr,
4640                                gmx_bool bCompact,
4641                                t_nrnb *nrnb,
4642                                int *ncg_stay_home,
4643                                int *ncg_moved)
4644 {
4645     int               *move;
4646     int                npbcdim;
4647     int                ncg[DIM*2], nat[DIM*2];
4648     int                c, i, cg, k, d, dim, dim2, dir, d2, d3;
4649     int                mc, cdd, nrcg, ncg_recv, nvs, nvr, nvec, vec;
4650     int                sbuf[2], rbuf[2];
4651     int                home_pos_cg, home_pos_at, buf_pos;
4652     int                flag;
4653     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4654     real               pos_d;
4655     matrix             tcm;
4656     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1;
4657     atom_id           *cgindex;
4658     cginfo_mb_t       *cginfo_mb;
4659     gmx_domdec_comm_t *comm;
4660     int               *moved;
4661     int                nthread, thread;
4662
4663     if (dd->bScrewPBC)
4664     {
4665         check_screw_box(state->box);
4666     }
4667
4668     comm  = dd->comm;
4669     if (fr->cutoff_scheme == ecutsGROUP)
4670     {
4671         cg_cm = fr->cg_cm;
4672     }
4673
4674     for (i = 0; i < estNR; i++)
4675     {
4676         if (EST_DISTR(i))
4677         {
4678             switch (i)
4679             {
4680                 case estX: /* Always present */ break;
4681                 case estV:   bV   = (state->flags & (1<<i)); break;
4682                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4683                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4684                 case estLD_RNG:
4685                 case estLD_RNGI:
4686                 case estDISRE_INITF:
4687                 case estDISRE_RM3TAV:
4688                 case estORIRE_INITF:
4689                 case estORIRE_DTAV:
4690                     /* No processing required */
4691                     break;
4692                 default:
4693                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4694             }
4695         }
4696     }
4697
4698     if (dd->ncg_tot > comm->nalloc_int)
4699     {
4700         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4701         srenew(comm->buf_int, comm->nalloc_int);
4702     }
4703     move = comm->buf_int;
4704
4705     /* Clear the count */
4706     for (c = 0; c < dd->ndim*2; c++)
4707     {
4708         ncg[c] = 0;
4709         nat[c] = 0;
4710     }
4711
4712     npbcdim = dd->npbcdim;
4713
4714     for (d = 0; (d < DIM); d++)
4715     {
4716         limitd[d] = dd->comm->cellsize_min[d];
4717         if (d >= npbcdim && dd->ci[d] == 0)
4718         {
4719             cell_x0[d] = -GMX_FLOAT_MAX;
4720         }
4721         else
4722         {
4723             cell_x0[d] = comm->cell_x0[d];
4724         }
4725         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4726         {
4727             cell_x1[d] = GMX_FLOAT_MAX;
4728         }
4729         else
4730         {
4731             cell_x1[d] = comm->cell_x1[d];
4732         }
4733         if (d < npbcdim)
4734         {
4735             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4736             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4737         }
4738         else
4739         {
4740             /* We check after communication if a charge group moved
4741              * more than one cell. Set the pre-comm check limit to float_max.
4742              */
4743             limit0[d] = -GMX_FLOAT_MAX;
4744             limit1[d] =  GMX_FLOAT_MAX;
4745         }
4746     }
4747
4748     make_tric_corr_matrix(npbcdim, state->box, tcm);
4749
4750     cgindex = dd->cgindex;
4751
4752     nthread = gmx_omp_nthreads_get(emntDomdec);
4753
4754     /* Compute the center of geometry for all home charge groups
4755      * and put them in the box and determine where they should go.
4756      */
4757 #pragma omp parallel for num_threads(nthread) schedule(static)
4758     for (thread = 0; thread < nthread; thread++)
4759     {
4760         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4761                      cell_x0, cell_x1, limitd, limit0, limit1,
4762                      cgindex,
4763                      ( thread   *dd->ncg_home)/nthread,
4764                      ((thread+1)*dd->ncg_home)/nthread,
4765                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4766                      move);
4767     }
4768
4769     for (cg = 0; cg < dd->ncg_home; cg++)
4770     {
4771         if (move[cg] >= 0)
4772         {
4773             mc       = move[cg];
4774             flag     = mc & ~DD_FLAG_NRCG;
4775             mc       = mc & DD_FLAG_NRCG;
4776             move[cg] = mc;
4777
4778             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4779             {
4780                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4781                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4782             }
4783             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4784             /* We store the cg size in the lower 16 bits
4785              * and the place where the charge group should go
4786              * in the next 6 bits. This saves some communication volume.
4787              */
4788             nrcg = cgindex[cg+1] - cgindex[cg];
4789             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4790             ncg[mc] += 1;
4791             nat[mc] += nrcg;
4792         }
4793     }
4794
4795     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4796     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4797
4798     *ncg_moved = 0;
4799     for (i = 0; i < dd->ndim*2; i++)
4800     {
4801         *ncg_moved += ncg[i];
4802     }
4803
4804     nvec = 1;
4805     if (bV)
4806     {
4807         nvec++;
4808     }
4809     if (bSDX)
4810     {
4811         nvec++;
4812     }
4813     if (bCGP)
4814     {
4815         nvec++;
4816     }
4817
4818     /* Make sure the communication buffers are large enough */
4819     for (mc = 0; mc < dd->ndim*2; mc++)
4820     {
4821         nvr = ncg[mc] + nat[mc]*nvec;
4822         if (nvr > comm->cgcm_state_nalloc[mc])
4823         {
4824             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4825             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4826         }
4827     }
4828
4829     switch (fr->cutoff_scheme)
4830     {
4831         case ecutsGROUP:
4832             /* Recalculating cg_cm might be cheaper than communicating,
4833              * but that could give rise to rounding issues.
4834              */
4835             home_pos_cg =
4836                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4837                                         nvec, cg_cm, comm, bCompact);
4838             break;
4839         case ecutsVERLET:
4840             /* Without charge groups we send the moved atom coordinates
4841              * over twice. This is so the code below can be used without
4842              * many conditionals for both for with and without charge groups.
4843              */
4844             home_pos_cg =
4845                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4846                                         nvec, state->x, comm, FALSE);
4847             if (bCompact)
4848             {
4849                 home_pos_cg -= *ncg_moved;
4850             }
4851             break;
4852         default:
4853             gmx_incons("unimplemented");
4854             home_pos_cg = 0;
4855     }
4856
4857     vec         = 0;
4858     home_pos_at =
4859         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4860                                 nvec, vec++, state->x, comm, bCompact);
4861     if (bV)
4862     {
4863         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4864                                 nvec, vec++, state->v, comm, bCompact);
4865     }
4866     if (bSDX)
4867     {
4868         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4869                                 nvec, vec++, state->sd_X, comm, bCompact);
4870     }
4871     if (bCGP)
4872     {
4873         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4874                                 nvec, vec++, state->cg_p, comm, bCompact);
4875     }
4876
4877     if (bCompact)
4878     {
4879         compact_ind(dd->ncg_home, move,
4880                     dd->index_gl, dd->cgindex, dd->gatindex,
4881                     dd->ga2la, comm->bLocalCG,
4882                     fr->cginfo);
4883     }
4884     else
4885     {
4886         if (fr->cutoff_scheme == ecutsVERLET)
4887         {
4888             moved = get_moved(comm, dd->ncg_home);
4889
4890             for (k = 0; k < dd->ncg_home; k++)
4891             {
4892                 moved[k] = 0;
4893             }
4894         }
4895         else
4896         {
4897             moved = fr->ns.grid->cell_index;
4898         }
4899
4900         clear_and_mark_ind(dd->ncg_home, move,
4901                            dd->index_gl, dd->cgindex, dd->gatindex,
4902                            dd->ga2la, comm->bLocalCG,
4903                            moved);
4904     }
4905
4906     cginfo_mb = fr->cginfo_mb;
4907
4908     *ncg_stay_home = home_pos_cg;
4909     for (d = 0; d < dd->ndim; d++)
4910     {
4911         dim      = dd->dim[d];
4912         ncg_recv = 0;
4913         nvr      = 0;
4914         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4915         {
4916             cdd = d*2 + dir;
4917             /* Communicate the cg and atom counts */
4918             sbuf[0] = ncg[cdd];
4919             sbuf[1] = nat[cdd];
4920             if (debug)
4921             {
4922                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4923                         d, dir, sbuf[0], sbuf[1]);
4924             }
4925             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4926
4927             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4928             {
4929                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4930                 srenew(comm->buf_int, comm->nalloc_int);
4931             }
4932
4933             /* Communicate the charge group indices, sizes and flags */
4934             dd_sendrecv_int(dd, d, dir,
4935                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4936                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4937
4938             nvs = ncg[cdd] + nat[cdd]*nvec;
4939             i   = rbuf[0]  + rbuf[1] *nvec;
4940             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4941
4942             /* Communicate cgcm and state */
4943             dd_sendrecv_rvec(dd, d, dir,
4944                              comm->cgcm_state[cdd], nvs,
4945                              comm->vbuf.v+nvr, i);
4946             ncg_recv += rbuf[0];
4947             nvr      += i;
4948         }
4949
4950         /* Process the received charge groups */
4951         buf_pos = 0;
4952         for (cg = 0; cg < ncg_recv; cg++)
4953         {
4954             flag = comm->buf_int[cg*DD_CGIBS+1];
4955
4956             if (dim >= npbcdim && dd->nc[dim] > 2)
4957             {
4958                 /* No pbc in this dim and more than one domain boundary.
4959                  * We do a separate check if a charge group didn't move too far.
4960                  */
4961                 if (((flag & DD_FLAG_FW(d)) &&
4962                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4963                     ((flag & DD_FLAG_BW(d)) &&
4964                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4965                 {
4966                     cg_move_error(fplog, dd, step, cg, dim,
4967                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4968                                   fr->cutoff_scheme == ecutsGROUP, 0,
4969                                   comm->vbuf.v[buf_pos],
4970                                   comm->vbuf.v[buf_pos],
4971                                   comm->vbuf.v[buf_pos][dim]);
4972                 }
4973             }
4974
4975             mc = -1;
4976             if (d < dd->ndim-1)
4977             {
4978                 /* Check which direction this cg should go */
4979                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4980                 {
4981                     if (dd->bGridJump)
4982                     {
4983                         /* The cell boundaries for dimension d2 are not equal
4984                          * for each cell row of the lower dimension(s),
4985                          * therefore we might need to redetermine where
4986                          * this cg should go.
4987                          */
4988                         dim2 = dd->dim[d2];
4989                         /* If this cg crosses the box boundary in dimension d2
4990                          * we can use the communicated flag, so we do not
4991                          * have to worry about pbc.
4992                          */
4993                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4994                                (flag & DD_FLAG_FW(d2))) ||
4995                               (dd->ci[dim2] == 0 &&
4996                                (flag & DD_FLAG_BW(d2)))))
4997                         {
4998                             /* Clear the two flags for this dimension */
4999                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
5000                             /* Determine the location of this cg
5001                              * in lattice coordinates
5002                              */
5003                             pos_d = comm->vbuf.v[buf_pos][dim2];
5004                             if (tric_dir[dim2])
5005                             {
5006                                 for (d3 = dim2+1; d3 < DIM; d3++)
5007                                 {
5008                                     pos_d +=
5009                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
5010                                 }
5011                             }
5012                             /* Check of we are not at the box edge.
5013                              * pbc is only handled in the first step above,
5014                              * but this check could move over pbc while
5015                              * the first step did not due to different rounding.
5016                              */
5017                             if (pos_d >= cell_x1[dim2] &&
5018                                 dd->ci[dim2] != dd->nc[dim2]-1)
5019                             {
5020                                 flag |= DD_FLAG_FW(d2);
5021                             }
5022                             else if (pos_d < cell_x0[dim2] &&
5023                                      dd->ci[dim2] != 0)
5024                             {
5025                                 flag |= DD_FLAG_BW(d2);
5026                             }
5027                             comm->buf_int[cg*DD_CGIBS+1] = flag;
5028                         }
5029                     }
5030                     /* Set to which neighboring cell this cg should go */
5031                     if (flag & DD_FLAG_FW(d2))
5032                     {
5033                         mc = d2*2;
5034                     }
5035                     else if (flag & DD_FLAG_BW(d2))
5036                     {
5037                         if (dd->nc[dd->dim[d2]] > 2)
5038                         {
5039                             mc = d2*2+1;
5040                         }
5041                         else
5042                         {
5043                             mc = d2*2;
5044                         }
5045                     }
5046                 }
5047             }
5048
5049             nrcg = flag & DD_FLAG_NRCG;
5050             if (mc == -1)
5051             {
5052                 if (home_pos_cg+1 > dd->cg_nalloc)
5053                 {
5054                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5055                     srenew(dd->index_gl, dd->cg_nalloc);
5056                     srenew(dd->cgindex, dd->cg_nalloc+1);
5057                 }
5058                 /* Set the global charge group index and size */
5059                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5060                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5061                 /* Copy the state from the buffer */
5062                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5063                 if (fr->cutoff_scheme == ecutsGROUP)
5064                 {
5065                     cg_cm = fr->cg_cm;
5066                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5067                 }
5068                 buf_pos++;
5069
5070                 /* Set the cginfo */
5071                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5072                                                    dd->index_gl[home_pos_cg]);
5073                 if (comm->bLocalCG)
5074                 {
5075                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5076                 }
5077
5078                 if (home_pos_at+nrcg > state->nalloc)
5079                 {
5080                     dd_realloc_state(state, f, home_pos_at+nrcg);
5081                 }
5082                 for (i = 0; i < nrcg; i++)
5083                 {
5084                     copy_rvec(comm->vbuf.v[buf_pos++],
5085                               state->x[home_pos_at+i]);
5086                 }
5087                 if (bV)
5088                 {
5089                     for (i = 0; i < nrcg; i++)
5090                     {
5091                         copy_rvec(comm->vbuf.v[buf_pos++],
5092                                   state->v[home_pos_at+i]);
5093                     }
5094                 }
5095                 if (bSDX)
5096                 {
5097                     for (i = 0; i < nrcg; i++)
5098                     {
5099                         copy_rvec(comm->vbuf.v[buf_pos++],
5100                                   state->sd_X[home_pos_at+i]);
5101                     }
5102                 }
5103                 if (bCGP)
5104                 {
5105                     for (i = 0; i < nrcg; i++)
5106                     {
5107                         copy_rvec(comm->vbuf.v[buf_pos++],
5108                                   state->cg_p[home_pos_at+i]);
5109                     }
5110                 }
5111                 home_pos_cg += 1;
5112                 home_pos_at += nrcg;
5113             }
5114             else
5115             {
5116                 /* Reallocate the buffers if necessary  */
5117                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5118                 {
5119                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5120                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5121                 }
5122                 nvr = ncg[mc] + nat[mc]*nvec;
5123                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5124                 {
5125                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5126                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5127                 }
5128                 /* Copy from the receive to the send buffers */
5129                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5130                        comm->buf_int + cg*DD_CGIBS,
5131                        DD_CGIBS*sizeof(int));
5132                 memcpy(comm->cgcm_state[mc][nvr],
5133                        comm->vbuf.v[buf_pos],
5134                        (1+nrcg*nvec)*sizeof(rvec));
5135                 buf_pos += 1 + nrcg*nvec;
5136                 ncg[mc] += 1;
5137                 nat[mc] += nrcg;
5138             }
5139         }
5140     }
5141
5142     /* With sorting (!bCompact) the indices are now only partially up to date
5143      * and ncg_home and nat_home are not the real count, since there are
5144      * "holes" in the arrays for the charge groups that moved to neighbors.
5145      */
5146     if (fr->cutoff_scheme == ecutsVERLET)
5147     {
5148         moved = get_moved(comm, home_pos_cg);
5149
5150         for (i = dd->ncg_home; i < home_pos_cg; i++)
5151         {
5152             moved[i] = 0;
5153         }
5154     }
5155     dd->ncg_home = home_pos_cg;
5156     dd->nat_home = home_pos_at;
5157
5158     if (debug)
5159     {
5160         fprintf(debug,
5161                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5162                 *ncg_moved, dd->ncg_home-*ncg_moved);
5163
5164     }
5165 }
5166
5167 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5168 {
5169     dd->comm->cycl[ddCycl] += cycles;
5170     dd->comm->cycl_n[ddCycl]++;
5171     if (cycles > dd->comm->cycl_max[ddCycl])
5172     {
5173         dd->comm->cycl_max[ddCycl] = cycles;
5174     }
5175 }
5176
5177 static double force_flop_count(t_nrnb *nrnb)
5178 {
5179     int         i;
5180     double      sum;
5181     const char *name;
5182
5183     sum = 0;
5184     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5185     {
5186         /* To get closer to the real timings, we half the count
5187          * for the normal loops and again half it for water loops.
5188          */
5189         name = nrnb_str(i);
5190         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5191         {
5192             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5193         }
5194         else
5195         {
5196             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5197         }
5198     }
5199     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5200     {
5201         name = nrnb_str(i);
5202         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5203         {
5204             sum += nrnb->n[i]*cost_nrnb(i);
5205         }
5206     }
5207     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5208     {
5209         sum += nrnb->n[i]*cost_nrnb(i);
5210     }
5211
5212     return sum;
5213 }
5214
5215 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5216 {
5217     if (dd->comm->eFlop)
5218     {
5219         dd->comm->flop -= force_flop_count(nrnb);
5220     }
5221 }
5222 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5223 {
5224     if (dd->comm->eFlop)
5225     {
5226         dd->comm->flop += force_flop_count(nrnb);
5227         dd->comm->flop_n++;
5228     }
5229 }
5230
5231 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5232 {
5233     int i;
5234
5235     for (i = 0; i < ddCyclNr; i++)
5236     {
5237         dd->comm->cycl[i]     = 0;
5238         dd->comm->cycl_n[i]   = 0;
5239         dd->comm->cycl_max[i] = 0;
5240     }
5241     dd->comm->flop   = 0;
5242     dd->comm->flop_n = 0;
5243 }
5244
5245 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5246 {
5247     gmx_domdec_comm_t *comm;
5248     gmx_domdec_load_t *load;
5249     gmx_domdec_root_t *root = NULL;
5250     int                d, dim, i, pos;
5251     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5252     gmx_bool           bSepPME;
5253
5254     if (debug)
5255     {
5256         fprintf(debug, "get_load_distribution start\n");
5257     }
5258
5259     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5260
5261     comm = dd->comm;
5262
5263     bSepPME = (dd->pme_nodeid >= 0);
5264
5265     for (d = dd->ndim-1; d >= 0; d--)
5266     {
5267         dim = dd->dim[d];
5268         /* Check if we participate in the communication in this dimension */
5269         if (d == dd->ndim-1 ||
5270             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5271         {
5272             load = &comm->load[d];
5273             if (dd->bGridJump)
5274             {
5275                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5276             }
5277             pos = 0;
5278             if (d == dd->ndim-1)
5279             {
5280                 sbuf[pos++] = dd_force_load(comm);
5281                 sbuf[pos++] = sbuf[0];
5282                 if (dd->bGridJump)
5283                 {
5284                     sbuf[pos++] = sbuf[0];
5285                     sbuf[pos++] = cell_frac;
5286                     if (d > 0)
5287                     {
5288                         sbuf[pos++] = comm->cell_f_max0[d];
5289                         sbuf[pos++] = comm->cell_f_min1[d];
5290                     }
5291                 }
5292                 if (bSepPME)
5293                 {
5294                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5295                     sbuf[pos++] = comm->cycl[ddCyclPME];
5296                 }
5297             }
5298             else
5299             {
5300                 sbuf[pos++] = comm->load[d+1].sum;
5301                 sbuf[pos++] = comm->load[d+1].max;
5302                 if (dd->bGridJump)
5303                 {
5304                     sbuf[pos++] = comm->load[d+1].sum_m;
5305                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5306                     sbuf[pos++] = comm->load[d+1].flags;
5307                     if (d > 0)
5308                     {
5309                         sbuf[pos++] = comm->cell_f_max0[d];
5310                         sbuf[pos++] = comm->cell_f_min1[d];
5311                     }
5312                 }
5313                 if (bSepPME)
5314                 {
5315                     sbuf[pos++] = comm->load[d+1].mdf;
5316                     sbuf[pos++] = comm->load[d+1].pme;
5317                 }
5318             }
5319             load->nload = pos;
5320             /* Communicate a row in DD direction d.
5321              * The communicators are setup such that the root always has rank 0.
5322              */
5323 #ifdef GMX_MPI
5324             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5325                        load->load, load->nload*sizeof(float), MPI_BYTE,
5326                        0, comm->mpi_comm_load[d]);
5327 #endif
5328             if (dd->ci[dim] == dd->master_ci[dim])
5329             {
5330                 /* We are the root, process this row */
5331                 if (comm->bDynLoadBal)
5332                 {
5333                     root = comm->root[d];
5334                 }
5335                 load->sum      = 0;
5336                 load->max      = 0;
5337                 load->sum_m    = 0;
5338                 load->cvol_min = 1;
5339                 load->flags    = 0;
5340                 load->mdf      = 0;
5341                 load->pme      = 0;
5342                 pos            = 0;
5343                 for (i = 0; i < dd->nc[dim]; i++)
5344                 {
5345                     load->sum += load->load[pos++];
5346                     load->max  = std::max(load->max, load->load[pos]);
5347                     pos++;
5348                     if (dd->bGridJump)
5349                     {
5350                         if (root->bLimited)
5351                         {
5352                             /* This direction could not be load balanced properly,
5353                              * therefore we need to use the maximum iso the average load.
5354                              */
5355                             load->sum_m = std::max(load->sum_m, load->load[pos]);
5356                         }
5357                         else
5358                         {
5359                             load->sum_m += load->load[pos];
5360                         }
5361                         pos++;
5362                         load->cvol_min = std::min(load->cvol_min, load->load[pos]);
5363                         pos++;
5364                         if (d < dd->ndim-1)
5365                         {
5366                             load->flags = (int)(load->load[pos++] + 0.5);
5367                         }
5368                         if (d > 0)
5369                         {
5370                             root->cell_f_max0[i] = load->load[pos++];
5371                             root->cell_f_min1[i] = load->load[pos++];
5372                         }
5373                     }
5374                     if (bSepPME)
5375                     {
5376                         load->mdf = std::max(load->mdf, load->load[pos]);
5377                         pos++;
5378                         load->pme = std::max(load->pme, load->load[pos]);
5379                         pos++;
5380                     }
5381                 }
5382                 if (comm->bDynLoadBal && root->bLimited)
5383                 {
5384                     load->sum_m *= dd->nc[dim];
5385                     load->flags |= (1<<d);
5386                 }
5387             }
5388         }
5389     }
5390
5391     if (DDMASTER(dd))
5392     {
5393         comm->nload      += dd_load_count(comm);
5394         comm->load_step  += comm->cycl[ddCyclStep];
5395         comm->load_sum   += comm->load[0].sum;
5396         comm->load_max   += comm->load[0].max;
5397         if (comm->bDynLoadBal)
5398         {
5399             for (d = 0; d < dd->ndim; d++)
5400             {
5401                 if (comm->load[0].flags & (1<<d))
5402                 {
5403                     comm->load_lim[d]++;
5404                 }
5405             }
5406         }
5407         if (bSepPME)
5408         {
5409             comm->load_mdf += comm->load[0].mdf;
5410             comm->load_pme += comm->load[0].pme;
5411         }
5412     }
5413
5414     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5415
5416     if (debug)
5417     {
5418         fprintf(debug, "get_load_distribution finished\n");
5419     }
5420 }
5421
5422 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5423 {
5424     /* Return the relative performance loss on the total run time
5425      * due to the force calculation load imbalance.
5426      */
5427     if (dd->comm->nload > 0)
5428     {
5429         return
5430             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5431             (dd->comm->load_step*dd->nnodes);
5432     }
5433     else
5434     {
5435         return 0;
5436     }
5437 }
5438
5439 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5440 {
5441     char               buf[STRLEN];
5442     int                npp, npme, nnodes, d, limp;
5443     float              imbal, pme_f_ratio, lossf, lossp = 0;
5444     gmx_bool           bLim;
5445     gmx_domdec_comm_t *comm;
5446
5447     comm = dd->comm;
5448     if (DDMASTER(dd) && comm->nload > 0)
5449     {
5450         npp    = dd->nnodes;
5451         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5452         nnodes = npp + npme;
5453         imbal  = comm->load_max*npp/comm->load_sum - 1;
5454         lossf  = dd_force_imb_perf_loss(dd);
5455         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5456         fprintf(fplog, "%s", buf);
5457         fprintf(stderr, "\n");
5458         fprintf(stderr, "%s", buf);
5459         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5460         fprintf(fplog, "%s", buf);
5461         fprintf(stderr, "%s", buf);
5462         bLim = FALSE;
5463         if (comm->bDynLoadBal)
5464         {
5465             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5466             for (d = 0; d < dd->ndim; d++)
5467             {
5468                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5469                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5470                 if (limp >= 50)
5471                 {
5472                     bLim = TRUE;
5473                 }
5474             }
5475             sprintf(buf+strlen(buf), "\n");
5476             fprintf(fplog, "%s", buf);
5477             fprintf(stderr, "%s", buf);
5478         }
5479         if (npme > 0)
5480         {
5481             pme_f_ratio = comm->load_pme/comm->load_mdf;
5482             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5483             if (lossp <= 0)
5484             {
5485                 lossp *= (float)npme/(float)nnodes;
5486             }
5487             else
5488             {
5489                 lossp *= (float)npp/(float)nnodes;
5490             }
5491             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5492             fprintf(fplog, "%s", buf);
5493             fprintf(stderr, "%s", buf);
5494             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5495             fprintf(fplog, "%s", buf);
5496             fprintf(stderr, "%s", buf);
5497         }
5498         fprintf(fplog, "\n");
5499         fprintf(stderr, "\n");
5500
5501         if (lossf >= DD_PERF_LOSS_WARN)
5502         {
5503             sprintf(buf,
5504                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5505                     "      in the domain decomposition.\n", lossf*100);
5506             if (!comm->bDynLoadBal)
5507             {
5508                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5509             }
5510             else if (bLim)
5511             {
5512                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5513             }
5514             fprintf(fplog, "%s\n", buf);
5515             fprintf(stderr, "%s\n", buf);
5516         }
5517         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5518         {
5519             sprintf(buf,
5520                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5521                     "      had %s work to do than the PP ranks.\n"
5522                     "      You might want to %s the number of PME ranks\n"
5523                     "      or %s the cut-off and the grid spacing.\n",
5524                     fabs(lossp*100),
5525                     (lossp < 0) ? "less"     : "more",
5526                     (lossp < 0) ? "decrease" : "increase",
5527                     (lossp < 0) ? "decrease" : "increase");
5528             fprintf(fplog, "%s\n", buf);
5529             fprintf(stderr, "%s\n", buf);
5530         }
5531     }
5532 }
5533
5534 static float dd_vol_min(gmx_domdec_t *dd)
5535 {
5536     return dd->comm->load[0].cvol_min*dd->nnodes;
5537 }
5538
5539 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5540 {
5541     return dd->comm->load[0].flags;
5542 }
5543
5544 static float dd_f_imbal(gmx_domdec_t *dd)
5545 {
5546     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5547 }
5548
5549 float dd_pme_f_ratio(gmx_domdec_t *dd)
5550 {
5551     if (dd->comm->cycl_n[ddCyclPME] > 0)
5552     {
5553         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5554     }
5555     else
5556     {
5557         return -1.0;
5558     }
5559 }
5560
5561 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5562 {
5563     int  flags, d;
5564     char buf[22];
5565
5566     flags = dd_load_flags(dd);
5567     if (flags)
5568     {
5569         fprintf(fplog,
5570                 "DD  load balancing is limited by minimum cell size in dimension");
5571         for (d = 0; d < dd->ndim; d++)
5572         {
5573             if (flags & (1<<d))
5574             {
5575                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5576             }
5577         }
5578         fprintf(fplog, "\n");
5579     }
5580     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5581     if (dd->comm->bDynLoadBal)
5582     {
5583         fprintf(fplog, "  vol min/aver %5.3f%c",
5584                 dd_vol_min(dd), flags ? '!' : ' ');
5585     }
5586     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5587     if (dd->comm->cycl_n[ddCyclPME])
5588     {
5589         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5590     }
5591     fprintf(fplog, "\n\n");
5592 }
5593
5594 static void dd_print_load_verbose(gmx_domdec_t *dd)
5595 {
5596     if (dd->comm->bDynLoadBal)
5597     {
5598         fprintf(stderr, "vol %4.2f%c ",
5599                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5600     }
5601     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5602     if (dd->comm->cycl_n[ddCyclPME])
5603     {
5604         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5605     }
5606 }
5607
5608 #ifdef GMX_MPI
5609 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5610 {
5611     MPI_Comm           c_row;
5612     int                dim, i, rank;
5613     ivec               loc_c;
5614     gmx_domdec_root_t *root;
5615     gmx_bool           bPartOfGroup = FALSE;
5616
5617     dim = dd->dim[dim_ind];
5618     copy_ivec(loc, loc_c);
5619     for (i = 0; i < dd->nc[dim]; i++)
5620     {
5621         loc_c[dim] = i;
5622         rank       = dd_index(dd->nc, loc_c);
5623         if (rank == dd->rank)
5624         {
5625             /* This process is part of the group */
5626             bPartOfGroup = TRUE;
5627         }
5628     }
5629     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5630                    &c_row);
5631     if (bPartOfGroup)
5632     {
5633         dd->comm->mpi_comm_load[dim_ind] = c_row;
5634         if (dd->comm->eDLB != edlbNO)
5635         {
5636             if (dd->ci[dim] == dd->master_ci[dim])
5637             {
5638                 /* This is the root process of this row */
5639                 snew(dd->comm->root[dim_ind], 1);
5640                 root = dd->comm->root[dim_ind];
5641                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5642                 snew(root->old_cell_f, dd->nc[dim]+1);
5643                 snew(root->bCellMin, dd->nc[dim]);
5644                 if (dim_ind > 0)
5645                 {
5646                     snew(root->cell_f_max0, dd->nc[dim]);
5647                     snew(root->cell_f_min1, dd->nc[dim]);
5648                     snew(root->bound_min, dd->nc[dim]);
5649                     snew(root->bound_max, dd->nc[dim]);
5650                 }
5651                 snew(root->buf_ncd, dd->nc[dim]);
5652             }
5653             else
5654             {
5655                 /* This is not a root process, we only need to receive cell_f */
5656                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5657             }
5658         }
5659         if (dd->ci[dim] == dd->master_ci[dim])
5660         {
5661             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5662         }
5663     }
5664 }
5665 #endif
5666
5667 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5668                                    const gmx_hw_info_t gmx_unused *hwinfo,
5669                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5670 {
5671 #ifdef GMX_MPI
5672     int           physicalnode_id_hash;
5673     int           gpu_id;
5674     gmx_domdec_t *dd;
5675     MPI_Comm      mpi_comm_pp_physicalnode;
5676
5677     if (!(cr->duty & DUTY_PP) ||
5678         hw_opt->gpu_opt.ncuda_dev_use == 0)
5679     {
5680         /* Only PP nodes (currently) use GPUs.
5681          * If we don't have GPUs, there are no resources to share.
5682          */
5683         return;
5684     }
5685
5686     physicalnode_id_hash = gmx_physicalnode_id_hash();
5687
5688     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5689
5690     dd = cr->dd;
5691
5692     if (debug)
5693     {
5694         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5695         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5696                 dd->rank, physicalnode_id_hash, gpu_id);
5697     }
5698     /* Split the PP communicator over the physical nodes */
5699     /* TODO: See if we should store this (before), as it's also used for
5700      * for the nodecomm summution.
5701      */
5702     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5703                    &mpi_comm_pp_physicalnode);
5704     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5705                    &dd->comm->mpi_comm_gpu_shared);
5706     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5707     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5708
5709     if (debug)
5710     {
5711         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5712     }
5713
5714     /* Note that some ranks could share a GPU, while others don't */
5715
5716     if (dd->comm->nrank_gpu_shared == 1)
5717     {
5718         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5719     }
5720 #endif
5721 }
5722
5723 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5724 {
5725 #ifdef GMX_MPI
5726     int  dim0, dim1, i, j;
5727     ivec loc;
5728
5729     if (debug)
5730     {
5731         fprintf(debug, "Making load communicators\n");
5732     }
5733
5734     snew(dd->comm->load, dd->ndim);
5735     snew(dd->comm->mpi_comm_load, dd->ndim);
5736
5737     clear_ivec(loc);
5738     make_load_communicator(dd, 0, loc);
5739     if (dd->ndim > 1)
5740     {
5741         dim0 = dd->dim[0];
5742         for (i = 0; i < dd->nc[dim0]; i++)
5743         {
5744             loc[dim0] = i;
5745             make_load_communicator(dd, 1, loc);
5746         }
5747     }
5748     if (dd->ndim > 2)
5749     {
5750         dim0 = dd->dim[0];
5751         for (i = 0; i < dd->nc[dim0]; i++)
5752         {
5753             loc[dim0] = i;
5754             dim1      = dd->dim[1];
5755             for (j = 0; j < dd->nc[dim1]; j++)
5756             {
5757                 loc[dim1] = j;
5758                 make_load_communicator(dd, 2, loc);
5759             }
5760         }
5761     }
5762
5763     if (debug)
5764     {
5765         fprintf(debug, "Finished making load communicators\n");
5766     }
5767 #endif
5768 }
5769
5770 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5771 {
5772     int                     d, dim, i, j, m;
5773     ivec                    tmp, s;
5774     int                     nzone, nzonep;
5775     ivec                    dd_zp[DD_MAXIZONE];
5776     gmx_domdec_zones_t     *zones;
5777     gmx_domdec_ns_ranges_t *izone;
5778
5779     for (d = 0; d < dd->ndim; d++)
5780     {
5781         dim = dd->dim[d];
5782         copy_ivec(dd->ci, tmp);
5783         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5784         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5785         copy_ivec(dd->ci, tmp);
5786         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5787         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5788         if (debug)
5789         {
5790             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5791                     dd->rank, dim,
5792                     dd->neighbor[d][0],
5793                     dd->neighbor[d][1]);
5794         }
5795     }
5796
5797     if (fplog)
5798     {
5799         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5800                 dd->ndim,
5801                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5802                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5803     }
5804     switch (dd->ndim)
5805     {
5806         case 3:
5807             nzone  = dd_z3n;
5808             nzonep = dd_zp3n;
5809             for (i = 0; i < nzonep; i++)
5810             {
5811                 copy_ivec(dd_zp3[i], dd_zp[i]);
5812             }
5813             break;
5814         case 2:
5815             nzone  = dd_z2n;
5816             nzonep = dd_zp2n;
5817             for (i = 0; i < nzonep; i++)
5818             {
5819                 copy_ivec(dd_zp2[i], dd_zp[i]);
5820             }
5821             break;
5822         case 1:
5823             nzone  = dd_z1n;
5824             nzonep = dd_zp1n;
5825             for (i = 0; i < nzonep; i++)
5826             {
5827                 copy_ivec(dd_zp1[i], dd_zp[i]);
5828             }
5829             break;
5830         default:
5831             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5832             nzone  = 0;
5833             nzonep = 0;
5834     }
5835
5836     zones = &dd->comm->zones;
5837
5838     for (i = 0; i < nzone; i++)
5839     {
5840         m = 0;
5841         clear_ivec(zones->shift[i]);
5842         for (d = 0; d < dd->ndim; d++)
5843         {
5844             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5845         }
5846     }
5847
5848     zones->n = nzone;
5849     for (i = 0; i < nzone; i++)
5850     {
5851         for (d = 0; d < DIM; d++)
5852         {
5853             s[d] = dd->ci[d] - zones->shift[i][d];
5854             if (s[d] < 0)
5855             {
5856                 s[d] += dd->nc[d];
5857             }
5858             else if (s[d] >= dd->nc[d])
5859             {
5860                 s[d] -= dd->nc[d];
5861             }
5862         }
5863     }
5864     zones->nizone = nzonep;
5865     for (i = 0; i < zones->nizone; i++)
5866     {
5867         if (dd_zp[i][0] != i)
5868         {
5869             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5870         }
5871         izone     = &zones->izone[i];
5872         izone->j0 = dd_zp[i][1];
5873         izone->j1 = dd_zp[i][2];
5874         for (dim = 0; dim < DIM; dim++)
5875         {
5876             if (dd->nc[dim] == 1)
5877             {
5878                 /* All shifts should be allowed */
5879                 izone->shift0[dim] = -1;
5880                 izone->shift1[dim] = 1;
5881             }
5882             else
5883             {
5884                 /*
5885                    izone->shift0[d] = 0;
5886                    izone->shift1[d] = 0;
5887                    for(j=izone->j0; j<izone->j1; j++) {
5888                    if (dd->shift[j][d] > dd->shift[i][d])
5889                    izone->shift0[d] = -1;
5890                    if (dd->shift[j][d] < dd->shift[i][d])
5891                    izone->shift1[d] = 1;
5892                    }
5893                  */
5894
5895                 int shift_diff;
5896
5897                 /* Assume the shift are not more than 1 cell */
5898                 izone->shift0[dim] = 1;
5899                 izone->shift1[dim] = -1;
5900                 for (j = izone->j0; j < izone->j1; j++)
5901                 {
5902                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5903                     if (shift_diff < izone->shift0[dim])
5904                     {
5905                         izone->shift0[dim] = shift_diff;
5906                     }
5907                     if (shift_diff > izone->shift1[dim])
5908                     {
5909                         izone->shift1[dim] = shift_diff;
5910                     }
5911                 }
5912             }
5913         }
5914     }
5915
5916     if (dd->comm->eDLB != edlbNO)
5917     {
5918         snew(dd->comm->root, dd->ndim);
5919     }
5920
5921     if (dd->comm->bRecordLoad)
5922     {
5923         make_load_communicators(dd);
5924     }
5925 }
5926
5927 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5928 {
5929     gmx_domdec_t      *dd;
5930     dd   = cr->dd;
5931
5932 #ifdef GMX_MPI
5933     gmx_domdec_comm_t *comm;
5934     int                rank, *buf;
5935     ivec               periods;
5936     MPI_Comm           comm_cart;
5937
5938     comm = dd->comm;
5939
5940     if (comm->bCartesianPP)
5941     {
5942         /* Set up cartesian communication for the particle-particle part */
5943         if (fplog)
5944         {
5945             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5946                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5947         }
5948
5949         for (int i = 0; i < DIM; i++)
5950         {
5951             periods[i] = TRUE;
5952         }
5953         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5954                         &comm_cart);
5955         /* We overwrite the old communicator with the new cartesian one */
5956         cr->mpi_comm_mygroup = comm_cart;
5957     }
5958
5959     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5960     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5961
5962     if (comm->bCartesianPP_PME)
5963     {
5964         /* Since we want to use the original cartesian setup for sim,
5965          * and not the one after split, we need to make an index.
5966          */
5967         snew(comm->ddindex2ddnodeid, dd->nnodes);
5968         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5969         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5970         /* Get the rank of the DD master,
5971          * above we made sure that the master node is a PP node.
5972          */
5973         if (MASTER(cr))
5974         {
5975             rank = dd->rank;
5976         }
5977         else
5978         {
5979             rank = 0;
5980         }
5981         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5982     }
5983     else if (comm->bCartesianPP)
5984     {
5985         if (cr->npmenodes == 0)
5986         {
5987             /* The PP communicator is also
5988              * the communicator for this simulation
5989              */
5990             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5991         }
5992         cr->nodeid = dd->rank;
5993
5994         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5995
5996         /* We need to make an index to go from the coordinates
5997          * to the nodeid of this simulation.
5998          */
5999         snew(comm->ddindex2simnodeid, dd->nnodes);
6000         snew(buf, dd->nnodes);
6001         if (cr->duty & DUTY_PP)
6002         {
6003             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6004         }
6005         /* Communicate the ddindex to simulation nodeid index */
6006         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6007                       cr->mpi_comm_mysim);
6008         sfree(buf);
6009
6010         /* Determine the master coordinates and rank.
6011          * The DD master should be the same node as the master of this sim.
6012          */
6013         for (int i = 0; i < dd->nnodes; i++)
6014         {
6015             if (comm->ddindex2simnodeid[i] == 0)
6016             {
6017                 ddindex2xyz(dd->nc, i, dd->master_ci);
6018                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
6019             }
6020         }
6021         if (debug)
6022         {
6023             fprintf(debug, "The master rank is %d\n", dd->masterrank);
6024         }
6025     }
6026     else
6027     {
6028         /* No Cartesian communicators */
6029         /* We use the rank in dd->comm->all as DD index */
6030         ddindex2xyz(dd->nc, dd->rank, dd->ci);
6031         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6032         dd->masterrank = 0;
6033         clear_ivec(dd->master_ci);
6034     }
6035 #endif
6036
6037     if (fplog)
6038     {
6039         fprintf(fplog,
6040                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6041                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6042     }
6043     if (debug)
6044     {
6045         fprintf(debug,
6046                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6047                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6048     }
6049 }
6050
6051 static void receive_ddindex2simnodeid(t_commrec gmx_unused *cr)
6052 {
6053 #ifdef GMX_MPI
6054     gmx_domdec_t      *dd;
6055     gmx_domdec_comm_t *comm;
6056
6057     dd   = cr->dd;
6058     comm = dd->comm;
6059
6060     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
6061     {
6062         int *buf;
6063         snew(comm->ddindex2simnodeid, dd->nnodes);
6064         snew(buf, dd->nnodes);
6065         if (cr->duty & DUTY_PP)
6066         {
6067             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6068         }
6069         /* Communicate the ddindex to simulation nodeid index */
6070         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6071                       cr->mpi_comm_mysim);
6072         sfree(buf);
6073     }
6074 #endif
6075 }
6076
6077 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
6078                                                      int ncg, int natoms)
6079 {
6080     gmx_domdec_master_t *ma;
6081     int                  i;
6082
6083     snew(ma, 1);
6084
6085     snew(ma->ncg, dd->nnodes);
6086     snew(ma->index, dd->nnodes+1);
6087     snew(ma->cg, ncg);
6088     snew(ma->nat, dd->nnodes);
6089     snew(ma->ibuf, dd->nnodes*2);
6090     snew(ma->cell_x, DIM);
6091     for (i = 0; i < DIM; i++)
6092     {
6093         snew(ma->cell_x[i], dd->nc[i]+1);
6094     }
6095
6096     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
6097     {
6098         ma->vbuf = NULL;
6099     }
6100     else
6101     {
6102         snew(ma->vbuf, natoms);
6103     }
6104
6105     return ma;
6106 }
6107
6108 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
6109                                int gmx_unused reorder)
6110 {
6111     gmx_domdec_t      *dd;
6112     gmx_domdec_comm_t *comm;
6113     int                i;
6114     gmx_bool           bDiv[DIM];
6115 #ifdef GMX_MPI
6116     MPI_Comm           comm_cart;
6117 #endif
6118
6119     dd   = cr->dd;
6120     comm = dd->comm;
6121
6122     if (comm->bCartesianPP)
6123     {
6124         for (i = 1; i < DIM; i++)
6125         {
6126             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6127         }
6128         if (bDiv[YY] || bDiv[ZZ])
6129         {
6130             comm->bCartesianPP_PME = TRUE;
6131             /* If we have 2D PME decomposition, which is always in x+y,
6132              * we stack the PME only nodes in z.
6133              * Otherwise we choose the direction that provides the thinnest slab
6134              * of PME only nodes as this will have the least effect
6135              * on the PP communication.
6136              * But for the PME communication the opposite might be better.
6137              */
6138             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6139                              !bDiv[YY] ||
6140                              dd->nc[YY] > dd->nc[ZZ]))
6141             {
6142                 comm->cartpmedim = ZZ;
6143             }
6144             else
6145             {
6146                 comm->cartpmedim = YY;
6147             }
6148             comm->ntot[comm->cartpmedim]
6149                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6150         }
6151         else if (fplog)
6152         {
6153             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6154             fprintf(fplog,
6155                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6156         }
6157     }
6158
6159 #ifdef GMX_MPI
6160     if (comm->bCartesianPP_PME)
6161     {
6162         int  rank;
6163         ivec periods;
6164
6165         if (fplog)
6166         {
6167             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6168         }
6169
6170         for (i = 0; i < DIM; i++)
6171         {
6172             periods[i] = TRUE;
6173         }
6174         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6175                         &comm_cart);
6176         MPI_Comm_rank(comm_cart, &rank);
6177         if (MASTERNODE(cr) && rank != 0)
6178         {
6179             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6180         }
6181
6182         /* With this assigment we loose the link to the original communicator
6183          * which will usually be MPI_COMM_WORLD, unless have multisim.
6184          */
6185         cr->mpi_comm_mysim = comm_cart;
6186         cr->sim_nodeid     = rank;
6187
6188         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6189
6190         if (fplog)
6191         {
6192             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
6193                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6194         }
6195
6196         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6197         {
6198             cr->duty = DUTY_PP;
6199         }
6200         if (cr->npmenodes == 0 ||
6201             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6202         {
6203             cr->duty = DUTY_PME;
6204         }
6205
6206         /* Split the sim communicator into PP and PME only nodes */
6207         MPI_Comm_split(cr->mpi_comm_mysim,
6208                        cr->duty,
6209                        dd_index(comm->ntot, dd->ci),
6210                        &cr->mpi_comm_mygroup);
6211     }
6212     else
6213     {
6214         switch (dd_node_order)
6215         {
6216             case ddnoPP_PME:
6217                 if (fplog)
6218                 {
6219                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
6220                 }
6221                 break;
6222             case ddnoINTERLEAVE:
6223                 /* Interleave the PP-only and PME-only nodes,
6224                  * as on clusters with dual-core machines this will double
6225                  * the communication bandwidth of the PME processes
6226                  * and thus speed up the PP <-> PME and inter PME communication.
6227                  */
6228                 if (fplog)
6229                 {
6230                     fprintf(fplog, "Interleaving PP and PME ranks\n");
6231                 }
6232                 comm->pmenodes = dd_pmenodes(cr);
6233                 break;
6234             case ddnoCARTESIAN:
6235                 break;
6236             default:
6237                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6238         }
6239
6240         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6241         {
6242             cr->duty = DUTY_PME;
6243         }
6244         else
6245         {
6246             cr->duty = DUTY_PP;
6247         }
6248
6249         /* Split the sim communicator into PP and PME only nodes */
6250         MPI_Comm_split(cr->mpi_comm_mysim,
6251                        cr->duty,
6252                        cr->nodeid,
6253                        &cr->mpi_comm_mygroup);
6254         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6255     }
6256 #endif
6257
6258     if (fplog)
6259     {
6260         fprintf(fplog, "This rank does only %s work.\n\n",
6261                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6262     }
6263 }
6264
6265 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6266 {
6267     gmx_domdec_t      *dd;
6268     gmx_domdec_comm_t *comm;
6269     int                CartReorder;
6270
6271     dd   = cr->dd;
6272     comm = dd->comm;
6273
6274     copy_ivec(dd->nc, comm->ntot);
6275
6276     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6277     comm->bCartesianPP_PME = FALSE;
6278
6279     /* Reorder the nodes by default. This might change the MPI ranks.
6280      * Real reordering is only supported on very few architectures,
6281      * Blue Gene is one of them.
6282      */
6283     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6284
6285     if (cr->npmenodes > 0)
6286     {
6287         /* Split the communicator into a PP and PME part */
6288         split_communicator(fplog, cr, dd_node_order, CartReorder);
6289         if (comm->bCartesianPP_PME)
6290         {
6291             /* We (possibly) reordered the nodes in split_communicator,
6292              * so it is no longer required in make_pp_communicator.
6293              */
6294             CartReorder = FALSE;
6295         }
6296     }
6297     else
6298     {
6299         /* All nodes do PP and PME */
6300 #ifdef GMX_MPI
6301         /* We do not require separate communicators */
6302         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6303 #endif
6304     }
6305
6306     if (cr->duty & DUTY_PP)
6307     {
6308         /* Copy or make a new PP communicator */
6309         make_pp_communicator(fplog, cr, CartReorder);
6310     }
6311     else
6312     {
6313         receive_ddindex2simnodeid(cr);
6314     }
6315
6316     if (!(cr->duty & DUTY_PME))
6317     {
6318         /* Set up the commnuication to our PME node */
6319         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6320         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6321         if (debug)
6322         {
6323             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6324                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6325         }
6326     }
6327     else
6328     {
6329         dd->pme_nodeid = -1;
6330     }
6331
6332     if (DDMASTER(dd))
6333     {
6334         dd->ma = init_gmx_domdec_master_t(dd,
6335                                           comm->cgs_gl.nr,
6336                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6337     }
6338 }
6339
6340 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6341 {
6342     real  *slb_frac, tot;
6343     int    i, n;
6344     double dbl;
6345
6346     slb_frac = NULL;
6347     if (nc > 1 && size_string != NULL)
6348     {
6349         if (fplog)
6350         {
6351             fprintf(fplog, "Using static load balancing for the %s direction\n",
6352                     dir);
6353         }
6354         snew(slb_frac, nc);
6355         tot = 0;
6356         for (i = 0; i < nc; i++)
6357         {
6358             dbl = 0;
6359             sscanf(size_string, "%20lf%n", &dbl, &n);
6360             if (dbl == 0)
6361             {
6362                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6363             }
6364             slb_frac[i]  = dbl;
6365             size_string += n;
6366             tot         += slb_frac[i];
6367         }
6368         /* Normalize */
6369         if (fplog)
6370         {
6371             fprintf(fplog, "Relative cell sizes:");
6372         }
6373         for (i = 0; i < nc; i++)
6374         {
6375             slb_frac[i] /= tot;
6376             if (fplog)
6377             {
6378                 fprintf(fplog, " %5.3f", slb_frac[i]);
6379             }
6380         }
6381         if (fplog)
6382         {
6383             fprintf(fplog, "\n");
6384         }
6385     }
6386
6387     return slb_frac;
6388 }
6389
6390 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6391 {
6392     int                  n, nmol, ftype;
6393     gmx_mtop_ilistloop_t iloop;
6394     t_ilist             *il;
6395
6396     n     = 0;
6397     iloop = gmx_mtop_ilistloop_init(mtop);
6398     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6399     {
6400         for (ftype = 0; ftype < F_NRE; ftype++)
6401         {
6402             if ((interaction_function[ftype].flags & IF_BOND) &&
6403                 NRAL(ftype) >  2)
6404             {
6405                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6406             }
6407         }
6408     }
6409
6410     return n;
6411 }
6412
6413 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6414 {
6415     char *val;
6416     int   nst;
6417
6418     nst = def;
6419     val = getenv(env_var);
6420     if (val)
6421     {
6422         if (sscanf(val, "%20d", &nst) <= 0)
6423         {
6424             nst = 1;
6425         }
6426         if (fplog)
6427         {
6428             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6429                     env_var, val, nst);
6430         }
6431     }
6432
6433     return nst;
6434 }
6435
6436 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6437 {
6438     if (MASTER(cr))
6439     {
6440         fprintf(stderr, "\n%s\n", warn_string);
6441     }
6442     if (fplog)
6443     {
6444         fprintf(fplog, "\n%s\n", warn_string);
6445     }
6446 }
6447
6448 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6449                                   t_inputrec *ir, FILE *fplog)
6450 {
6451     if (ir->ePBC == epbcSCREW &&
6452         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6453     {
6454         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6455     }
6456
6457     if (ir->ns_type == ensSIMPLE)
6458     {
6459         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6460     }
6461
6462     if (ir->nstlist == 0)
6463     {
6464         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6465     }
6466
6467     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6468     {
6469         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6470     }
6471 }
6472
6473 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6474 {
6475     int  di, d;
6476     real r;
6477
6478     r = ddbox->box_size[XX];
6479     for (di = 0; di < dd->ndim; di++)
6480     {
6481         d = dd->dim[di];
6482         /* Check using the initial average cell size */
6483         r = std::min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6484     }
6485
6486     return r;
6487 }
6488
6489 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6490                              const char *dlb_opt, gmx_bool bRecordLoad,
6491                              unsigned long Flags, t_inputrec *ir)
6492 {
6493     int           eDLB = -1;
6494     char          buf[STRLEN];
6495
6496     switch (dlb_opt[0])
6497     {
6498         case 'a': eDLB = edlbAUTO; break;
6499         case 'n': eDLB = edlbNO;   break;
6500         case 'y': eDLB = edlbYES;  break;
6501         default: gmx_incons("Unknown dlb_opt");
6502     }
6503
6504     if (Flags & MD_RERUN)
6505     {
6506         return edlbNO;
6507     }
6508
6509     if (!EI_DYNAMICS(ir->eI))
6510     {
6511         if (eDLB == edlbYES)
6512         {
6513             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6514             dd_warning(cr, fplog, buf);
6515         }
6516
6517         return edlbNO;
6518     }
6519
6520     if (!bRecordLoad)
6521     {
6522         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6523
6524         return edlbNO;
6525     }
6526
6527     if (Flags & MD_REPRODUCIBLE)
6528     {
6529         switch (eDLB)
6530         {
6531             case edlbNO:
6532                 break;
6533             case edlbAUTO:
6534                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6535                 eDLB = edlbNO;
6536                 break;
6537             case edlbYES:
6538                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6539                 break;
6540             default:
6541                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6542                 break;
6543         }
6544     }
6545
6546     return eDLB;
6547 }
6548
6549 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6550 {
6551     int dim;
6552
6553     dd->ndim = 0;
6554     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6555     {
6556         /* Decomposition order z,y,x */
6557         if (fplog)
6558         {
6559             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6560         }
6561         for (dim = DIM-1; dim >= 0; dim--)
6562         {
6563             if (dd->nc[dim] > 1)
6564             {
6565                 dd->dim[dd->ndim++] = dim;
6566             }
6567         }
6568     }
6569     else
6570     {
6571         /* Decomposition order x,y,z */
6572         for (dim = 0; dim < DIM; dim++)
6573         {
6574             if (dd->nc[dim] > 1)
6575             {
6576                 dd->dim[dd->ndim++] = dim;
6577             }
6578         }
6579     }
6580 }
6581
6582 static gmx_domdec_comm_t *init_dd_comm()
6583 {
6584     gmx_domdec_comm_t *comm;
6585     int                i;
6586
6587     snew(comm, 1);
6588     snew(comm->cggl_flag, DIM*2);
6589     snew(comm->cgcm_state, DIM*2);
6590     for (i = 0; i < DIM*2; i++)
6591     {
6592         comm->cggl_flag_nalloc[i]  = 0;
6593         comm->cgcm_state_nalloc[i] = 0;
6594     }
6595
6596     comm->nalloc_int = 0;
6597     comm->buf_int    = NULL;
6598
6599     vec_rvec_init(&comm->vbuf);
6600
6601     comm->n_load_have    = 0;
6602     comm->n_load_collect = 0;
6603
6604     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6605     {
6606         comm->sum_nat[i] = 0;
6607     }
6608     comm->ndecomp   = 0;
6609     comm->nload     = 0;
6610     comm->load_step = 0;
6611     comm->load_sum  = 0;
6612     comm->load_max  = 0;
6613     clear_ivec(comm->load_lim);
6614     comm->load_mdf  = 0;
6615     comm->load_pme  = 0;
6616
6617     return comm;
6618 }
6619
6620 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6621                                         unsigned long Flags,
6622                                         ivec nc,
6623                                         real comm_distance_min, real rconstr,
6624                                         const char *dlb_opt, real dlb_scale,
6625                                         const char *sizex, const char *sizey, const char *sizez,
6626                                         gmx_mtop_t *mtop, t_inputrec *ir,
6627                                         matrix box, rvec *x,
6628                                         gmx_ddbox_t *ddbox,
6629                                         int *npme_x, int *npme_y)
6630 {
6631     gmx_domdec_t      *dd;
6632     gmx_domdec_comm_t *comm;
6633     int                recload;
6634     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6635     gmx_bool           bC;
6636     char               buf[STRLEN];
6637     const real         tenPercentMargin = 1.1;
6638
6639     if (fplog)
6640     {
6641         fprintf(fplog,
6642                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6643     }
6644
6645     snew(dd, 1);
6646
6647     dd->comm = init_dd_comm();
6648     comm     = dd->comm;
6649     snew(comm->cggl_flag, DIM*2);
6650     snew(comm->cgcm_state, DIM*2);
6651
6652     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6653     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6654
6655     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6656     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6657     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6658     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6659     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6660     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6661     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6662     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6663
6664     dd->pme_recv_f_alloc = 0;
6665     dd->pme_recv_f_buf   = NULL;
6666
6667     if (dd->bSendRecv2 && fplog)
6668     {
6669         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6670     }
6671     if (comm->eFlop)
6672     {
6673         if (fplog)
6674         {
6675             fprintf(fplog, "Will load balance based on FLOP count\n");
6676         }
6677         if (comm->eFlop > 1)
6678         {
6679             srand(1+cr->nodeid);
6680         }
6681         comm->bRecordLoad = TRUE;
6682     }
6683     else
6684     {
6685         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6686
6687     }
6688
6689     /* Initialize to GPU share count to 0, might change later */
6690     comm->nrank_gpu_shared = 0;
6691
6692     comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6693     comm->bDLB_locked = FALSE;
6694
6695     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6696     if (fplog)
6697     {
6698         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6699     }
6700     dd->bGridJump              = comm->bDynLoadBal;
6701     comm->bPMELoadBalDLBLimits = FALSE;
6702
6703     if (comm->nstSortCG)
6704     {
6705         if (fplog)
6706         {
6707             if (comm->nstSortCG == 1)
6708             {
6709                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6710             }
6711             else
6712             {
6713                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6714                         comm->nstSortCG);
6715             }
6716         }
6717         snew(comm->sort, 1);
6718     }
6719     else
6720     {
6721         if (fplog)
6722         {
6723             fprintf(fplog, "Will not sort the charge groups\n");
6724         }
6725     }
6726
6727     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6728
6729     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6730     if (comm->bInterCGBondeds)
6731     {
6732         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6733     }
6734     else
6735     {
6736         comm->bInterCGMultiBody = FALSE;
6737     }
6738
6739     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6740     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6741
6742     if (ir->rlistlong == 0)
6743     {
6744         /* Set the cut-off to some very large value,
6745          * so we don't need if statements everywhere in the code.
6746          * We use sqrt, since the cut-off is squared in some places.
6747          */
6748         comm->cutoff   = GMX_CUTOFF_INF;
6749     }
6750     else
6751     {
6752         comm->cutoff   = ir->rlistlong;
6753     }
6754     comm->cutoff_mbody = 0;
6755
6756     comm->cellsize_limit = 0;
6757     comm->bBondComm      = FALSE;
6758
6759     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6760      * within nstlist steps. Since boundaries are allowed to displace by half
6761      * a cell size, DD cells should be at least the size of the list buffer.
6762      */
6763     comm->cellsize_limit = std::max(comm->cellsize_limit,
6764                                     ir->rlistlong - std::max(ir->rvdw, ir->rcoulomb));
6765
6766     if (comm->bInterCGBondeds)
6767     {
6768         if (comm_distance_min > 0)
6769         {
6770             comm->cutoff_mbody = comm_distance_min;
6771             if (Flags & MD_DDBONDCOMM)
6772             {
6773                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6774             }
6775             else
6776             {
6777                 comm->cutoff = std::max(comm->cutoff, comm->cutoff_mbody);
6778             }
6779             r_bonded_limit = comm->cutoff_mbody;
6780         }
6781         else if (ir->bPeriodicMols)
6782         {
6783             /* Can not easily determine the required cut-off */
6784             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6785             comm->cutoff_mbody = comm->cutoff/2;
6786             r_bonded_limit     = comm->cutoff_mbody;
6787         }
6788         else
6789         {
6790             if (MASTER(cr))
6791             {
6792                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6793                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6794             }
6795             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6796             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6797
6798             /* We use an initial margin of 10% for the minimum cell size,
6799              * except when we are just below the non-bonded cut-off.
6800              */
6801             if (Flags & MD_DDBONDCOMM)
6802             {
6803                 if (std::max(r_2b, r_mb) > comm->cutoff)
6804                 {
6805                     r_bonded        = std::max(r_2b, r_mb);
6806                     r_bonded_limit  = tenPercentMargin*r_bonded;
6807                     comm->bBondComm = TRUE;
6808                 }
6809                 else
6810                 {
6811                     r_bonded       = r_mb;
6812                     r_bonded_limit = std::min(tenPercentMargin*r_bonded, comm->cutoff);
6813                 }
6814                 /* We determine cutoff_mbody later */
6815             }
6816             else
6817             {
6818                 /* No special bonded communication,
6819                  * simply increase the DD cut-off.
6820                  */
6821                 r_bonded_limit     = tenPercentMargin*std::max(r_2b, r_mb);
6822                 comm->cutoff_mbody = r_bonded_limit;
6823                 comm->cutoff       = std::max(comm->cutoff, comm->cutoff_mbody);
6824             }
6825         }
6826         if (fplog)
6827         {
6828             fprintf(fplog,
6829                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6830                     r_bonded_limit);
6831         }
6832         comm->cellsize_limit = std::max(comm->cellsize_limit, r_bonded_limit);
6833     }
6834
6835     if (dd->bInterCGcons && rconstr <= 0)
6836     {
6837         /* There is a cell size limit due to the constraints (P-LINCS) */
6838         rconstr = constr_r_max(fplog, mtop, ir);
6839         if (fplog)
6840         {
6841             fprintf(fplog,
6842                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6843                     rconstr);
6844             if (rconstr > comm->cellsize_limit)
6845             {
6846                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6847             }
6848         }
6849     }
6850     else if (rconstr > 0 && fplog)
6851     {
6852         /* Here we do not check for dd->bInterCGcons,
6853          * because one can also set a cell size limit for virtual sites only
6854          * and at this point we don't know yet if there are intercg v-sites.
6855          */
6856         fprintf(fplog,
6857                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6858                 rconstr);
6859     }
6860     comm->cellsize_limit = std::max(comm->cellsize_limit, rconstr);
6861
6862     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6863
6864     if (nc[XX] > 0)
6865     {
6866         copy_ivec(nc, dd->nc);
6867         set_dd_dim(fplog, dd);
6868         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6869
6870         if (cr->npmenodes == -1)
6871         {
6872             cr->npmenodes = 0;
6873         }
6874         acs = average_cellsize_min(dd, ddbox);
6875         if (acs < comm->cellsize_limit)
6876         {
6877             if (fplog)
6878             {
6879                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6880             }
6881             gmx_fatal_collective(FARGS, cr, NULL,
6882                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6883                                  acs, comm->cellsize_limit);
6884         }
6885     }
6886     else
6887     {
6888         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6889
6890         /* We need to choose the optimal DD grid and possibly PME nodes */
6891         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6892                                comm->eDLB != edlbNO, dlb_scale,
6893                                comm->cellsize_limit, comm->cutoff,
6894                                comm->bInterCGBondeds);
6895
6896         if (dd->nc[XX] == 0)
6897         {
6898             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6899             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6900                     !bC ? "-rdd" : "-rcon",
6901                     comm->eDLB != edlbNO ? " or -dds" : "",
6902                     bC ? " or your LINCS settings" : "");
6903
6904             gmx_fatal_collective(FARGS, cr, NULL,
6905                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6906                                  "%s\n"
6907                                  "Look in the log file for details on the domain decomposition",
6908                                  cr->nnodes-cr->npmenodes, limit, buf);
6909         }
6910         set_dd_dim(fplog, dd);
6911     }
6912
6913     if (fplog)
6914     {
6915         fprintf(fplog,
6916                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6917                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6918     }
6919
6920     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6921     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6922     {
6923         gmx_fatal_collective(FARGS, cr, NULL,
6924                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6925                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6926     }
6927     if (cr->npmenodes > dd->nnodes)
6928     {
6929         gmx_fatal_collective(FARGS, cr, NULL,
6930                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6931     }
6932     if (cr->npmenodes > 0)
6933     {
6934         comm->npmenodes = cr->npmenodes;
6935     }
6936     else
6937     {
6938         comm->npmenodes = dd->nnodes;
6939     }
6940
6941     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6942     {
6943         /* The following choices should match those
6944          * in comm_cost_est in domdec_setup.c.
6945          * Note that here the checks have to take into account
6946          * that the decomposition might occur in a different order than xyz
6947          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6948          * in which case they will not match those in comm_cost_est,
6949          * but since that is mainly for testing purposes that's fine.
6950          */
6951         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6952             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6953             getenv("GMX_PMEONEDD") == NULL)
6954         {
6955             comm->npmedecompdim = 2;
6956             comm->npmenodes_x   = dd->nc[XX];
6957             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6958         }
6959         else
6960         {
6961             /* In case nc is 1 in both x and y we could still choose to
6962              * decompose pme in y instead of x, but we use x for simplicity.
6963              */
6964             comm->npmedecompdim = 1;
6965             if (dd->dim[0] == YY)
6966             {
6967                 comm->npmenodes_x = 1;
6968                 comm->npmenodes_y = comm->npmenodes;
6969             }
6970             else
6971             {
6972                 comm->npmenodes_x = comm->npmenodes;
6973                 comm->npmenodes_y = 1;
6974             }
6975         }
6976         if (fplog)
6977         {
6978             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6979                     comm->npmenodes_x, comm->npmenodes_y, 1);
6980         }
6981     }
6982     else
6983     {
6984         comm->npmedecompdim = 0;
6985         comm->npmenodes_x   = 0;
6986         comm->npmenodes_y   = 0;
6987     }
6988
6989     /* Technically we don't need both of these,
6990      * but it simplifies code not having to recalculate it.
6991      */
6992     *npme_x = comm->npmenodes_x;
6993     *npme_y = comm->npmenodes_y;
6994
6995     snew(comm->slb_frac, DIM);
6996     if (comm->eDLB == edlbNO)
6997     {
6998         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6999         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
7000         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
7001     }
7002
7003     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
7004     {
7005         if (comm->bBondComm || comm->eDLB != edlbNO)
7006         {
7007             /* Set the bonded communication distance to halfway
7008              * the minimum and the maximum,
7009              * since the extra communication cost is nearly zero.
7010              */
7011             acs                = average_cellsize_min(dd, ddbox);
7012             comm->cutoff_mbody = 0.5*(r_bonded + acs);
7013             if (comm->eDLB != edlbNO)
7014             {
7015                 /* Check if this does not limit the scaling */
7016                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, dlb_scale*acs);
7017             }
7018             if (!comm->bBondComm)
7019             {
7020                 /* Without bBondComm do not go beyond the n.b. cut-off */
7021                 comm->cutoff_mbody = std::min(comm->cutoff_mbody, comm->cutoff);
7022                 if (comm->cellsize_limit >= comm->cutoff)
7023                 {
7024                     /* We don't loose a lot of efficieny
7025                      * when increasing it to the n.b. cut-off.
7026                      * It can even be slightly faster, because we need
7027                      * less checks for the communication setup.
7028                      */
7029                     comm->cutoff_mbody = comm->cutoff;
7030                 }
7031             }
7032             /* Check if we did not end up below our original limit */
7033             comm->cutoff_mbody = std::max(comm->cutoff_mbody, r_bonded_limit);
7034
7035             if (comm->cutoff_mbody > comm->cellsize_limit)
7036             {
7037                 comm->cellsize_limit = comm->cutoff_mbody;
7038             }
7039         }
7040         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7041     }
7042
7043     if (debug)
7044     {
7045         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
7046                 "cellsize limit %f\n",
7047                 comm->bBondComm, comm->cellsize_limit);
7048     }
7049
7050     if (MASTER(cr))
7051     {
7052         check_dd_restrictions(cr, dd, ir, fplog);
7053     }
7054
7055     comm->partition_step = INT_MIN;
7056     dd->ddp_count        = 0;
7057
7058     clear_dd_cycle_counts(dd);
7059
7060     return dd;
7061 }
7062
7063 static void set_dlb_limits(gmx_domdec_t *dd)
7064
7065 {
7066     int d;
7067
7068     for (d = 0; d < dd->ndim; d++)
7069     {
7070         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
7071         dd->comm->cellsize_min[dd->dim[d]] =
7072             dd->comm->cellsize_min_dlb[dd->dim[d]];
7073     }
7074 }
7075
7076
7077 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
7078 {
7079     gmx_domdec_t      *dd;
7080     gmx_domdec_comm_t *comm;
7081     real               cellsize_min;
7082     int                d, nc, i;
7083     char               buf[STRLEN];
7084
7085     dd   = cr->dd;
7086     comm = dd->comm;
7087
7088     if (fplog)
7089     {
7090         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
7091     }
7092
7093     cellsize_min = comm->cellsize_min[dd->dim[0]];
7094     for (d = 1; d < dd->ndim; d++)
7095     {
7096         cellsize_min = std::min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
7097     }
7098
7099     if (cellsize_min < comm->cellsize_limit*1.05)
7100     {
7101         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7102
7103         /* Change DLB from "auto" to "no". */
7104         comm->eDLB = edlbNO;
7105
7106         return;
7107     }
7108
7109     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
7110     comm->bDynLoadBal = TRUE;
7111     dd->bGridJump     = TRUE;
7112
7113     set_dlb_limits(dd);
7114
7115     /* We can set the required cell size info here,
7116      * so we do not need to communicate this.
7117      * The grid is completely uniform.
7118      */
7119     for (d = 0; d < dd->ndim; d++)
7120     {
7121         if (comm->root[d])
7122         {
7123             comm->load[d].sum_m = comm->load[d].sum;
7124
7125             nc = dd->nc[dd->dim[d]];
7126             for (i = 0; i < nc; i++)
7127             {
7128                 comm->root[d]->cell_f[i]    = i/(real)nc;
7129                 if (d > 0)
7130                 {
7131                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7132                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7133                 }
7134             }
7135             comm->root[d]->cell_f[nc] = 1.0;
7136         }
7137     }
7138 }
7139
7140 static char *init_bLocalCG(gmx_mtop_t *mtop)
7141 {
7142     int   ncg, cg;
7143     char *bLocalCG;
7144
7145     ncg = ncg_mtop(mtop);
7146     snew(bLocalCG, ncg);
7147     for (cg = 0; cg < ncg; cg++)
7148     {
7149         bLocalCG[cg] = FALSE;
7150     }
7151
7152     return bLocalCG;
7153 }
7154
7155 void dd_init_bondeds(FILE *fplog,
7156                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7157                      gmx_vsite_t *vsite,
7158                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7159 {
7160     gmx_domdec_comm_t *comm;
7161
7162     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
7163
7164     comm = dd->comm;
7165
7166     if (comm->bBondComm)
7167     {
7168         /* Communicate atoms beyond the cut-off for bonded interactions */
7169         comm = dd->comm;
7170
7171         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7172
7173         comm->bLocalCG = init_bLocalCG(mtop);
7174     }
7175     else
7176     {
7177         /* Only communicate atoms based on cut-off */
7178         comm->cglink   = NULL;
7179         comm->bLocalCG = NULL;
7180     }
7181 }
7182
7183 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7184                               t_inputrec *ir,
7185                               gmx_bool bDynLoadBal, real dlb_scale,
7186                               gmx_ddbox_t *ddbox)
7187 {
7188     gmx_domdec_comm_t *comm;
7189     int                d;
7190     ivec               np;
7191     real               limit, shrink;
7192     char               buf[64];
7193
7194     if (fplog == NULL)
7195     {
7196         return;
7197     }
7198
7199     comm = dd->comm;
7200
7201     if (bDynLoadBal)
7202     {
7203         fprintf(fplog, "The maximum number of communication pulses is:");
7204         for (d = 0; d < dd->ndim; d++)
7205         {
7206             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7207         }
7208         fprintf(fplog, "\n");
7209         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7210         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7211         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7212         for (d = 0; d < DIM; d++)
7213         {
7214             if (dd->nc[d] > 1)
7215             {
7216                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7217                 {
7218                     shrink = 0;
7219                 }
7220                 else
7221                 {
7222                     shrink =
7223                         comm->cellsize_min_dlb[d]/
7224                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7225                 }
7226                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7227             }
7228         }
7229         fprintf(fplog, "\n");
7230     }
7231     else
7232     {
7233         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
7234         fprintf(fplog, "The initial number of communication pulses is:");
7235         for (d = 0; d < dd->ndim; d++)
7236         {
7237             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7238         }
7239         fprintf(fplog, "\n");
7240         fprintf(fplog, "The initial domain decomposition cell size is:");
7241         for (d = 0; d < DIM; d++)
7242         {
7243             if (dd->nc[d] > 1)
7244             {
7245                 fprintf(fplog, " %c %.2f nm",
7246                         dim2char(d), dd->comm->cellsize_min[d]);
7247             }
7248         }
7249         fprintf(fplog, "\n\n");
7250     }
7251
7252     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7253     {
7254         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7255         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7256                 "non-bonded interactions", "", comm->cutoff);
7257
7258         if (bDynLoadBal)
7259         {
7260             limit = dd->comm->cellsize_limit;
7261         }
7262         else
7263         {
7264             if (dynamic_dd_box(ddbox, ir))
7265             {
7266                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7267             }
7268             limit = dd->comm->cellsize_min[XX];
7269             for (d = 1; d < DIM; d++)
7270             {
7271                 limit = std::min(limit, dd->comm->cellsize_min[d]);
7272             }
7273         }
7274
7275         if (comm->bInterCGBondeds)
7276         {
7277             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7278                     "two-body bonded interactions", "(-rdd)",
7279                     std::max(comm->cutoff, comm->cutoff_mbody));
7280             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7281                     "multi-body bonded interactions", "(-rdd)",
7282                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : std::min(comm->cutoff, limit));
7283         }
7284         if (dd->vsite_comm)
7285         {
7286             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7287                     "virtual site constructions", "(-rcon)", limit);
7288         }
7289         if (dd->constraint_comm)
7290         {
7291             sprintf(buf, "atoms separated by up to %d constraints",
7292                     1+ir->nProjOrder);
7293             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7294                     buf, "(-rcon)", limit);
7295         }
7296         fprintf(fplog, "\n");
7297     }
7298
7299     fflush(fplog);
7300 }
7301
7302 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7303                                 real               dlb_scale,
7304                                 const t_inputrec  *ir,
7305                                 const gmx_ddbox_t *ddbox)
7306 {
7307     gmx_domdec_comm_t *comm;
7308     int                d, dim, npulse, npulse_d_max, npulse_d;
7309     gmx_bool           bNoCutOff;
7310
7311     comm = dd->comm;
7312
7313     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7314
7315     /* Determine the maximum number of comm. pulses in one dimension */
7316
7317     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7318
7319     /* Determine the maximum required number of grid pulses */
7320     if (comm->cellsize_limit >= comm->cutoff)
7321     {
7322         /* Only a single pulse is required */
7323         npulse = 1;
7324     }
7325     else if (!bNoCutOff && comm->cellsize_limit > 0)
7326     {
7327         /* We round down slightly here to avoid overhead due to the latency
7328          * of extra communication calls when the cut-off
7329          * would be only slightly longer than the cell size.
7330          * Later cellsize_limit is redetermined,
7331          * so we can not miss interactions due to this rounding.
7332          */
7333         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7334     }
7335     else
7336     {
7337         /* There is no cell size limit */
7338         npulse = std::max(dd->nc[XX]-1, std::max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7339     }
7340
7341     if (!bNoCutOff && npulse > 1)
7342     {
7343         /* See if we can do with less pulses, based on dlb_scale */
7344         npulse_d_max = 0;
7345         for (d = 0; d < dd->ndim; d++)
7346         {
7347             dim      = dd->dim[d];
7348             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7349                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7350             npulse_d_max = std::max(npulse_d_max, npulse_d);
7351         }
7352         npulse = std::min(npulse, npulse_d_max);
7353     }
7354
7355     /* This env var can override npulse */
7356     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7357     if (d > 0)
7358     {
7359         npulse = d;
7360     }
7361
7362     comm->maxpulse       = 1;
7363     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7364     for (d = 0; d < dd->ndim; d++)
7365     {
7366         comm->cd[d].np_dlb    = std::min(npulse, dd->nc[dd->dim[d]]-1);
7367         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7368         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7369         comm->maxpulse = std::max(comm->maxpulse, comm->cd[d].np_dlb);
7370         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7371         {
7372             comm->bVacDLBNoLimit = FALSE;
7373         }
7374     }
7375
7376     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7377     if (!comm->bVacDLBNoLimit)
7378     {
7379         comm->cellsize_limit = std::max(comm->cellsize_limit,
7380                                         comm->cutoff/comm->maxpulse);
7381     }
7382     comm->cellsize_limit = std::max(comm->cellsize_limit, comm->cutoff_mbody);
7383     /* Set the minimum cell size for each DD dimension */
7384     for (d = 0; d < dd->ndim; d++)
7385     {
7386         if (comm->bVacDLBNoLimit ||
7387             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7388         {
7389             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7390         }
7391         else
7392         {
7393             comm->cellsize_min_dlb[dd->dim[d]] =
7394                 comm->cutoff/comm->cd[d].np_dlb;
7395         }
7396     }
7397     if (comm->cutoff_mbody <= 0)
7398     {
7399         comm->cutoff_mbody = std::min(comm->cutoff, comm->cellsize_limit);
7400     }
7401     if (comm->bDynLoadBal)
7402     {
7403         set_dlb_limits(dd);
7404     }
7405 }
7406
7407 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7408 {
7409     /* If each molecule is a single charge group
7410      * or we use domain decomposition for each periodic dimension,
7411      * we do not need to take pbc into account for the bonded interactions.
7412      */
7413     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7414             !(dd->nc[XX] > 1 &&
7415               dd->nc[YY] > 1 &&
7416               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7417 }
7418
7419 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7420                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7421 {
7422     gmx_domdec_comm_t *comm;
7423     int                natoms_tot;
7424     real               vol_frac;
7425
7426     comm = dd->comm;
7427
7428     /* Initialize the thread data.
7429      * This can not be done in init_domain_decomposition,
7430      * as the numbers of threads is determined later.
7431      */
7432     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7433     if (comm->nth > 1)
7434     {
7435         snew(comm->dth, comm->nth);
7436     }
7437
7438     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7439     {
7440         init_ddpme(dd, &comm->ddpme[0], 0);
7441         if (comm->npmedecompdim >= 2)
7442         {
7443             init_ddpme(dd, &comm->ddpme[1], 1);
7444         }
7445     }
7446     else
7447     {
7448         comm->npmenodes = 0;
7449         if (dd->pme_nodeid >= 0)
7450         {
7451             gmx_fatal_collective(FARGS, NULL, dd,
7452                                  "Can not have separate PME ranks without PME electrostatics");
7453         }
7454     }
7455
7456     if (debug)
7457     {
7458         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7459     }
7460     if (comm->eDLB != edlbNO)
7461     {
7462         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7463     }
7464
7465     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7466     if (comm->eDLB == edlbAUTO)
7467     {
7468         if (fplog)
7469         {
7470             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7471         }
7472         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7473     }
7474
7475     if (ir->ePBC == epbcNONE)
7476     {
7477         vol_frac = 1 - 1/(double)dd->nnodes;
7478     }
7479     else
7480     {
7481         vol_frac =
7482             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7483     }
7484     if (debug)
7485     {
7486         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7487     }
7488     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7489
7490     dd->ga2la = ga2la_init(natoms_tot, static_cast<int>(vol_frac*natoms_tot));
7491 }
7492
7493 static gmx_bool test_dd_cutoff(t_commrec *cr,
7494                                t_state *state, t_inputrec *ir,
7495                                real cutoff_req)
7496 {
7497     gmx_domdec_t *dd;
7498     gmx_ddbox_t   ddbox;
7499     int           d, dim, np;
7500     real          inv_cell_size;
7501     int           LocallyLimited;
7502
7503     dd = cr->dd;
7504
7505     set_ddbox(dd, FALSE, cr, ir, state->box,
7506               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7507
7508     LocallyLimited = 0;
7509
7510     for (d = 0; d < dd->ndim; d++)
7511     {
7512         dim = dd->dim[d];
7513
7514         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7515         if (dynamic_dd_box(&ddbox, ir))
7516         {
7517             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7518         }
7519
7520         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7521
7522         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7523             dd->comm->cd[d].np_dlb > 0)
7524         {
7525             if (np > dd->comm->cd[d].np_dlb)
7526             {
7527                 return FALSE;
7528             }
7529
7530             /* If a current local cell size is smaller than the requested
7531              * cut-off, we could still fix it, but this gets very complicated.
7532              * Without fixing here, we might actually need more checks.
7533              */
7534             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7535             {
7536                 LocallyLimited = 1;
7537             }
7538         }
7539     }
7540
7541     if (dd->comm->eDLB != edlbNO)
7542     {
7543         /* If DLB is not active yet, we don't need to check the grid jumps.
7544          * Actually we shouldn't, because then the grid jump data is not set.
7545          */
7546         if (dd->comm->bDynLoadBal &&
7547             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7548         {
7549             LocallyLimited = 1;
7550         }
7551
7552         gmx_sumi(1, &LocallyLimited, cr);
7553
7554         if (LocallyLimited > 0)
7555         {
7556             return FALSE;
7557         }
7558     }
7559
7560     return TRUE;
7561 }
7562
7563 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7564                           real cutoff_req)
7565 {
7566     gmx_bool bCutoffAllowed;
7567
7568     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7569
7570     if (bCutoffAllowed)
7571     {
7572         cr->dd->comm->cutoff = cutoff_req;
7573     }
7574
7575     return bCutoffAllowed;
7576 }
7577
7578 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7579 {
7580     gmx_domdec_comm_t *comm;
7581
7582     comm = cr->dd->comm;
7583
7584     /* Turn on the DLB limiting (might have been on already) */
7585     comm->bPMELoadBalDLBLimits = TRUE;
7586
7587     /* Change the cut-off limit */
7588     comm->PMELoadBal_max_cutoff = comm->cutoff;
7589 }
7590
7591 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7592 {
7593     return dd->comm->bDLB_locked;
7594 }
7595
7596 void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
7597 {
7598     /* We can only lock the DLB when it is set to auto, otherwise don't lock */
7599     if (dd->comm->eDLB == edlbAUTO)
7600     {
7601         dd->comm->bDLB_locked = bValue;
7602     }
7603 }
7604
7605 static void merge_cg_buffers(int ncell,
7606                              gmx_domdec_comm_dim_t *cd, int pulse,
7607                              int  *ncg_cell,
7608                              int  *index_gl, int  *recv_i,
7609                              rvec *cg_cm,    rvec *recv_vr,
7610                              int *cgindex,
7611                              cginfo_mb_t *cginfo_mb, int *cginfo)
7612 {
7613     gmx_domdec_ind_t *ind, *ind_p;
7614     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7615     int               shift, shift_at;
7616
7617     ind = &cd->ind[pulse];
7618
7619     /* First correct the already stored data */
7620     shift = ind->nrecv[ncell];
7621     for (cell = ncell-1; cell >= 0; cell--)
7622     {
7623         shift -= ind->nrecv[cell];
7624         if (shift > 0)
7625         {
7626             /* Move the cg's present from previous grid pulses */
7627             cg0                = ncg_cell[ncell+cell];
7628             cg1                = ncg_cell[ncell+cell+1];
7629             cgindex[cg1+shift] = cgindex[cg1];
7630             for (cg = cg1-1; cg >= cg0; cg--)
7631             {
7632                 index_gl[cg+shift] = index_gl[cg];
7633                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7634                 cgindex[cg+shift] = cgindex[cg];
7635                 cginfo[cg+shift]  = cginfo[cg];
7636             }
7637             /* Correct the already stored send indices for the shift */
7638             for (p = 1; p <= pulse; p++)
7639             {
7640                 ind_p = &cd->ind[p];
7641                 cg0   = 0;
7642                 for (c = 0; c < cell; c++)
7643                 {
7644                     cg0 += ind_p->nsend[c];
7645                 }
7646                 cg1 = cg0 + ind_p->nsend[cell];
7647                 for (cg = cg0; cg < cg1; cg++)
7648                 {
7649                     ind_p->index[cg] += shift;
7650                 }
7651             }
7652         }
7653     }
7654
7655     /* Merge in the communicated buffers */
7656     shift    = 0;
7657     shift_at = 0;
7658     cg0      = 0;
7659     for (cell = 0; cell < ncell; cell++)
7660     {
7661         cg1 = ncg_cell[ncell+cell+1] + shift;
7662         if (shift_at > 0)
7663         {
7664             /* Correct the old cg indices */
7665             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7666             {
7667                 cgindex[cg+1] += shift_at;
7668             }
7669         }
7670         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7671         {
7672             /* Copy this charge group from the buffer */
7673             index_gl[cg1] = recv_i[cg0];
7674             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7675             /* Add it to the cgindex */
7676             cg_gl          = index_gl[cg1];
7677             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7678             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7679             cgindex[cg1+1] = cgindex[cg1] + nat;
7680             cg0++;
7681             cg1++;
7682             shift_at += nat;
7683         }
7684         shift                 += ind->nrecv[cell];
7685         ncg_cell[ncell+cell+1] = cg1;
7686     }
7687 }
7688
7689 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7690                                int nzone, int cg0, const int *cgindex)
7691 {
7692     int cg, zone, p;
7693
7694     /* Store the atom block boundaries for easy copying of communication buffers
7695      */
7696     cg = cg0;
7697     for (zone = 0; zone < nzone; zone++)
7698     {
7699         for (p = 0; p < cd->np; p++)
7700         {
7701             cd->ind[p].cell2at0[zone] = cgindex[cg];
7702             cg += cd->ind[p].nrecv[zone];
7703             cd->ind[p].cell2at1[zone] = cgindex[cg];
7704         }
7705     }
7706 }
7707
7708 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7709 {
7710     int      i;
7711     gmx_bool bMiss;
7712
7713     bMiss = FALSE;
7714     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7715     {
7716         if (!bLocalCG[link->a[i]])
7717         {
7718             bMiss = TRUE;
7719         }
7720     }
7721
7722     return bMiss;
7723 }
7724
7725 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7726 typedef struct {
7727     real c[DIM][4]; /* the corners for the non-bonded communication */
7728     real cr0;       /* corner for rounding */
7729     real cr1[4];    /* corners for rounding */
7730     real bc[DIM];   /* corners for bounded communication */
7731     real bcr1;      /* corner for rounding for bonded communication */
7732 } dd_corners_t;
7733
7734 /* Determine the corners of the domain(s) we are communicating with */
7735 static void
7736 set_dd_corners(const gmx_domdec_t *dd,
7737                int dim0, int dim1, int dim2,
7738                gmx_bool bDistMB,
7739                dd_corners_t *c)
7740 {
7741     const gmx_domdec_comm_t  *comm;
7742     const gmx_domdec_zones_t *zones;
7743     int i, j;
7744
7745     comm = dd->comm;
7746
7747     zones = &comm->zones;
7748
7749     /* Keep the compiler happy */
7750     c->cr0  = 0;
7751     c->bcr1 = 0;
7752
7753     /* The first dimension is equal for all cells */
7754     c->c[0][0] = comm->cell_x0[dim0];
7755     if (bDistMB)
7756     {
7757         c->bc[0] = c->c[0][0];
7758     }
7759     if (dd->ndim >= 2)
7760     {
7761         dim1 = dd->dim[1];
7762         /* This cell row is only seen from the first row */
7763         c->c[1][0] = comm->cell_x0[dim1];
7764         /* All rows can see this row */
7765         c->c[1][1] = comm->cell_x0[dim1];
7766         if (dd->bGridJump)
7767         {
7768             c->c[1][1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7769             if (bDistMB)
7770             {
7771                 /* For the multi-body distance we need the maximum */
7772                 c->bc[1] = std::max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7773             }
7774         }
7775         /* Set the upper-right corner for rounding */
7776         c->cr0 = comm->cell_x1[dim0];
7777
7778         if (dd->ndim >= 3)
7779         {
7780             dim2 = dd->dim[2];
7781             for (j = 0; j < 4; j++)
7782             {
7783                 c->c[2][j] = comm->cell_x0[dim2];
7784             }
7785             if (dd->bGridJump)
7786             {
7787                 /* Use the maximum of the i-cells that see a j-cell */
7788                 for (i = 0; i < zones->nizone; i++)
7789                 {
7790                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7791                     {
7792                         if (j >= 4)
7793                         {
7794                             c->c[2][j-4] =
7795                                 std::max(c->c[2][j-4],
7796                                          comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7797                         }
7798                     }
7799                 }
7800                 if (bDistMB)
7801                 {
7802                     /* For the multi-body distance we need the maximum */
7803                     c->bc[2] = comm->cell_x0[dim2];
7804                     for (i = 0; i < 2; i++)
7805                     {
7806                         for (j = 0; j < 2; j++)
7807                         {
7808                             c->bc[2] = std::max(c->bc[2], comm->zone_d2[i][j].p1_0);
7809                         }
7810                     }
7811                 }
7812             }
7813
7814             /* Set the upper-right corner for rounding */
7815             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7816              * Only cell (0,0,0) can see cell 7 (1,1,1)
7817              */
7818             c->cr1[0] = comm->cell_x1[dim1];
7819             c->cr1[3] = comm->cell_x1[dim1];
7820             if (dd->bGridJump)
7821             {
7822                 c->cr1[0] = std::max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7823                 if (bDistMB)
7824                 {
7825                     /* For the multi-body distance we need the maximum */
7826                     c->bcr1 = std::max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7827                 }
7828             }
7829         }
7830     }
7831 }
7832
7833 /* Determine which cg's we need to send in this pulse from this zone */
7834 static void
7835 get_zone_pulse_cgs(gmx_domdec_t *dd,
7836                    int zonei, int zone,
7837                    int cg0, int cg1,
7838                    const int *index_gl,
7839                    const int *cgindex,
7840                    int dim, int dim_ind,
7841                    int dim0, int dim1, int dim2,
7842                    real r_comm2, real r_bcomm2,
7843                    matrix box,
7844                    ivec tric_dist,
7845                    rvec *normal,
7846                    real skew_fac2_d, real skew_fac_01,
7847                    rvec *v_d, rvec *v_0, rvec *v_1,
7848                    const dd_corners_t *c,
7849                    rvec sf2_round,
7850                    gmx_bool bDistBonded,
7851                    gmx_bool bBondComm,
7852                    gmx_bool bDist2B,
7853                    gmx_bool bDistMB,
7854                    rvec *cg_cm,
7855                    int *cginfo,
7856                    gmx_domdec_ind_t *ind,
7857                    int **ibuf, int *ibuf_nalloc,
7858                    vec_rvec_t *vbuf,
7859                    int *nsend_ptr,
7860                    int *nat_ptr,
7861                    int *nsend_z_ptr)
7862 {
7863     gmx_domdec_comm_t *comm;
7864     gmx_bool           bScrew;
7865     gmx_bool           bDistMB_pulse;
7866     int                cg, i;
7867     real               r2, rb2, r, tric_sh;
7868     rvec               rn, rb;
7869     int                dimd;
7870     int                nsend_z, nsend, nat;
7871
7872     comm = dd->comm;
7873
7874     bScrew = (dd->bScrewPBC && dim == XX);
7875
7876     bDistMB_pulse = (bDistMB && bDistBonded);
7877
7878     nsend_z = 0;
7879     nsend   = *nsend_ptr;
7880     nat     = *nat_ptr;
7881
7882     for (cg = cg0; cg < cg1; cg++)
7883     {
7884         r2  = 0;
7885         rb2 = 0;
7886         if (tric_dist[dim_ind] == 0)
7887         {
7888             /* Rectangular direction, easy */
7889             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7890             if (r > 0)
7891             {
7892                 r2 += r*r;
7893             }
7894             if (bDistMB_pulse)
7895             {
7896                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7897                 if (r > 0)
7898                 {
7899                     rb2 += r*r;
7900                 }
7901             }
7902             /* Rounding gives at most a 16% reduction
7903              * in communicated atoms
7904              */
7905             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7906             {
7907                 r = cg_cm[cg][dim0] - c->cr0;
7908                 /* This is the first dimension, so always r >= 0 */
7909                 r2 += r*r;
7910                 if (bDistMB_pulse)
7911                 {
7912                     rb2 += r*r;
7913                 }
7914             }
7915             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7916             {
7917                 r = cg_cm[cg][dim1] - c->cr1[zone];
7918                 if (r > 0)
7919                 {
7920                     r2 += r*r;
7921                 }
7922                 if (bDistMB_pulse)
7923                 {
7924                     r = cg_cm[cg][dim1] - c->bcr1;
7925                     if (r > 0)
7926                     {
7927                         rb2 += r*r;
7928                     }
7929                 }
7930             }
7931         }
7932         else
7933         {
7934             /* Triclinic direction, more complicated */
7935             clear_rvec(rn);
7936             clear_rvec(rb);
7937             /* Rounding, conservative as the skew_fac multiplication
7938              * will slightly underestimate the distance.
7939              */
7940             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7941             {
7942                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7943                 for (i = dim0+1; i < DIM; i++)
7944                 {
7945                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7946                 }
7947                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7948                 if (bDistMB_pulse)
7949                 {
7950                     rb[dim0] = rn[dim0];
7951                     rb2      = r2;
7952                 }
7953                 /* Take care that the cell planes along dim0 might not
7954                  * be orthogonal to those along dim1 and dim2.
7955                  */
7956                 for (i = 1; i <= dim_ind; i++)
7957                 {
7958                     dimd = dd->dim[i];
7959                     if (normal[dim0][dimd] > 0)
7960                     {
7961                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7962                         if (bDistMB_pulse)
7963                         {
7964                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7965                         }
7966                     }
7967                 }
7968             }
7969             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7970             {
7971                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7972                 tric_sh   = 0;
7973                 for (i = dim1+1; i < DIM; i++)
7974                 {
7975                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7976                 }
7977                 rn[dim1] += tric_sh;
7978                 if (rn[dim1] > 0)
7979                 {
7980                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7981                     /* Take care of coupling of the distances
7982                      * to the planes along dim0 and dim1 through dim2.
7983                      */
7984                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7985                     /* Take care that the cell planes along dim1
7986                      * might not be orthogonal to that along dim2.
7987                      */
7988                     if (normal[dim1][dim2] > 0)
7989                     {
7990                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7991                     }
7992                 }
7993                 if (bDistMB_pulse)
7994                 {
7995                     rb[dim1] +=
7996                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7997                     if (rb[dim1] > 0)
7998                     {
7999                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
8000                         /* Take care of coupling of the distances
8001                          * to the planes along dim0 and dim1 through dim2.
8002                          */
8003                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
8004                         /* Take care that the cell planes along dim1
8005                          * might not be orthogonal to that along dim2.
8006                          */
8007                         if (normal[dim1][dim2] > 0)
8008                         {
8009                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
8010                         }
8011                     }
8012                 }
8013             }
8014             /* The distance along the communication direction */
8015             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
8016             tric_sh  = 0;
8017             for (i = dim+1; i < DIM; i++)
8018             {
8019                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
8020             }
8021             rn[dim] += tric_sh;
8022             if (rn[dim] > 0)
8023             {
8024                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
8025                 /* Take care of coupling of the distances
8026                  * to the planes along dim0 and dim1 through dim2.
8027                  */
8028                 if (dim_ind == 1 && zonei == 1)
8029                 {
8030                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
8031                 }
8032             }
8033             if (bDistMB_pulse)
8034             {
8035                 clear_rvec(rb);
8036                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
8037                 if (rb[dim] > 0)
8038                 {
8039                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
8040                     /* Take care of coupling of the distances
8041                      * to the planes along dim0 and dim1 through dim2.
8042                      */
8043                     if (dim_ind == 1 && zonei == 1)
8044                     {
8045                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
8046                     }
8047                 }
8048             }
8049         }
8050
8051         if (r2 < r_comm2 ||
8052             (bDistBonded &&
8053              ((bDistMB && rb2 < r_bcomm2) ||
8054               (bDist2B && r2  < r_bcomm2)) &&
8055              (!bBondComm ||
8056               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
8057                missing_link(comm->cglink, index_gl[cg],
8058                             comm->bLocalCG)))))
8059         {
8060             /* Make an index to the local charge groups */
8061             if (nsend+1 > ind->nalloc)
8062             {
8063                 ind->nalloc = over_alloc_large(nsend+1);
8064                 srenew(ind->index, ind->nalloc);
8065             }
8066             if (nsend+1 > *ibuf_nalloc)
8067             {
8068                 *ibuf_nalloc = over_alloc_large(nsend+1);
8069                 srenew(*ibuf, *ibuf_nalloc);
8070             }
8071             ind->index[nsend] = cg;
8072             (*ibuf)[nsend]    = index_gl[cg];
8073             nsend_z++;
8074             vec_rvec_check_alloc(vbuf, nsend+1);
8075
8076             if (dd->ci[dim] == 0)
8077             {
8078                 /* Correct cg_cm for pbc */
8079                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
8080                 if (bScrew)
8081                 {
8082                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
8083                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
8084                 }
8085             }
8086             else
8087             {
8088                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
8089             }
8090             nsend++;
8091             nat += cgindex[cg+1] - cgindex[cg];
8092         }
8093     }
8094
8095     *nsend_ptr   = nsend;
8096     *nat_ptr     = nat;
8097     *nsend_z_ptr = nsend_z;
8098 }
8099
8100 static void setup_dd_communication(gmx_domdec_t *dd,
8101                                    matrix box, gmx_ddbox_t *ddbox,
8102                                    t_forcerec *fr, t_state *state, rvec **f)
8103 {
8104     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8105     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
8106     int                    c, i, cg, cg_gl, nrcg;
8107     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8108     gmx_domdec_comm_t     *comm;
8109     gmx_domdec_zones_t    *zones;
8110     gmx_domdec_comm_dim_t *cd;
8111     gmx_domdec_ind_t      *ind;
8112     cginfo_mb_t           *cginfo_mb;
8113     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8114     real                   r_comm2, r_bcomm2;
8115     dd_corners_t           corners;
8116     ivec                   tric_dist;
8117     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
8118     real                   skew_fac2_d, skew_fac_01;
8119     rvec                   sf2_round;
8120     int                    nsend, nat;
8121     int                    th;
8122
8123     if (debug)
8124     {
8125         fprintf(debug, "Setting up DD communication\n");
8126     }
8127
8128     comm  = dd->comm;
8129
8130     switch (fr->cutoff_scheme)
8131     {
8132         case ecutsGROUP:
8133             cg_cm = fr->cg_cm;
8134             break;
8135         case ecutsVERLET:
8136             cg_cm = state->x;
8137             break;
8138         default:
8139             gmx_incons("unimplemented");
8140             cg_cm = NULL;
8141     }
8142
8143     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8144     {
8145         /* Check if we need to use triclinic distances */
8146         tric_dist[dim_ind] = 0;
8147         for (i = 0; i <= dim_ind; i++)
8148         {
8149             if (ddbox->tric_dir[dd->dim[i]])
8150             {
8151                 tric_dist[dim_ind] = 1;
8152             }
8153         }
8154     }
8155
8156     bBondComm = comm->bBondComm;
8157
8158     /* Do we need to determine extra distances for multi-body bondeds? */
8159     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8160
8161     /* Do we need to determine extra distances for only two-body bondeds? */
8162     bDist2B = (bBondComm && !bDistMB);
8163
8164     r_comm2  = sqr(comm->cutoff);
8165     r_bcomm2 = sqr(comm->cutoff_mbody);
8166
8167     if (debug)
8168     {
8169         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8170     }
8171
8172     zones = &comm->zones;
8173
8174     dim0 = dd->dim[0];
8175     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8176     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8177
8178     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8179
8180     /* Triclinic stuff */
8181     normal      = ddbox->normal;
8182     skew_fac_01 = 0;
8183     if (dd->ndim >= 2)
8184     {
8185         v_0 = ddbox->v[dim0];
8186         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8187         {
8188             /* Determine the coupling coefficient for the distances
8189              * to the cell planes along dim0 and dim1 through dim2.
8190              * This is required for correct rounding.
8191              */
8192             skew_fac_01 =
8193                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8194             if (debug)
8195             {
8196                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8197             }
8198         }
8199     }
8200     if (dd->ndim >= 3)
8201     {
8202         v_1 = ddbox->v[dim1];
8203     }
8204
8205     zone_cg_range = zones->cg_range;
8206     index_gl      = dd->index_gl;
8207     cgindex       = dd->cgindex;
8208     cginfo_mb     = fr->cginfo_mb;
8209
8210     zone_cg_range[0]   = 0;
8211     zone_cg_range[1]   = dd->ncg_home;
8212     comm->zone_ncg1[0] = dd->ncg_home;
8213     pos_cg             = dd->ncg_home;
8214
8215     nat_tot = dd->nat_home;
8216     nzone   = 1;
8217     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8218     {
8219         dim = dd->dim[dim_ind];
8220         cd  = &comm->cd[dim_ind];
8221
8222         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8223         {
8224             /* No pbc in this dimension, the first node should not comm. */
8225             nzone_send = 0;
8226         }
8227         else
8228         {
8229             nzone_send = nzone;
8230         }
8231
8232         v_d         = ddbox->v[dim];
8233         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8234
8235         cd->bInPlace = TRUE;
8236         for (p = 0; p < cd->np; p++)
8237         {
8238             /* Only atoms communicated in the first pulse are used
8239              * for multi-body bonded interactions or for bBondComm.
8240              */
8241             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8242
8243             ind   = &cd->ind[p];
8244             nsend = 0;
8245             nat   = 0;
8246             for (zone = 0; zone < nzone_send; zone++)
8247             {
8248                 if (tric_dist[dim_ind] && dim_ind > 0)
8249                 {
8250                     /* Determine slightly more optimized skew_fac's
8251                      * for rounding.
8252                      * This reduces the number of communicated atoms
8253                      * by about 10% for 3D DD of rhombic dodecahedra.
8254                      */
8255                     for (dimd = 0; dimd < dim; dimd++)
8256                     {
8257                         sf2_round[dimd] = 1;
8258                         if (ddbox->tric_dir[dimd])
8259                         {
8260                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8261                             {
8262                                 /* If we are shifted in dimension i
8263                                  * and the cell plane is tilted forward
8264                                  * in dimension i, skip this coupling.
8265                                  */
8266                                 if (!(zones->shift[nzone+zone][i] &&
8267                                       ddbox->v[dimd][i][dimd] >= 0))
8268                                 {
8269                                     sf2_round[dimd] +=
8270                                         sqr(ddbox->v[dimd][i][dimd]);
8271                                 }
8272                             }
8273                             sf2_round[dimd] = 1/sf2_round[dimd];
8274                         }
8275                     }
8276                 }
8277
8278                 zonei = zone_perm[dim_ind][zone];
8279                 if (p == 0)
8280                 {
8281                     /* Here we permutate the zones to obtain a convenient order
8282                      * for neighbor searching
8283                      */
8284                     cg0 = zone_cg_range[zonei];
8285                     cg1 = zone_cg_range[zonei+1];
8286                 }
8287                 else
8288                 {
8289                     /* Look only at the cg's received in the previous grid pulse
8290                      */
8291                     cg1 = zone_cg_range[nzone+zone+1];
8292                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8293                 }
8294
8295 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8296                 for (th = 0; th < comm->nth; th++)
8297                 {
8298                     gmx_domdec_ind_t *ind_p;
8299                     int             **ibuf_p, *ibuf_nalloc_p;
8300                     vec_rvec_t       *vbuf_p;
8301                     int              *nsend_p, *nat_p;
8302                     int              *nsend_zone_p;
8303                     int               cg0_th, cg1_th;
8304
8305                     if (th == 0)
8306                     {
8307                         /* Thread 0 writes in the comm buffers */
8308                         ind_p         = ind;
8309                         ibuf_p        = &comm->buf_int;
8310                         ibuf_nalloc_p = &comm->nalloc_int;
8311                         vbuf_p        = &comm->vbuf;
8312                         nsend_p       = &nsend;
8313                         nat_p         = &nat;
8314                         nsend_zone_p  = &ind->nsend[zone];
8315                     }
8316                     else
8317                     {
8318                         /* Other threads write into temp buffers */
8319                         ind_p         = &comm->dth[th].ind;
8320                         ibuf_p        = &comm->dth[th].ibuf;
8321                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8322                         vbuf_p        = &comm->dth[th].vbuf;
8323                         nsend_p       = &comm->dth[th].nsend;
8324                         nat_p         = &comm->dth[th].nat;
8325                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8326
8327                         comm->dth[th].nsend      = 0;
8328                         comm->dth[th].nat        = 0;
8329                         comm->dth[th].nsend_zone = 0;
8330                     }
8331
8332                     if (comm->nth == 1)
8333                     {
8334                         cg0_th = cg0;
8335                         cg1_th = cg1;
8336                     }
8337                     else
8338                     {
8339                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8340                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8341                     }
8342
8343                     /* Get the cg's for this pulse in this zone */
8344                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8345                                        index_gl, cgindex,
8346                                        dim, dim_ind, dim0, dim1, dim2,
8347                                        r_comm2, r_bcomm2,
8348                                        box, tric_dist,
8349                                        normal, skew_fac2_d, skew_fac_01,
8350                                        v_d, v_0, v_1, &corners, sf2_round,
8351                                        bDistBonded, bBondComm,
8352                                        bDist2B, bDistMB,
8353                                        cg_cm, fr->cginfo,
8354                                        ind_p,
8355                                        ibuf_p, ibuf_nalloc_p,
8356                                        vbuf_p,
8357                                        nsend_p, nat_p,
8358                                        nsend_zone_p);
8359                 }
8360
8361                 /* Append data of threads>=1 to the communication buffers */
8362                 for (th = 1; th < comm->nth; th++)
8363                 {
8364                     dd_comm_setup_work_t *dth;
8365                     int                   i, ns1;
8366
8367                     dth = &comm->dth[th];
8368
8369                     ns1 = nsend + dth->nsend_zone;
8370                     if (ns1 > ind->nalloc)
8371                     {
8372                         ind->nalloc = over_alloc_dd(ns1);
8373                         srenew(ind->index, ind->nalloc);
8374                     }
8375                     if (ns1 > comm->nalloc_int)
8376                     {
8377                         comm->nalloc_int = over_alloc_dd(ns1);
8378                         srenew(comm->buf_int, comm->nalloc_int);
8379                     }
8380                     if (ns1 > comm->vbuf.nalloc)
8381                     {
8382                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8383                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8384                     }
8385
8386                     for (i = 0; i < dth->nsend_zone; i++)
8387                     {
8388                         ind->index[nsend]    = dth->ind.index[i];
8389                         comm->buf_int[nsend] = dth->ibuf[i];
8390                         copy_rvec(dth->vbuf.v[i],
8391                                   comm->vbuf.v[nsend]);
8392                         nsend++;
8393                     }
8394                     nat              += dth->nat;
8395                     ind->nsend[zone] += dth->nsend_zone;
8396                 }
8397             }
8398             /* Clear the counts in case we do not have pbc */
8399             for (zone = nzone_send; zone < nzone; zone++)
8400             {
8401                 ind->nsend[zone] = 0;
8402             }
8403             ind->nsend[nzone]   = nsend;
8404             ind->nsend[nzone+1] = nat;
8405             /* Communicate the number of cg's and atoms to receive */
8406             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8407                             ind->nsend, nzone+2,
8408                             ind->nrecv, nzone+2);
8409
8410             /* The rvec buffer is also required for atom buffers of size nsend
8411              * in dd_move_x and dd_move_f.
8412              */
8413             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8414
8415             if (p > 0)
8416             {
8417                 /* We can receive in place if only the last zone is not empty */
8418                 for (zone = 0; zone < nzone-1; zone++)
8419                 {
8420                     if (ind->nrecv[zone] > 0)
8421                     {
8422                         cd->bInPlace = FALSE;
8423                     }
8424                 }
8425                 if (!cd->bInPlace)
8426                 {
8427                     /* The int buffer is only required here for the cg indices */
8428                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8429                     {
8430                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8431                         srenew(comm->buf_int2, comm->nalloc_int2);
8432                     }
8433                     /* The rvec buffer is also required for atom buffers
8434                      * of size nrecv in dd_move_x and dd_move_f.
8435                      */
8436                     i = std::max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8437                     vec_rvec_check_alloc(&comm->vbuf2, i);
8438                 }
8439             }
8440
8441             /* Make space for the global cg indices */
8442             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8443                 || dd->cg_nalloc == 0)
8444             {
8445                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8446                 srenew(index_gl, dd->cg_nalloc);
8447                 srenew(cgindex, dd->cg_nalloc+1);
8448             }
8449             /* Communicate the global cg indices */
8450             if (cd->bInPlace)
8451             {
8452                 recv_i = index_gl + pos_cg;
8453             }
8454             else
8455             {
8456                 recv_i = comm->buf_int2;
8457             }
8458             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8459                             comm->buf_int, nsend,
8460                             recv_i,        ind->nrecv[nzone]);
8461
8462             /* Make space for cg_cm */
8463             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8464             if (fr->cutoff_scheme == ecutsGROUP)
8465             {
8466                 cg_cm = fr->cg_cm;
8467             }
8468             else
8469             {
8470                 cg_cm = state->x;
8471             }
8472             /* Communicate cg_cm */
8473             if (cd->bInPlace)
8474             {
8475                 recv_vr = cg_cm + pos_cg;
8476             }
8477             else
8478             {
8479                 recv_vr = comm->vbuf2.v;
8480             }
8481             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8482                              comm->vbuf.v, nsend,
8483                              recv_vr,      ind->nrecv[nzone]);
8484
8485             /* Make the charge group index */
8486             if (cd->bInPlace)
8487             {
8488                 zone = (p == 0 ? 0 : nzone - 1);
8489                 while (zone < nzone)
8490                 {
8491                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8492                     {
8493                         cg_gl              = index_gl[pos_cg];
8494                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8495                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8496                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8497                         if (bBondComm)
8498                         {
8499                             /* Update the charge group presence,
8500                              * so we can use it in the next pass of the loop.
8501                              */
8502                             comm->bLocalCG[cg_gl] = TRUE;
8503                         }
8504                         pos_cg++;
8505                     }
8506                     if (p == 0)
8507                     {
8508                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8509                     }
8510                     zone++;
8511                     zone_cg_range[nzone+zone] = pos_cg;
8512                 }
8513             }
8514             else
8515             {
8516                 /* This part of the code is never executed with bBondComm. */
8517                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8518                                  index_gl, recv_i, cg_cm, recv_vr,
8519                                  cgindex, fr->cginfo_mb, fr->cginfo);
8520                 pos_cg += ind->nrecv[nzone];
8521             }
8522             nat_tot += ind->nrecv[nzone+1];
8523         }
8524         if (!cd->bInPlace)
8525         {
8526             /* Store the atom block for easy copying of communication buffers */
8527             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8528         }
8529         nzone += nzone;
8530     }
8531     dd->index_gl = index_gl;
8532     dd->cgindex  = cgindex;
8533
8534     dd->ncg_tot          = zone_cg_range[zones->n];
8535     dd->nat_tot          = nat_tot;
8536     comm->nat[ddnatHOME] = dd->nat_home;
8537     for (i = ddnatZONE; i < ddnatNR; i++)
8538     {
8539         comm->nat[i] = dd->nat_tot;
8540     }
8541
8542     if (!bBondComm)
8543     {
8544         /* We don't need to update cginfo, since that was alrady done above.
8545          * So we pass NULL for the forcerec.
8546          */
8547         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8548                       NULL, comm->bLocalCG);
8549     }
8550
8551     if (debug)
8552     {
8553         fprintf(debug, "Finished setting up DD communication, zones:");
8554         for (c = 0; c < zones->n; c++)
8555         {
8556             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8557         }
8558         fprintf(debug, "\n");
8559     }
8560 }
8561
8562 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8563 {
8564     int c;
8565
8566     for (c = 0; c < zones->nizone; c++)
8567     {
8568         zones->izone[c].cg1  = zones->cg_range[c+1];
8569         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8570         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8571     }
8572 }
8573
8574 static void set_zones_size(gmx_domdec_t *dd,
8575                            matrix box, const gmx_ddbox_t *ddbox,
8576                            int zone_start, int zone_end)
8577 {
8578     gmx_domdec_comm_t  *comm;
8579     gmx_domdec_zones_t *zones;
8580     gmx_bool            bDistMB;
8581     int                 z, zi, d, dim;
8582     real                rcs, rcmbs;
8583     int                 i, j;
8584     real                vol;
8585
8586     comm = dd->comm;
8587
8588     zones = &comm->zones;
8589
8590     /* Do we need to determine extra distances for multi-body bondeds? */
8591     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8592
8593     for (z = zone_start; z < zone_end; z++)
8594     {
8595         /* Copy cell limits to zone limits.
8596          * Valid for non-DD dims and non-shifted dims.
8597          */
8598         copy_rvec(comm->cell_x0, zones->size[z].x0);
8599         copy_rvec(comm->cell_x1, zones->size[z].x1);
8600     }
8601
8602     for (d = 0; d < dd->ndim; d++)
8603     {
8604         dim = dd->dim[d];
8605
8606         for (z = 0; z < zones->n; z++)
8607         {
8608             /* With a staggered grid we have different sizes
8609              * for non-shifted dimensions.
8610              */
8611             if (dd->bGridJump && zones->shift[z][dim] == 0)
8612             {
8613                 if (d == 1)
8614                 {
8615                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8616                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8617                 }
8618                 else if (d == 2)
8619                 {
8620                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8621                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8622                 }
8623             }
8624         }
8625
8626         rcs   = comm->cutoff;
8627         rcmbs = comm->cutoff_mbody;
8628         if (ddbox->tric_dir[dim])
8629         {
8630             rcs   /= ddbox->skew_fac[dim];
8631             rcmbs /= ddbox->skew_fac[dim];
8632         }
8633
8634         /* Set the lower limit for the shifted zone dimensions */
8635         for (z = zone_start; z < zone_end; z++)
8636         {
8637             if (zones->shift[z][dim] > 0)
8638             {
8639                 dim = dd->dim[d];
8640                 if (!dd->bGridJump || d == 0)
8641                 {
8642                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8643                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8644                 }
8645                 else
8646                 {
8647                     /* Here we take the lower limit of the zone from
8648                      * the lowest domain of the zone below.
8649                      */
8650                     if (z < 4)
8651                     {
8652                         zones->size[z].x0[dim] =
8653                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8654                     }
8655                     else
8656                     {
8657                         if (d == 1)
8658                         {
8659                             zones->size[z].x0[dim] =
8660                                 zones->size[zone_perm[2][z-4]].x0[dim];
8661                         }
8662                         else
8663                         {
8664                             zones->size[z].x0[dim] =
8665                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8666                         }
8667                     }
8668                     /* A temporary limit, is updated below */
8669                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8670
8671                     if (bDistMB)
8672                     {
8673                         for (zi = 0; zi < zones->nizone; zi++)
8674                         {
8675                             if (zones->shift[zi][dim] == 0)
8676                             {
8677                                 /* This takes the whole zone into account.
8678                                  * With multiple pulses this will lead
8679                                  * to a larger zone then strictly necessary.
8680                                  */
8681                                 zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8682                                                                   zones->size[zi].x1[dim]+rcmbs);
8683                             }
8684                         }
8685                     }
8686                 }
8687             }
8688         }
8689
8690         /* Loop over the i-zones to set the upper limit of each
8691          * j-zone they see.
8692          */
8693         for (zi = 0; zi < zones->nizone; zi++)
8694         {
8695             if (zones->shift[zi][dim] == 0)
8696             {
8697                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8698                 {
8699                     if (zones->shift[z][dim] > 0)
8700                     {
8701                         zones->size[z].x1[dim] = std::max(zones->size[z].x1[dim],
8702                                                           zones->size[zi].x1[dim]+rcs);
8703                     }
8704                 }
8705             }
8706         }
8707     }
8708
8709     for (z = zone_start; z < zone_end; z++)
8710     {
8711         /* Initialization only required to keep the compiler happy */
8712         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8713         int  nc, c;
8714
8715         /* To determine the bounding box for a zone we need to find
8716          * the extreme corners of 4, 2 or 1 corners.
8717          */
8718         nc = 1 << (ddbox->nboundeddim - 1);
8719
8720         for (c = 0; c < nc; c++)
8721         {
8722             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8723             corner[XX] = 0;
8724             if ((c & 1) == 0)
8725             {
8726                 corner[YY] = zones->size[z].x0[YY];
8727             }
8728             else
8729             {
8730                 corner[YY] = zones->size[z].x1[YY];
8731             }
8732             if ((c & 2) == 0)
8733             {
8734                 corner[ZZ] = zones->size[z].x0[ZZ];
8735             }
8736             else
8737             {
8738                 corner[ZZ] = zones->size[z].x1[ZZ];
8739             }
8740             if (dd->ndim == 1 && dd->dim[0] < ZZ && ZZ < dd->npbcdim &&
8741                 box[ZZ][1 - dd->dim[0]] != 0)
8742             {
8743                 /* With 1D domain decomposition the cg's are not in
8744                  * the triclinic box, but triclinic x-y and rectangular y/x-z.
8745                  * Shift the corner of the z-vector back to along the box
8746                  * vector of dimension d, so it will later end up at 0 along d.
8747                  * This can affect the location of this corner along dd->dim[0]
8748                  * through the matrix operation below if box[d][dd->dim[0]]!=0.
8749                  */
8750                 int d = 1 - dd->dim[0];
8751
8752                 corner[d] -= corner[ZZ]*box[ZZ][d]/box[ZZ][ZZ];
8753             }
8754             /* Apply the triclinic couplings */
8755             assert(ddbox->npbcdim <= DIM);
8756             for (i = YY; i < ddbox->npbcdim; i++)
8757             {
8758                 for (j = XX; j < i; j++)
8759                 {
8760                     corner[j] += corner[i]*box[i][j]/box[i][i];
8761                 }
8762             }
8763             if (c == 0)
8764             {
8765                 copy_rvec(corner, corner_min);
8766                 copy_rvec(corner, corner_max);
8767             }
8768             else
8769             {
8770                 for (i = 0; i < DIM; i++)
8771                 {
8772                     corner_min[i] = std::min(corner_min[i], corner[i]);
8773                     corner_max[i] = std::max(corner_max[i], corner[i]);
8774                 }
8775             }
8776         }
8777         /* Copy the extreme cornes without offset along x */
8778         for (i = 0; i < DIM; i++)
8779         {
8780             zones->size[z].bb_x0[i] = corner_min[i];
8781             zones->size[z].bb_x1[i] = corner_max[i];
8782         }
8783         /* Add the offset along x */
8784         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8785         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8786     }
8787
8788     if (zone_start == 0)
8789     {
8790         vol = 1;
8791         for (dim = 0; dim < DIM; dim++)
8792         {
8793             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8794         }
8795         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8796     }
8797
8798     if (debug)
8799     {
8800         for (z = zone_start; z < zone_end; z++)
8801         {
8802             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8803                     z,
8804                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8805                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8806                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8807             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8808                     z,
8809                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8810                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8811                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8812         }
8813     }
8814 }
8815
8816 static int comp_cgsort(const void *a, const void *b)
8817 {
8818     int           comp;
8819
8820     gmx_cgsort_t *cga, *cgb;
8821     cga = (gmx_cgsort_t *)a;
8822     cgb = (gmx_cgsort_t *)b;
8823
8824     comp = cga->nsc - cgb->nsc;
8825     if (comp == 0)
8826     {
8827         comp = cga->ind_gl - cgb->ind_gl;
8828     }
8829
8830     return comp;
8831 }
8832
8833 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8834                          int *a, int *buf)
8835 {
8836     int i;
8837
8838     /* Order the data */
8839     for (i = 0; i < n; i++)
8840     {
8841         buf[i] = a[sort[i].ind];
8842     }
8843
8844     /* Copy back to the original array */
8845     for (i = 0; i < n; i++)
8846     {
8847         a[i] = buf[i];
8848     }
8849 }
8850
8851 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8852                          rvec *v, rvec *buf)
8853 {
8854     int i;
8855
8856     /* Order the data */
8857     for (i = 0; i < n; i++)
8858     {
8859         copy_rvec(v[sort[i].ind], buf[i]);
8860     }
8861
8862     /* Copy back to the original array */
8863     for (i = 0; i < n; i++)
8864     {
8865         copy_rvec(buf[i], v[i]);
8866     }
8867 }
8868
8869 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8870                            rvec *v, rvec *buf)
8871 {
8872     int a, atot, cg, cg0, cg1, i;
8873
8874     if (cgindex == NULL)
8875     {
8876         /* Avoid the useless loop of the atoms within a cg */
8877         order_vec_cg(ncg, sort, v, buf);
8878
8879         return;
8880     }
8881
8882     /* Order the data */
8883     a = 0;
8884     for (cg = 0; cg < ncg; cg++)
8885     {
8886         cg0 = cgindex[sort[cg].ind];
8887         cg1 = cgindex[sort[cg].ind+1];
8888         for (i = cg0; i < cg1; i++)
8889         {
8890             copy_rvec(v[i], buf[a]);
8891             a++;
8892         }
8893     }
8894     atot = a;
8895
8896     /* Copy back to the original array */
8897     for (a = 0; a < atot; a++)
8898     {
8899         copy_rvec(buf[a], v[a]);
8900     }
8901 }
8902
8903 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8904                          int nsort_new, gmx_cgsort_t *sort_new,
8905                          gmx_cgsort_t *sort1)
8906 {
8907     int i1, i2, i_new;
8908
8909     /* The new indices are not very ordered, so we qsort them */
8910     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8911
8912     /* sort2 is already ordered, so now we can merge the two arrays */
8913     i1    = 0;
8914     i2    = 0;
8915     i_new = 0;
8916     while (i2 < nsort2 || i_new < nsort_new)
8917     {
8918         if (i2 == nsort2)
8919         {
8920             sort1[i1++] = sort_new[i_new++];
8921         }
8922         else if (i_new == nsort_new)
8923         {
8924             sort1[i1++] = sort2[i2++];
8925         }
8926         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8927                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8928                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8929         {
8930             sort1[i1++] = sort2[i2++];
8931         }
8932         else
8933         {
8934             sort1[i1++] = sort_new[i_new++];
8935         }
8936     }
8937 }
8938
8939 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8940 {
8941     gmx_domdec_sort_t *sort;
8942     gmx_cgsort_t      *cgsort, *sort_i;
8943     int                ncg_new, nsort2, nsort_new, i, *a, moved;
8944
8945     sort = dd->comm->sort;
8946
8947     a = fr->ns.grid->cell_index;
8948
8949     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8950
8951     if (ncg_home_old >= 0)
8952     {
8953         /* The charge groups that remained in the same ns grid cell
8954          * are completely ordered. So we can sort efficiently by sorting
8955          * the charge groups that did move into the stationary list.
8956          */
8957         ncg_new   = 0;
8958         nsort2    = 0;
8959         nsort_new = 0;
8960         for (i = 0; i < dd->ncg_home; i++)
8961         {
8962             /* Check if this cg did not move to another node */
8963             if (a[i] < moved)
8964             {
8965                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8966                 {
8967                     /* This cg is new on this node or moved ns grid cell */
8968                     if (nsort_new >= sort->sort_new_nalloc)
8969                     {
8970                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8971                         srenew(sort->sort_new, sort->sort_new_nalloc);
8972                     }
8973                     sort_i = &(sort->sort_new[nsort_new++]);
8974                 }
8975                 else
8976                 {
8977                     /* This cg did not move */
8978                     sort_i = &(sort->sort2[nsort2++]);
8979                 }
8980                 /* Sort on the ns grid cell indices
8981                  * and the global topology index.
8982                  * index_gl is irrelevant with cell ns,
8983                  * but we set it here anyhow to avoid a conditional.
8984                  */
8985                 sort_i->nsc    = a[i];
8986                 sort_i->ind_gl = dd->index_gl[i];
8987                 sort_i->ind    = i;
8988                 ncg_new++;
8989             }
8990         }
8991         if (debug)
8992         {
8993             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8994                     nsort2, nsort_new);
8995         }
8996         /* Sort efficiently */
8997         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8998                      sort->sort);
8999     }
9000     else
9001     {
9002         cgsort  = sort->sort;
9003         ncg_new = 0;
9004         for (i = 0; i < dd->ncg_home; i++)
9005         {
9006             /* Sort on the ns grid cell indices
9007              * and the global topology index
9008              */
9009             cgsort[i].nsc    = a[i];
9010             cgsort[i].ind_gl = dd->index_gl[i];
9011             cgsort[i].ind    = i;
9012             if (cgsort[i].nsc < moved)
9013             {
9014                 ncg_new++;
9015             }
9016         }
9017         if (debug)
9018         {
9019             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
9020         }
9021         /* Determine the order of the charge groups using qsort */
9022         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
9023     }
9024
9025     return ncg_new;
9026 }
9027
9028 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
9029 {
9030     gmx_cgsort_t *sort;
9031     int           ncg_new, i, *a, na;
9032
9033     sort = dd->comm->sort->sort;
9034
9035     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
9036
9037     ncg_new = 0;
9038     for (i = 0; i < na; i++)
9039     {
9040         if (a[i] >= 0)
9041         {
9042             sort[ncg_new].ind = a[i];
9043             ncg_new++;
9044         }
9045     }
9046
9047     return ncg_new;
9048 }
9049
9050 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
9051                           int ncg_home_old)
9052 {
9053     gmx_domdec_sort_t *sort;
9054     gmx_cgsort_t      *cgsort;
9055     int               *cgindex;
9056     int                ncg_new, i, *ibuf, cgsize;
9057     rvec              *vbuf;
9058
9059     sort = dd->comm->sort;
9060
9061     if (dd->ncg_home > sort->sort_nalloc)
9062     {
9063         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9064         srenew(sort->sort, sort->sort_nalloc);
9065         srenew(sort->sort2, sort->sort_nalloc);
9066     }
9067     cgsort = sort->sort;
9068
9069     switch (fr->cutoff_scheme)
9070     {
9071         case ecutsGROUP:
9072             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9073             break;
9074         case ecutsVERLET:
9075             ncg_new = dd_sort_order_nbnxn(dd, fr);
9076             break;
9077         default:
9078             gmx_incons("unimplemented");
9079             ncg_new = 0;
9080     }
9081
9082     /* We alloc with the old size, since cgindex is still old */
9083     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9084     vbuf = dd->comm->vbuf.v;
9085
9086     if (dd->comm->bCGs)
9087     {
9088         cgindex = dd->cgindex;
9089     }
9090     else
9091     {
9092         cgindex = NULL;
9093     }
9094
9095     /* Remove the charge groups which are no longer at home here */
9096     dd->ncg_home = ncg_new;
9097     if (debug)
9098     {
9099         fprintf(debug, "Set the new home charge group count to %d\n",
9100                 dd->ncg_home);
9101     }
9102
9103     /* Reorder the state */
9104     for (i = 0; i < estNR; i++)
9105     {
9106         if (EST_DISTR(i) && (state->flags & (1<<i)))
9107         {
9108             switch (i)
9109             {
9110                 case estX:
9111                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
9112                     break;
9113                 case estV:
9114                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
9115                     break;
9116                 case estSDX:
9117                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
9118                     break;
9119                 case estCGP:
9120                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
9121                     break;
9122                 case estLD_RNG:
9123                 case estLD_RNGI:
9124                 case estDISRE_INITF:
9125                 case estDISRE_RM3TAV:
9126                 case estORIRE_INITF:
9127                 case estORIRE_DTAV:
9128                     /* No ordering required */
9129                     break;
9130                 default:
9131                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9132                     break;
9133             }
9134         }
9135     }
9136     if (fr->cutoff_scheme == ecutsGROUP)
9137     {
9138         /* Reorder cgcm */
9139         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9140     }
9141
9142     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9143     {
9144         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9145         srenew(sort->ibuf, sort->ibuf_nalloc);
9146     }
9147     ibuf = sort->ibuf;
9148     /* Reorder the global cg index */
9149     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9150     /* Reorder the cginfo */
9151     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9152     /* Rebuild the local cg index */
9153     if (dd->comm->bCGs)
9154     {
9155         ibuf[0] = 0;
9156         for (i = 0; i < dd->ncg_home; i++)
9157         {
9158             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9159             ibuf[i+1] = ibuf[i] + cgsize;
9160         }
9161         for (i = 0; i < dd->ncg_home+1; i++)
9162         {
9163             dd->cgindex[i] = ibuf[i];
9164         }
9165     }
9166     else
9167     {
9168         for (i = 0; i < dd->ncg_home+1; i++)
9169         {
9170             dd->cgindex[i] = i;
9171         }
9172     }
9173     /* Set the home atom number */
9174     dd->nat_home = dd->cgindex[dd->ncg_home];
9175
9176     if (fr->cutoff_scheme == ecutsVERLET)
9177     {
9178         /* The atoms are now exactly in grid order, update the grid order */
9179         nbnxn_set_atomorder(fr->nbv->nbs);
9180     }
9181     else
9182     {
9183         /* Copy the sorted ns cell indices back to the ns grid struct */
9184         for (i = 0; i < dd->ncg_home; i++)
9185         {
9186             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9187         }
9188         fr->ns.grid->nr = dd->ncg_home;
9189     }
9190 }
9191
9192 static void add_dd_statistics(gmx_domdec_t *dd)
9193 {
9194     gmx_domdec_comm_t *comm;
9195     int                ddnat;
9196
9197     comm = dd->comm;
9198
9199     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9200     {
9201         comm->sum_nat[ddnat-ddnatZONE] +=
9202             comm->nat[ddnat] - comm->nat[ddnat-1];
9203     }
9204     comm->ndecomp++;
9205 }
9206
9207 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9208 {
9209     gmx_domdec_comm_t *comm;
9210     int                ddnat;
9211
9212     comm = dd->comm;
9213
9214     /* Reset all the statistics and counters for total run counting */
9215     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9216     {
9217         comm->sum_nat[ddnat-ddnatZONE] = 0;
9218     }
9219     comm->ndecomp   = 0;
9220     comm->nload     = 0;
9221     comm->load_step = 0;
9222     comm->load_sum  = 0;
9223     comm->load_max  = 0;
9224     clear_ivec(comm->load_lim);
9225     comm->load_mdf = 0;
9226     comm->load_pme = 0;
9227 }
9228
9229 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9230 {
9231     gmx_domdec_comm_t *comm;
9232     int                ddnat;
9233     double             av;
9234
9235     comm = cr->dd->comm;
9236
9237     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9238
9239     if (fplog == NULL)
9240     {
9241         return;
9242     }
9243
9244     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9245
9246     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9247     {
9248         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9249         switch (ddnat)
9250         {
9251             case ddnatZONE:
9252                 fprintf(fplog,
9253                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9254                         2, av);
9255                 break;
9256             case ddnatVSITE:
9257                 if (cr->dd->vsite_comm)
9258                 {
9259                     fprintf(fplog,
9260                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9261                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9262                             av);
9263                 }
9264                 break;
9265             case ddnatCON:
9266                 if (cr->dd->constraint_comm)
9267                 {
9268                     fprintf(fplog,
9269                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9270                             1 + ir->nLincsIter, av);
9271                 }
9272                 break;
9273             default:
9274                 gmx_incons(" Unknown type for DD statistics");
9275         }
9276     }
9277     fprintf(fplog, "\n");
9278
9279     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9280     {
9281         print_dd_load_av(fplog, cr->dd);
9282     }
9283 }
9284
9285 void dd_partition_system(FILE                *fplog,
9286                          gmx_int64_t          step,
9287                          t_commrec           *cr,
9288                          gmx_bool             bMasterState,
9289                          int                  nstglobalcomm,
9290                          t_state             *state_global,
9291                          gmx_mtop_t          *top_global,
9292                          t_inputrec          *ir,
9293                          t_state             *state_local,
9294                          rvec               **f,
9295                          t_mdatoms           *mdatoms,
9296                          gmx_localtop_t      *top_local,
9297                          t_forcerec          *fr,
9298                          gmx_vsite_t         *vsite,
9299                          gmx_shellfc_t        shellfc,
9300                          gmx_constr_t         constr,
9301                          t_nrnb              *nrnb,
9302                          gmx_wallcycle_t      wcycle,
9303                          gmx_bool             bVerbose)
9304 {
9305     gmx_domdec_t      *dd;
9306     gmx_domdec_comm_t *comm;
9307     gmx_ddbox_t        ddbox = {0};
9308     t_block           *cgs_gl;
9309     gmx_int64_t        step_pcoupl;
9310     rvec               cell_ns_x0, cell_ns_x1;
9311     int                i, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9312     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9313     gmx_bool           bRedist, bSortCG, bResortAll;
9314     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9315     real               grid_density;
9316     char               sbuf[22];
9317
9318     dd   = cr->dd;
9319     comm = dd->comm;
9320
9321     bBoxChanged = (bMasterState || DEFORM(*ir));
9322     if (ir->epc != epcNO)
9323     {
9324         /* With nstpcouple > 1 pressure coupling happens.
9325          * one step after calculating the pressure.
9326          * Box scaling happens at the end of the MD step,
9327          * after the DD partitioning.
9328          * We therefore have to do DLB in the first partitioning
9329          * after an MD step where P-coupling occured.
9330          * We need to determine the last step in which p-coupling occurred.
9331          * MRS -- need to validate this for vv?
9332          */
9333         n = ir->nstpcouple;
9334         if (n == 1)
9335         {
9336             step_pcoupl = step - 1;
9337         }
9338         else
9339         {
9340             step_pcoupl = ((step - 1)/n)*n + 1;
9341         }
9342         if (step_pcoupl >= comm->partition_step)
9343         {
9344             bBoxChanged = TRUE;
9345         }
9346     }
9347
9348     bNStGlobalComm = (step % nstglobalcomm == 0);
9349
9350     if (!comm->bDynLoadBal)
9351     {
9352         bDoDLB = FALSE;
9353     }
9354     else
9355     {
9356         /* Should we do dynamic load balacing this step?
9357          * Since it requires (possibly expensive) global communication,
9358          * we might want to do DLB less frequently.
9359          */
9360         if (bBoxChanged || ir->epc != epcNO)
9361         {
9362             bDoDLB = bBoxChanged;
9363         }
9364         else
9365         {
9366             bDoDLB = bNStGlobalComm;
9367         }
9368     }
9369
9370     /* Check if we have recorded loads on the nodes */
9371     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9372     {
9373         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
9374         {
9375             /* Check if we should use DLB at the second partitioning
9376              * and every 100 partitionings,
9377              * so the extra communication cost is negligible.
9378              */
9379             const int nddp_chk_dlb = 100;
9380             bCheckDLB = (comm->n_load_collect == 0 ||
9381                          comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
9382         }
9383         else
9384         {
9385             bCheckDLB = FALSE;
9386         }
9387
9388         /* Print load every nstlog, first and last step to the log file */
9389         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9390                     comm->n_load_collect == 0 ||
9391                     (ir->nsteps >= 0 &&
9392                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9393
9394         /* Avoid extra communication due to verbose screen output
9395          * when nstglobalcomm is set.
9396          */
9397         if (bDoDLB || bLogLoad || bCheckDLB ||
9398             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9399         {
9400             get_load_distribution(dd, wcycle);
9401             if (DDMASTER(dd))
9402             {
9403                 if (bLogLoad)
9404                 {
9405                     dd_print_load(fplog, dd, step-1);
9406                 }
9407                 if (bVerbose)
9408                 {
9409                     dd_print_load_verbose(dd);
9410                 }
9411             }
9412             comm->n_load_collect++;
9413
9414             if (bCheckDLB)
9415             {
9416                 /* Since the timings are node dependent, the master decides */
9417                 if (DDMASTER(dd))
9418                 {
9419                     /* Here we check if the max PME rank load is more than 0.98
9420                      * the max PP force load. If so, PP DLB will not help,
9421                      * since we are (almost) limited by PME. Furthermore,
9422                      * DLB will cause a significant extra x/f redistribution
9423                      * cost on the PME ranks, which will then surely result
9424                      * in lower total performance.
9425                      * This check might be fragile, since one measurement
9426                      * below 0.98 (although only done once every 100 DD part.)
9427                      * could turn on DLB for the rest of the run.
9428                      */
9429                     if (cr->npmenodes > 0 &&
9430                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9431                     {
9432                         bTurnOnDLB = FALSE;
9433                     }
9434                     else
9435                     {
9436                         bTurnOnDLB =
9437                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9438                     }
9439                     if (debug)
9440                     {
9441                         fprintf(debug, "step %s, imb loss %f\n",
9442                                 gmx_step_str(step, sbuf),
9443                                 dd_force_imb_perf_loss(dd));
9444                     }
9445                 }
9446                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9447                 if (bTurnOnDLB)
9448                 {
9449                     turn_on_dlb(fplog, cr, step);
9450                     bDoDLB = TRUE;
9451                 }
9452             }
9453         }
9454         comm->n_load_have++;
9455     }
9456
9457     cgs_gl = &comm->cgs_gl;
9458
9459     bRedist = FALSE;
9460     if (bMasterState)
9461     {
9462         /* Clear the old state */
9463         clear_dd_indices(dd, 0, 0);
9464         ncgindex_set = 0;
9465
9466         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9467                   TRUE, cgs_gl, state_global->x, &ddbox);
9468
9469         get_cg_distribution(fplog, step, dd, cgs_gl,
9470                             state_global->box, &ddbox, state_global->x);
9471
9472         dd_distribute_state(dd, cgs_gl,
9473                             state_global, state_local, f);
9474
9475         dd_make_local_cgs(dd, &top_local->cgs);
9476
9477         /* Ensure that we have space for the new distribution */
9478         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9479
9480         if (fr->cutoff_scheme == ecutsGROUP)
9481         {
9482             calc_cgcm(fplog, 0, dd->ncg_home,
9483                       &top_local->cgs, state_local->x, fr->cg_cm);
9484         }
9485
9486         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9487
9488         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9489     }
9490     else if (state_local->ddp_count != dd->ddp_count)
9491     {
9492         if (state_local->ddp_count > dd->ddp_count)
9493         {
9494             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9495         }
9496
9497         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9498         {
9499             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9500         }
9501
9502         /* Clear the old state */
9503         clear_dd_indices(dd, 0, 0);
9504
9505         /* Build the new indices */
9506         rebuild_cgindex(dd, cgs_gl->index, state_local);
9507         make_dd_indices(dd, cgs_gl->index, 0);
9508         ncgindex_set = dd->ncg_home;
9509
9510         if (fr->cutoff_scheme == ecutsGROUP)
9511         {
9512             /* Redetermine the cg COMs */
9513             calc_cgcm(fplog, 0, dd->ncg_home,
9514                       &top_local->cgs, state_local->x, fr->cg_cm);
9515         }
9516
9517         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9518
9519         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9520
9521         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9522                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9523
9524         bRedist = comm->bDynLoadBal;
9525     }
9526     else
9527     {
9528         /* We have the full state, only redistribute the cgs */
9529
9530         /* Clear the non-home indices */
9531         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9532         ncgindex_set = 0;
9533
9534         /* Avoid global communication for dim's without pbc and -gcom */
9535         if (!bNStGlobalComm)
9536         {
9537             copy_rvec(comm->box0, ddbox.box0    );
9538             copy_rvec(comm->box_size, ddbox.box_size);
9539         }
9540         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9541                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9542
9543         bBoxChanged = TRUE;
9544         bRedist     = TRUE;
9545     }
9546     /* For dim's without pbc and -gcom */
9547     copy_rvec(ddbox.box0, comm->box0    );
9548     copy_rvec(ddbox.box_size, comm->box_size);
9549
9550     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9551                       step, wcycle);
9552
9553     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9554     {
9555         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9556     }
9557
9558     /* Check if we should sort the charge groups */
9559     if (comm->nstSortCG > 0)
9560     {
9561         bSortCG = (bMasterState ||
9562                    (bRedist && (step % comm->nstSortCG == 0)));
9563     }
9564     else
9565     {
9566         bSortCG = FALSE;
9567     }
9568
9569     ncg_home_old = dd->ncg_home;
9570
9571     ncg_moved = 0;
9572     if (bRedist)
9573     {
9574         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9575
9576         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9577                            state_local, f, fr,
9578                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9579
9580         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9581     }
9582
9583     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9584                           dd, &ddbox,
9585                           &comm->cell_x0, &comm->cell_x1,
9586                           dd->ncg_home, fr->cg_cm,
9587                           cell_ns_x0, cell_ns_x1, &grid_density);
9588
9589     if (bBoxChanged)
9590     {
9591         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9592     }
9593
9594     switch (fr->cutoff_scheme)
9595     {
9596         case ecutsGROUP:
9597             copy_ivec(fr->ns.grid->n, ncells_old);
9598             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9599                        state_local->box, cell_ns_x0, cell_ns_x1,
9600                        fr->rlistlong, grid_density);
9601             break;
9602         case ecutsVERLET:
9603             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9604             break;
9605         default:
9606             gmx_incons("unimplemented");
9607     }
9608     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9609     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9610
9611     if (bSortCG)
9612     {
9613         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9614
9615         /* Sort the state on charge group position.
9616          * This enables exact restarts from this step.
9617          * It also improves performance by about 15% with larger numbers
9618          * of atoms per node.
9619          */
9620
9621         /* Fill the ns grid with the home cell,
9622          * so we can sort with the indices.
9623          */
9624         set_zones_ncg_home(dd);
9625
9626         switch (fr->cutoff_scheme)
9627         {
9628             case ecutsVERLET:
9629                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9630
9631                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9632                                   0,
9633                                   comm->zones.size[0].bb_x0,
9634                                   comm->zones.size[0].bb_x1,
9635                                   0, dd->ncg_home,
9636                                   comm->zones.dens_zone0,
9637                                   fr->cginfo,
9638                                   state_local->x,
9639                                   ncg_moved, bRedist ? comm->moved : NULL,
9640                                   fr->nbv->grp[eintLocal].kernel_type,
9641                                   fr->nbv->grp[eintLocal].nbat);
9642
9643                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9644                 break;
9645             case ecutsGROUP:
9646                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9647                           0, dd->ncg_home, fr->cg_cm);
9648
9649                 copy_ivec(fr->ns.grid->n, ncells_new);
9650                 break;
9651             default:
9652                 gmx_incons("unimplemented");
9653         }
9654
9655         bResortAll = bMasterState;
9656
9657         /* Check if we can user the old order and ns grid cell indices
9658          * of the charge groups to sort the charge groups efficiently.
9659          */
9660         if (ncells_new[XX] != ncells_old[XX] ||
9661             ncells_new[YY] != ncells_old[YY] ||
9662             ncells_new[ZZ] != ncells_old[ZZ])
9663         {
9664             bResortAll = TRUE;
9665         }
9666
9667         if (debug)
9668         {
9669             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9670                     gmx_step_str(step, sbuf), dd->ncg_home);
9671         }
9672         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9673                       bResortAll ? -1 : ncg_home_old);
9674         /* Rebuild all the indices */
9675         ga2la_clear(dd->ga2la);
9676         ncgindex_set = 0;
9677
9678         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9679     }
9680
9681     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9682
9683     /* Setup up the communication and communicate the coordinates */
9684     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9685
9686     /* Set the indices */
9687     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9688
9689     /* Set the charge group boundaries for neighbor searching */
9690     set_cg_boundaries(&comm->zones);
9691
9692     if (fr->cutoff_scheme == ecutsVERLET)
9693     {
9694         set_zones_size(dd, state_local->box, &ddbox,
9695                        bSortCG ? 1 : 0, comm->zones.n);
9696     }
9697
9698     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9699
9700     /*
9701        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9702                  -1,state_local->x,state_local->box);
9703      */
9704
9705     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9706
9707     /* Extract a local topology from the global topology */
9708     for (i = 0; i < dd->ndim; i++)
9709     {
9710         np[dd->dim[i]] = comm->cd[i].np;
9711     }
9712     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9713                       comm->cellsize_min, np,
9714                       fr,
9715                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9716                       vsite, top_global, top_local);
9717
9718     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9719
9720     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9721
9722     /* Set up the special atom communication */
9723     n = comm->nat[ddnatZONE];
9724     for (i = ddnatZONE+1; i < ddnatNR; i++)
9725     {
9726         switch (i)
9727         {
9728             case ddnatVSITE:
9729                 if (vsite && vsite->n_intercg_vsite)
9730                 {
9731                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9732                 }
9733                 break;
9734             case ddnatCON:
9735                 if (dd->bInterCGcons || dd->bInterCGsettles)
9736                 {
9737                     /* Only for inter-cg constraints we need special code */
9738                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9739                                                   constr, ir->nProjOrder,
9740                                                   top_local->idef.il);
9741                 }
9742                 break;
9743             default:
9744                 gmx_incons("Unknown special atom type setup");
9745         }
9746         comm->nat[i] = n;
9747     }
9748
9749     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9750
9751     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9752
9753     /* Make space for the extra coordinates for virtual site
9754      * or constraint communication.
9755      */
9756     state_local->natoms = comm->nat[ddnatNR-1];
9757     if (state_local->natoms > state_local->nalloc)
9758     {
9759         dd_realloc_state(state_local, f, state_local->natoms);
9760     }
9761
9762     if (fr->bF_NoVirSum)
9763     {
9764         if (vsite && vsite->n_intercg_vsite)
9765         {
9766             nat_f_novirsum = comm->nat[ddnatVSITE];
9767         }
9768         else
9769         {
9770             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9771             {
9772                 nat_f_novirsum = dd->nat_tot;
9773             }
9774             else
9775             {
9776                 nat_f_novirsum = dd->nat_home;
9777             }
9778         }
9779     }
9780     else
9781     {
9782         nat_f_novirsum = 0;
9783     }
9784
9785     /* Set the number of atoms required for the force calculation.
9786      * Forces need to be constrained when using a twin-range setup
9787      * or with energy minimization. For simple simulations we could
9788      * avoid some allocation, zeroing and copying, but this is
9789      * probably not worth the complications ande checking.
9790      */
9791     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9792                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9793
9794     /* We make the all mdatoms up to nat_tot_con.
9795      * We could save some work by only setting invmass
9796      * between nat_tot and nat_tot_con.
9797      */
9798     /* This call also sets the new number of home particles to dd->nat_home */
9799     atoms2md(top_global, ir,
9800              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9801
9802     /* Now we have the charges we can sort the FE interactions */
9803     dd_sort_local_top(dd, mdatoms, top_local);
9804
9805     if (vsite != NULL)
9806     {
9807         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9808         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9809                                   mdatoms, FALSE, vsite);
9810     }
9811
9812     if (shellfc)
9813     {
9814         /* Make the local shell stuff, currently no communication is done */
9815         make_local_shells(cr, mdatoms, shellfc);
9816     }
9817
9818     if (ir->implicit_solvent)
9819     {
9820         make_local_gb(cr, fr->born, ir->gb_algorithm);
9821     }
9822
9823     setup_bonded_threading(fr, &top_local->idef);
9824
9825     if (!(cr->duty & DUTY_PME))
9826     {
9827         /* Send the charges and/or c6/sigmas to our PME only node */
9828         gmx_pme_send_parameters(cr,
9829                                 fr->ic,
9830                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9831                                 mdatoms->chargeA, mdatoms->chargeB,
9832                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9833                                 mdatoms->sigmaA, mdatoms->sigmaB,
9834                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9835     }
9836
9837     if (constr)
9838     {
9839         set_constraints(constr, top_local, ir, mdatoms, cr);
9840     }
9841
9842     if (ir->ePull != epullNO)
9843     {
9844         /* Update the local pull groups */
9845         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9846     }
9847
9848     if (ir->bRot)
9849     {
9850         /* Update the local rotation groups */
9851         dd_make_local_rotation_groups(dd, ir->rot);
9852     }
9853
9854     if (ir->eSwapCoords != eswapNO)
9855     {
9856         /* Update the local groups needed for ion swapping */
9857         dd_make_local_swap_groups(dd, ir->swap);
9858     }
9859
9860     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9861     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9862
9863     add_dd_statistics(dd);
9864
9865     /* Make sure we only count the cycles for this DD partitioning */
9866     clear_dd_cycle_counts(dd);
9867
9868     /* Because the order of the atoms might have changed since
9869      * the last vsite construction, we need to communicate the constructing
9870      * atom coordinates again (for spreading the forces this MD step).
9871      */
9872     dd_move_x_vsites(dd, state_local->box, state_local->x);
9873
9874     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9875
9876     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9877     {
9878         dd_move_x(dd, state_local->box, state_local->x);
9879         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9880                      -1, state_local->x, state_local->box);
9881     }
9882
9883     /* Store the partitioning step */
9884     comm->partition_step = step;
9885
9886     /* Increase the DD partitioning counter */
9887     dd->ddp_count++;
9888     /* The state currently matches this DD partitioning count, store it */
9889     state_local->ddp_count = dd->ddp_count;
9890     if (bMasterState)
9891     {
9892         /* The DD master node knows the complete cg distribution,
9893          * store the count so we can possibly skip the cg info communication.
9894          */
9895         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9896     }
9897
9898     if (comm->DD_debug > 0)
9899     {
9900         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9901         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9902                                 "after partitioning");
9903     }
9904 }