src/gromacs/mdlib/domdec.c

   1 /*
   2  * This file is part of the GROMACS molecular simulation package.
   3  *
   4  * Copyright (c) 2005,2006,2007,2008,2009,2010,2011,2012,2013,2014, by the GROMACS development team, led by
   5  * Mark Abraham, David van der Spoel, Berk Hess, and Erik Lindahl,
   6  * and including many others, as listed in the AUTHORS file in the
   7  * top-level source directory and at http://www.gromacs.org.
   8  *
   9  * GROMACS is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public License
  11  * as published by the Free Software Foundation; either version 2.1
  12  * of the License, or (at your option) any later version.
  13  *
  14  * GROMACS is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with GROMACS; if not, see
  21  * http://www.gnu.org/licenses, or write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
  23  *
  24  * If you want to redistribute modifications to GROMACS, please
  25  * consider that scientific software is very special. Version
  26  * control is crucial - bugs must be traceable. We will be happy to
  27  * consider code for inclusion in the official distribution, but
  28  * derived work must not be called official GROMACS. Details are found
  29  * in the README & COPYING files - if they are missing, get the
  30  * official version at http://www.gromacs.org.
  31  *
  32  * To help us fund GROMACS development, we humbly ask that you cite
  33  * the research papers on the package. Check out http://www.gromacs.org.
  34  */
  35
  36 #ifdef HAVE_CONFIG_H
  37 #include <config.h>
  38 #endif
  39
  40 #include <stdio.h>
  41 #include <time.h>
  42 #include <math.h>
  43 #include <string.h>
  44 #include <stdlib.h>
  45 #include <assert.h>
  46
  47 #include "typedefs.h"
  48 #include "gromacs/utility/smalloc.h"
  49 #include "gmx_fatal.h"
  50 #include "gmx_fatal_collective.h"
  51 #include "vec.h"
  52 #include "domdec.h"
  53 #include "domdec_network.h"
  54 #include "nrnb.h"
  55 #include "pbc.h"
  56 #include "chargegroup.h"
  57 #include "constr.h"
  58 #include "mdatoms.h"
  59 #include "names.h"
  60 #include "force.h"
  61 #include "pme.h"
  62 #include "mdrun.h"
  63 #include "nsgrid.h"
  64 #include "shellfc.h"
  65 #include "mtop_util.h"
  66 #include "gmx_ga2la.h"
  67 #include "macros.h"
  68 #include "nbnxn_search.h"
  69 #include "bondf.h"
  70 #include "gmx_omp_nthreads.h"
  71 #include "gpu_utils.h"
  72
  73 #include "gromacs/fileio/futil.h"
  74 #include "gromacs/fileio/gmxfio.h"
  75 #include "gromacs/fileio/pdbio.h"
  76 #include "gromacs/timing/wallcycle.h"
  77 #include "gromacs/utility/gmxmpi.h"
  78 #include "gromacs/swap/swapcoords.h"
  79 #include "gromacs/utility/qsort_threadsafe.h"
  80 #include "gromacs/pulling/pull.h"
  81 #include "gromacs/pulling/pull_rotation.h"
  82 #include "gromacs/imd/imd.h"
  83
  84 #define DDRANK(dd, rank)    (rank)
  85 #define DDMASTERRANK(dd)   (dd->masterrank)
  86
  87 typedef struct gmx_domdec_master
  88 {
  89     /* The cell boundaries */
  90     real **cell_x;
  91     /* The global charge group division */
  92     int   *ncg;    /* Number of home charge groups for each node */
  93     int   *index;  /* Index of nnodes+1 into cg */
  94     int   *cg;     /* Global charge group index */
  95     int   *nat;    /* Number of home atoms for each node. */
  96     int   *ibuf;   /* Buffer for communication */
  97     rvec  *vbuf;   /* Buffer for state scattering and gathering */
  98 } gmx_domdec_master_t;
  99
 100 typedef struct
 101 {
 102     /* The numbers of charge groups to send and receive for each cell
 103      * that requires communication, the last entry contains the total
 104      * number of atoms that needs to be communicated.
 105      */
 106     int  nsend[DD_MAXIZONE+2];
 107     int  nrecv[DD_MAXIZONE+2];
 108     /* The charge groups to send */
 109     int *index;
 110     int  nalloc;
 111     /* The atom range for non-in-place communication */
 112     int  cell2at0[DD_MAXIZONE];
 113     int  cell2at1[DD_MAXIZONE];
 114 } gmx_domdec_ind_t;
 115
 116 typedef struct
 117 {
 118     int               np;       /* Number of grid pulses in this dimension */
 119     int               np_dlb;   /* For dlb, for use with edlbAUTO          */
 120     gmx_domdec_ind_t *ind;      /* The indices to communicate, size np     */
 121     int               np_nalloc;
 122     gmx_bool          bInPlace; /* Can we communicate in place?            */
 123 } gmx_domdec_comm_dim_t;
 124
 125 typedef struct
 126 {
 127     gmx_bool *bCellMin;    /* Temp. var.: is this cell size at the limit     */
 128     real     *cell_f;      /* State var.: cell boundaries, box relative      */
 129     real     *old_cell_f;  /* Temp. var.: old cell size                      */
 130     real     *cell_f_max0; /* State var.: max lower boundary, incl neighbors */
 131     real     *cell_f_min1; /* State var.: min upper boundary, incl neighbors */
 132     real     *bound_min;   /* Temp. var.: lower limit for cell boundary      */
 133     real     *bound_max;   /* Temp. var.: upper limit for cell boundary      */
 134     gmx_bool  bLimited;    /* State var.: is DLB limited in this dim and row */
 135     real     *buf_ncd;     /* Temp. var.                                     */
 136 } gmx_domdec_root_t;
 137
 138 #define DD_NLOAD_MAX 9
 139
 140 /* Here floats are accurate enough, since these variables
 141  * only influence the load balancing, not the actual MD results.
 142  */
 143 typedef struct
 144 {
 145     int    nload;
 146     float *load;
 147     float  sum;
 148     float  max;
 149     float  sum_m;
 150     float  cvol_min;
 151     float  mdf;
 152     float  pme;
 153     int    flags;
 154 } gmx_domdec_load_t;
 155
 156 typedef struct
 157 {
 158     int  nsc;
 159     int  ind_gl;
 160     int  ind;
 161 } gmx_cgsort_t;
 162
 163 typedef struct
 164 {
 165     gmx_cgsort_t *sort;
 166     gmx_cgsort_t *sort2;
 167     int           sort_nalloc;
 168     gmx_cgsort_t *sort_new;
 169     int           sort_new_nalloc;
 170     int          *ibuf;
 171     int           ibuf_nalloc;
 172 } gmx_domdec_sort_t;
 173
 174 typedef struct
 175 {
 176     rvec *v;
 177     int   nalloc;
 178 } vec_rvec_t;
 179
 180 /* This enum determines the order of the coordinates.
 181  * ddnatHOME and ddnatZONE should be first and second,
 182  * the others can be ordered as wanted.
 183  */
 184 enum {
 185     ddnatHOME, ddnatZONE, ddnatVSITE, ddnatCON, ddnatNR
 186 };
 187
 188 enum {
 189     edlbAUTO, edlbNO, edlbYES, edlbNR
 190 };
 191 const char *edlb_names[edlbNR] = { "auto", "no", "yes" };
 192
 193 typedef struct
 194 {
 195     int      dim;       /* The dimension                                          */
 196     gmx_bool dim_match; /* Tells if DD and PME dims match                         */
 197     int      nslab;     /* The number of PME slabs in this dimension              */
 198     real    *slb_dim_f; /* Cell sizes for determining the PME comm. with SLB    */
 199     int     *pp_min;    /* The minimum pp node location, size nslab               */
 200     int     *pp_max;    /* The maximum pp node location,size nslab                */
 201     int      maxshift;  /* The maximum shift for coordinate redistribution in PME */
 202 } gmx_ddpme_t;
 203
 204 typedef struct
 205 {
 206     real min0;    /* The minimum bottom of this zone                        */
 207     real max1;    /* The maximum top of this zone                           */
 208     real min1;    /* The minimum top of this zone                           */
 209     real mch0;    /* The maximum bottom communicaton height for this zone   */
 210     real mch1;    /* The maximum top communicaton height for this zone      */
 211     real p1_0;    /* The bottom value of the first cell in this zone        */
 212     real p1_1;    /* The top value of the first cell in this zone           */
 213 } gmx_ddzone_t;
 214
 215 typedef struct
 216 {
 217     gmx_domdec_ind_t ind;
 218     int             *ibuf;
 219     int              ibuf_nalloc;
 220     vec_rvec_t       vbuf;
 221     int              nsend;
 222     int              nat;
 223     int              nsend_zone;
 224 } dd_comm_setup_work_t;
 225
 226 typedef struct gmx_domdec_comm
 227 {
 228     /* All arrays are indexed with 0 to dd->ndim (not Cartesian indexing),
 229      * unless stated otherwise.
 230      */
 231
 232     /* The number of decomposition dimensions for PME, 0: no PME */
 233     int         npmedecompdim;
 234     /* The number of nodes doing PME (PP/PME or only PME) */
 235     int         npmenodes;
 236     int         npmenodes_x;
 237     int         npmenodes_y;
 238     /* The communication setup including the PME only nodes */
 239     gmx_bool    bCartesianPP_PME;
 240     ivec        ntot;
 241     int         cartpmedim;
 242     int        *pmenodes;          /* size npmenodes                         */
 243     int        *ddindex2simnodeid; /* size npmenodes, only with bCartesianPP
 244                                     * but with bCartesianPP_PME              */
 245     gmx_ddpme_t ddpme[2];
 246
 247     /* The DD particle-particle nodes only */
 248     gmx_bool bCartesianPP;
 249     int     *ddindex2ddnodeid; /* size npmenode, only with bCartesianPP_PME */
 250
 251     /* The global charge groups */
 252     t_block cgs_gl;
 253
 254     /* Should we sort the cgs */
 255     int                nstSortCG;
 256     gmx_domdec_sort_t *sort;
 257
 258     /* Are there charge groups? */
 259     gmx_bool bCGs;
 260
 261     /* Are there bonded and multi-body interactions between charge groups? */
 262     gmx_bool bInterCGBondeds;
 263     gmx_bool bInterCGMultiBody;
 264
 265     /* Data for the optional bonded interaction atom communication range */
 266     gmx_bool  bBondComm;
 267     t_blocka *cglink;
 268     char     *bLocalCG;
 269
 270     /* The DLB option */
 271     int      eDLB;
 272     /* Is eDLB=edlbAUTO locked such that we currently can't turn it on? */
 273     gmx_bool bDLB_locked;
 274     /* Are we actually using DLB? */
 275     gmx_bool bDynLoadBal;
 276
 277     /* Cell sizes for static load balancing, first index cartesian */
 278     real **slb_frac;
 279
 280     /* The width of the communicated boundaries */
 281     real     cutoff_mbody;
 282     real     cutoff;
 283     /* The minimum cell size (including triclinic correction) */
 284     rvec     cellsize_min;
 285     /* For dlb, for use with edlbAUTO */
 286     rvec     cellsize_min_dlb;
 287     /* The lower limit for the DD cell size with DLB */
 288     real     cellsize_limit;
 289     /* Effectively no NB cut-off limit with DLB for systems without PBC? */
 290     gmx_bool bVacDLBNoLimit;
 291
 292     /* With PME load balancing we set limits on DLB */
 293     gmx_bool bPMELoadBalDLBLimits;
 294     /* DLB needs to take into account that we want to allow this maximum
 295      * cut-off (for PME load balancing), this could limit cell boundaries.
 296      */
 297     real PMELoadBal_max_cutoff;
 298
 299     /* tric_dir is only stored here because dd_get_ns_ranges needs it */
 300     ivec tric_dir;
 301     /* box0 and box_size are required with dim's without pbc and -gcom */
 302     rvec box0;
 303     rvec box_size;
 304
 305     /* The cell boundaries */
 306     rvec cell_x0;
 307     rvec cell_x1;
 308
 309     /* The old location of the cell boundaries, to check cg displacements */
 310     rvec old_cell_x0;
 311     rvec old_cell_x1;
 312
 313     /* The communication setup and charge group boundaries for the zones */
 314     gmx_domdec_zones_t zones;
 315
 316     /* The zone limits for DD dimensions 1 and 2 (not 0), determined from
 317      * cell boundaries of neighboring cells for dynamic load balancing.
 318      */
 319     gmx_ddzone_t zone_d1[2];
 320     gmx_ddzone_t zone_d2[2][2];
 321
 322     /* The coordinate/force communication setup and indices */
 323     gmx_domdec_comm_dim_t cd[DIM];
 324     /* The maximum number of cells to communicate with in one dimension */
 325     int                   maxpulse;
 326
 327     /* Which cg distribution is stored on the master node */
 328     int master_cg_ddp_count;
 329
 330     /* The number of cg's received from the direct neighbors */
 331     int  zone_ncg1[DD_MAXZONE];
 332
 333     /* The atom counts, the range for each type t is nat[t-1] <= at < nat[t] */
 334     int  nat[ddnatNR];
 335
 336     /* Array for signalling if atoms have moved to another domain */
 337     int  *moved;
 338     int   moved_nalloc;
 339
 340     /* Communication buffer for general use */
 341     int  *buf_int;
 342     int   nalloc_int;
 343
 344     /* Communication buffer for general use */
 345     vec_rvec_t vbuf;
 346
 347     /* Temporary storage for thread parallel communication setup */
 348     int                   nth;
 349     dd_comm_setup_work_t *dth;
 350
 351     /* Communication buffers only used with multiple grid pulses */
 352     int       *buf_int2;
 353     int        nalloc_int2;
 354     vec_rvec_t vbuf2;
 355
 356     /* Communication buffers for local redistribution */
 357     int  **cggl_flag;
 358     int    cggl_flag_nalloc[DIM*2];
 359     rvec **cgcm_state;
 360     int    cgcm_state_nalloc[DIM*2];
 361
 362     /* Cell sizes for dynamic load balancing */
 363     gmx_domdec_root_t **root;
 364     real               *cell_f_row;
 365     real                cell_f0[DIM];
 366     real                cell_f1[DIM];
 367     real                cell_f_max0[DIM];
 368     real                cell_f_min1[DIM];
 369
 370     /* Stuff for load communication */
 371     gmx_bool           bRecordLoad;
 372     gmx_domdec_load_t *load;
 373     int                nrank_gpu_shared;
 374 #ifdef GMX_MPI
 375     MPI_Comm          *mpi_comm_load;
 376     MPI_Comm           mpi_comm_gpu_shared;
 377 #endif
 378
 379     /* Maximum DLB scaling per load balancing step in percent */
 380     int dlb_scale_lim;
 381
 382     /* Cycle counters */
 383     float  cycl[ddCyclNr];
 384     int    cycl_n[ddCyclNr];
 385     float  cycl_max[ddCyclNr];
 386     /* Flop counter (0=no,1=yes,2=with (eFlop-1)*5% noise */
 387     int    eFlop;
 388     double flop;
 389     int    flop_n;
 390     /* How many times have did we have load measurements */
 391     int    n_load_have;
 392     /* How many times have we collected the load measurements */
 393     int    n_load_collect;
 394
 395     /* Statistics */
 396     double sum_nat[ddnatNR-ddnatZONE];
 397     int    ndecomp;
 398     int    nload;
 399     double load_step;
 400     double load_sum;
 401     double load_max;
 402     ivec   load_lim;
 403     double load_mdf;
 404     double load_pme;
 405
 406     /* The last partition step */
 407     gmx_int64_t partition_step;
 408
 409     /* Debugging */
 410     int  nstDDDump;
 411     int  nstDDDumpGrid;
 412     int  DD_debug;
 413 } gmx_domdec_comm_t;
 414
 415 /* The size per charge group of the cggl_flag buffer in gmx_domdec_comm_t */
 416 #define DD_CGIBS 2
 417
 418 /* The flags for the cggl_flag buffer in gmx_domdec_comm_t */
 419 #define DD_FLAG_NRCG  65535
 420 #define DD_FLAG_FW(d) (1<<(16+(d)*2))
 421 #define DD_FLAG_BW(d) (1<<(16+(d)*2+1))
 422
 423 /* Zone permutation required to obtain consecutive charge groups
 424  * for neighbor searching.
 425  */
 426 static const int zone_perm[3][4] = { {0, 0, 0, 0}, {1, 0, 0, 0}, {3, 0, 1, 2} };
 427
 428 /* dd_zo and dd_zp3/dd_zp2 are set up such that i zones with non-zero
 429  * components see only j zones with that component 0.
 430  */
 431
 432 /* The DD zone order */
 433 static const ivec dd_zo[DD_MAXZONE] =
 434 {{0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0}, {0, 1, 1}, {0, 0, 1}, {1, 0, 1}, {1, 1, 1}};
 435
 436 /* The 3D setup */
 437 #define dd_z3n  8
 438 #define dd_zp3n 4
 439 static const ivec dd_zp3[dd_zp3n] = {{0, 0, 8}, {1, 3, 6}, {2, 5, 6}, {3, 5, 7}};
 440
 441 /* The 2D setup */
 442 #define dd_z2n  4
 443 #define dd_zp2n 2
 444 static const ivec dd_zp2[dd_zp2n] = {{0, 0, 4}, {1, 3, 4}};
 445
 446 /* The 1D setup */
 447 #define dd_z1n  2
 448 #define dd_zp1n 1
 449 static const ivec dd_zp1[dd_zp1n] = {{0, 0, 2}};
 450
 451 /* Factors used to avoid problems due to rounding issues */
 452 #define DD_CELL_MARGIN       1.0001
 453 #define DD_CELL_MARGIN2      1.00005
 454 /* Factor to account for pressure scaling during nstlist steps */
 455 #define DD_PRES_SCALE_MARGIN 1.02
 456
 457 /* Turn on DLB when the load imbalance causes this amount of total loss.
 458  * There is a bit of overhead with DLB and it's difficult to achieve
 459  * a load imbalance of less than 2% with DLB.
 460  */
 461 #define DD_PERF_LOSS_DLB_ON  0.02
 462
 463 /* Warn about imbalance due to PP or PP/PME load imbalance at this loss */
 464 #define DD_PERF_LOSS_WARN    0.05
 465
 466 #define DD_CELL_F_SIZE(dd, di) ((dd)->nc[(dd)->dim[(di)]]+1+(di)*2+1+(di))
 467
 468 /* Use separate MPI send and receive commands
 469  * when nnodes <= GMX_DD_NNODES_SENDRECV.
 470  * This saves memory (and some copying for small nnodes).
 471  * For high parallelization scatter and gather calls are used.
 472  */
 473 #define GMX_DD_NNODES_SENDRECV 4
 474
 475
 476 /*
 477    #define dd_index(n,i) ((((i)[ZZ]*(n)[YY] + (i)[YY])*(n)[XX]) + (i)[XX])
 478
 479    static void index2xyz(ivec nc,int ind,ivec xyz)
 480    {
 481    xyz[XX] = ind % nc[XX];
 482    xyz[YY] = (ind / nc[XX]) % nc[YY];
 483    xyz[ZZ] = ind / (nc[YY]*nc[XX]);
 484    }
 485  */
 486
 487 /* This order is required to minimize the coordinate communication in PME
 488  * which uses decomposition in the x direction.
 489  */
 490 #define dd_index(n, i) ((((i)[XX]*(n)[YY] + (i)[YY])*(n)[ZZ]) + (i)[ZZ])
 491
 492 static void ddindex2xyz(ivec nc, int ind, ivec xyz)
 493 {
 494     xyz[XX] = ind / (nc[YY]*nc[ZZ]);
 495     xyz[YY] = (ind / nc[ZZ]) % nc[YY];
 496     xyz[ZZ] = ind % nc[ZZ];
 497 }
 498
 499 static int ddcoord2ddnodeid(gmx_domdec_t *dd, ivec c)
 500 {
 501     int ddindex;
 502     int ddnodeid = -1;
 503
 504     ddindex = dd_index(dd->nc, c);
 505     if (dd->comm->bCartesianPP_PME)
 506     {
 507         ddnodeid = dd->comm->ddindex2ddnodeid[ddindex];
 508     }
 509     else if (dd->comm->bCartesianPP)
 510     {
 511 #ifdef GMX_MPI
 512         MPI_Cart_rank(dd->mpi_comm_all, c, &ddnodeid);
 513 #endif
 514     }
 515     else
 516     {
 517         ddnodeid = ddindex;
 518     }
 519
 520     return ddnodeid;
 521 }
 522
 523 static gmx_bool dynamic_dd_box(gmx_ddbox_t *ddbox, t_inputrec *ir)
 524 {
 525     return (ddbox->nboundeddim < DIM || DYNAMIC_BOX(*ir));
 526 }
 527
 528 int ddglatnr(gmx_domdec_t *dd, int i)
 529 {
 530     int atnr;
 531
 532     if (dd == NULL)
 533     {
 534         atnr = i + 1;
 535     }
 536     else
 537     {
 538         if (i >= dd->comm->nat[ddnatNR-1])
 539         {
 540             gmx_fatal(FARGS, "glatnr called with %d, which is larger than the local number of atoms (%d)", i, dd->comm->nat[ddnatNR-1]);
 541         }
 542         atnr = dd->gatindex[i] + 1;
 543     }
 544
 545     return atnr;
 546 }
 547
 548 t_block *dd_charge_groups_global(gmx_domdec_t *dd)
 549 {
 550     return &dd->comm->cgs_gl;
 551 }
 552
 553 static void vec_rvec_init(vec_rvec_t *v)
 554 {
 555     v->nalloc = 0;
 556     v->v      = NULL;
 557 }
 558
 559 static void vec_rvec_check_alloc(vec_rvec_t *v, int n)
 560 {
 561     if (n > v->nalloc)
 562     {
 563         v->nalloc = over_alloc_dd(n);
 564         srenew(v->v, v->nalloc);
 565     }
 566 }
 567
 568 void dd_store_state(gmx_domdec_t *dd, t_state *state)
 569 {
 570     int i;
 571
 572     if (state->ddp_count != dd->ddp_count)
 573     {
 574         gmx_incons("The state does not the domain decomposition state");
 575     }
 576
 577     state->ncg_gl = dd->ncg_home;
 578     if (state->ncg_gl > state->cg_gl_nalloc)
 579     {
 580         state->cg_gl_nalloc = over_alloc_dd(state->ncg_gl);
 581         srenew(state->cg_gl, state->cg_gl_nalloc);
 582     }
 583     for (i = 0; i < state->ncg_gl; i++)
 584     {
 585         state->cg_gl[i] = dd->index_gl[i];
 586     }
 587
 588     state->ddp_count_cg_gl = dd->ddp_count;
 589 }
 590
 591 gmx_domdec_zones_t *domdec_zones(gmx_domdec_t *dd)
 592 {
 593     return &dd->comm->zones;
 594 }
 595
 596 void dd_get_ns_ranges(gmx_domdec_t *dd, int icg,
 597                       int *jcg0, int *jcg1, ivec shift0, ivec shift1)
 598 {
 599     gmx_domdec_zones_t *zones;
 600     int                 izone, d, dim;
 601
 602     zones = &dd->comm->zones;
 603
 604     izone = 0;
 605     while (icg >= zones->izone[izone].cg1)
 606     {
 607         izone++;
 608     }
 609
 610     if (izone == 0)
 611     {
 612         *jcg0 = icg;
 613     }
 614     else if (izone < zones->nizone)
 615     {
 616         *jcg0 = zones->izone[izone].jcg0;
 617     }
 618     else
 619     {
 620         gmx_fatal(FARGS, "DD icg %d out of range: izone (%d) >= nizone (%d)",
 621                   icg, izone, zones->nizone);
 622     }
 623
 624     *jcg1 = zones->izone[izone].jcg1;
 625
 626     for (d = 0; d < dd->ndim; d++)
 627     {
 628         dim         = dd->dim[d];
 629         shift0[dim] = zones->izone[izone].shift0[dim];
 630         shift1[dim] = zones->izone[izone].shift1[dim];
 631         if (dd->comm->tric_dir[dim] || (dd->bGridJump && d > 0))
 632         {
 633             /* A conservative approach, this can be optimized */
 634             shift0[dim] -= 1;
 635             shift1[dim] += 1;
 636         }
 637     }
 638 }
 639
 640 int dd_natoms_vsite(gmx_domdec_t *dd)
 641 {
 642     return dd->comm->nat[ddnatVSITE];
 643 }
 644
 645 void dd_get_constraint_range(gmx_domdec_t *dd, int *at_start, int *at_end)
 646 {
 647     *at_start = dd->comm->nat[ddnatCON-1];
 648     *at_end   = dd->comm->nat[ddnatCON];
 649 }
 650
 651 void dd_move_x(gmx_domdec_t *dd, matrix box, rvec x[])
 652 {
 653     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 654     int                   *index, *cgindex;
 655     gmx_domdec_comm_t     *comm;
 656     gmx_domdec_comm_dim_t *cd;
 657     gmx_domdec_ind_t      *ind;
 658     rvec                   shift = {0, 0, 0}, *buf, *rbuf;
 659     gmx_bool               bPBC, bScrew;
 660
 661     comm = dd->comm;
 662
 663     cgindex = dd->cgindex;
 664
 665     buf = comm->vbuf.v;
 666
 667     nzone   = 1;
 668     nat_tot = dd->nat_home;
 669     for (d = 0; d < dd->ndim; d++)
 670     {
 671         bPBC   = (dd->ci[dd->dim[d]] == 0);
 672         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 673         if (bPBC)
 674         {
 675             copy_rvec(box[dd->dim[d]], shift);
 676         }
 677         cd = &comm->cd[d];
 678         for (p = 0; p < cd->np; p++)
 679         {
 680             ind   = &cd->ind[p];
 681             index = ind->index;
 682             n     = 0;
 683             if (!bPBC)
 684             {
 685                 for (i = 0; i < ind->nsend[nzone]; i++)
 686                 {
 687                     at0 = cgindex[index[i]];
 688                     at1 = cgindex[index[i]+1];
 689                     for (j = at0; j < at1; j++)
 690                     {
 691                         copy_rvec(x[j], buf[n]);
 692                         n++;
 693                     }
 694                 }
 695             }
 696             else if (!bScrew)
 697             {
 698                 for (i = 0; i < ind->nsend[nzone]; i++)
 699                 {
 700                     at0 = cgindex[index[i]];
 701                     at1 = cgindex[index[i]+1];
 702                     for (j = at0; j < at1; j++)
 703                     {
 704                         /* We need to shift the coordinates */
 705                         rvec_add(x[j], shift, buf[n]);
 706                         n++;
 707                     }
 708                 }
 709             }
 710             else
 711             {
 712                 for (i = 0; i < ind->nsend[nzone]; i++)
 713                 {
 714                     at0 = cgindex[index[i]];
 715                     at1 = cgindex[index[i]+1];
 716                     for (j = at0; j < at1; j++)
 717                     {
 718                         /* Shift x */
 719                         buf[n][XX] = x[j][XX] + shift[XX];
 720                         /* Rotate y and z.
 721                          * This operation requires a special shift force
 722                          * treatment, which is performed in calc_vir.
 723                          */
 724                         buf[n][YY] = box[YY][YY] - x[j][YY];
 725                         buf[n][ZZ] = box[ZZ][ZZ] - x[j][ZZ];
 726                         n++;
 727                     }
 728                 }
 729             }
 730
 731             if (cd->bInPlace)
 732             {
 733                 rbuf = x + nat_tot;
 734             }
 735             else
 736             {
 737                 rbuf = comm->vbuf2.v;
 738             }
 739             /* Send and receive the coordinates */
 740             dd_sendrecv_rvec(dd, d, dddirBackward,
 741                              buf,  ind->nsend[nzone+1],
 742                              rbuf, ind->nrecv[nzone+1]);
 743             if (!cd->bInPlace)
 744             {
 745                 j = 0;
 746                 for (zone = 0; zone < nzone; zone++)
 747                 {
 748                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 749                     {
 750                         copy_rvec(rbuf[j], x[i]);
 751                         j++;
 752                     }
 753                 }
 754             }
 755             nat_tot += ind->nrecv[nzone+1];
 756         }
 757         nzone += nzone;
 758     }
 759 }
 760
 761 void dd_move_f(gmx_domdec_t *dd, rvec f[], rvec *fshift)
 762 {
 763     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 764     int                   *index, *cgindex;
 765     gmx_domdec_comm_t     *comm;
 766     gmx_domdec_comm_dim_t *cd;
 767     gmx_domdec_ind_t      *ind;
 768     rvec                  *buf, *sbuf;
 769     ivec                   vis;
 770     int                    is;
 771     gmx_bool               bPBC, bScrew;
 772
 773     comm = dd->comm;
 774
 775     cgindex = dd->cgindex;
 776
 777     buf = comm->vbuf.v;
 778
 779     n       = 0;
 780     nzone   = comm->zones.n/2;
 781     nat_tot = dd->nat_tot;
 782     for (d = dd->ndim-1; d >= 0; d--)
 783     {
 784         bPBC   = (dd->ci[dd->dim[d]] == 0);
 785         bScrew = (bPBC && dd->bScrewPBC && dd->dim[d] == XX);
 786         if (fshift == NULL && !bScrew)
 787         {
 788             bPBC = FALSE;
 789         }
 790         /* Determine which shift vector we need */
 791         clear_ivec(vis);
 792         vis[dd->dim[d]] = 1;
 793         is              = IVEC2IS(vis);
 794
 795         cd = &comm->cd[d];
 796         for (p = cd->np-1; p >= 0; p--)
 797         {
 798             ind      = &cd->ind[p];
 799             nat_tot -= ind->nrecv[nzone+1];
 800             if (cd->bInPlace)
 801             {
 802                 sbuf = f + nat_tot;
 803             }
 804             else
 805             {
 806                 sbuf = comm->vbuf2.v;
 807                 j    = 0;
 808                 for (zone = 0; zone < nzone; zone++)
 809                 {
 810                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 811                     {
 812                         copy_rvec(f[i], sbuf[j]);
 813                         j++;
 814                     }
 815                 }
 816             }
 817             /* Communicate the forces */
 818             dd_sendrecv_rvec(dd, d, dddirForward,
 819                              sbuf, ind->nrecv[nzone+1],
 820                              buf,  ind->nsend[nzone+1]);
 821             index = ind->index;
 822             /* Add the received forces */
 823             n = 0;
 824             if (!bPBC)
 825             {
 826                 for (i = 0; i < ind->nsend[nzone]; i++)
 827                 {
 828                     at0 = cgindex[index[i]];
 829                     at1 = cgindex[index[i]+1];
 830                     for (j = at0; j < at1; j++)
 831                     {
 832                         rvec_inc(f[j], buf[n]);
 833                         n++;
 834                     }
 835                 }
 836             }
 837             else if (!bScrew)
 838             {
 839                 for (i = 0; i < ind->nsend[nzone]; i++)
 840                 {
 841                     at0 = cgindex[index[i]];
 842                     at1 = cgindex[index[i]+1];
 843                     for (j = at0; j < at1; j++)
 844                     {
 845                         rvec_inc(f[j], buf[n]);
 846                         /* Add this force to the shift force */
 847                         rvec_inc(fshift[is], buf[n]);
 848                         n++;
 849                     }
 850                 }
 851             }
 852             else
 853             {
 854                 for (i = 0; i < ind->nsend[nzone]; i++)
 855                 {
 856                     at0 = cgindex[index[i]];
 857                     at1 = cgindex[index[i]+1];
 858                     for (j = at0; j < at1; j++)
 859                     {
 860                         /* Rotate the force */
 861                         f[j][XX] += buf[n][XX];
 862                         f[j][YY] -= buf[n][YY];
 863                         f[j][ZZ] -= buf[n][ZZ];
 864                         if (fshift)
 865                         {
 866                             /* Add this force to the shift force */
 867                             rvec_inc(fshift[is], buf[n]);
 868                         }
 869                         n++;
 870                     }
 871                 }
 872             }
 873         }
 874         nzone /= 2;
 875     }
 876 }
 877
 878 void dd_atom_spread_real(gmx_domdec_t *dd, real v[])
 879 {
 880     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 881     int                   *index, *cgindex;
 882     gmx_domdec_comm_t     *comm;
 883     gmx_domdec_comm_dim_t *cd;
 884     gmx_domdec_ind_t      *ind;
 885     real                  *buf, *rbuf;
 886
 887     comm = dd->comm;
 888
 889     cgindex = dd->cgindex;
 890
 891     buf = &comm->vbuf.v[0][0];
 892
 893     nzone   = 1;
 894     nat_tot = dd->nat_home;
 895     for (d = 0; d < dd->ndim; d++)
 896     {
 897         cd = &comm->cd[d];
 898         for (p = 0; p < cd->np; p++)
 899         {
 900             ind   = &cd->ind[p];
 901             index = ind->index;
 902             n     = 0;
 903             for (i = 0; i < ind->nsend[nzone]; i++)
 904             {
 905                 at0 = cgindex[index[i]];
 906                 at1 = cgindex[index[i]+1];
 907                 for (j = at0; j < at1; j++)
 908                 {
 909                     buf[n] = v[j];
 910                     n++;
 911                 }
 912             }
 913
 914             if (cd->bInPlace)
 915             {
 916                 rbuf = v + nat_tot;
 917             }
 918             else
 919             {
 920                 rbuf = &comm->vbuf2.v[0][0];
 921             }
 922             /* Send and receive the coordinates */
 923             dd_sendrecv_real(dd, d, dddirBackward,
 924                              buf,  ind->nsend[nzone+1],
 925                              rbuf, ind->nrecv[nzone+1]);
 926             if (!cd->bInPlace)
 927             {
 928                 j = 0;
 929                 for (zone = 0; zone < nzone; zone++)
 930                 {
 931                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 932                     {
 933                         v[i] = rbuf[j];
 934                         j++;
 935                     }
 936                 }
 937             }
 938             nat_tot += ind->nrecv[nzone+1];
 939         }
 940         nzone += nzone;
 941     }
 942 }
 943
 944 void dd_atom_sum_real(gmx_domdec_t *dd, real v[])
 945 {
 946     int                    nzone, nat_tot, n, d, p, i, j, at0, at1, zone;
 947     int                   *index, *cgindex;
 948     gmx_domdec_comm_t     *comm;
 949     gmx_domdec_comm_dim_t *cd;
 950     gmx_domdec_ind_t      *ind;
 951     real                  *buf, *sbuf;
 952
 953     comm = dd->comm;
 954
 955     cgindex = dd->cgindex;
 956
 957     buf = &comm->vbuf.v[0][0];
 958
 959     n       = 0;
 960     nzone   = comm->zones.n/2;
 961     nat_tot = dd->nat_tot;
 962     for (d = dd->ndim-1; d >= 0; d--)
 963     {
 964         cd = &comm->cd[d];
 965         for (p = cd->np-1; p >= 0; p--)
 966         {
 967             ind      = &cd->ind[p];
 968             nat_tot -= ind->nrecv[nzone+1];
 969             if (cd->bInPlace)
 970             {
 971                 sbuf = v + nat_tot;
 972             }
 973             else
 974             {
 975                 sbuf = &comm->vbuf2.v[0][0];
 976                 j    = 0;
 977                 for (zone = 0; zone < nzone; zone++)
 978                 {
 979                     for (i = ind->cell2at0[zone]; i < ind->cell2at1[zone]; i++)
 980                     {
 981                         sbuf[j] = v[i];
 982                         j++;
 983                     }
 984                 }
 985             }
 986             /* Communicate the forces */
 987             dd_sendrecv_real(dd, d, dddirForward,
 988                              sbuf, ind->nrecv[nzone+1],
 989                              buf,  ind->nsend[nzone+1]);
 990             index = ind->index;
 991             /* Add the received forces */
 992             n = 0;
 993             for (i = 0; i < ind->nsend[nzone]; i++)
 994             {
 995                 at0 = cgindex[index[i]];
 996                 at1 = cgindex[index[i]+1];
 997                 for (j = at0; j < at1; j++)
 998                 {
 999                     v[j] += buf[n];
1000                     n++;
1001                 }
1002             }
1003         }
1004         nzone /= 2;
1005     }
1006 }
1007
1008 static void print_ddzone(FILE *fp, int d, int i, int j, gmx_ddzone_t *zone)
1009 {
1010     fprintf(fp, "zone d0 %d d1 %d d2 %d  min0 %6.3f max1 %6.3f mch0 %6.3f mch1 %6.3f p1_0 %6.3f p1_1 %6.3f\n",
1011             d, i, j,
1012             zone->min0, zone->max1,
1013             zone->mch0, zone->mch0,
1014             zone->p1_0, zone->p1_1);
1015 }
1016
1017
1018 #define DDZONECOMM_MAXZONE  5
1019 #define DDZONECOMM_BUFSIZE  3
1020
1021 static void dd_sendrecv_ddzone(const gmx_domdec_t *dd,
1022                                int ddimind, int direction,
1023                                gmx_ddzone_t *buf_s, int n_s,
1024                                gmx_ddzone_t *buf_r, int n_r)
1025 {
1026 #define ZBS  DDZONECOMM_BUFSIZE
1027     rvec vbuf_s[DDZONECOMM_MAXZONE*ZBS];
1028     rvec vbuf_r[DDZONECOMM_MAXZONE*ZBS];
1029     int  i;
1030
1031     for (i = 0; i < n_s; i++)
1032     {
1033         vbuf_s[i*ZBS  ][0] = buf_s[i].min0;
1034         vbuf_s[i*ZBS  ][1] = buf_s[i].max1;
1035         vbuf_s[i*ZBS  ][2] = buf_s[i].min1;
1036         vbuf_s[i*ZBS+1][0] = buf_s[i].mch0;
1037         vbuf_s[i*ZBS+1][1] = buf_s[i].mch1;
1038         vbuf_s[i*ZBS+1][2] = 0;
1039         vbuf_s[i*ZBS+2][0] = buf_s[i].p1_0;
1040         vbuf_s[i*ZBS+2][1] = buf_s[i].p1_1;
1041         vbuf_s[i*ZBS+2][2] = 0;
1042     }
1043
1044     dd_sendrecv_rvec(dd, ddimind, direction,
1045                      vbuf_s, n_s*ZBS,
1046                      vbuf_r, n_r*ZBS);
1047
1048     for (i = 0; i < n_r; i++)
1049     {
1050         buf_r[i].min0 = vbuf_r[i*ZBS  ][0];
1051         buf_r[i].max1 = vbuf_r[i*ZBS  ][1];
1052         buf_r[i].min1 = vbuf_r[i*ZBS  ][2];
1053         buf_r[i].mch0 = vbuf_r[i*ZBS+1][0];
1054         buf_r[i].mch1 = vbuf_r[i*ZBS+1][1];
1055         buf_r[i].p1_0 = vbuf_r[i*ZBS+2][0];
1056         buf_r[i].p1_1 = vbuf_r[i*ZBS+2][1];
1057     }
1058
1059 #undef ZBS
1060 }
1061
1062 static void dd_move_cellx(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
1063                           rvec cell_ns_x0, rvec cell_ns_x1)
1064 {
1065     int                d, d1, dim, dim1, pos, buf_size, i, j, k, p, npulse, npulse_min;
1066     gmx_ddzone_t      *zp;
1067     gmx_ddzone_t       buf_s[DDZONECOMM_MAXZONE];
1068     gmx_ddzone_t       buf_r[DDZONECOMM_MAXZONE];
1069     gmx_ddzone_t       buf_e[DDZONECOMM_MAXZONE];
1070     rvec               extr_s[2], extr_r[2];
1071     rvec               dh;
1072     real               dist_d, c = 0, det;
1073     gmx_domdec_comm_t *comm;
1074     gmx_bool           bPBC, bUse;
1075
1076     comm = dd->comm;
1077
1078     for (d = 1; d < dd->ndim; d++)
1079     {
1080         dim      = dd->dim[d];
1081         zp       = (d == 1) ? &comm->zone_d1[0] : &comm->zone_d2[0][0];
1082         zp->min0 = cell_ns_x0[dim];
1083         zp->max1 = cell_ns_x1[dim];
1084         zp->min1 = cell_ns_x1[dim];
1085         zp->mch0 = cell_ns_x0[dim];
1086         zp->mch1 = cell_ns_x1[dim];
1087         zp->p1_0 = cell_ns_x0[dim];
1088         zp->p1_1 = cell_ns_x1[dim];
1089     }
1090
1091     for (d = dd->ndim-2; d >= 0; d--)
1092     {
1093         dim  = dd->dim[d];
1094         bPBC = (dim < ddbox->npbcdim);
1095
1096         /* Use an rvec to store two reals */
1097         extr_s[d][0] = comm->cell_f0[d+1];
1098         extr_s[d][1] = comm->cell_f1[d+1];
1099         extr_s[d][2] = comm->cell_f1[d+1];
1100
1101         pos = 0;
1102         /* Store the extremes in the backward sending buffer,
1103          * so the get updated separately from the forward communication.
1104          */
1105         for (d1 = d; d1 < dd->ndim-1; d1++)
1106         {
1107             /* We invert the order to be able to use the same loop for buf_e */
1108             buf_s[pos].min0 = extr_s[d1][1];
1109             buf_s[pos].max1 = extr_s[d1][0];
1110             buf_s[pos].min1 = extr_s[d1][2];
1111             buf_s[pos].mch0 = 0;
1112             buf_s[pos].mch1 = 0;
1113             /* Store the cell corner of the dimension we communicate along */
1114             buf_s[pos].p1_0 = comm->cell_x0[dim];
1115             buf_s[pos].p1_1 = 0;
1116             pos++;
1117         }
1118
1119         buf_s[pos] = (dd->ndim == 2) ? comm->zone_d1[0] : comm->zone_d2[0][0];
1120         pos++;
1121
1122         if (dd->ndim == 3 && d == 0)
1123         {
1124             buf_s[pos] = comm->zone_d2[0][1];
1125             pos++;
1126             buf_s[pos] = comm->zone_d1[0];
1127             pos++;
1128         }
1129
1130         /* We only need to communicate the extremes
1131          * in the forward direction
1132          */
1133         npulse = comm->cd[d].np;
1134         if (bPBC)
1135         {
1136             /* Take the minimum to avoid double communication */
1137             npulse_min = min(npulse, dd->nc[dim]-1-npulse);
1138         }
1139         else
1140         {
1141             /* Without PBC we should really not communicate over
1142              * the boundaries, but implementing that complicates
1143              * the communication setup and therefore we simply
1144              * do all communication, but ignore some data.
1145              */
1146             npulse_min = npulse;
1147         }
1148         for (p = 0; p < npulse_min; p++)
1149         {
1150             /* Communicate the extremes forward */
1151             bUse = (bPBC || dd->ci[dim] > 0);
1152
1153             dd_sendrecv_rvec(dd, d, dddirForward,
1154                              extr_s+d, dd->ndim-d-1,
1155                              extr_r+d, dd->ndim-d-1);
1156
1157             if (bUse)
1158             {
1159                 for (d1 = d; d1 < dd->ndim-1; d1++)
1160                 {
1161                     extr_s[d1][0] = max(extr_s[d1][0], extr_r[d1][0]);
1162                     extr_s[d1][1] = min(extr_s[d1][1], extr_r[d1][1]);
1163                     extr_s[d1][2] = min(extr_s[d1][2], extr_r[d1][2]);
1164                 }
1165             }
1166         }
1167
1168         buf_size = pos;
1169         for (p = 0; p < npulse; p++)
1170         {
1171             /* Communicate all the zone information backward */
1172             bUse = (bPBC || dd->ci[dim] < dd->nc[dim] - 1);
1173
1174             dd_sendrecv_ddzone(dd, d, dddirBackward,
1175                                buf_s, buf_size,
1176                                buf_r, buf_size);
1177
1178             clear_rvec(dh);
1179             if (p > 0)
1180             {
1181                 for (d1 = d+1; d1 < dd->ndim; d1++)
1182                 {
1183                     /* Determine the decrease of maximum required
1184                      * communication height along d1 due to the distance along d,
1185                      * this avoids a lot of useless atom communication.
1186                      */
1187                     dist_d = comm->cell_x1[dim] - buf_r[0].p1_0;
1188
1189                     if (ddbox->tric_dir[dim])
1190                     {
1191                         /* c is the off-diagonal coupling between the cell planes
1192                          * along directions d and d1.
1193                          */
1194                         c = ddbox->v[dim][dd->dim[d1]][dim];
1195                     }
1196                     else
1197                     {
1198                         c = 0;
1199                     }
1200                     det = (1 + c*c)*comm->cutoff*comm->cutoff - dist_d*dist_d;
1201                     if (det > 0)
1202                     {
1203                         dh[d1] = comm->cutoff - (c*dist_d + sqrt(det))/(1 + c*c);
1204                     }
1205                     else
1206                     {
1207                         /* A negative value signals out of range */
1208                         dh[d1] = -1;
1209                     }
1210                 }
1211             }
1212
1213             /* Accumulate the extremes over all pulses */
1214             for (i = 0; i < buf_size; i++)
1215             {
1216                 if (p == 0)
1217                 {
1218                     buf_e[i] = buf_r[i];
1219                 }
1220                 else
1221                 {
1222                     if (bUse)
1223                     {
1224                         buf_e[i].min0 = min(buf_e[i].min0, buf_r[i].min0);
1225                         buf_e[i].max1 = max(buf_e[i].max1, buf_r[i].max1);
1226                         buf_e[i].min1 = min(buf_e[i].min1, buf_r[i].min1);
1227                     }
1228
1229                     if (dd->ndim == 3 && d == 0 && i == buf_size - 1)
1230                     {
1231                         d1 = 1;
1232                     }
1233                     else
1234                     {
1235                         d1 = d + 1;
1236                     }
1237                     if (bUse && dh[d1] >= 0)
1238                     {
1239                         buf_e[i].mch0 = max(buf_e[i].mch0, buf_r[i].mch0-dh[d1]);
1240                         buf_e[i].mch1 = max(buf_e[i].mch1, buf_r[i].mch1-dh[d1]);
1241                     }
1242                 }
1243                 /* Copy the received buffer to the send buffer,
1244                  * to pass the data through with the next pulse.
1245                  */
1246                 buf_s[i] = buf_r[i];
1247             }
1248             if (((bPBC || dd->ci[dim]+npulse < dd->nc[dim]) && p == npulse-1) ||
1249                 (!bPBC && dd->ci[dim]+1+p == dd->nc[dim]-1))
1250             {
1251                 /* Store the extremes */
1252                 pos = 0;
1253
1254                 for (d1 = d; d1 < dd->ndim-1; d1++)
1255                 {
1256                     extr_s[d1][1] = min(extr_s[d1][1], buf_e[pos].min0);
1257                     extr_s[d1][0] = max(extr_s[d1][0], buf_e[pos].max1);
1258                     extr_s[d1][2] = min(extr_s[d1][2], buf_e[pos].min1);
1259                     pos++;
1260                 }
1261
1262                 if (d == 1 || (d == 0 && dd->ndim == 3))
1263                 {
1264                     for (i = d; i < 2; i++)
1265                     {
1266                         comm->zone_d2[1-d][i] = buf_e[pos];
1267                         pos++;
1268                     }
1269                 }
1270                 if (d == 0)
1271                 {
1272                     comm->zone_d1[1] = buf_e[pos];
1273                     pos++;
1274                 }
1275             }
1276         }
1277     }
1278
1279     if (dd->ndim >= 2)
1280     {
1281         dim = dd->dim[1];
1282         for (i = 0; i < 2; i++)
1283         {
1284             if (debug)
1285             {
1286                 print_ddzone(debug, 1, i, 0, &comm->zone_d1[i]);
1287             }
1288             cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d1[i].min0);
1289             cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d1[i].max1);
1290         }
1291     }
1292     if (dd->ndim >= 3)
1293     {
1294         dim = dd->dim[2];
1295         for (i = 0; i < 2; i++)
1296         {
1297             for (j = 0; j < 2; j++)
1298             {
1299                 if (debug)
1300                 {
1301                     print_ddzone(debug, 2, i, j, &comm->zone_d2[i][j]);
1302                 }
1303                 cell_ns_x0[dim] = min(cell_ns_x0[dim], comm->zone_d2[i][j].min0);
1304                 cell_ns_x1[dim] = max(cell_ns_x1[dim], comm->zone_d2[i][j].max1);
1305             }
1306         }
1307     }
1308     for (d = 1; d < dd->ndim; d++)
1309     {
1310         comm->cell_f_max0[d] = extr_s[d-1][0];
1311         comm->cell_f_min1[d] = extr_s[d-1][1];
1312         if (debug)
1313         {
1314             fprintf(debug, "Cell fraction d %d, max0 %f, min1 %f\n",
1315                     d, comm->cell_f_max0[d], comm->cell_f_min1[d]);
1316         }
1317     }
1318 }
1319
1320 static void dd_collect_cg(gmx_domdec_t *dd,
1321                           t_state      *state_local)
1322 {
1323     gmx_domdec_master_t *ma = NULL;
1324     int                  buf2[2], *ibuf, i, ncg_home = 0, *cg = NULL, nat_home = 0;
1325
1326     if (state_local->ddp_count == dd->comm->master_cg_ddp_count)
1327     {
1328         /* The master has the correct distribution */
1329         return;
1330     }
1331
1332     if (state_local->ddp_count == dd->ddp_count)
1333     {
1334         /* The local state and DD are in sync, use the DD indices */
1335         ncg_home = dd->ncg_home;
1336         cg       = dd->index_gl;
1337         nat_home = dd->nat_home;
1338     }
1339     else if (state_local->ddp_count_cg_gl == state_local->ddp_count)
1340     {
1341         /* The DD is out of sync with the local state, but we have stored
1342          * the cg indices with the local state, so we can use those.
1343          */
1344         t_block *cgs_gl;
1345
1346         cgs_gl = &dd->comm->cgs_gl;
1347
1348         ncg_home = state_local->ncg_gl;
1349         cg       = state_local->cg_gl;
1350         nat_home = 0;
1351         for (i = 0; i < ncg_home; i++)
1352         {
1353             nat_home += cgs_gl->index[cg[i]+1] - cgs_gl->index[cg[i]];
1354         }
1355     }
1356     else
1357     {
1358         gmx_incons("Attempted to collect a vector for a state for which the charge group distribution is unknown");
1359     }
1360
1361     buf2[0] = ncg_home;
1362     buf2[1] = nat_home;
1363     if (DDMASTER(dd))
1364     {
1365         ma   = dd->ma;
1366         ibuf = ma->ibuf;
1367     }
1368     else
1369     {
1370         ibuf = NULL;
1371     }
1372     /* Collect the charge group and atom counts on the master */
1373     dd_gather(dd, 2*sizeof(int), buf2, ibuf);
1374
1375     if (DDMASTER(dd))
1376     {
1377         ma->index[0] = 0;
1378         for (i = 0; i < dd->nnodes; i++)
1379         {
1380             ma->ncg[i]     = ma->ibuf[2*i];
1381             ma->nat[i]     = ma->ibuf[2*i+1];
1382             ma->index[i+1] = ma->index[i] + ma->ncg[i];
1383
1384         }
1385         /* Make byte counts and indices */
1386         for (i = 0; i < dd->nnodes; i++)
1387         {
1388             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
1389             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
1390         }
1391         if (debug)
1392         {
1393             fprintf(debug, "Initial charge group distribution: ");
1394             for (i = 0; i < dd->nnodes; i++)
1395             {
1396                 fprintf(debug, " %d", ma->ncg[i]);
1397             }
1398             fprintf(debug, "\n");
1399         }
1400     }
1401
1402     /* Collect the charge group indices on the master */
1403     dd_gatherv(dd,
1404                ncg_home*sizeof(int), cg,
1405                DDMASTER(dd) ? ma->ibuf : NULL,
1406                DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
1407                DDMASTER(dd) ? ma->cg : NULL);
1408
1409     dd->comm->master_cg_ddp_count = state_local->ddp_count;
1410 }
1411
1412 static void dd_collect_vec_sendrecv(gmx_domdec_t *dd,
1413                                     rvec *lv, rvec *v)
1414 {
1415     gmx_domdec_master_t *ma;
1416     int                  n, i, c, a, nalloc = 0;
1417     rvec                *buf = NULL;
1418     t_block             *cgs_gl;
1419
1420     ma = dd->ma;
1421
1422     if (!DDMASTER(dd))
1423     {
1424 #ifdef GMX_MPI
1425         MPI_Send(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1426                  dd->rank, dd->mpi_comm_all);
1427 #endif
1428     }
1429     else
1430     {
1431         /* Copy the master coordinates to the global array */
1432         cgs_gl = &dd->comm->cgs_gl;
1433
1434         n = DDMASTERRANK(dd);
1435         a = 0;
1436         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1437         {
1438             for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1439             {
1440                 copy_rvec(lv[a++], v[c]);
1441             }
1442         }
1443
1444         for (n = 0; n < dd->nnodes; n++)
1445         {
1446             if (n != dd->rank)
1447             {
1448                 if (ma->nat[n] > nalloc)
1449                 {
1450                     nalloc = over_alloc_dd(ma->nat[n]);
1451                     srenew(buf, nalloc);
1452                 }
1453 #ifdef GMX_MPI
1454                 MPI_Recv(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE, DDRANK(dd, n),
1455                          n, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1456 #endif
1457                 a = 0;
1458                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1459                 {
1460                     for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1461                     {
1462                         copy_rvec(buf[a++], v[c]);
1463                     }
1464                 }
1465             }
1466         }
1467         sfree(buf);
1468     }
1469 }
1470
1471 static void get_commbuffer_counts(gmx_domdec_t *dd,
1472                                   int **counts, int **disps)
1473 {
1474     gmx_domdec_master_t *ma;
1475     int                  n;
1476
1477     ma = dd->ma;
1478
1479     /* Make the rvec count and displacment arrays */
1480     *counts  = ma->ibuf;
1481     *disps   = ma->ibuf + dd->nnodes;
1482     for (n = 0; n < dd->nnodes; n++)
1483     {
1484         (*counts)[n] = ma->nat[n]*sizeof(rvec);
1485         (*disps)[n]  = (n == 0 ? 0 : (*disps)[n-1] + (*counts)[n-1]);
1486     }
1487 }
1488
1489 static void dd_collect_vec_gatherv(gmx_domdec_t *dd,
1490                                    rvec *lv, rvec *v)
1491 {
1492     gmx_domdec_master_t *ma;
1493     int                 *rcounts = NULL, *disps = NULL;
1494     int                  n, i, c, a;
1495     rvec                *buf = NULL;
1496     t_block             *cgs_gl;
1497
1498     ma = dd->ma;
1499
1500     if (DDMASTER(dd))
1501     {
1502         get_commbuffer_counts(dd, &rcounts, &disps);
1503
1504         buf = ma->vbuf;
1505     }
1506
1507     dd_gatherv(dd, dd->nat_home*sizeof(rvec), lv, rcounts, disps, buf);
1508
1509     if (DDMASTER(dd))
1510     {
1511         cgs_gl = &dd->comm->cgs_gl;
1512
1513         a = 0;
1514         for (n = 0; n < dd->nnodes; n++)
1515         {
1516             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1517             {
1518                 for (c = cgs_gl->index[ma->cg[i]]; c < cgs_gl->index[ma->cg[i]+1]; c++)
1519                 {
1520                     copy_rvec(buf[a++], v[c]);
1521                 }
1522             }
1523         }
1524     }
1525 }
1526
1527 void dd_collect_vec(gmx_domdec_t *dd,
1528                     t_state *state_local, rvec *lv, rvec *v)
1529 {
1530     gmx_domdec_master_t *ma;
1531     int                  n, i, c, a, nalloc = 0;
1532     rvec                *buf = NULL;
1533
1534     dd_collect_cg(dd, state_local);
1535
1536     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1537     {
1538         dd_collect_vec_sendrecv(dd, lv, v);
1539     }
1540     else
1541     {
1542         dd_collect_vec_gatherv(dd, lv, v);
1543     }
1544 }
1545
1546
1547 void dd_collect_state(gmx_domdec_t *dd,
1548                       t_state *state_local, t_state *state)
1549 {
1550     int est, i, j, nh;
1551
1552     nh = state->nhchainlength;
1553
1554     if (DDMASTER(dd))
1555     {
1556         for (i = 0; i < efptNR; i++)
1557         {
1558             state->lambda[i] = state_local->lambda[i];
1559         }
1560         state->fep_state = state_local->fep_state;
1561         state->veta      = state_local->veta;
1562         state->vol0      = state_local->vol0;
1563         copy_mat(state_local->box, state->box);
1564         copy_mat(state_local->boxv, state->boxv);
1565         copy_mat(state_local->svir_prev, state->svir_prev);
1566         copy_mat(state_local->fvir_prev, state->fvir_prev);
1567         copy_mat(state_local->pres_prev, state->pres_prev);
1568
1569         for (i = 0; i < state_local->ngtc; i++)
1570         {
1571             for (j = 0; j < nh; j++)
1572             {
1573                 state->nosehoover_xi[i*nh+j]        = state_local->nosehoover_xi[i*nh+j];
1574                 state->nosehoover_vxi[i*nh+j]       = state_local->nosehoover_vxi[i*nh+j];
1575             }
1576             state->therm_integral[i] = state_local->therm_integral[i];
1577         }
1578         for (i = 0; i < state_local->nnhpres; i++)
1579         {
1580             for (j = 0; j < nh; j++)
1581             {
1582                 state->nhpres_xi[i*nh+j]        = state_local->nhpres_xi[i*nh+j];
1583                 state->nhpres_vxi[i*nh+j]       = state_local->nhpres_vxi[i*nh+j];
1584             }
1585         }
1586     }
1587     for (est = 0; est < estNR; est++)
1588     {
1589         if (EST_DISTR(est) && (state_local->flags & (1<<est)))
1590         {
1591             switch (est)
1592             {
1593                 case estX:
1594                     dd_collect_vec(dd, state_local, state_local->x, state->x);
1595                     break;
1596                 case estV:
1597                     dd_collect_vec(dd, state_local, state_local->v, state->v);
1598                     break;
1599                 case estSDX:
1600                     dd_collect_vec(dd, state_local, state_local->sd_X, state->sd_X);
1601                     break;
1602                 case estCGP:
1603                     dd_collect_vec(dd, state_local, state_local->cg_p, state->cg_p);
1604                     break;
1605                 case estDISRE_INITF:
1606                 case estDISRE_RM3TAV:
1607                 case estORIRE_INITF:
1608                 case estORIRE_DTAV:
1609                     break;
1610                 default:
1611                     gmx_incons("Unknown state entry encountered in dd_collect_state");
1612             }
1613         }
1614     }
1615 }
1616
1617 static void dd_realloc_state(t_state *state, rvec **f, int nalloc)
1618 {
1619     int est;
1620
1621     if (debug)
1622     {
1623         fprintf(debug, "Reallocating state: currently %d, required %d, allocating %d\n", state->nalloc, nalloc, over_alloc_dd(nalloc));
1624     }
1625
1626     state->nalloc = over_alloc_dd(nalloc);
1627
1628     for (est = 0; est < estNR; est++)
1629     {
1630         if (EST_DISTR(est) && (state->flags & (1<<est)))
1631         {
1632             switch (est)
1633             {
1634                 case estX:
1635                     srenew(state->x, state->nalloc);
1636                     break;
1637                 case estV:
1638                     srenew(state->v, state->nalloc);
1639                     break;
1640                 case estSDX:
1641                     srenew(state->sd_X, state->nalloc);
1642                     break;
1643                 case estCGP:
1644                     srenew(state->cg_p, state->nalloc);
1645                     break;
1646                 case estDISRE_INITF:
1647                 case estDISRE_RM3TAV:
1648                 case estORIRE_INITF:
1649                 case estORIRE_DTAV:
1650                     /* No reallocation required */
1651                     break;
1652                 default:
1653                     gmx_incons("Unknown state entry encountered in dd_realloc_state");
1654             }
1655         }
1656     }
1657
1658     if (f != NULL)
1659     {
1660         srenew(*f, state->nalloc);
1661     }
1662 }
1663
1664 static void dd_check_alloc_ncg(t_forcerec *fr, t_state *state, rvec **f,
1665                                int nalloc)
1666 {
1667     if (nalloc > fr->cg_nalloc)
1668     {
1669         if (debug)
1670         {
1671             fprintf(debug, "Reallocating forcerec: currently %d, required %d, allocating %d\n", fr->cg_nalloc, nalloc, over_alloc_dd(nalloc));
1672         }
1673         fr->cg_nalloc = over_alloc_dd(nalloc);
1674         srenew(fr->cginfo, fr->cg_nalloc);
1675         if (fr->cutoff_scheme == ecutsGROUP)
1676         {
1677             srenew(fr->cg_cm, fr->cg_nalloc);
1678         }
1679     }
1680     if (fr->cutoff_scheme == ecutsVERLET && nalloc > state->nalloc)
1681     {
1682         /* We don't use charge groups, we use x in state to set up
1683          * the atom communication.
1684          */
1685         dd_realloc_state(state, f, nalloc);
1686     }
1687 }
1688
1689 static void dd_distribute_vec_sendrecv(gmx_domdec_t *dd, t_block *cgs,
1690                                        rvec *v, rvec *lv)
1691 {
1692     gmx_domdec_master_t *ma;
1693     int                  n, i, c, a, nalloc = 0;
1694     rvec                *buf = NULL;
1695
1696     if (DDMASTER(dd))
1697     {
1698         ma  = dd->ma;
1699
1700         for (n = 0; n < dd->nnodes; n++)
1701         {
1702             if (n != dd->rank)
1703             {
1704                 if (ma->nat[n] > nalloc)
1705                 {
1706                     nalloc = over_alloc_dd(ma->nat[n]);
1707                     srenew(buf, nalloc);
1708                 }
1709                 /* Use lv as a temporary buffer */
1710                 a = 0;
1711                 for (i = ma->index[n]; i < ma->index[n+1]; i++)
1712                 {
1713                     for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1714                     {
1715                         copy_rvec(v[c], buf[a++]);
1716                     }
1717                 }
1718                 if (a != ma->nat[n])
1719                 {
1720                     gmx_fatal(FARGS, "Internal error a (%d) != nat (%d)",
1721                               a, ma->nat[n]);
1722                 }
1723
1724 #ifdef GMX_MPI
1725                 MPI_Send(buf, ma->nat[n]*sizeof(rvec), MPI_BYTE,
1726                          DDRANK(dd, n), n, dd->mpi_comm_all);
1727 #endif
1728             }
1729         }
1730         sfree(buf);
1731         n = DDMASTERRANK(dd);
1732         a = 0;
1733         for (i = ma->index[n]; i < ma->index[n+1]; i++)
1734         {
1735             for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1736             {
1737                 copy_rvec(v[c], lv[a++]);
1738             }
1739         }
1740     }
1741     else
1742     {
1743 #ifdef GMX_MPI
1744         MPI_Recv(lv, dd->nat_home*sizeof(rvec), MPI_BYTE, DDMASTERRANK(dd),
1745                  MPI_ANY_TAG, dd->mpi_comm_all, MPI_STATUS_IGNORE);
1746 #endif
1747     }
1748 }
1749
1750 static void dd_distribute_vec_scatterv(gmx_domdec_t *dd, t_block *cgs,
1751                                        rvec *v, rvec *lv)
1752 {
1753     gmx_domdec_master_t *ma;
1754     int                 *scounts = NULL, *disps = NULL;
1755     int                  n, i, c, a, nalloc = 0;
1756     rvec                *buf = NULL;
1757
1758     if (DDMASTER(dd))
1759     {
1760         ma  = dd->ma;
1761
1762         get_commbuffer_counts(dd, &scounts, &disps);
1763
1764         buf = ma->vbuf;
1765         a   = 0;
1766         for (n = 0; n < dd->nnodes; n++)
1767         {
1768             for (i = ma->index[n]; i < ma->index[n+1]; i++)
1769             {
1770                 for (c = cgs->index[ma->cg[i]]; c < cgs->index[ma->cg[i]+1]; c++)
1771                 {
1772                     copy_rvec(v[c], buf[a++]);
1773                 }
1774             }
1775         }
1776     }
1777
1778     dd_scatterv(dd, scounts, disps, buf, dd->nat_home*sizeof(rvec), lv);
1779 }
1780
1781 static void dd_distribute_vec(gmx_domdec_t *dd, t_block *cgs, rvec *v, rvec *lv)
1782 {
1783     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
1784     {
1785         dd_distribute_vec_sendrecv(dd, cgs, v, lv);
1786     }
1787     else
1788     {
1789         dd_distribute_vec_scatterv(dd, cgs, v, lv);
1790     }
1791 }
1792
1793 static void dd_distribute_dfhist(gmx_domdec_t *dd, df_history_t *dfhist)
1794 {
1795     int i;
1796     dd_bcast(dd, sizeof(int), &dfhist->bEquil);
1797     dd_bcast(dd, sizeof(int), &dfhist->nlambda);
1798     dd_bcast(dd, sizeof(real), &dfhist->wl_delta);
1799
1800     if (dfhist->nlambda > 0)
1801     {
1802         int nlam = dfhist->nlambda;
1803         dd_bcast(dd, sizeof(int)*nlam, dfhist->n_at_lam);
1804         dd_bcast(dd, sizeof(real)*nlam, dfhist->wl_histo);
1805         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_weights);
1806         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_dg);
1807         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_minvar);
1808         dd_bcast(dd, sizeof(real)*nlam, dfhist->sum_variance);
1809
1810         for (i = 0; i < nlam; i++)
1811         {
1812             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p[i]);
1813             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m[i]);
1814             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_p2[i]);
1815             dd_bcast(dd, sizeof(real)*nlam, dfhist->accum_m2[i]);
1816             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij[i]);
1817             dd_bcast(dd, sizeof(real)*nlam, dfhist->Tij_empirical[i]);
1818         }
1819     }
1820 }
1821
1822 static void dd_distribute_state(gmx_domdec_t *dd, t_block *cgs,
1823                                 t_state *state, t_state *state_local,
1824                                 rvec **f)
1825 {
1826     int  i, j, nh;
1827
1828     nh = state->nhchainlength;
1829
1830     if (DDMASTER(dd))
1831     {
1832         for (i = 0; i < efptNR; i++)
1833         {
1834             state_local->lambda[i] = state->lambda[i];
1835         }
1836         state_local->fep_state = state->fep_state;
1837         state_local->veta      = state->veta;
1838         state_local->vol0      = state->vol0;
1839         copy_mat(state->box, state_local->box);
1840         copy_mat(state->box_rel, state_local->box_rel);
1841         copy_mat(state->boxv, state_local->boxv);
1842         copy_mat(state->svir_prev, state_local->svir_prev);
1843         copy_mat(state->fvir_prev, state_local->fvir_prev);
1844         copy_df_history(&state_local->dfhist, &state->dfhist);
1845         for (i = 0; i < state_local->ngtc; i++)
1846         {
1847             for (j = 0; j < nh; j++)
1848             {
1849                 state_local->nosehoover_xi[i*nh+j]        = state->nosehoover_xi[i*nh+j];
1850                 state_local->nosehoover_vxi[i*nh+j]       = state->nosehoover_vxi[i*nh+j];
1851             }
1852             state_local->therm_integral[i] = state->therm_integral[i];
1853         }
1854         for (i = 0; i < state_local->nnhpres; i++)
1855         {
1856             for (j = 0; j < nh; j++)
1857             {
1858                 state_local->nhpres_xi[i*nh+j]        = state->nhpres_xi[i*nh+j];
1859                 state_local->nhpres_vxi[i*nh+j]       = state->nhpres_vxi[i*nh+j];
1860             }
1861         }
1862     }
1863     dd_bcast(dd, ((efptNR)*sizeof(real)), state_local->lambda);
1864     dd_bcast(dd, sizeof(int), &state_local->fep_state);
1865     dd_bcast(dd, sizeof(real), &state_local->veta);
1866     dd_bcast(dd, sizeof(real), &state_local->vol0);
1867     dd_bcast(dd, sizeof(state_local->box), state_local->box);
1868     dd_bcast(dd, sizeof(state_local->box_rel), state_local->box_rel);
1869     dd_bcast(dd, sizeof(state_local->boxv), state_local->boxv);
1870     dd_bcast(dd, sizeof(state_local->svir_prev), state_local->svir_prev);
1871     dd_bcast(dd, sizeof(state_local->fvir_prev), state_local->fvir_prev);
1872     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_xi);
1873     dd_bcast(dd, ((state_local->ngtc*nh)*sizeof(double)), state_local->nosehoover_vxi);
1874     dd_bcast(dd, state_local->ngtc*sizeof(double), state_local->therm_integral);
1875     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_xi);
1876     dd_bcast(dd, ((state_local->nnhpres*nh)*sizeof(double)), state_local->nhpres_vxi);
1877
1878     /* communicate df_history -- required for restarting from checkpoint */
1879     dd_distribute_dfhist(dd, &state_local->dfhist);
1880
1881     if (dd->nat_home > state_local->nalloc)
1882     {
1883         dd_realloc_state(state_local, f, dd->nat_home);
1884     }
1885     for (i = 0; i < estNR; i++)
1886     {
1887         if (EST_DISTR(i) && (state_local->flags & (1<<i)))
1888         {
1889             switch (i)
1890             {
1891                 case estX:
1892                     dd_distribute_vec(dd, cgs, state->x, state_local->x);
1893                     break;
1894                 case estV:
1895                     dd_distribute_vec(dd, cgs, state->v, state_local->v);
1896                     break;
1897                 case estSDX:
1898                     dd_distribute_vec(dd, cgs, state->sd_X, state_local->sd_X);
1899                     break;
1900                 case estCGP:
1901                     dd_distribute_vec(dd, cgs, state->cg_p, state_local->cg_p);
1902                     break;
1903                 case estDISRE_INITF:
1904                 case estDISRE_RM3TAV:
1905                 case estORIRE_INITF:
1906                 case estORIRE_DTAV:
1907                     /* Not implemented yet */
1908                     break;
1909                 default:
1910                     gmx_incons("Unknown state entry encountered in dd_distribute_state");
1911             }
1912         }
1913     }
1914 }
1915
1916 static char dim2char(int dim)
1917 {
1918     char c = '?';
1919
1920     switch (dim)
1921     {
1922         case XX: c = 'X'; break;
1923         case YY: c = 'Y'; break;
1924         case ZZ: c = 'Z'; break;
1925         default: gmx_fatal(FARGS, "Unknown dim %d", dim);
1926     }
1927
1928     return c;
1929 }
1930
1931 static void write_dd_grid_pdb(const char *fn, gmx_int64_t step,
1932                               gmx_domdec_t *dd, matrix box, gmx_ddbox_t *ddbox)
1933 {
1934     rvec   grid_s[2], *grid_r = NULL, cx, r;
1935     char   fname[STRLEN], buf[22];
1936     FILE  *out;
1937     int    a, i, d, z, y, x;
1938     matrix tric;
1939     real   vol;
1940
1941     copy_rvec(dd->comm->cell_x0, grid_s[0]);
1942     copy_rvec(dd->comm->cell_x1, grid_s[1]);
1943
1944     if (DDMASTER(dd))
1945     {
1946         snew(grid_r, 2*dd->nnodes);
1947     }
1948
1949     dd_gather(dd, 2*sizeof(rvec), grid_s[0], DDMASTER(dd) ? grid_r[0] : NULL);
1950
1951     if (DDMASTER(dd))
1952     {
1953         for (d = 0; d < DIM; d++)
1954         {
1955             for (i = 0; i < DIM; i++)
1956             {
1957                 if (d == i)
1958                 {
1959                     tric[d][i] = 1;
1960                 }
1961                 else
1962                 {
1963                     if (d < ddbox->npbcdim && dd->nc[d] > 1)
1964                     {
1965                         tric[d][i] = box[i][d]/box[i][i];
1966                     }
1967                     else
1968                     {
1969                         tric[d][i] = 0;
1970                     }
1971                 }
1972             }
1973         }
1974         sprintf(fname, "%s_%s.pdb", fn, gmx_step_str(step, buf));
1975         out = gmx_fio_fopen(fname, "w");
1976         gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
1977         a = 1;
1978         for (i = 0; i < dd->nnodes; i++)
1979         {
1980             vol = dd->nnodes/(box[XX][XX]*box[YY][YY]*box[ZZ][ZZ]);
1981             for (d = 0; d < DIM; d++)
1982             {
1983                 vol *= grid_r[i*2+1][d] - grid_r[i*2][d];
1984             }
1985             for (z = 0; z < 2; z++)
1986             {
1987                 for (y = 0; y < 2; y++)
1988                 {
1989                     for (x = 0; x < 2; x++)
1990                     {
1991                         cx[XX] = grid_r[i*2+x][XX];
1992                         cx[YY] = grid_r[i*2+y][YY];
1993                         cx[ZZ] = grid_r[i*2+z][ZZ];
1994                         mvmul(tric, cx, r);
1995                         gmx_fprintf_pdb_atomline(out, epdbATOM, a++, "CA", ' ', "GLY", ' ', i+1, ' ',
1996                                                  10*r[XX], 10*r[YY], 10*r[ZZ], 1.0, vol, "");
1997                     }
1998                 }
1999             }
2000             for (d = 0; d < DIM; d++)
2001             {
2002                 for (x = 0; x < 4; x++)
2003                 {
2004                     switch (d)
2005                     {
2006                         case 0: y = 1 + i*8 + 2*x; break;
2007                         case 1: y = 1 + i*8 + 2*x - (x % 2); break;
2008                         case 2: y = 1 + i*8 + x; break;
2009                     }
2010                     fprintf(out, "%6s%5d%5d\n", "CONECT", y, y+(1<<d));
2011                 }
2012             }
2013         }
2014         gmx_fio_fclose(out);
2015         sfree(grid_r);
2016     }
2017 }
2018
2019 void write_dd_pdb(const char *fn, gmx_int64_t step, const char *title,
2020                   gmx_mtop_t *mtop, t_commrec *cr,
2021                   int natoms, rvec x[], matrix box)
2022 {
2023     char          fname[STRLEN], buf[22];
2024     FILE         *out;
2025     int           i, ii, resnr, c;
2026     char         *atomname, *resname;
2027     real          b;
2028     gmx_domdec_t *dd;
2029
2030     dd = cr->dd;
2031     if (natoms == -1)
2032     {
2033         natoms = dd->comm->nat[ddnatVSITE];
2034     }
2035
2036     sprintf(fname, "%s_%s_n%d.pdb", fn, gmx_step_str(step, buf), cr->sim_nodeid);
2037
2038     out = gmx_fio_fopen(fname, "w");
2039
2040     fprintf(out, "TITLE     %s\n", title);
2041     gmx_write_pdb_box(out, dd->bScrewPBC ? epbcSCREW : epbcXYZ, box);
2042     for (i = 0; i < natoms; i++)
2043     {
2044         ii = dd->gatindex[i];
2045         gmx_mtop_atominfo_global(mtop, ii, &atomname, &resnr, &resname);
2046         if (i < dd->comm->nat[ddnatZONE])
2047         {
2048             c = 0;
2049             while (i >= dd->cgindex[dd->comm->zones.cg_range[c+1]])
2050             {
2051                 c++;
2052             }
2053             b = c;
2054         }
2055         else if (i < dd->comm->nat[ddnatVSITE])
2056         {
2057             b = dd->comm->zones.n;
2058         }
2059         else
2060         {
2061             b = dd->comm->zones.n + 1;
2062         }
2063         gmx_fprintf_pdb_atomline(out, epdbATOM, ii+1, atomname, ' ', resname, ' ', resnr, ' ',
2064                                  10*x[i][XX], 10*x[i][YY], 10*x[i][ZZ], 1.0, b, "");
2065     }
2066     fprintf(out, "TER\n");
2067
2068     gmx_fio_fclose(out);
2069 }
2070
2071 real dd_cutoff_mbody(gmx_domdec_t *dd)
2072 {
2073     gmx_domdec_comm_t *comm;
2074     int                di;
2075     real               r;
2076
2077     comm = dd->comm;
2078
2079     r = -1;
2080     if (comm->bInterCGBondeds)
2081     {
2082         if (comm->cutoff_mbody > 0)
2083         {
2084             r = comm->cutoff_mbody;
2085         }
2086         else
2087         {
2088             /* cutoff_mbody=0 means we do not have DLB */
2089             r = comm->cellsize_min[dd->dim[0]];
2090             for (di = 1; di < dd->ndim; di++)
2091             {
2092                 r = min(r, comm->cellsize_min[dd->dim[di]]);
2093             }
2094             if (comm->bBondComm)
2095             {
2096                 r = max(r, comm->cutoff_mbody);
2097             }
2098             else
2099             {
2100                 r = min(r, comm->cutoff);
2101             }
2102         }
2103     }
2104
2105     return r;
2106 }
2107
2108 real dd_cutoff_twobody(gmx_domdec_t *dd)
2109 {
2110     real r_mb;
2111
2112     r_mb = dd_cutoff_mbody(dd);
2113
2114     return max(dd->comm->cutoff, r_mb);
2115 }
2116
2117
2118 static void dd_cart_coord2pmecoord(gmx_domdec_t *dd, ivec coord, ivec coord_pme)
2119 {
2120     int nc, ntot;
2121
2122     nc   = dd->nc[dd->comm->cartpmedim];
2123     ntot = dd->comm->ntot[dd->comm->cartpmedim];
2124     copy_ivec(coord, coord_pme);
2125     coord_pme[dd->comm->cartpmedim] =
2126         nc + (coord[dd->comm->cartpmedim]*(ntot - nc) + (ntot - nc)/2)/nc;
2127 }
2128
2129 static int low_ddindex2pmeindex(int ndd, int npme, int ddindex)
2130 {
2131     /* Here we assign a PME node to communicate with this DD node
2132      * by assuming that the major index of both is x.
2133      * We add cr->npmenodes/2 to obtain an even distribution.
2134      */
2135     return (ddindex*npme + npme/2)/ndd;
2136 }
2137
2138 static int ddindex2pmeindex(const gmx_domdec_t *dd, int ddindex)
2139 {
2140     return low_ddindex2pmeindex(dd->nnodes, dd->comm->npmenodes, ddindex);
2141 }
2142
2143 static int cr_ddindex2pmeindex(const t_commrec *cr, int ddindex)
2144 {
2145     return low_ddindex2pmeindex(cr->dd->nnodes, cr->npmenodes, ddindex);
2146 }
2147
2148 static int *dd_pmenodes(t_commrec *cr)
2149 {
2150     int *pmenodes;
2151     int  n, i, p0, p1;
2152
2153     snew(pmenodes, cr->npmenodes);
2154     n = 0;
2155     for (i = 0; i < cr->dd->nnodes; i++)
2156     {
2157         p0 = cr_ddindex2pmeindex(cr, i);
2158         p1 = cr_ddindex2pmeindex(cr, i+1);
2159         if (i+1 == cr->dd->nnodes || p1 > p0)
2160         {
2161             if (debug)
2162             {
2163                 fprintf(debug, "pmenode[%d] = %d\n", n, i+1+n);
2164             }
2165             pmenodes[n] = i + 1 + n;
2166             n++;
2167         }
2168     }
2169
2170     return pmenodes;
2171 }
2172
2173 static int gmx_ddcoord2pmeindex(t_commrec *cr, int x, int y, int z)
2174 {
2175     gmx_domdec_t *dd;
2176     ivec          coords, coords_pme, nc;
2177     int           slab;
2178
2179     dd = cr->dd;
2180     /*
2181        if (dd->comm->bCartesian) {
2182        gmx_ddindex2xyz(dd->nc,ddindex,coords);
2183        dd_coords2pmecoords(dd,coords,coords_pme);
2184        copy_ivec(dd->ntot,nc);
2185        nc[dd->cartpmedim]         -= dd->nc[dd->cartpmedim];
2186        coords_pme[dd->cartpmedim] -= dd->nc[dd->cartpmedim];
2187
2188        slab = (coords_pme[XX]*nc[YY] + coords_pme[YY])*nc[ZZ] + coords_pme[ZZ];
2189        } else {
2190        slab = (ddindex*cr->npmenodes + cr->npmenodes/2)/dd->nnodes;
2191        }
2192      */
2193     coords[XX] = x;
2194     coords[YY] = y;
2195     coords[ZZ] = z;
2196     slab       = ddindex2pmeindex(dd, dd_index(dd->nc, coords));
2197
2198     return slab;
2199 }
2200
2201 static int ddcoord2simnodeid(t_commrec *cr, int x, int y, int z)
2202 {
2203     gmx_domdec_comm_t *comm;
2204     ivec               coords;
2205     int                ddindex, nodeid = -1;
2206
2207     comm = cr->dd->comm;
2208
2209     coords[XX] = x;
2210     coords[YY] = y;
2211     coords[ZZ] = z;
2212     if (comm->bCartesianPP_PME)
2213     {
2214 #ifdef GMX_MPI
2215         MPI_Cart_rank(cr->mpi_comm_mysim, coords, &nodeid);
2216 #endif
2217     }
2218     else
2219     {
2220         ddindex = dd_index(cr->dd->nc, coords);
2221         if (comm->bCartesianPP)
2222         {
2223             nodeid = comm->ddindex2simnodeid[ddindex];
2224         }
2225         else
2226         {
2227             if (comm->pmenodes)
2228             {
2229                 nodeid = ddindex + gmx_ddcoord2pmeindex(cr, x, y, z);
2230             }
2231             else
2232             {
2233                 nodeid = ddindex;
2234             }
2235         }
2236     }
2237
2238     return nodeid;
2239 }
2240
2241 static int dd_simnode2pmenode(t_commrec *cr, int sim_nodeid)
2242 {
2243     gmx_domdec_t      *dd;
2244     gmx_domdec_comm_t *comm;
2245     ivec               coord, coord_pme;
2246     int                i;
2247     int                pmenode = -1;
2248
2249     dd   = cr->dd;
2250     comm = dd->comm;
2251
2252     /* This assumes a uniform x domain decomposition grid cell size */
2253     if (comm->bCartesianPP_PME)
2254     {
2255 #ifdef GMX_MPI
2256         MPI_Cart_coords(cr->mpi_comm_mysim, sim_nodeid, DIM, coord);
2257         if (coord[comm->cartpmedim] < dd->nc[comm->cartpmedim])
2258         {
2259             /* This is a PP node */
2260             dd_cart_coord2pmecoord(dd, coord, coord_pme);
2261             MPI_Cart_rank(cr->mpi_comm_mysim, coord_pme, &pmenode);
2262         }
2263 #endif
2264     }
2265     else if (comm->bCartesianPP)
2266     {
2267         if (sim_nodeid < dd->nnodes)
2268         {
2269             pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2270         }
2271     }
2272     else
2273     {
2274         /* This assumes DD cells with identical x coordinates
2275          * are numbered sequentially.
2276          */
2277         if (dd->comm->pmenodes == NULL)
2278         {
2279             if (sim_nodeid < dd->nnodes)
2280             {
2281                 /* The DD index equals the nodeid */
2282                 pmenode = dd->nnodes + ddindex2pmeindex(dd, sim_nodeid);
2283             }
2284         }
2285         else
2286         {
2287             i = 0;
2288             while (sim_nodeid > dd->comm->pmenodes[i])
2289             {
2290                 i++;
2291             }
2292             if (sim_nodeid < dd->comm->pmenodes[i])
2293             {
2294                 pmenode = dd->comm->pmenodes[i];
2295             }
2296         }
2297     }
2298
2299     return pmenode;
2300 }
2301
2302 void get_pme_nnodes(const gmx_domdec_t *dd,
2303                     int *npmenodes_x, int *npmenodes_y)
2304 {
2305     if (dd != NULL)
2306     {
2307         *npmenodes_x = dd->comm->npmenodes_x;
2308         *npmenodes_y = dd->comm->npmenodes_y;
2309     }
2310     else
2311     {
2312         *npmenodes_x = 1;
2313         *npmenodes_y = 1;
2314     }
2315 }
2316
2317 gmx_bool gmx_pmeonlynode(t_commrec *cr, int sim_nodeid)
2318 {
2319     gmx_bool bPMEOnlyNode;
2320
2321     if (DOMAINDECOMP(cr))
2322     {
2323         bPMEOnlyNode = (dd_simnode2pmenode(cr, sim_nodeid) == -1);
2324     }
2325     else
2326     {
2327         bPMEOnlyNode = FALSE;
2328     }
2329
2330     return bPMEOnlyNode;
2331 }
2332
2333 void get_pme_ddnodes(t_commrec *cr, int pmenodeid,
2334                      int *nmy_ddnodes, int **my_ddnodes, int *node_peer)
2335 {
2336     gmx_domdec_t *dd;
2337     int           x, y, z;
2338     ivec          coord, coord_pme;
2339
2340     dd = cr->dd;
2341
2342     snew(*my_ddnodes, (dd->nnodes+cr->npmenodes-1)/cr->npmenodes);
2343
2344     *nmy_ddnodes = 0;
2345     for (x = 0; x < dd->nc[XX]; x++)
2346     {
2347         for (y = 0; y < dd->nc[YY]; y++)
2348         {
2349             for (z = 0; z < dd->nc[ZZ]; z++)
2350             {
2351                 if (dd->comm->bCartesianPP_PME)
2352                 {
2353                     coord[XX] = x;
2354                     coord[YY] = y;
2355                     coord[ZZ] = z;
2356                     dd_cart_coord2pmecoord(dd, coord, coord_pme);
2357                     if (dd->ci[XX] == coord_pme[XX] &&
2358                         dd->ci[YY] == coord_pme[YY] &&
2359                         dd->ci[ZZ] == coord_pme[ZZ])
2360                     {
2361                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2362                     }
2363                 }
2364                 else
2365                 {
2366                     /* The slab corresponds to the nodeid in the PME group */
2367                     if (gmx_ddcoord2pmeindex(cr, x, y, z) == pmenodeid)
2368                     {
2369                         (*my_ddnodes)[(*nmy_ddnodes)++] = ddcoord2simnodeid(cr, x, y, z);
2370                     }
2371                 }
2372             }
2373         }
2374     }
2375
2376     /* The last PP-only node is the peer node */
2377     *node_peer = (*my_ddnodes)[*nmy_ddnodes-1];
2378
2379     if (debug)
2380     {
2381         fprintf(debug, "Receive coordinates from PP ranks:");
2382         for (x = 0; x < *nmy_ddnodes; x++)
2383         {
2384             fprintf(debug, " %d", (*my_ddnodes)[x]);
2385         }
2386         fprintf(debug, "\n");
2387     }
2388 }
2389
2390 static gmx_bool receive_vir_ener(t_commrec *cr)
2391 {
2392     gmx_domdec_comm_t *comm;
2393     int                pmenode, coords[DIM], rank;
2394     gmx_bool           bReceive;
2395
2396     bReceive = TRUE;
2397     if (cr->npmenodes < cr->dd->nnodes)
2398     {
2399         comm = cr->dd->comm;
2400         if (comm->bCartesianPP_PME)
2401         {
2402             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2403 #ifdef GMX_MPI
2404             MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, coords);
2405             coords[comm->cartpmedim]++;
2406             if (coords[comm->cartpmedim] < cr->dd->nc[comm->cartpmedim])
2407             {
2408                 MPI_Cart_rank(cr->mpi_comm_mysim, coords, &rank);
2409                 if (dd_simnode2pmenode(cr, rank) == pmenode)
2410                 {
2411                     /* This is not the last PP node for pmenode */
2412                     bReceive = FALSE;
2413                 }
2414             }
2415 #endif
2416         }
2417         else
2418         {
2419             pmenode = dd_simnode2pmenode(cr, cr->sim_nodeid);
2420             if (cr->sim_nodeid+1 < cr->nnodes &&
2421                 dd_simnode2pmenode(cr, cr->sim_nodeid+1) == pmenode)
2422             {
2423                 /* This is not the last PP node for pmenode */
2424                 bReceive = FALSE;
2425             }
2426         }
2427     }
2428
2429     return bReceive;
2430 }
2431
2432 static void set_zones_ncg_home(gmx_domdec_t *dd)
2433 {
2434     gmx_domdec_zones_t *zones;
2435     int                 i;
2436
2437     zones = &dd->comm->zones;
2438
2439     zones->cg_range[0] = 0;
2440     for (i = 1; i < zones->n+1; i++)
2441     {
2442         zones->cg_range[i] = dd->ncg_home;
2443     }
2444     /* zone_ncg1[0] should always be equal to ncg_home */
2445     dd->comm->zone_ncg1[0] = dd->ncg_home;
2446 }
2447
2448 static void rebuild_cgindex(gmx_domdec_t *dd,
2449                             const int *gcgs_index, t_state *state)
2450 {
2451     int nat, i, *ind, *dd_cg_gl, *cgindex, cg_gl;
2452
2453     ind        = state->cg_gl;
2454     dd_cg_gl   = dd->index_gl;
2455     cgindex    = dd->cgindex;
2456     nat        = 0;
2457     cgindex[0] = nat;
2458     for (i = 0; i < state->ncg_gl; i++)
2459     {
2460         cgindex[i]  = nat;
2461         cg_gl       = ind[i];
2462         dd_cg_gl[i] = cg_gl;
2463         nat        += gcgs_index[cg_gl+1] - gcgs_index[cg_gl];
2464     }
2465     cgindex[i] = nat;
2466
2467     dd->ncg_home = state->ncg_gl;
2468     dd->nat_home = nat;
2469
2470     set_zones_ncg_home(dd);
2471 }
2472
2473 static int ddcginfo(const cginfo_mb_t *cginfo_mb, int cg)
2474 {
2475     while (cg >= cginfo_mb->cg_end)
2476     {
2477         cginfo_mb++;
2478     }
2479
2480     return cginfo_mb->cginfo[(cg - cginfo_mb->cg_start) % cginfo_mb->cg_mod];
2481 }
2482
2483 static void dd_set_cginfo(int *index_gl, int cg0, int cg1,
2484                           t_forcerec *fr, char *bLocalCG)
2485 {
2486     cginfo_mb_t *cginfo_mb;
2487     int         *cginfo;
2488     int          cg;
2489
2490     if (fr != NULL)
2491     {
2492         cginfo_mb = fr->cginfo_mb;
2493         cginfo    = fr->cginfo;
2494
2495         for (cg = cg0; cg < cg1; cg++)
2496         {
2497             cginfo[cg] = ddcginfo(cginfo_mb, index_gl[cg]);
2498         }
2499     }
2500
2501     if (bLocalCG != NULL)
2502     {
2503         for (cg = cg0; cg < cg1; cg++)
2504         {
2505             bLocalCG[index_gl[cg]] = TRUE;
2506         }
2507     }
2508 }
2509
2510 static void make_dd_indices(gmx_domdec_t *dd,
2511                             const int *gcgs_index, int cg_start)
2512 {
2513     int          nzone, zone, zone1, cg0, cg1, cg1_p1, cg, cg_gl, a, a_gl;
2514     int         *zone2cg, *zone_ncg1, *index_gl, *gatindex;
2515     gmx_ga2la_t *ga2la;
2516     char        *bLocalCG;
2517     gmx_bool     bCGs;
2518
2519     bLocalCG = dd->comm->bLocalCG;
2520
2521     if (dd->nat_tot > dd->gatindex_nalloc)
2522     {
2523         dd->gatindex_nalloc = over_alloc_dd(dd->nat_tot);
2524         srenew(dd->gatindex, dd->gatindex_nalloc);
2525     }
2526
2527     nzone      = dd->comm->zones.n;
2528     zone2cg    = dd->comm->zones.cg_range;
2529     zone_ncg1  = dd->comm->zone_ncg1;
2530     index_gl   = dd->index_gl;
2531     gatindex   = dd->gatindex;
2532     bCGs       = dd->comm->bCGs;
2533
2534     if (zone2cg[1] != dd->ncg_home)
2535     {
2536         gmx_incons("dd->ncg_zone is not up to date");
2537     }
2538
2539     /* Make the local to global and global to local atom index */
2540     a = dd->cgindex[cg_start];
2541     for (zone = 0; zone < nzone; zone++)
2542     {
2543         if (zone == 0)
2544         {
2545             cg0 = cg_start;
2546         }
2547         else
2548         {
2549             cg0 = zone2cg[zone];
2550         }
2551         cg1    = zone2cg[zone+1];
2552         cg1_p1 = cg0 + zone_ncg1[zone];
2553
2554         for (cg = cg0; cg < cg1; cg++)
2555         {
2556             zone1 = zone;
2557             if (cg >= cg1_p1)
2558             {
2559                 /* Signal that this cg is from more than one pulse away */
2560                 zone1 += nzone;
2561             }
2562             cg_gl = index_gl[cg];
2563             if (bCGs)
2564             {
2565                 for (a_gl = gcgs_index[cg_gl]; a_gl < gcgs_index[cg_gl+1]; a_gl++)
2566                 {
2567                     gatindex[a] = a_gl;
2568                     ga2la_set(dd->ga2la, a_gl, a, zone1);
2569                     a++;
2570                 }
2571             }
2572             else
2573             {
2574                 gatindex[a] = cg_gl;
2575                 ga2la_set(dd->ga2la, cg_gl, a, zone1);
2576                 a++;
2577             }
2578         }
2579     }
2580 }
2581
2582 static int check_bLocalCG(gmx_domdec_t *dd, int ncg_sys, const char *bLocalCG,
2583                           const char *where)
2584 {
2585     int ncg, i, ngl, nerr;
2586
2587     nerr = 0;
2588     if (bLocalCG == NULL)
2589     {
2590         return nerr;
2591     }
2592     for (i = 0; i < dd->ncg_tot; i++)
2593     {
2594         if (!bLocalCG[dd->index_gl[i]])
2595         {
2596             fprintf(stderr,
2597                     "DD rank %d, %s: cg %d, global cg %d is not marked in bLocalCG (ncg_home %d)\n", dd->rank, where, i+1, dd->index_gl[i]+1, dd->ncg_home);
2598             nerr++;
2599         }
2600     }
2601     ngl = 0;
2602     for (i = 0; i < ncg_sys; i++)
2603     {
2604         if (bLocalCG[i])
2605         {
2606             ngl++;
2607         }
2608     }
2609     if (ngl != dd->ncg_tot)
2610     {
2611         fprintf(stderr, "DD rank %d, %s: In bLocalCG %d cgs are marked as local, whereas there are %d\n", dd->rank, where, ngl, dd->ncg_tot);
2612         nerr++;
2613     }
2614
2615     return nerr;
2616 }
2617
2618 static void check_index_consistency(gmx_domdec_t *dd,
2619                                     int natoms_sys, int ncg_sys,
2620                                     const char *where)
2621 {
2622     int   nerr, ngl, i, a, cell;
2623     int  *have;
2624
2625     nerr = 0;
2626
2627     if (dd->comm->DD_debug > 1)
2628     {
2629         snew(have, natoms_sys);
2630         for (a = 0; a < dd->nat_tot; a++)
2631         {
2632             if (have[dd->gatindex[a]] > 0)
2633             {
2634                 fprintf(stderr, "DD rank %d: global atom %d occurs twice: index %d and %d\n", dd->rank, dd->gatindex[a]+1, have[dd->gatindex[a]], a+1);
2635             }
2636             else
2637             {
2638                 have[dd->gatindex[a]] = a + 1;
2639             }
2640         }
2641         sfree(have);
2642     }
2643
2644     snew(have, dd->nat_tot);
2645
2646     ngl  = 0;
2647     for (i = 0; i < natoms_sys; i++)
2648     {
2649         if (ga2la_get(dd->ga2la, i, &a, &cell))
2650         {
2651             if (a >= dd->nat_tot)
2652             {
2653                 fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which is larger than nat_tot (%d)\n", dd->rank, i+1, a+1, dd->nat_tot);
2654                 nerr++;
2655             }
2656             else
2657             {
2658                 have[a] = 1;
2659                 if (dd->gatindex[a] != i)
2660                 {
2661                     fprintf(stderr, "DD rank %d: global atom %d marked as local atom %d, which has global atom index %d\n", dd->rank, i+1, a+1, dd->gatindex[a]+1);
2662                     nerr++;
2663                 }
2664             }
2665             ngl++;
2666         }
2667     }
2668     if (ngl != dd->nat_tot)
2669     {
2670         fprintf(stderr,
2671                 "DD rank %d, %s: %d global atom indices, %d local atoms\n",
2672                 dd->rank, where, ngl, dd->nat_tot);
2673     }
2674     for (a = 0; a < dd->nat_tot; a++)
2675     {
2676         if (have[a] == 0)
2677         {
2678             fprintf(stderr,
2679                     "DD rank %d, %s: local atom %d, global %d has no global index\n",
2680                     dd->rank, where, a+1, dd->gatindex[a]+1);
2681         }
2682     }
2683     sfree(have);
2684
2685     nerr += check_bLocalCG(dd, ncg_sys, dd->comm->bLocalCG, where);
2686
2687     if (nerr > 0)
2688     {
2689         gmx_fatal(FARGS, "DD rank %d, %s: %d atom/cg index inconsistencies",
2690                   dd->rank, where, nerr);
2691     }
2692 }
2693
2694 static void clear_dd_indices(gmx_domdec_t *dd, int cg_start, int a_start)
2695 {
2696     int   i;
2697     char *bLocalCG;
2698
2699     if (a_start == 0)
2700     {
2701         /* Clear the whole list without searching */
2702         ga2la_clear(dd->ga2la);
2703     }
2704     else
2705     {
2706         for (i = a_start; i < dd->nat_tot; i++)
2707         {
2708             ga2la_del(dd->ga2la, dd->gatindex[i]);
2709         }
2710     }
2711
2712     bLocalCG = dd->comm->bLocalCG;
2713     if (bLocalCG)
2714     {
2715         for (i = cg_start; i < dd->ncg_tot; i++)
2716         {
2717             bLocalCG[dd->index_gl[i]] = FALSE;
2718         }
2719     }
2720
2721     dd_clear_local_vsite_indices(dd);
2722
2723     if (dd->constraints)
2724     {
2725         dd_clear_local_constraint_indices(dd);
2726     }
2727 }
2728
2729 /* This function should be used for moving the domain boudaries during DLB,
2730  * for obtaining the minimum cell size. It checks the initially set limit
2731  * comm->cellsize_min, for bonded and initial non-bonded cut-offs,
2732  * and, possibly, a longer cut-off limit set for PME load balancing.
2733  */
2734 static real cellsize_min_dlb(gmx_domdec_comm_t *comm, int dim_ind, int dim)
2735 {
2736     real cellsize_min;
2737
2738     cellsize_min = comm->cellsize_min[dim];
2739
2740     if (!comm->bVacDLBNoLimit)
2741     {
2742         /* The cut-off might have changed, e.g. by PME load balacning,
2743          * from the value used to set comm->cellsize_min, so check it.
2744          */
2745         cellsize_min = max(cellsize_min, comm->cutoff/comm->cd[dim_ind].np_dlb);
2746
2747         if (comm->bPMELoadBalDLBLimits)
2748         {
2749             /* Check for the cut-off limit set by the PME load balancing */
2750             cellsize_min = max(cellsize_min, comm->PMELoadBal_max_cutoff/comm->cd[dim_ind].np_dlb);
2751         }
2752     }
2753
2754     return cellsize_min;
2755 }
2756
2757 static real grid_jump_limit(gmx_domdec_comm_t *comm, real cutoff,
2758                             int dim_ind)
2759 {
2760     real grid_jump_limit;
2761
2762     /* The distance between the boundaries of cells at distance
2763      * x+-1,y+-1 or y+-1,z+-1 is limited by the cut-off restrictions
2764      * and by the fact that cells should not be shifted by more than
2765      * half their size, such that cg's only shift by one cell
2766      * at redecomposition.
2767      */
2768     grid_jump_limit = comm->cellsize_limit;
2769     if (!comm->bVacDLBNoLimit)
2770     {
2771         if (comm->bPMELoadBalDLBLimits)
2772         {
2773             cutoff = max(cutoff, comm->PMELoadBal_max_cutoff);
2774         }
2775         grid_jump_limit = max(grid_jump_limit,
2776                               cutoff/comm->cd[dim_ind].np);
2777     }
2778
2779     return grid_jump_limit;
2780 }
2781
2782 static gmx_bool check_grid_jump(gmx_int64_t     step,
2783                                 gmx_domdec_t   *dd,
2784                                 real            cutoff,
2785                                 gmx_ddbox_t    *ddbox,
2786                                 gmx_bool        bFatal)
2787 {
2788     gmx_domdec_comm_t *comm;
2789     int                d, dim;
2790     real               limit, bfac;
2791     gmx_bool           bInvalid;
2792
2793     bInvalid = FALSE;
2794
2795     comm = dd->comm;
2796
2797     for (d = 1; d < dd->ndim; d++)
2798     {
2799         dim   = dd->dim[d];
2800         limit = grid_jump_limit(comm, cutoff, d);
2801         bfac  = ddbox->box_size[dim];
2802         if (ddbox->tric_dir[dim])
2803         {
2804             bfac *= ddbox->skew_fac[dim];
2805         }
2806         if ((comm->cell_f1[d] - comm->cell_f_max0[d])*bfac <  limit ||
2807                                                               (comm->cell_f0[d] - comm->cell_f_min1[d])*bfac > -limit)
2808         {
2809             bInvalid = TRUE;
2810
2811             if (bFatal)
2812             {
2813                 char buf[22];
2814
2815                 /* This error should never be triggered under normal
2816                  * circumstances, but you never know ...
2817                  */
2818                 gmx_fatal(FARGS, "Step %s: The domain decomposition grid has shifted too much in the %c-direction around cell %d %d %d. This should not have happened. Running with fewer ranks might avoid this issue.",
2819                           gmx_step_str(step, buf),
2820                           dim2char(dim), dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
2821             }
2822         }
2823     }
2824
2825     return bInvalid;
2826 }
2827
2828 static int dd_load_count(gmx_domdec_comm_t *comm)
2829 {
2830     return (comm->eFlop ? comm->flop_n : comm->cycl_n[ddCyclF]);
2831 }
2832
2833 static float dd_force_load(gmx_domdec_comm_t *comm)
2834 {
2835     float load;
2836
2837     if (comm->eFlop)
2838     {
2839         load = comm->flop;
2840         if (comm->eFlop > 1)
2841         {
2842             load *= 1.0 + (comm->eFlop - 1)*(0.1*rand()/RAND_MAX - 0.05);
2843         }
2844     }
2845     else
2846     {
2847         load = comm->cycl[ddCyclF];
2848         if (comm->cycl_n[ddCyclF] > 1)
2849         {
2850             /* Subtract the maximum of the last n cycle counts
2851              * to get rid of possible high counts due to other sources,
2852              * for instance system activity, that would otherwise
2853              * affect the dynamic load balancing.
2854              */
2855             load -= comm->cycl_max[ddCyclF];
2856         }
2857
2858 #ifdef GMX_MPI
2859         if (comm->cycl_n[ddCyclWaitGPU] && comm->nrank_gpu_shared > 1)
2860         {
2861             float gpu_wait, gpu_wait_sum;
2862
2863             gpu_wait = comm->cycl[ddCyclWaitGPU];
2864             if (comm->cycl_n[ddCyclF] > 1)
2865             {
2866                 /* We should remove the WaitGPU time of the same MD step
2867                  * as the one with the maximum F time, since the F time
2868                  * and the wait time are not independent.
2869                  * Furthermore, the step for the max F time should be chosen
2870                  * the same on all ranks that share the same GPU.
2871                  * But to keep the code simple, we remove the average instead.
2872                  * The main reason for artificially long times at some steps
2873                  * is spurious CPU activity or MPI time, so we don't expect
2874                  * that changes in the GPU wait time matter a lot here.
2875                  */
2876                 gpu_wait *= (comm->cycl_n[ddCyclF] - 1)/(float)comm->cycl_n[ddCyclF];
2877             }
2878             /* Sum the wait times over the ranks that share the same GPU */
2879             MPI_Allreduce(&gpu_wait, &gpu_wait_sum, 1, MPI_FLOAT, MPI_SUM,
2880                           comm->mpi_comm_gpu_shared);
2881             /* Replace the wait time by the average over the ranks */
2882             load += -gpu_wait + gpu_wait_sum/comm->nrank_gpu_shared;
2883         }
2884 #endif
2885     }
2886
2887     return load;
2888 }
2889
2890 static void set_slb_pme_dim_f(gmx_domdec_t *dd, int dim, real **dim_f)
2891 {
2892     gmx_domdec_comm_t *comm;
2893     int                i;
2894
2895     comm = dd->comm;
2896
2897     snew(*dim_f, dd->nc[dim]+1);
2898     (*dim_f)[0] = 0;
2899     for (i = 1; i < dd->nc[dim]; i++)
2900     {
2901         if (comm->slb_frac[dim])
2902         {
2903             (*dim_f)[i] = (*dim_f)[i-1] + comm->slb_frac[dim][i-1];
2904         }
2905         else
2906         {
2907             (*dim_f)[i] = (real)i/(real)dd->nc[dim];
2908         }
2909     }
2910     (*dim_f)[dd->nc[dim]] = 1;
2911 }
2912
2913 static void init_ddpme(gmx_domdec_t *dd, gmx_ddpme_t *ddpme, int dimind)
2914 {
2915     int  pmeindex, slab, nso, i;
2916     ivec xyz;
2917
2918     if (dimind == 0 && dd->dim[0] == YY && dd->comm->npmenodes_x == 1)
2919     {
2920         ddpme->dim = YY;
2921     }
2922     else
2923     {
2924         ddpme->dim = dimind;
2925     }
2926     ddpme->dim_match = (ddpme->dim == dd->dim[dimind]);
2927
2928     ddpme->nslab = (ddpme->dim == 0 ?
2929                     dd->comm->npmenodes_x :
2930                     dd->comm->npmenodes_y);
2931
2932     if (ddpme->nslab <= 1)
2933     {
2934         return;
2935     }
2936
2937     nso = dd->comm->npmenodes/ddpme->nslab;
2938     /* Determine for each PME slab the PP location range for dimension dim */
2939     snew(ddpme->pp_min, ddpme->nslab);
2940     snew(ddpme->pp_max, ddpme->nslab);
2941     for (slab = 0; slab < ddpme->nslab; slab++)
2942     {
2943         ddpme->pp_min[slab] = dd->nc[dd->dim[dimind]] - 1;
2944         ddpme->pp_max[slab] = 0;
2945     }
2946     for (i = 0; i < dd->nnodes; i++)
2947     {
2948         ddindex2xyz(dd->nc, i, xyz);
2949         /* For y only use our y/z slab.
2950          * This assumes that the PME x grid size matches the DD grid size.
2951          */
2952         if (dimind == 0 || xyz[XX] == dd->ci[XX])
2953         {
2954             pmeindex = ddindex2pmeindex(dd, i);
2955             if (dimind == 0)
2956             {
2957                 slab = pmeindex/nso;
2958             }
2959             else
2960             {
2961                 slab = pmeindex % ddpme->nslab;
2962             }
2963             ddpme->pp_min[slab] = min(ddpme->pp_min[slab], xyz[dimind]);
2964             ddpme->pp_max[slab] = max(ddpme->pp_max[slab], xyz[dimind]);
2965         }
2966     }
2967
2968     set_slb_pme_dim_f(dd, ddpme->dim, &ddpme->slb_dim_f);
2969 }
2970
2971 int dd_pme_maxshift_x(gmx_domdec_t *dd)
2972 {
2973     if (dd->comm->ddpme[0].dim == XX)
2974     {
2975         return dd->comm->ddpme[0].maxshift;
2976     }
2977     else
2978     {
2979         return 0;
2980     }
2981 }
2982
2983 int dd_pme_maxshift_y(gmx_domdec_t *dd)
2984 {
2985     if (dd->comm->ddpme[0].dim == YY)
2986     {
2987         return dd->comm->ddpme[0].maxshift;
2988     }
2989     else if (dd->comm->npmedecompdim >= 2 && dd->comm->ddpme[1].dim == YY)
2990     {
2991         return dd->comm->ddpme[1].maxshift;
2992     }
2993     else
2994     {
2995         return 0;
2996     }
2997 }
2998
2999 static void set_pme_maxshift(gmx_domdec_t *dd, gmx_ddpme_t *ddpme,
3000                              gmx_bool bUniform, gmx_ddbox_t *ddbox, real *cell_f)
3001 {
3002     gmx_domdec_comm_t *comm;
3003     int                nc, ns, s;
3004     int               *xmin, *xmax;
3005     real               range, pme_boundary;
3006     int                sh;
3007
3008     comm = dd->comm;
3009     nc   = dd->nc[ddpme->dim];
3010     ns   = ddpme->nslab;
3011
3012     if (!ddpme->dim_match)
3013     {
3014         /* PP decomposition is not along dim: the worst situation */
3015         sh = ns/2;
3016     }
3017     else if (ns <= 3 || (bUniform && ns == nc))
3018     {
3019         /* The optimal situation */
3020         sh = 1;
3021     }
3022     else
3023     {
3024         /* We need to check for all pme nodes which nodes they
3025          * could possibly need to communicate with.
3026          */
3027         xmin = ddpme->pp_min;
3028         xmax = ddpme->pp_max;
3029         /* Allow for atoms to be maximally 2/3 times the cut-off
3030          * out of their DD cell. This is a reasonable balance between
3031          * between performance and support for most charge-group/cut-off
3032          * combinations.
3033          */
3034         range  = 2.0/3.0*comm->cutoff/ddbox->box_size[ddpme->dim];
3035         /* Avoid extra communication when we are exactly at a boundary */
3036         range *= 0.999;
3037
3038         sh = 1;
3039         for (s = 0; s < ns; s++)
3040         {
3041             /* PME slab s spreads atoms between box frac. s/ns and (s+1)/ns */
3042             pme_boundary = (real)s/ns;
3043             while (sh+1 < ns &&
3044                    ((s-(sh+1) >= 0 &&
3045                      cell_f[xmax[s-(sh+1)   ]+1]     + range > pme_boundary) ||
3046                     (s-(sh+1) <  0 &&
3047                      cell_f[xmax[s-(sh+1)+ns]+1] - 1 + range > pme_boundary)))
3048             {
3049                 sh++;
3050             }
3051             pme_boundary = (real)(s+1)/ns;
3052             while (sh+1 < ns &&
3053                    ((s+(sh+1) <  ns &&
3054                      cell_f[xmin[s+(sh+1)   ]  ]     - range < pme_boundary) ||
3055                     (s+(sh+1) >= ns &&
3056                      cell_f[xmin[s+(sh+1)-ns]  ] + 1 - range < pme_boundary)))
3057             {
3058                 sh++;
3059             }
3060         }
3061     }
3062
3063     ddpme->maxshift = sh;
3064
3065     if (debug)
3066     {
3067         fprintf(debug, "PME slab communication range for dim %d is %d\n",
3068                 ddpme->dim, ddpme->maxshift);
3069     }
3070 }
3071
3072 static void check_box_size(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3073 {
3074     int d, dim;
3075
3076     for (d = 0; d < dd->ndim; d++)
3077     {
3078         dim = dd->dim[d];
3079         if (dim < ddbox->nboundeddim &&
3080             ddbox->box_size[dim]*ddbox->skew_fac[dim] <
3081             dd->nc[dim]*dd->comm->cellsize_limit*DD_CELL_MARGIN)
3082         {
3083             gmx_fatal(FARGS, "The %c-size of the box (%f) times the triclinic skew factor (%f) is smaller than the number of DD cells (%d) times the smallest allowed cell size (%f)\n",
3084                       dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3085                       dd->nc[dim], dd->comm->cellsize_limit);
3086         }
3087     }
3088 }
3089
3090 enum {
3091     setcellsizeslbLOCAL, setcellsizeslbMASTER, setcellsizeslbPULSE_ONLY
3092 };
3093
3094 /* Set the domain boundaries. Use for static (or no) load balancing,
3095  * and also for the starting state for dynamic load balancing.
3096  * setmode determine if and where the boundaries are stored, use enum above.
3097  * Returns the number communication pulses in npulse.
3098  */
3099 static void set_dd_cell_sizes_slb(gmx_domdec_t *dd, gmx_ddbox_t *ddbox,
3100                                   int setmode, ivec npulse)
3101 {
3102     gmx_domdec_comm_t *comm;
3103     int                d, j;
3104     rvec               cellsize_min;
3105     real              *cell_x, cell_dx, cellsize;
3106
3107     comm = dd->comm;
3108
3109     for (d = 0; d < DIM; d++)
3110     {
3111         cellsize_min[d] = ddbox->box_size[d]*ddbox->skew_fac[d];
3112         npulse[d]       = 1;
3113         if (dd->nc[d] == 1 || comm->slb_frac[d] == NULL)
3114         {
3115             /* Uniform grid */
3116             cell_dx = ddbox->box_size[d]/dd->nc[d];
3117             switch (setmode)
3118             {
3119                 case setcellsizeslbMASTER:
3120                     for (j = 0; j < dd->nc[d]+1; j++)
3121                     {
3122                         dd->ma->cell_x[d][j] = ddbox->box0[d] + j*cell_dx;
3123                     }
3124                     break;
3125                 case setcellsizeslbLOCAL:
3126                     comm->cell_x0[d] = ddbox->box0[d] + (dd->ci[d]  )*cell_dx;
3127                     comm->cell_x1[d] = ddbox->box0[d] + (dd->ci[d]+1)*cell_dx;
3128                     break;
3129                 default:
3130                     break;
3131             }
3132             cellsize = cell_dx*ddbox->skew_fac[d];
3133             while (cellsize*npulse[d] < comm->cutoff)
3134             {
3135                 npulse[d]++;
3136             }
3137             cellsize_min[d] = cellsize;
3138         }
3139         else
3140         {
3141             /* Statically load balanced grid */
3142             /* Also when we are not doing a master distribution we determine
3143              * all cell borders in a loop to obtain identical values
3144              * to the master distribution case and to determine npulse.
3145              */
3146             if (setmode == setcellsizeslbMASTER)
3147             {
3148                 cell_x = dd->ma->cell_x[d];
3149             }
3150             else
3151             {
3152                 snew(cell_x, dd->nc[d]+1);
3153             }
3154             cell_x[0] = ddbox->box0[d];
3155             for (j = 0; j < dd->nc[d]; j++)
3156             {
3157                 cell_dx     = ddbox->box_size[d]*comm->slb_frac[d][j];
3158                 cell_x[j+1] = cell_x[j] + cell_dx;
3159                 cellsize    = cell_dx*ddbox->skew_fac[d];
3160                 while (cellsize*npulse[d] < comm->cutoff &&
3161                        npulse[d] < dd->nc[d]-1)
3162                 {
3163                     npulse[d]++;
3164                 }
3165                 cellsize_min[d] = min(cellsize_min[d], cellsize);
3166             }
3167             if (setmode == setcellsizeslbLOCAL)
3168             {
3169                 comm->cell_x0[d] = cell_x[dd->ci[d]];
3170                 comm->cell_x1[d] = cell_x[dd->ci[d]+1];
3171             }
3172             if (setmode != setcellsizeslbMASTER)
3173             {
3174                 sfree(cell_x);
3175             }
3176         }
3177         /* The following limitation is to avoid that a cell would receive
3178          * some of its own home charge groups back over the periodic boundary.
3179          * Double charge groups cause trouble with the global indices.
3180          */
3181         if (d < ddbox->npbcdim &&
3182             dd->nc[d] > 1 && npulse[d] >= dd->nc[d])
3183         {
3184             char error_string[STRLEN];
3185
3186             sprintf(error_string,
3187                     "The box size in direction %c (%f) times the triclinic skew factor (%f) is too small for a cut-off of %f with %d domain decomposition cells, use 1 or more than %d %s or increase the box size in this direction",
3188                     dim2char(d), ddbox->box_size[d], ddbox->skew_fac[d],
3189                     comm->cutoff,
3190                     dd->nc[d], dd->nc[d],
3191                     dd->nnodes > dd->nc[d] ? "cells" : "ranks");
3192
3193             if (setmode == setcellsizeslbLOCAL)
3194             {
3195                 gmx_fatal_collective(FARGS, NULL, dd, error_string);
3196             }
3197             else
3198             {
3199                 gmx_fatal(FARGS, error_string);
3200             }
3201         }
3202     }
3203
3204     if (!comm->bDynLoadBal)
3205     {
3206         copy_rvec(cellsize_min, comm->cellsize_min);
3207     }
3208
3209     for (d = 0; d < comm->npmedecompdim; d++)
3210     {
3211         set_pme_maxshift(dd, &comm->ddpme[d],
3212                          comm->slb_frac[dd->dim[d]] == NULL, ddbox,
3213                          comm->ddpme[d].slb_dim_f);
3214     }
3215 }
3216
3217
3218 static void dd_cell_sizes_dlb_root_enforce_limits(gmx_domdec_t *dd,
3219                                                   int d, int dim, gmx_domdec_root_t *root,
3220                                                   gmx_ddbox_t *ddbox,
3221                                                   gmx_bool bUniform, gmx_int64_t step, real cellsize_limit_f, int range[])
3222 {
3223     gmx_domdec_comm_t *comm;
3224     int                ncd, i, j, nmin, nmin_old;
3225     gmx_bool           bLimLo, bLimHi;
3226     real              *cell_size;
3227     real               fac, halfway, cellsize_limit_f_i, region_size;
3228     gmx_bool           bPBC, bLastHi = FALSE;
3229     int                nrange[] = {range[0], range[1]};
3230
3231     region_size = root->cell_f[range[1]]-root->cell_f[range[0]];
3232
3233     comm = dd->comm;
3234
3235     ncd = dd->nc[dim];
3236
3237     bPBC = (dim < ddbox->npbcdim);
3238
3239     cell_size = root->buf_ncd;
3240
3241     if (debug)
3242     {
3243         fprintf(debug, "enforce_limits: %d %d\n", range[0], range[1]);
3244     }
3245
3246     /* First we need to check if the scaling does not make cells
3247      * smaller than the smallest allowed size.
3248      * We need to do this iteratively, since if a cell is too small,
3249      * it needs to be enlarged, which makes all the other cells smaller,
3250      * which could in turn make another cell smaller than allowed.
3251      */
3252     for (i = range[0]; i < range[1]; i++)
3253     {
3254         root->bCellMin[i] = FALSE;
3255     }
3256     nmin = 0;
3257     do
3258     {
3259         nmin_old = nmin;
3260         /* We need the total for normalization */
3261         fac = 0;
3262         for (i = range[0]; i < range[1]; i++)
3263         {
3264             if (root->bCellMin[i] == FALSE)
3265             {
3266                 fac += cell_size[i];
3267             }
3268         }
3269         fac = ( region_size - nmin*cellsize_limit_f)/fac; /* substracting cells already set to cellsize_limit_f */
3270         /* Determine the cell boundaries */
3271         for (i = range[0]; i < range[1]; i++)
3272         {
3273             if (root->bCellMin[i] == FALSE)
3274             {
3275                 cell_size[i] *= fac;
3276                 if (!bPBC && (i == 0 || i == dd->nc[dim] -1))
3277                 {
3278                     cellsize_limit_f_i = 0;
3279                 }
3280                 else
3281                 {
3282                     cellsize_limit_f_i = cellsize_limit_f;
3283                 }
3284                 if (cell_size[i] < cellsize_limit_f_i)
3285                 {
3286                     root->bCellMin[i] = TRUE;
3287                     cell_size[i]      = cellsize_limit_f_i;
3288                     nmin++;
3289                 }
3290             }
3291             root->cell_f[i+1] = root->cell_f[i] + cell_size[i];
3292         }
3293     }
3294     while (nmin > nmin_old);
3295
3296     i            = range[1]-1;
3297     cell_size[i] = root->cell_f[i+1] - root->cell_f[i];
3298     /* For this check we should not use DD_CELL_MARGIN,
3299      * but a slightly smaller factor,
3300      * since rounding could get use below the limit.
3301      */
3302     if (bPBC && cell_size[i] < cellsize_limit_f*DD_CELL_MARGIN2/DD_CELL_MARGIN)
3303     {
3304         char buf[22];
3305         gmx_fatal(FARGS, "Step %s: the dynamic load balancing could not balance dimension %c: box size %f, triclinic skew factor %f, #cells %d, minimum cell size %f\n",
3306                   gmx_step_str(step, buf),
3307                   dim2char(dim), ddbox->box_size[dim], ddbox->skew_fac[dim],
3308                   ncd, comm->cellsize_min[dim]);
3309     }
3310
3311     root->bLimited = (nmin > 0) || (range[0] > 0) || (range[1] < ncd);
3312
3313     if (!bUniform)
3314     {
3315         /* Check if the boundary did not displace more than halfway
3316          * each of the cells it bounds, as this could cause problems,
3317          * especially when the differences between cell sizes are large.
3318          * If changes are applied, they will not make cells smaller
3319          * than the cut-off, as we check all the boundaries which
3320          * might be affected by a change and if the old state was ok,
3321          * the cells will at most be shrunk back to their old size.
3322          */
3323         for (i = range[0]+1; i < range[1]; i++)
3324         {
3325             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i-1]);
3326             if (root->cell_f[i] < halfway)
3327             {
3328                 root->cell_f[i] = halfway;
3329                 /* Check if the change also causes shifts of the next boundaries */
3330                 for (j = i+1; j < range[1]; j++)
3331                 {
3332                     if (root->cell_f[j] < root->cell_f[j-1] + cellsize_limit_f)
3333                     {
3334                         root->cell_f[j] =  root->cell_f[j-1] + cellsize_limit_f;
3335                     }
3336                 }
3337             }
3338             halfway = 0.5*(root->old_cell_f[i] + root->old_cell_f[i+1]);
3339             if (root->cell_f[i] > halfway)
3340             {
3341                 root->cell_f[i] = halfway;
3342                 /* Check if the change also causes shifts of the next boundaries */
3343                 for (j = i-1; j >= range[0]+1; j--)
3344                 {
3345                     if (root->cell_f[j] > root->cell_f[j+1] - cellsize_limit_f)
3346                     {
3347                         root->cell_f[j] = root->cell_f[j+1] - cellsize_limit_f;
3348                     }
3349                 }
3350             }
3351         }
3352     }
3353
3354     /* nrange is defined as [lower, upper) range for new call to enforce_limits */
3355     /* find highest violation of LimLo (a) and the following violation of LimHi (thus the lowest following) (b)
3356      * then call enforce_limits for (oldb,a), (a,b). In the next step: (b,nexta). oldb and nexta can be the boundaries.
3357      * for a and b nrange is used */
3358     if (d > 0)
3359     {
3360         /* Take care of the staggering of the cell boundaries */
3361         if (bUniform)
3362         {
3363             for (i = range[0]; i < range[1]; i++)
3364             {
3365                 root->cell_f_max0[i] = root->cell_f[i];
3366                 root->cell_f_min1[i] = root->cell_f[i+1];
3367             }
3368         }
3369         else
3370         {
3371             for (i = range[0]+1; i < range[1]; i++)
3372             {
3373                 bLimLo = (root->cell_f[i] < root->bound_min[i]);
3374                 bLimHi = (root->cell_f[i] > root->bound_max[i]);
3375                 if (bLimLo && bLimHi)
3376                 {
3377                     /* Both limits violated, try the best we can */
3378                     /* For this case we split the original range (range) in two parts and care about the other limitiations in the next iteration. */
3379                     root->cell_f[i] = 0.5*(root->bound_min[i] + root->bound_max[i]);
3380                     nrange[0]       = range[0];
3381                     nrange[1]       = i;
3382                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3383
3384                     nrange[0] = i;
3385                     nrange[1] = range[1];
3386                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3387
3388                     return;
3389                 }
3390                 else if (bLimLo)
3391                 {
3392                     /* root->cell_f[i] = root->bound_min[i]; */
3393                     nrange[1] = i;  /* only store violation location. There could be a LimLo violation following with an higher index */
3394                     bLastHi   = FALSE;
3395                 }
3396                 else if (bLimHi && !bLastHi)
3397                 {
3398                     bLastHi = TRUE;
3399                     if (nrange[1] < range[1])   /* found a LimLo before */
3400                     {
3401                         root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3402                         dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3403                         nrange[0] = nrange[1];
3404                     }
3405                     root->cell_f[i] = root->bound_max[i];
3406                     nrange[1]       = i;
3407                     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3408                     nrange[0] = i;
3409                     nrange[1] = range[1];
3410                 }
3411             }
3412             if (nrange[1] < range[1])   /* found last a LimLo */
3413             {
3414                 root->cell_f[nrange[1]] = root->bound_min[nrange[1]];
3415                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3416                 nrange[0] = nrange[1];
3417                 nrange[1] = range[1];
3418                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3419             }
3420             else if (nrange[0] > range[0]) /* found at least one LimHi */
3421             {
3422                 dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, nrange);
3423             }
3424         }
3425     }
3426 }
3427
3428
3429 static void set_dd_cell_sizes_dlb_root(gmx_domdec_t *dd,
3430                                        int d, int dim, gmx_domdec_root_t *root,
3431                                        gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3432                                        gmx_bool bUniform, gmx_int64_t step)
3433 {
3434     gmx_domdec_comm_t *comm;
3435     int                ncd, d1, i, j, pos;
3436     real              *cell_size;
3437     real               load_aver, load_i, imbalance, change, change_max, sc;
3438     real               cellsize_limit_f, dist_min_f, dist_min_f_hard, space;
3439     real               change_limit;
3440     real               relax = 0.5;
3441     gmx_bool           bPBC;
3442     int                range[] = { 0, 0 };
3443
3444     comm = dd->comm;
3445
3446     /* Convert the maximum change from the input percentage to a fraction */
3447     change_limit = comm->dlb_scale_lim*0.01;
3448
3449     ncd = dd->nc[dim];
3450
3451     bPBC = (dim < ddbox->npbcdim);
3452
3453     cell_size = root->buf_ncd;
3454
3455     /* Store the original boundaries */
3456     for (i = 0; i < ncd+1; i++)
3457     {
3458         root->old_cell_f[i] = root->cell_f[i];
3459     }
3460     if (bUniform)
3461     {
3462         for (i = 0; i < ncd; i++)
3463         {
3464             cell_size[i] = 1.0/ncd;
3465         }
3466     }
3467     else if (dd_load_count(comm) > 0)
3468     {
3469         load_aver  = comm->load[d].sum_m/ncd;
3470         change_max = 0;
3471         for (i = 0; i < ncd; i++)
3472         {
3473             /* Determine the relative imbalance of cell i */
3474             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3475             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3476             /* Determine the change of the cell size using underrelaxation */
3477             change     = -relax*imbalance;
3478             change_max = max(change_max, max(change, -change));
3479         }
3480         /* Limit the amount of scaling.
3481          * We need to use the same rescaling for all cells in one row,
3482          * otherwise the load balancing might not converge.
3483          */
3484         sc = relax;
3485         if (change_max > change_limit)
3486         {
3487             sc *= change_limit/change_max;
3488         }
3489         for (i = 0; i < ncd; i++)
3490         {
3491             /* Determine the relative imbalance of cell i */
3492             load_i    = comm->load[d].load[i*comm->load[d].nload+2];
3493             imbalance = (load_i - load_aver)/(load_aver > 0 ? load_aver : 1);
3494             /* Determine the change of the cell size using underrelaxation */
3495             change       = -sc*imbalance;
3496             cell_size[i] = (root->cell_f[i+1]-root->cell_f[i])*(1 + change);
3497         }
3498     }
3499
3500     cellsize_limit_f  = cellsize_min_dlb(comm, d, dim)/ddbox->box_size[dim];
3501     cellsize_limit_f *= DD_CELL_MARGIN;
3502     dist_min_f_hard   = grid_jump_limit(comm, comm->cutoff, d)/ddbox->box_size[dim];
3503     dist_min_f        = dist_min_f_hard * DD_CELL_MARGIN;
3504     if (ddbox->tric_dir[dim])
3505     {
3506         cellsize_limit_f /= ddbox->skew_fac[dim];
3507         dist_min_f       /= ddbox->skew_fac[dim];
3508     }
3509     if (bDynamicBox && d > 0)
3510     {
3511         dist_min_f *= DD_PRES_SCALE_MARGIN;
3512     }
3513     if (d > 0 && !bUniform)
3514     {
3515         /* Make sure that the grid is not shifted too much */
3516         for (i = 1; i < ncd; i++)
3517         {
3518             if (root->cell_f_min1[i] - root->cell_f_max0[i-1] < 2 * dist_min_f_hard)
3519             {
3520                 gmx_incons("Inconsistent DD boundary staggering limits!");
3521             }
3522             root->bound_min[i] = root->cell_f_max0[i-1] + dist_min_f;
3523             space              = root->cell_f[i] - (root->cell_f_max0[i-1] + dist_min_f);
3524             if (space > 0)
3525             {
3526                 root->bound_min[i] += 0.5*space;
3527             }
3528             root->bound_max[i] = root->cell_f_min1[i] - dist_min_f;
3529             space              = root->cell_f[i] - (root->cell_f_min1[i] - dist_min_f);
3530             if (space < 0)
3531             {
3532                 root->bound_max[i] += 0.5*space;
3533             }
3534             if (debug)
3535             {
3536                 fprintf(debug,
3537                         "dim %d boundary %d %.3f < %.3f < %.3f < %.3f < %.3f\n",
3538                         d, i,
3539                         root->cell_f_max0[i-1] + dist_min_f,
3540                         root->bound_min[i], root->cell_f[i], root->bound_max[i],
3541                         root->cell_f_min1[i] - dist_min_f);
3542             }
3543         }
3544     }
3545     range[1]          = ncd;
3546     root->cell_f[0]   = 0;
3547     root->cell_f[ncd] = 1;
3548     dd_cell_sizes_dlb_root_enforce_limits(dd, d, dim, root, ddbox, bUniform, step, cellsize_limit_f, range);
3549
3550
3551     /* After the checks above, the cells should obey the cut-off
3552      * restrictions, but it does not hurt to check.
3553      */
3554     for (i = 0; i < ncd; i++)
3555     {
3556         if (debug)
3557         {
3558             fprintf(debug, "Relative bounds dim %d  cell %d: %f %f\n",
3559                     dim, i, root->cell_f[i], root->cell_f[i+1]);
3560         }
3561
3562         if ((bPBC || (i != 0 && i != dd->nc[dim]-1)) &&
3563             root->cell_f[i+1] - root->cell_f[i] <
3564             cellsize_limit_f/DD_CELL_MARGIN)
3565         {
3566             char buf[22];
3567             fprintf(stderr,
3568                     "\nWARNING step %s: direction %c, cell %d too small: %f\n",
3569                     gmx_step_str(step, buf), dim2char(dim), i,
3570                     (root->cell_f[i+1] - root->cell_f[i])
3571                     *ddbox->box_size[dim]*ddbox->skew_fac[dim]);
3572         }
3573     }
3574
3575     pos = ncd + 1;
3576     /* Store the cell boundaries of the lower dimensions at the end */
3577     for (d1 = 0; d1 < d; d1++)
3578     {
3579         root->cell_f[pos++] = comm->cell_f0[d1];
3580         root->cell_f[pos++] = comm->cell_f1[d1];
3581     }
3582
3583     if (d < comm->npmedecompdim)
3584     {
3585         /* The master determines the maximum shift for
3586          * the coordinate communication between separate PME nodes.
3587          */
3588         set_pme_maxshift(dd, &comm->ddpme[d], bUniform, ddbox, root->cell_f);
3589     }
3590     root->cell_f[pos++] = comm->ddpme[0].maxshift;
3591     if (d >= 1)
3592     {
3593         root->cell_f[pos++] = comm->ddpme[1].maxshift;
3594     }
3595 }
3596
3597 static void relative_to_absolute_cell_bounds(gmx_domdec_t *dd,
3598                                              gmx_ddbox_t *ddbox, int dimind)
3599 {
3600     gmx_domdec_comm_t *comm;
3601     int                dim;
3602
3603     comm = dd->comm;
3604
3605     /* Set the cell dimensions */
3606     dim                = dd->dim[dimind];
3607     comm->cell_x0[dim] = comm->cell_f0[dimind]*ddbox->box_size[dim];
3608     comm->cell_x1[dim] = comm->cell_f1[dimind]*ddbox->box_size[dim];
3609     if (dim >= ddbox->nboundeddim)
3610     {
3611         comm->cell_x0[dim] += ddbox->box0[dim];
3612         comm->cell_x1[dim] += ddbox->box0[dim];
3613     }
3614 }
3615
3616 static void distribute_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3617                                          int d, int dim, real *cell_f_row,
3618                                          gmx_ddbox_t *ddbox)
3619 {
3620     gmx_domdec_comm_t *comm;
3621     int                d1, dim1, pos;
3622
3623     comm = dd->comm;
3624
3625 #ifdef GMX_MPI
3626     /* Each node would only need to know two fractions,
3627      * but it is probably cheaper to broadcast the whole array.
3628      */
3629     MPI_Bcast(cell_f_row, DD_CELL_F_SIZE(dd, d)*sizeof(real), MPI_BYTE,
3630               0, comm->mpi_comm_load[d]);
3631 #endif
3632     /* Copy the fractions for this dimension from the buffer */
3633     comm->cell_f0[d] = cell_f_row[dd->ci[dim]  ];
3634     comm->cell_f1[d] = cell_f_row[dd->ci[dim]+1];
3635     /* The whole array was communicated, so set the buffer position */
3636     pos = dd->nc[dim] + 1;
3637     for (d1 = 0; d1 <= d; d1++)
3638     {
3639         if (d1 < d)
3640         {
3641             /* Copy the cell fractions of the lower dimensions */
3642             comm->cell_f0[d1] = cell_f_row[pos++];
3643             comm->cell_f1[d1] = cell_f_row[pos++];
3644         }
3645         relative_to_absolute_cell_bounds(dd, ddbox, d1);
3646     }
3647     /* Convert the communicated shift from float to int */
3648     comm->ddpme[0].maxshift = (int)(cell_f_row[pos++] + 0.5);
3649     if (d >= 1)
3650     {
3651         comm->ddpme[1].maxshift = (int)(cell_f_row[pos++] + 0.5);
3652     }
3653 }
3654
3655 static void set_dd_cell_sizes_dlb_change(gmx_domdec_t *dd,
3656                                          gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3657                                          gmx_bool bUniform, gmx_int64_t step)
3658 {
3659     gmx_domdec_comm_t *comm;
3660     int                d, dim, d1;
3661     gmx_bool           bRowMember, bRowRoot;
3662     real              *cell_f_row;
3663
3664     comm = dd->comm;
3665
3666     for (d = 0; d < dd->ndim; d++)
3667     {
3668         dim        = dd->dim[d];
3669         bRowMember = TRUE;
3670         bRowRoot   = TRUE;
3671         for (d1 = d; d1 < dd->ndim; d1++)
3672         {
3673             if (dd->ci[dd->dim[d1]] > 0)
3674             {
3675                 if (d1 != d)
3676                 {
3677                     bRowMember = FALSE;
3678                 }
3679                 bRowRoot = FALSE;
3680             }
3681         }
3682         if (bRowMember)
3683         {
3684             if (bRowRoot)
3685             {
3686                 set_dd_cell_sizes_dlb_root(dd, d, dim, comm->root[d],
3687                                            ddbox, bDynamicBox, bUniform, step);
3688                 cell_f_row = comm->root[d]->cell_f;
3689             }
3690             else
3691             {
3692                 cell_f_row = comm->cell_f_row;
3693             }
3694             distribute_dd_cell_sizes_dlb(dd, d, dim, cell_f_row, ddbox);
3695         }
3696     }
3697 }
3698
3699 static void set_dd_cell_sizes_dlb_nochange(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
3700 {
3701     int d;
3702
3703     /* This function assumes the box is static and should therefore
3704      * not be called when the box has changed since the last
3705      * call to dd_partition_system.
3706      */
3707     for (d = 0; d < dd->ndim; d++)
3708     {
3709         relative_to_absolute_cell_bounds(dd, ddbox, d);
3710     }
3711 }
3712
3713
3714
3715 static void set_dd_cell_sizes_dlb(gmx_domdec_t *dd,
3716                                   gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3717                                   gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3718                                   gmx_wallcycle_t wcycle)
3719 {
3720     gmx_domdec_comm_t *comm;
3721     int                dim;
3722
3723     comm = dd->comm;
3724
3725     if (bDoDLB)
3726     {
3727         wallcycle_start(wcycle, ewcDDCOMMBOUND);
3728         set_dd_cell_sizes_dlb_change(dd, ddbox, bDynamicBox, bUniform, step);
3729         wallcycle_stop(wcycle, ewcDDCOMMBOUND);
3730     }
3731     else if (bDynamicBox)
3732     {
3733         set_dd_cell_sizes_dlb_nochange(dd, ddbox);
3734     }
3735
3736     /* Set the dimensions for which no DD is used */
3737     for (dim = 0; dim < DIM; dim++)
3738     {
3739         if (dd->nc[dim] == 1)
3740         {
3741             comm->cell_x0[dim] = 0;
3742             comm->cell_x1[dim] = ddbox->box_size[dim];
3743             if (dim >= ddbox->nboundeddim)
3744             {
3745                 comm->cell_x0[dim] += ddbox->box0[dim];
3746                 comm->cell_x1[dim] += ddbox->box0[dim];
3747             }
3748         }
3749     }
3750 }
3751
3752 static void realloc_comm_ind(gmx_domdec_t *dd, ivec npulse)
3753 {
3754     int                    d, np, i;
3755     gmx_domdec_comm_dim_t *cd;
3756
3757     for (d = 0; d < dd->ndim; d++)
3758     {
3759         cd = &dd->comm->cd[d];
3760         np = npulse[dd->dim[d]];
3761         if (np > cd->np_nalloc)
3762         {
3763             if (debug)
3764             {
3765                 fprintf(debug, "(Re)allocing cd for %c to %d pulses\n",
3766                         dim2char(dd->dim[d]), np);
3767             }
3768             if (DDMASTER(dd) && cd->np_nalloc > 0)
3769             {
3770                 fprintf(stderr, "\nIncreasing the number of cell to communicate in dimension %c to %d for the first time\n", dim2char(dd->dim[d]), np);
3771             }
3772             srenew(cd->ind, np);
3773             for (i = cd->np_nalloc; i < np; i++)
3774             {
3775                 cd->ind[i].index  = NULL;
3776                 cd->ind[i].nalloc = 0;
3777             }
3778             cd->np_nalloc = np;
3779         }
3780         cd->np = np;
3781     }
3782 }
3783
3784
3785 static void set_dd_cell_sizes(gmx_domdec_t *dd,
3786                               gmx_ddbox_t *ddbox, gmx_bool bDynamicBox,
3787                               gmx_bool bUniform, gmx_bool bDoDLB, gmx_int64_t step,
3788                               gmx_wallcycle_t wcycle)
3789 {
3790     gmx_domdec_comm_t *comm;
3791     int                d;
3792     ivec               npulse;
3793
3794     comm = dd->comm;
3795
3796     /* Copy the old cell boundaries for the cg displacement check */
3797     copy_rvec(comm->cell_x0, comm->old_cell_x0);
3798     copy_rvec(comm->cell_x1, comm->old_cell_x1);
3799
3800     if (comm->bDynLoadBal)
3801     {
3802         if (DDMASTER(dd))
3803         {
3804             check_box_size(dd, ddbox);
3805         }
3806         set_dd_cell_sizes_dlb(dd, ddbox, bDynamicBox, bUniform, bDoDLB, step, wcycle);
3807     }
3808     else
3809     {
3810         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbLOCAL, npulse);
3811         realloc_comm_ind(dd, npulse);
3812     }
3813
3814     if (debug)
3815     {
3816         for (d = 0; d < DIM; d++)
3817         {
3818             fprintf(debug, "cell_x[%d] %f - %f skew_fac %f\n",
3819                     d, comm->cell_x0[d], comm->cell_x1[d], ddbox->skew_fac[d]);
3820         }
3821     }
3822 }
3823
3824 static void comm_dd_ns_cell_sizes(gmx_domdec_t *dd,
3825                                   gmx_ddbox_t *ddbox,
3826                                   rvec cell_ns_x0, rvec cell_ns_x1,
3827                                   gmx_int64_t step)
3828 {
3829     gmx_domdec_comm_t *comm;
3830     int                dim_ind, dim;
3831
3832     comm = dd->comm;
3833
3834     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
3835     {
3836         dim = dd->dim[dim_ind];
3837
3838         /* Without PBC we don't have restrictions on the outer cells */
3839         if (!(dim >= ddbox->npbcdim &&
3840               (dd->ci[dim] == 0 || dd->ci[dim] == dd->nc[dim] - 1)) &&
3841             comm->bDynLoadBal &&
3842             (comm->cell_x1[dim] - comm->cell_x0[dim])*ddbox->skew_fac[dim] <
3843             comm->cellsize_min[dim])
3844         {
3845             char buf[22];
3846             gmx_fatal(FARGS, "Step %s: The %c-size (%f) times the triclinic skew factor (%f) is smaller than the smallest allowed cell size (%f) for domain decomposition grid cell %d %d %d",
3847                       gmx_step_str(step, buf), dim2char(dim),
3848                       comm->cell_x1[dim] - comm->cell_x0[dim],
3849                       ddbox->skew_fac[dim],
3850                       dd->comm->cellsize_min[dim],
3851                       dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
3852         }
3853     }
3854
3855     if ((dd->bGridJump && dd->ndim > 1) || ddbox->nboundeddim < DIM)
3856     {
3857         /* Communicate the boundaries and update cell_ns_x0/1 */
3858         dd_move_cellx(dd, ddbox, cell_ns_x0, cell_ns_x1);
3859         if (dd->bGridJump && dd->ndim > 1)
3860         {
3861             check_grid_jump(step, dd, dd->comm->cutoff, ddbox, TRUE);
3862         }
3863     }
3864 }
3865
3866 static void make_tric_corr_matrix(int npbcdim, matrix box, matrix tcm)
3867 {
3868     if (YY < npbcdim)
3869     {
3870         tcm[YY][XX] = -box[YY][XX]/box[YY][YY];
3871     }
3872     else
3873     {
3874         tcm[YY][XX] = 0;
3875     }
3876     if (ZZ < npbcdim)
3877     {
3878         tcm[ZZ][XX] = -(box[ZZ][YY]*tcm[YY][XX] + box[ZZ][XX])/box[ZZ][ZZ];
3879         tcm[ZZ][YY] = -box[ZZ][YY]/box[ZZ][ZZ];
3880     }
3881     else
3882     {
3883         tcm[ZZ][XX] = 0;
3884         tcm[ZZ][YY] = 0;
3885     }
3886 }
3887
3888 static void check_screw_box(matrix box)
3889 {
3890     /* Mathematical limitation */
3891     if (box[YY][XX] != 0 || box[ZZ][XX] != 0)
3892     {
3893         gmx_fatal(FARGS, "With screw pbc the unit cell can not have non-zero off-diagonal x-components");
3894     }
3895
3896     /* Limitation due to the asymmetry of the eighth shell method */
3897     if (box[ZZ][YY] != 0)
3898     {
3899         gmx_fatal(FARGS, "pbc=screw with non-zero box_zy is not supported");
3900     }
3901 }
3902
3903 static void distribute_cg(FILE *fplog, gmx_int64_t step,
3904                           matrix box, ivec tric_dir, t_block *cgs, rvec pos[],
3905                           gmx_domdec_t *dd)
3906 {
3907     gmx_domdec_master_t *ma;
3908     int                **tmp_ind = NULL, *tmp_nalloc = NULL;
3909     int                  i, icg, j, k, k0, k1, d, npbcdim;
3910     matrix               tcm;
3911     rvec                 box_size, cg_cm;
3912     ivec                 ind;
3913     real                 nrcg, inv_ncg, pos_d;
3914     atom_id             *cgindex;
3915     gmx_bool             bUnbounded, bScrew;
3916
3917     ma = dd->ma;
3918
3919     if (tmp_ind == NULL)
3920     {
3921         snew(tmp_nalloc, dd->nnodes);
3922         snew(tmp_ind, dd->nnodes);
3923         for (i = 0; i < dd->nnodes; i++)
3924         {
3925             tmp_nalloc[i] = over_alloc_large(cgs->nr/dd->nnodes+1);
3926             snew(tmp_ind[i], tmp_nalloc[i]);
3927         }
3928     }
3929
3930     /* Clear the count */
3931     for (i = 0; i < dd->nnodes; i++)
3932     {
3933         ma->ncg[i] = 0;
3934         ma->nat[i] = 0;
3935     }
3936
3937     make_tric_corr_matrix(dd->npbcdim, box, tcm);
3938
3939     cgindex = cgs->index;
3940
3941     /* Compute the center of geometry for all charge groups */
3942     for (icg = 0; icg < cgs->nr; icg++)
3943     {
3944         k0      = cgindex[icg];
3945         k1      = cgindex[icg+1];
3946         nrcg    = k1 - k0;
3947         if (nrcg == 1)
3948         {
3949             copy_rvec(pos[k0], cg_cm);
3950         }
3951         else
3952         {
3953             inv_ncg = 1.0/nrcg;
3954
3955             clear_rvec(cg_cm);
3956             for (k = k0; (k < k1); k++)
3957             {
3958                 rvec_inc(cg_cm, pos[k]);
3959             }
3960             for (d = 0; (d < DIM); d++)
3961             {
3962                 cg_cm[d] *= inv_ncg;
3963             }
3964         }
3965         /* Put the charge group in the box and determine the cell index */
3966         for (d = DIM-1; d >= 0; d--)
3967         {
3968             pos_d = cg_cm[d];
3969             if (d < dd->npbcdim)
3970             {
3971                 bScrew = (dd->bScrewPBC && d == XX);
3972                 if (tric_dir[d] && dd->nc[d] > 1)
3973                 {
3974                     /* Use triclinic coordintates for this dimension */
3975                     for (j = d+1; j < DIM; j++)
3976                     {
3977                         pos_d += cg_cm[j]*tcm[j][d];
3978                     }
3979                 }
3980                 while (pos_d >= box[d][d])
3981                 {
3982                     pos_d -= box[d][d];
3983                     rvec_dec(cg_cm, box[d]);
3984                     if (bScrew)
3985                     {
3986                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
3987                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
3988                     }
3989                     for (k = k0; (k < k1); k++)
3990                     {
3991                         rvec_dec(pos[k], box[d]);
3992                         if (bScrew)
3993                         {
3994                             pos[k][YY] = box[YY][YY] - pos[k][YY];
3995                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
3996                         }
3997                     }
3998                 }
3999                 while (pos_d < 0)
4000                 {
4001                     pos_d += box[d][d];
4002                     rvec_inc(cg_cm, box[d]);
4003                     if (bScrew)
4004                     {
4005                         cg_cm[YY] = box[YY][YY] - cg_cm[YY];
4006                         cg_cm[ZZ] = box[ZZ][ZZ] - cg_cm[ZZ];
4007                     }
4008                     for (k = k0; (k < k1); k++)
4009                     {
4010                         rvec_inc(pos[k], box[d]);
4011                         if (bScrew)
4012                         {
4013                             pos[k][YY] = box[YY][YY] - pos[k][YY];
4014                             pos[k][ZZ] = box[ZZ][ZZ] - pos[k][ZZ];
4015                         }
4016                     }
4017                 }
4018             }
4019             /* This could be done more efficiently */
4020             ind[d] = 0;
4021             while (ind[d]+1 < dd->nc[d] && pos_d >= ma->cell_x[d][ind[d]+1])
4022             {
4023                 ind[d]++;
4024             }
4025         }
4026         i = dd_index(dd->nc, ind);
4027         if (ma->ncg[i] == tmp_nalloc[i])
4028         {
4029             tmp_nalloc[i] = over_alloc_large(ma->ncg[i]+1);
4030             srenew(tmp_ind[i], tmp_nalloc[i]);
4031         }
4032         tmp_ind[i][ma->ncg[i]] = icg;
4033         ma->ncg[i]++;
4034         ma->nat[i] += cgindex[icg+1] - cgindex[icg];
4035     }
4036
4037     k1 = 0;
4038     for (i = 0; i < dd->nnodes; i++)
4039     {
4040         ma->index[i] = k1;
4041         for (k = 0; k < ma->ncg[i]; k++)
4042         {
4043             ma->cg[k1++] = tmp_ind[i][k];
4044         }
4045     }
4046     ma->index[dd->nnodes] = k1;
4047
4048     for (i = 0; i < dd->nnodes; i++)
4049     {
4050         sfree(tmp_ind[i]);
4051     }
4052     sfree(tmp_ind);
4053     sfree(tmp_nalloc);
4054
4055     if (fplog)
4056     {
4057         char buf[22];
4058         fprintf(fplog, "Charge group distribution at step %s:",
4059                 gmx_step_str(step, buf));
4060         for (i = 0; i < dd->nnodes; i++)
4061         {
4062             fprintf(fplog, " %d", ma->ncg[i]);
4063         }
4064         fprintf(fplog, "\n");
4065     }
4066 }
4067
4068 static void get_cg_distribution(FILE *fplog, gmx_int64_t step, gmx_domdec_t *dd,
4069                                 t_block *cgs, matrix box, gmx_ddbox_t *ddbox,
4070                                 rvec pos[])
4071 {
4072     gmx_domdec_master_t *ma = NULL;
4073     ivec                 npulse;
4074     int                  i, cg_gl;
4075     int                 *ibuf, buf2[2] = { 0, 0 };
4076     gmx_bool             bMaster = DDMASTER(dd);
4077
4078     if (bMaster)
4079     {
4080         ma = dd->ma;
4081
4082         if (dd->bScrewPBC)
4083         {
4084             check_screw_box(box);
4085         }
4086
4087         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbMASTER, npulse);
4088
4089         distribute_cg(fplog, step, box, ddbox->tric_dir, cgs, pos, dd);
4090         for (i = 0; i < dd->nnodes; i++)
4091         {
4092             ma->ibuf[2*i]   = ma->ncg[i];
4093             ma->ibuf[2*i+1] = ma->nat[i];
4094         }
4095         ibuf = ma->ibuf;
4096     }
4097     else
4098     {
4099         ibuf = NULL;
4100     }
4101     dd_scatter(dd, 2*sizeof(int), ibuf, buf2);
4102
4103     dd->ncg_home = buf2[0];
4104     dd->nat_home = buf2[1];
4105     dd->ncg_tot  = dd->ncg_home;
4106     dd->nat_tot  = dd->nat_home;
4107     if (dd->ncg_home > dd->cg_nalloc || dd->cg_nalloc == 0)
4108     {
4109         dd->cg_nalloc = over_alloc_dd(dd->ncg_home);
4110         srenew(dd->index_gl, dd->cg_nalloc);
4111         srenew(dd->cgindex, dd->cg_nalloc+1);
4112     }
4113     if (bMaster)
4114     {
4115         for (i = 0; i < dd->nnodes; i++)
4116         {
4117             ma->ibuf[i]            = ma->ncg[i]*sizeof(int);
4118             ma->ibuf[dd->nnodes+i] = ma->index[i]*sizeof(int);
4119         }
4120     }
4121
4122     dd_scatterv(dd,
4123                 DDMASTER(dd) ? ma->ibuf : NULL,
4124                 DDMASTER(dd) ? ma->ibuf+dd->nnodes : NULL,
4125                 DDMASTER(dd) ? ma->cg : NULL,
4126                 dd->ncg_home*sizeof(int), dd->index_gl);
4127
4128     /* Determine the home charge group sizes */
4129     dd->cgindex[0] = 0;
4130     for (i = 0; i < dd->ncg_home; i++)
4131     {
4132         cg_gl            = dd->index_gl[i];
4133         dd->cgindex[i+1] =
4134             dd->cgindex[i] + cgs->index[cg_gl+1] - cgs->index[cg_gl];
4135     }
4136
4137     if (debug)
4138     {
4139         fprintf(debug, "Home charge groups:\n");
4140         for (i = 0; i < dd->ncg_home; i++)
4141         {
4142             fprintf(debug, " %d", dd->index_gl[i]);
4143             if (i % 10 == 9)
4144             {
4145                 fprintf(debug, "\n");
4146             }
4147         }
4148         fprintf(debug, "\n");
4149     }
4150 }
4151
4152 static int compact_and_copy_vec_at(int ncg, int *move,
4153                                    int *cgindex,
4154                                    int nvec, int vec,
4155                                    rvec *src, gmx_domdec_comm_t *comm,
4156                                    gmx_bool bCompact)
4157 {
4158     int m, icg, i, i0, i1, nrcg;
4159     int home_pos;
4160     int pos_vec[DIM*2];
4161
4162     home_pos = 0;
4163
4164     for (m = 0; m < DIM*2; m++)
4165     {
4166         pos_vec[m] = 0;
4167     }
4168
4169     i0 = 0;
4170     for (icg = 0; icg < ncg; icg++)
4171     {
4172         i1 = cgindex[icg+1];
4173         m  = move[icg];
4174         if (m == -1)
4175         {
4176             if (bCompact)
4177             {
4178                 /* Compact the home array in place */
4179                 for (i = i0; i < i1; i++)
4180                 {
4181                     copy_rvec(src[i], src[home_pos++]);
4182                 }
4183             }
4184         }
4185         else
4186         {
4187             /* Copy to the communication buffer */
4188             nrcg        = i1 - i0;
4189             pos_vec[m] += 1 + vec*nrcg;
4190             for (i = i0; i < i1; i++)
4191             {
4192                 copy_rvec(src[i], comm->cgcm_state[m][pos_vec[m]++]);
4193             }
4194             pos_vec[m] += (nvec - vec - 1)*nrcg;
4195         }
4196         if (!bCompact)
4197         {
4198             home_pos += i1 - i0;
4199         }
4200         i0 = i1;
4201     }
4202
4203     return home_pos;
4204 }
4205
4206 static int compact_and_copy_vec_cg(int ncg, int *move,
4207                                    int *cgindex,
4208                                    int nvec, rvec *src, gmx_domdec_comm_t *comm,
4209                                    gmx_bool bCompact)
4210 {
4211     int m, icg, i0, i1, nrcg;
4212     int home_pos;
4213     int pos_vec[DIM*2];
4214
4215     home_pos = 0;
4216
4217     for (m = 0; m < DIM*2; m++)
4218     {
4219         pos_vec[m] = 0;
4220     }
4221
4222     i0 = 0;
4223     for (icg = 0; icg < ncg; icg++)
4224     {
4225         i1 = cgindex[icg+1];
4226         m  = move[icg];
4227         if (m == -1)
4228         {
4229             if (bCompact)
4230             {
4231                 /* Compact the home array in place */
4232                 copy_rvec(src[icg], src[home_pos++]);
4233             }
4234         }
4235         else
4236         {
4237             nrcg = i1 - i0;
4238             /* Copy to the communication buffer */
4239             copy_rvec(src[icg], comm->cgcm_state[m][pos_vec[m]]);
4240             pos_vec[m] += 1 + nrcg*nvec;
4241         }
4242         i0 = i1;
4243     }
4244     if (!bCompact)
4245     {
4246         home_pos = ncg;
4247     }
4248
4249     return home_pos;
4250 }
4251
4252 static int compact_ind(int ncg, int *move,
4253                        int *index_gl, int *cgindex,
4254                        int *gatindex,
4255                        gmx_ga2la_t ga2la, char *bLocalCG,
4256                        int *cginfo)
4257 {
4258     int cg, nat, a0, a1, a, a_gl;
4259     int home_pos;
4260
4261     home_pos = 0;
4262     nat      = 0;
4263     for (cg = 0; cg < ncg; cg++)
4264     {
4265         a0 = cgindex[cg];
4266         a1 = cgindex[cg+1];
4267         if (move[cg] == -1)
4268         {
4269             /* Compact the home arrays in place.
4270              * Anything that can be done here avoids access to global arrays.
4271              */
4272             cgindex[home_pos] = nat;
4273             for (a = a0; a < a1; a++)
4274             {
4275                 a_gl          = gatindex[a];
4276                 gatindex[nat] = a_gl;
4277                 /* The cell number stays 0, so we don't need to set it */
4278                 ga2la_change_la(ga2la, a_gl, nat);
4279                 nat++;
4280             }
4281             index_gl[home_pos] = index_gl[cg];
4282             cginfo[home_pos]   = cginfo[cg];
4283             /* The charge group remains local, so bLocalCG does not change */
4284             home_pos++;
4285         }
4286         else
4287         {
4288             /* Clear the global indices */
4289             for (a = a0; a < a1; a++)
4290             {
4291                 ga2la_del(ga2la, gatindex[a]);
4292             }
4293             if (bLocalCG)
4294             {
4295                 bLocalCG[index_gl[cg]] = FALSE;
4296             }
4297         }
4298     }
4299     cgindex[home_pos] = nat;
4300
4301     return home_pos;
4302 }
4303
4304 static void clear_and_mark_ind(int ncg, int *move,
4305                                int *index_gl, int *cgindex, int *gatindex,
4306                                gmx_ga2la_t ga2la, char *bLocalCG,
4307                                int *cell_index)
4308 {
4309     int cg, a0, a1, a;
4310
4311     for (cg = 0; cg < ncg; cg++)
4312     {
4313         if (move[cg] >= 0)
4314         {
4315             a0 = cgindex[cg];
4316             a1 = cgindex[cg+1];
4317             /* Clear the global indices */
4318             for (a = a0; a < a1; a++)
4319             {
4320                 ga2la_del(ga2la, gatindex[a]);
4321             }
4322             if (bLocalCG)
4323             {
4324                 bLocalCG[index_gl[cg]] = FALSE;
4325             }
4326             /* Signal that this cg has moved using the ns cell index.
4327              * Here we set it to -1. fill_grid will change it
4328              * from -1 to NSGRID_SIGNAL_MOVED_FAC*grid->ncells.
4329              */
4330             cell_index[cg] = -1;
4331         }
4332     }
4333 }
4334
4335 static void print_cg_move(FILE *fplog,
4336                           gmx_domdec_t *dd,
4337                           gmx_int64_t step, int cg, int dim, int dir,
4338                           gmx_bool bHaveCgcmOld, real limitd,
4339                           rvec cm_old, rvec cm_new, real pos_d)
4340 {
4341     gmx_domdec_comm_t *comm;
4342     char               buf[22];
4343
4344     comm = dd->comm;
4345
4346     fprintf(fplog, "\nStep %s:\n", gmx_step_str(step, buf));
4347     if (limitd > 0)
4348     {
4349         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition (%f) in direction %c\n",
4350                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4351                 ddglatnr(dd, dd->cgindex[cg]), limitd, dim2char(dim));
4352     }
4353     else
4354     {
4355         /* We don't have a limiting distance available: don't print it */
4356         fprintf(fplog, "%s %d moved more than the distance allowed by the domain decomposition in direction %c\n",
4357                 dd->comm->bCGs ? "The charge group starting at atom" : "Atom",
4358                 ddglatnr(dd, dd->cgindex[cg]), dim2char(dim));
4359     }
4360     fprintf(fplog, "distance out of cell %f\n",
4361             dir == 1 ? pos_d - comm->cell_x1[dim] : pos_d - comm->cell_x0[dim]);
4362     if (bHaveCgcmOld)
4363     {
4364         fprintf(fplog, "Old coordinates: %8.3f %8.3f %8.3f\n",
4365                 cm_old[XX], cm_old[YY], cm_old[ZZ]);
4366     }
4367     fprintf(fplog, "New coordinates: %8.3f %8.3f %8.3f\n",
4368             cm_new[XX], cm_new[YY], cm_new[ZZ]);
4369     fprintf(fplog, "Old cell boundaries in direction %c: %8.3f %8.3f\n",
4370             dim2char(dim),
4371             comm->old_cell_x0[dim], comm->old_cell_x1[dim]);
4372     fprintf(fplog, "New cell boundaries in direction %c: %8.3f %8.3f\n",
4373             dim2char(dim),
4374             comm->cell_x0[dim], comm->cell_x1[dim]);
4375 }
4376
4377 static void cg_move_error(FILE *fplog,
4378                           gmx_domdec_t *dd,
4379                           gmx_int64_t step, int cg, int dim, int dir,
4380                           gmx_bool bHaveCgcmOld, real limitd,
4381                           rvec cm_old, rvec cm_new, real pos_d)
4382 {
4383     if (fplog)
4384     {
4385         print_cg_move(fplog, dd, step, cg, dim, dir,
4386                       bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4387     }
4388     print_cg_move(stderr, dd, step, cg, dim, dir,
4389                   bHaveCgcmOld, limitd, cm_old, cm_new, pos_d);
4390     gmx_fatal(FARGS,
4391               "%s moved too far between two domain decomposition steps\n"
4392               "This usually means that your system is not well equilibrated",
4393               dd->comm->bCGs ? "A charge group" : "An atom");
4394 }
4395
4396 static void rotate_state_atom(t_state *state, int a)
4397 {
4398     int est;
4399
4400     for (est = 0; est < estNR; est++)
4401     {
4402         if (EST_DISTR(est) && (state->flags & (1<<est)))
4403         {
4404             switch (est)
4405             {
4406                 case estX:
4407                     /* Rotate the complete state; for a rectangular box only */
4408                     state->x[a][YY] = state->box[YY][YY] - state->x[a][YY];
4409                     state->x[a][ZZ] = state->box[ZZ][ZZ] - state->x[a][ZZ];
4410                     break;
4411                 case estV:
4412                     state->v[a][YY] = -state->v[a][YY];
4413                     state->v[a][ZZ] = -state->v[a][ZZ];
4414                     break;
4415                 case estSDX:
4416                     state->sd_X[a][YY] = -state->sd_X[a][YY];
4417                     state->sd_X[a][ZZ] = -state->sd_X[a][ZZ];
4418                     break;
4419                 case estCGP:
4420                     state->cg_p[a][YY] = -state->cg_p[a][YY];
4421                     state->cg_p[a][ZZ] = -state->cg_p[a][ZZ];
4422                     break;
4423                 case estDISRE_INITF:
4424                 case estDISRE_RM3TAV:
4425                 case estORIRE_INITF:
4426                 case estORIRE_DTAV:
4427                     /* These are distances, so not affected by rotation */
4428                     break;
4429                 default:
4430                     gmx_incons("Unknown state entry encountered in rotate_state_atom");
4431             }
4432         }
4433     }
4434 }
4435
4436 static int *get_moved(gmx_domdec_comm_t *comm, int natoms)
4437 {
4438     if (natoms > comm->moved_nalloc)
4439     {
4440         /* Contents should be preserved here */
4441         comm->moved_nalloc = over_alloc_dd(natoms);
4442         srenew(comm->moved, comm->moved_nalloc);
4443     }
4444
4445     return comm->moved;
4446 }
4447
4448 static void calc_cg_move(FILE *fplog, gmx_int64_t step,
4449                          gmx_domdec_t *dd,
4450                          t_state *state,
4451                          ivec tric_dir, matrix tcm,
4452                          rvec cell_x0, rvec cell_x1,
4453                          rvec limitd, rvec limit0, rvec limit1,
4454                          const int *cgindex,
4455                          int cg_start, int cg_end,
4456                          rvec *cg_cm,
4457                          int *move)
4458 {
4459     int      npbcdim;
4460     int      c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4461     int      mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4462     int      flag;
4463     gmx_bool bScrew;
4464     ivec     dev;
4465     real     inv_ncg, pos_d;
4466     rvec     cm_new;
4467
4468     npbcdim = dd->npbcdim;
4469
4470     for (cg = cg_start; cg < cg_end; cg++)
4471     {
4472         k0   = cgindex[cg];
4473         k1   = cgindex[cg+1];
4474         nrcg = k1 - k0;
4475         if (nrcg == 1)
4476         {
4477             copy_rvec(state->x[k0], cm_new);
4478         }
4479         else
4480         {
4481             inv_ncg = 1.0/nrcg;
4482
4483             clear_rvec(cm_new);
4484             for (k = k0; (k < k1); k++)
4485             {
4486                 rvec_inc(cm_new, state->x[k]);
4487             }
4488             for (d = 0; (d < DIM); d++)
4489             {
4490                 cm_new[d] = inv_ncg*cm_new[d];
4491             }
4492         }
4493
4494         clear_ivec(dev);
4495         /* Do pbc and check DD cell boundary crossings */
4496         for (d = DIM-1; d >= 0; d--)
4497         {
4498             if (dd->nc[d] > 1)
4499             {
4500                 bScrew = (dd->bScrewPBC && d == XX);
4501                 /* Determine the location of this cg in lattice coordinates */
4502                 pos_d = cm_new[d];
4503                 if (tric_dir[d])
4504                 {
4505                     for (d2 = d+1; d2 < DIM; d2++)
4506                     {
4507                         pos_d += cm_new[d2]*tcm[d2][d];
4508                     }
4509                 }
4510                 /* Put the charge group in the triclinic unit-cell */
4511                 if (pos_d >= cell_x1[d])
4512                 {
4513                     if (pos_d >= limit1[d])
4514                     {
4515                         cg_move_error(fplog, dd, step, cg, d, 1,
4516                                       cg_cm != state->x, limitd[d],
4517                                       cg_cm[cg], cm_new, pos_d);
4518                     }
4519                     dev[d] = 1;
4520                     if (dd->ci[d] == dd->nc[d] - 1)
4521                     {
4522                         rvec_dec(cm_new, state->box[d]);
4523                         if (bScrew)
4524                         {
4525                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4526                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4527                         }
4528                         for (k = k0; (k < k1); k++)
4529                         {
4530                             rvec_dec(state->x[k], state->box[d]);
4531                             if (bScrew)
4532                             {
4533                                 rotate_state_atom(state, k);
4534                             }
4535                         }
4536                     }
4537                 }
4538                 else if (pos_d < cell_x0[d])
4539                 {
4540                     if (pos_d < limit0[d])
4541                     {
4542                         cg_move_error(fplog, dd, step, cg, d, -1,
4543                                       cg_cm != state->x, limitd[d],
4544                                       cg_cm[cg], cm_new, pos_d);
4545                     }
4546                     dev[d] = -1;
4547                     if (dd->ci[d] == 0)
4548                     {
4549                         rvec_inc(cm_new, state->box[d]);
4550                         if (bScrew)
4551                         {
4552                             cm_new[YY] = state->box[YY][YY] - cm_new[YY];
4553                             cm_new[ZZ] = state->box[ZZ][ZZ] - cm_new[ZZ];
4554                         }
4555                         for (k = k0; (k < k1); k++)
4556                         {
4557                             rvec_inc(state->x[k], state->box[d]);
4558                             if (bScrew)
4559                             {
4560                                 rotate_state_atom(state, k);
4561                             }
4562                         }
4563                     }
4564                 }
4565             }
4566             else if (d < npbcdim)
4567             {
4568                 /* Put the charge group in the rectangular unit-cell */
4569                 while (cm_new[d] >= state->box[d][d])
4570                 {
4571                     rvec_dec(cm_new, state->box[d]);
4572                     for (k = k0; (k < k1); k++)
4573                     {
4574                         rvec_dec(state->x[k], state->box[d]);
4575                     }
4576                 }
4577                 while (cm_new[d] < 0)
4578                 {
4579                     rvec_inc(cm_new, state->box[d]);
4580                     for (k = k0; (k < k1); k++)
4581                     {
4582                         rvec_inc(state->x[k], state->box[d]);
4583                     }
4584                 }
4585             }
4586         }
4587
4588         copy_rvec(cm_new, cg_cm[cg]);
4589
4590         /* Determine where this cg should go */
4591         flag = 0;
4592         mc   = -1;
4593         for (d = 0; d < dd->ndim; d++)
4594         {
4595             dim = dd->dim[d];
4596             if (dev[dim] == 1)
4597             {
4598                 flag |= DD_FLAG_FW(d);
4599                 if (mc == -1)
4600                 {
4601                     mc = d*2;
4602                 }
4603             }
4604             else if (dev[dim] == -1)
4605             {
4606                 flag |= DD_FLAG_BW(d);
4607                 if (mc == -1)
4608                 {
4609                     if (dd->nc[dim] > 2)
4610                     {
4611                         mc = d*2 + 1;
4612                     }
4613                     else
4614                     {
4615                         mc = d*2;
4616                     }
4617                 }
4618             }
4619         }
4620         /* Temporarily store the flag in move */
4621         move[cg] = mc + flag;
4622     }
4623 }
4624
4625 static void dd_redistribute_cg(FILE *fplog, gmx_int64_t step,
4626                                gmx_domdec_t *dd, ivec tric_dir,
4627                                t_state *state, rvec **f,
4628                                t_forcerec *fr,
4629                                gmx_bool bCompact,
4630                                t_nrnb *nrnb,
4631                                int *ncg_stay_home,
4632                                int *ncg_moved)
4633 {
4634     int               *move;
4635     int                npbcdim;
4636     int                ncg[DIM*2], nat[DIM*2];
4637     int                c, i, cg, k, k0, k1, d, dim, dim2, dir, d2, d3, d4, cell_d;
4638     int                mc, cdd, nrcg, ncg_recv, nat_recv, nvs, nvr, nvec, vec;
4639     int                sbuf[2], rbuf[2];
4640     int                home_pos_cg, home_pos_at, buf_pos;
4641     int                flag;
4642     gmx_bool           bV = FALSE, bSDX = FALSE, bCGP = FALSE;
4643     gmx_bool           bScrew;
4644     ivec               dev;
4645     real               inv_ncg, pos_d;
4646     matrix             tcm;
4647     rvec              *cg_cm = NULL, cell_x0, cell_x1, limitd, limit0, limit1, cm_new;
4648     atom_id           *cgindex;
4649     cginfo_mb_t       *cginfo_mb;
4650     gmx_domdec_comm_t *comm;
4651     int               *moved;
4652     int                nthread, thread;
4653
4654     if (dd->bScrewPBC)
4655     {
4656         check_screw_box(state->box);
4657     }
4658
4659     comm  = dd->comm;
4660     if (fr->cutoff_scheme == ecutsGROUP)
4661     {
4662         cg_cm = fr->cg_cm;
4663     }
4664
4665     for (i = 0; i < estNR; i++)
4666     {
4667         if (EST_DISTR(i))
4668         {
4669             switch (i)
4670             {
4671                 case estX: /* Always present */ break;
4672                 case estV:   bV   = (state->flags & (1<<i)); break;
4673                 case estSDX: bSDX = (state->flags & (1<<i)); break;
4674                 case estCGP: bCGP = (state->flags & (1<<i)); break;
4675                 case estLD_RNG:
4676                 case estLD_RNGI:
4677                 case estDISRE_INITF:
4678                 case estDISRE_RM3TAV:
4679                 case estORIRE_INITF:
4680                 case estORIRE_DTAV:
4681                     /* No processing required */
4682                     break;
4683                 default:
4684                     gmx_incons("Unknown state entry encountered in dd_redistribute_cg");
4685             }
4686         }
4687     }
4688
4689     if (dd->ncg_tot > comm->nalloc_int)
4690     {
4691         comm->nalloc_int = over_alloc_dd(dd->ncg_tot);
4692         srenew(comm->buf_int, comm->nalloc_int);
4693     }
4694     move = comm->buf_int;
4695
4696     /* Clear the count */
4697     for (c = 0; c < dd->ndim*2; c++)
4698     {
4699         ncg[c] = 0;
4700         nat[c] = 0;
4701     }
4702
4703     npbcdim = dd->npbcdim;
4704
4705     for (d = 0; (d < DIM); d++)
4706     {
4707         limitd[d] = dd->comm->cellsize_min[d];
4708         if (d >= npbcdim && dd->ci[d] == 0)
4709         {
4710             cell_x0[d] = -GMX_FLOAT_MAX;
4711         }
4712         else
4713         {
4714             cell_x0[d] = comm->cell_x0[d];
4715         }
4716         if (d >= npbcdim && dd->ci[d] == dd->nc[d] - 1)
4717         {
4718             cell_x1[d] = GMX_FLOAT_MAX;
4719         }
4720         else
4721         {
4722             cell_x1[d] = comm->cell_x1[d];
4723         }
4724         if (d < npbcdim)
4725         {
4726             limit0[d] = comm->old_cell_x0[d] - limitd[d];
4727             limit1[d] = comm->old_cell_x1[d] + limitd[d];
4728         }
4729         else
4730         {
4731             /* We check after communication if a charge group moved
4732              * more than one cell. Set the pre-comm check limit to float_max.
4733              */
4734             limit0[d] = -GMX_FLOAT_MAX;
4735             limit1[d] =  GMX_FLOAT_MAX;
4736         }
4737     }
4738
4739     make_tric_corr_matrix(npbcdim, state->box, tcm);
4740
4741     cgindex = dd->cgindex;
4742
4743     nthread = gmx_omp_nthreads_get(emntDomdec);
4744
4745     /* Compute the center of geometry for all home charge groups
4746      * and put them in the box and determine where they should go.
4747      */
4748 #pragma omp parallel for num_threads(nthread) schedule(static)
4749     for (thread = 0; thread < nthread; thread++)
4750     {
4751         calc_cg_move(fplog, step, dd, state, tric_dir, tcm,
4752                      cell_x0, cell_x1, limitd, limit0, limit1,
4753                      cgindex,
4754                      ( thread   *dd->ncg_home)/nthread,
4755                      ((thread+1)*dd->ncg_home)/nthread,
4756                      fr->cutoff_scheme == ecutsGROUP ? cg_cm : state->x,
4757                      move);
4758     }
4759
4760     for (cg = 0; cg < dd->ncg_home; cg++)
4761     {
4762         if (move[cg] >= 0)
4763         {
4764             mc       = move[cg];
4765             flag     = mc & ~DD_FLAG_NRCG;
4766             mc       = mc & DD_FLAG_NRCG;
4767             move[cg] = mc;
4768
4769             if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
4770             {
4771                 comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
4772                 srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
4773             }
4774             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS  ] = dd->index_gl[cg];
4775             /* We store the cg size in the lower 16 bits
4776              * and the place where the charge group should go
4777              * in the next 6 bits. This saves some communication volume.
4778              */
4779             nrcg = cgindex[cg+1] - cgindex[cg];
4780             comm->cggl_flag[mc][ncg[mc]*DD_CGIBS+1] = nrcg | flag;
4781             ncg[mc] += 1;
4782             nat[mc] += nrcg;
4783         }
4784     }
4785
4786     inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
4787     inc_nrnb(nrnb, eNR_RESETX, dd->ncg_home);
4788
4789     *ncg_moved = 0;
4790     for (i = 0; i < dd->ndim*2; i++)
4791     {
4792         *ncg_moved += ncg[i];
4793     }
4794
4795     nvec = 1;
4796     if (bV)
4797     {
4798         nvec++;
4799     }
4800     if (bSDX)
4801     {
4802         nvec++;
4803     }
4804     if (bCGP)
4805     {
4806         nvec++;
4807     }
4808
4809     /* Make sure the communication buffers are large enough */
4810     for (mc = 0; mc < dd->ndim*2; mc++)
4811     {
4812         nvr = ncg[mc] + nat[mc]*nvec;
4813         if (nvr > comm->cgcm_state_nalloc[mc])
4814         {
4815             comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr);
4816             srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
4817         }
4818     }
4819
4820     switch (fr->cutoff_scheme)
4821     {
4822         case ecutsGROUP:
4823             /* Recalculating cg_cm might be cheaper than communicating,
4824              * but that could give rise to rounding issues.
4825              */
4826             home_pos_cg =
4827                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4828                                         nvec, cg_cm, comm, bCompact);
4829             break;
4830         case ecutsVERLET:
4831             /* Without charge groups we send the moved atom coordinates
4832              * over twice. This is so the code below can be used without
4833              * many conditionals for both for with and without charge groups.
4834              */
4835             home_pos_cg =
4836                 compact_and_copy_vec_cg(dd->ncg_home, move, cgindex,
4837                                         nvec, state->x, comm, FALSE);
4838             if (bCompact)
4839             {
4840                 home_pos_cg -= *ncg_moved;
4841             }
4842             break;
4843         default:
4844             gmx_incons("unimplemented");
4845             home_pos_cg = 0;
4846     }
4847
4848     vec         = 0;
4849     home_pos_at =
4850         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4851                                 nvec, vec++, state->x, comm, bCompact);
4852     if (bV)
4853     {
4854         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4855                                 nvec, vec++, state->v, comm, bCompact);
4856     }
4857     if (bSDX)
4858     {
4859         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4860                                 nvec, vec++, state->sd_X, comm, bCompact);
4861     }
4862     if (bCGP)
4863     {
4864         compact_and_copy_vec_at(dd->ncg_home, move, cgindex,
4865                                 nvec, vec++, state->cg_p, comm, bCompact);
4866     }
4867
4868     if (bCompact)
4869     {
4870         compact_ind(dd->ncg_home, move,
4871                     dd->index_gl, dd->cgindex, dd->gatindex,
4872                     dd->ga2la, comm->bLocalCG,
4873                     fr->cginfo);
4874     }
4875     else
4876     {
4877         if (fr->cutoff_scheme == ecutsVERLET)
4878         {
4879             moved = get_moved(comm, dd->ncg_home);
4880
4881             for (k = 0; k < dd->ncg_home; k++)
4882             {
4883                 moved[k] = 0;
4884             }
4885         }
4886         else
4887         {
4888             moved = fr->ns.grid->cell_index;
4889         }
4890
4891         clear_and_mark_ind(dd->ncg_home, move,
4892                            dd->index_gl, dd->cgindex, dd->gatindex,
4893                            dd->ga2la, comm->bLocalCG,
4894                            moved);
4895     }
4896
4897     cginfo_mb = fr->cginfo_mb;
4898
4899     *ncg_stay_home = home_pos_cg;
4900     for (d = 0; d < dd->ndim; d++)
4901     {
4902         dim      = dd->dim[d];
4903         ncg_recv = 0;
4904         nat_recv = 0;
4905         nvr      = 0;
4906         for (dir = 0; dir < (dd->nc[dim] == 2 ? 1 : 2); dir++)
4907         {
4908             cdd = d*2 + dir;
4909             /* Communicate the cg and atom counts */
4910             sbuf[0] = ncg[cdd];
4911             sbuf[1] = nat[cdd];
4912             if (debug)
4913             {
4914                 fprintf(debug, "Sending ddim %d dir %d: ncg %d nat %d\n",
4915                         d, dir, sbuf[0], sbuf[1]);
4916             }
4917             dd_sendrecv_int(dd, d, dir, sbuf, 2, rbuf, 2);
4918
4919             if ((ncg_recv+rbuf[0])*DD_CGIBS > comm->nalloc_int)
4920             {
4921                 comm->nalloc_int = over_alloc_dd((ncg_recv+rbuf[0])*DD_CGIBS);
4922                 srenew(comm->buf_int, comm->nalloc_int);
4923             }
4924
4925             /* Communicate the charge group indices, sizes and flags */
4926             dd_sendrecv_int(dd, d, dir,
4927                             comm->cggl_flag[cdd], sbuf[0]*DD_CGIBS,
4928                             comm->buf_int+ncg_recv*DD_CGIBS, rbuf[0]*DD_CGIBS);
4929
4930             nvs = ncg[cdd] + nat[cdd]*nvec;
4931             i   = rbuf[0]  + rbuf[1] *nvec;
4932             vec_rvec_check_alloc(&comm->vbuf, nvr+i);
4933
4934             /* Communicate cgcm and state */
4935             dd_sendrecv_rvec(dd, d, dir,
4936                              comm->cgcm_state[cdd], nvs,
4937                              comm->vbuf.v+nvr, i);
4938             ncg_recv += rbuf[0];
4939             nat_recv += rbuf[1];
4940             nvr      += i;
4941         }
4942
4943         /* Process the received charge groups */
4944         buf_pos = 0;
4945         for (cg = 0; cg < ncg_recv; cg++)
4946         {
4947             flag = comm->buf_int[cg*DD_CGIBS+1];
4948
4949             if (dim >= npbcdim && dd->nc[dim] > 2)
4950             {
4951                 /* No pbc in this dim and more than one domain boundary.
4952                  * We do a separate check if a charge group didn't move too far.
4953                  */
4954                 if (((flag & DD_FLAG_FW(d)) &&
4955                      comm->vbuf.v[buf_pos][dim] > cell_x1[dim]) ||
4956                     ((flag & DD_FLAG_BW(d)) &&
4957                      comm->vbuf.v[buf_pos][dim] < cell_x0[dim]))
4958                 {
4959                     cg_move_error(fplog, dd, step, cg, dim,
4960                                   (flag & DD_FLAG_FW(d)) ? 1 : 0,
4961                                   fr->cutoff_scheme == ecutsGROUP, 0,
4962                                   comm->vbuf.v[buf_pos],
4963                                   comm->vbuf.v[buf_pos],
4964                                   comm->vbuf.v[buf_pos][dim]);
4965                 }
4966             }
4967
4968             mc = -1;
4969             if (d < dd->ndim-1)
4970             {
4971                 /* Check which direction this cg should go */
4972                 for (d2 = d+1; (d2 < dd->ndim && mc == -1); d2++)
4973                 {
4974                     if (dd->bGridJump)
4975                     {
4976                         /* The cell boundaries for dimension d2 are not equal
4977                          * for each cell row of the lower dimension(s),
4978                          * therefore we might need to redetermine where
4979                          * this cg should go.
4980                          */
4981                         dim2 = dd->dim[d2];
4982                         /* If this cg crosses the box boundary in dimension d2
4983                          * we can use the communicated flag, so we do not
4984                          * have to worry about pbc.
4985                          */
4986                         if (!((dd->ci[dim2] == dd->nc[dim2]-1 &&
4987                                (flag & DD_FLAG_FW(d2))) ||
4988                               (dd->ci[dim2] == 0 &&
4989                                (flag & DD_FLAG_BW(d2)))))
4990                         {
4991                             /* Clear the two flags for this dimension */
4992                             flag &= ~(DD_FLAG_FW(d2) | DD_FLAG_BW(d2));
4993                             /* Determine the location of this cg
4994                              * in lattice coordinates
4995                              */
4996                             pos_d = comm->vbuf.v[buf_pos][dim2];
4997                             if (tric_dir[dim2])
4998                             {
4999                                 for (d3 = dim2+1; d3 < DIM; d3++)
5000                                 {
5001                                     pos_d +=
5002                                         comm->vbuf.v[buf_pos][d3]*tcm[d3][dim2];
5003                                 }
5004                             }
5005                             /* Check of we are not at the box edge.
5006                              * pbc is only handled in the first step above,
5007                              * but this check could move over pbc while
5008                              * the first step did not due to different rounding.
5009                              */
5010                             if (pos_d >= cell_x1[dim2] &&
5011                                 dd->ci[dim2] != dd->nc[dim2]-1)
5012                             {
5013                                 flag |= DD_FLAG_FW(d2);
5014                             }
5015                             else if (pos_d < cell_x0[dim2] &&
5016                                      dd->ci[dim2] != 0)
5017                             {
5018                                 flag |= DD_FLAG_BW(d2);
5019                             }
5020                             comm->buf_int[cg*DD_CGIBS+1] = flag;
5021                         }
5022                     }
5023                     /* Set to which neighboring cell this cg should go */
5024                     if (flag & DD_FLAG_FW(d2))
5025                     {
5026                         mc = d2*2;
5027                     }
5028                     else if (flag & DD_FLAG_BW(d2))
5029                     {
5030                         if (dd->nc[dd->dim[d2]] > 2)
5031                         {
5032                             mc = d2*2+1;
5033                         }
5034                         else
5035                         {
5036                             mc = d2*2;
5037                         }
5038                     }
5039                 }
5040             }
5041
5042             nrcg = flag & DD_FLAG_NRCG;
5043             if (mc == -1)
5044             {
5045                 if (home_pos_cg+1 > dd->cg_nalloc)
5046                 {
5047                     dd->cg_nalloc = over_alloc_dd(home_pos_cg+1);
5048                     srenew(dd->index_gl, dd->cg_nalloc);
5049                     srenew(dd->cgindex, dd->cg_nalloc+1);
5050                 }
5051                 /* Set the global charge group index and size */
5052                 dd->index_gl[home_pos_cg]  = comm->buf_int[cg*DD_CGIBS];
5053                 dd->cgindex[home_pos_cg+1] = dd->cgindex[home_pos_cg] + nrcg;
5054                 /* Copy the state from the buffer */
5055                 dd_check_alloc_ncg(fr, state, f, home_pos_cg+1);
5056                 if (fr->cutoff_scheme == ecutsGROUP)
5057                 {
5058                     cg_cm = fr->cg_cm;
5059                     copy_rvec(comm->vbuf.v[buf_pos], cg_cm[home_pos_cg]);
5060                 }
5061                 buf_pos++;
5062
5063                 /* Set the cginfo */
5064                 fr->cginfo[home_pos_cg] = ddcginfo(cginfo_mb,
5065                                                    dd->index_gl[home_pos_cg]);
5066                 if (comm->bLocalCG)
5067                 {
5068                     comm->bLocalCG[dd->index_gl[home_pos_cg]] = TRUE;
5069                 }
5070
5071                 if (home_pos_at+nrcg > state->nalloc)
5072                 {
5073                     dd_realloc_state(state, f, home_pos_at+nrcg);
5074                 }
5075                 for (i = 0; i < nrcg; i++)
5076                 {
5077                     copy_rvec(comm->vbuf.v[buf_pos++],
5078                               state->x[home_pos_at+i]);
5079                 }
5080                 if (bV)
5081                 {
5082                     for (i = 0; i < nrcg; i++)
5083                     {
5084                         copy_rvec(comm->vbuf.v[buf_pos++],
5085                                   state->v[home_pos_at+i]);
5086                     }
5087                 }
5088                 if (bSDX)
5089                 {
5090                     for (i = 0; i < nrcg; i++)
5091                     {
5092                         copy_rvec(comm->vbuf.v[buf_pos++],
5093                                   state->sd_X[home_pos_at+i]);
5094                     }
5095                 }
5096                 if (bCGP)
5097                 {
5098                     for (i = 0; i < nrcg; i++)
5099                     {
5100                         copy_rvec(comm->vbuf.v[buf_pos++],
5101                                   state->cg_p[home_pos_at+i]);
5102                     }
5103                 }
5104                 home_pos_cg += 1;
5105                 home_pos_at += nrcg;
5106             }
5107             else
5108             {
5109                 /* Reallocate the buffers if necessary  */
5110                 if (ncg[mc]+1 > comm->cggl_flag_nalloc[mc])
5111                 {
5112                     comm->cggl_flag_nalloc[mc] = over_alloc_dd(ncg[mc]+1);
5113                     srenew(comm->cggl_flag[mc], comm->cggl_flag_nalloc[mc]*DD_CGIBS);
5114                 }
5115                 nvr = ncg[mc] + nat[mc]*nvec;
5116                 if (nvr + 1 + nrcg*nvec > comm->cgcm_state_nalloc[mc])
5117                 {
5118                     comm->cgcm_state_nalloc[mc] = over_alloc_dd(nvr + 1 + nrcg*nvec);
5119                     srenew(comm->cgcm_state[mc], comm->cgcm_state_nalloc[mc]);
5120                 }
5121                 /* Copy from the receive to the send buffers */
5122                 memcpy(comm->cggl_flag[mc] + ncg[mc]*DD_CGIBS,
5123                        comm->buf_int + cg*DD_CGIBS,
5124                        DD_CGIBS*sizeof(int));
5125                 memcpy(comm->cgcm_state[mc][nvr],
5126                        comm->vbuf.v[buf_pos],
5127                        (1+nrcg*nvec)*sizeof(rvec));
5128                 buf_pos += 1 + nrcg*nvec;
5129                 ncg[mc] += 1;
5130                 nat[mc] += nrcg;
5131             }
5132         }
5133     }
5134
5135     /* With sorting (!bCompact) the indices are now only partially up to date
5136      * and ncg_home and nat_home are not the real count, since there are
5137      * "holes" in the arrays for the charge groups that moved to neighbors.
5138      */
5139     if (fr->cutoff_scheme == ecutsVERLET)
5140     {
5141         moved = get_moved(comm, home_pos_cg);
5142
5143         for (i = dd->ncg_home; i < home_pos_cg; i++)
5144         {
5145             moved[i] = 0;
5146         }
5147     }
5148     dd->ncg_home = home_pos_cg;
5149     dd->nat_home = home_pos_at;
5150
5151     if (debug)
5152     {
5153         fprintf(debug,
5154                 "Finished repartitioning: cgs moved out %d, new home %d\n",
5155                 *ncg_moved, dd->ncg_home-*ncg_moved);
5156
5157     }
5158 }
5159
5160 void dd_cycles_add(gmx_domdec_t *dd, float cycles, int ddCycl)
5161 {
5162     dd->comm->cycl[ddCycl] += cycles;
5163     dd->comm->cycl_n[ddCycl]++;
5164     if (cycles > dd->comm->cycl_max[ddCycl])
5165     {
5166         dd->comm->cycl_max[ddCycl] = cycles;
5167     }
5168 }
5169
5170 static double force_flop_count(t_nrnb *nrnb)
5171 {
5172     int         i;
5173     double      sum;
5174     const char *name;
5175
5176     sum = 0;
5177     for (i = 0; i < eNR_NBKERNEL_FREE_ENERGY; i++)
5178     {
5179         /* To get closer to the real timings, we half the count
5180          * for the normal loops and again half it for water loops.
5181          */
5182         name = nrnb_str(i);
5183         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5184         {
5185             sum += nrnb->n[i]*0.25*cost_nrnb(i);
5186         }
5187         else
5188         {
5189             sum += nrnb->n[i]*0.50*cost_nrnb(i);
5190         }
5191     }
5192     for (i = eNR_NBKERNEL_FREE_ENERGY; i <= eNR_NB14; i++)
5193     {
5194         name = nrnb_str(i);
5195         if (strstr(name, "W3") != NULL || strstr(name, "W4") != NULL)
5196         {
5197             sum += nrnb->n[i]*cost_nrnb(i);
5198         }
5199     }
5200     for (i = eNR_BONDS; i <= eNR_WALLS; i++)
5201     {
5202         sum += nrnb->n[i]*cost_nrnb(i);
5203     }
5204
5205     return sum;
5206 }
5207
5208 void dd_force_flop_start(gmx_domdec_t *dd, t_nrnb *nrnb)
5209 {
5210     if (dd->comm->eFlop)
5211     {
5212         dd->comm->flop -= force_flop_count(nrnb);
5213     }
5214 }
5215 void dd_force_flop_stop(gmx_domdec_t *dd, t_nrnb *nrnb)
5216 {
5217     if (dd->comm->eFlop)
5218     {
5219         dd->comm->flop += force_flop_count(nrnb);
5220         dd->comm->flop_n++;
5221     }
5222 }
5223
5224 static void clear_dd_cycle_counts(gmx_domdec_t *dd)
5225 {
5226     int i;
5227
5228     for (i = 0; i < ddCyclNr; i++)
5229     {
5230         dd->comm->cycl[i]     = 0;
5231         dd->comm->cycl_n[i]   = 0;
5232         dd->comm->cycl_max[i] = 0;
5233     }
5234     dd->comm->flop   = 0;
5235     dd->comm->flop_n = 0;
5236 }
5237
5238 static void get_load_distribution(gmx_domdec_t *dd, gmx_wallcycle_t wcycle)
5239 {
5240     gmx_domdec_comm_t *comm;
5241     gmx_domdec_load_t *load;
5242     gmx_domdec_root_t *root = NULL;
5243     int                d, dim, cid, i, pos;
5244     float              cell_frac = 0, sbuf[DD_NLOAD_MAX];
5245     gmx_bool           bSepPME;
5246
5247     if (debug)
5248     {
5249         fprintf(debug, "get_load_distribution start\n");
5250     }
5251
5252     wallcycle_start(wcycle, ewcDDCOMMLOAD);
5253
5254     comm = dd->comm;
5255
5256     bSepPME = (dd->pme_nodeid >= 0);
5257
5258     for (d = dd->ndim-1; d >= 0; d--)
5259     {
5260         dim = dd->dim[d];
5261         /* Check if we participate in the communication in this dimension */
5262         if (d == dd->ndim-1 ||
5263             (dd->ci[dd->dim[d+1]] == 0 && dd->ci[dd->dim[dd->ndim-1]] == 0))
5264         {
5265             load = &comm->load[d];
5266             if (dd->bGridJump)
5267             {
5268                 cell_frac = comm->cell_f1[d] - comm->cell_f0[d];
5269             }
5270             pos = 0;
5271             if (d == dd->ndim-1)
5272             {
5273                 sbuf[pos++] = dd_force_load(comm);
5274                 sbuf[pos++] = sbuf[0];
5275                 if (dd->bGridJump)
5276                 {
5277                     sbuf[pos++] = sbuf[0];
5278                     sbuf[pos++] = cell_frac;
5279                     if (d > 0)
5280                     {
5281                         sbuf[pos++] = comm->cell_f_max0[d];
5282                         sbuf[pos++] = comm->cell_f_min1[d];
5283                     }
5284                 }
5285                 if (bSepPME)
5286                 {
5287                     sbuf[pos++] = comm->cycl[ddCyclPPduringPME];
5288                     sbuf[pos++] = comm->cycl[ddCyclPME];
5289                 }
5290             }
5291             else
5292             {
5293                 sbuf[pos++] = comm->load[d+1].sum;
5294                 sbuf[pos++] = comm->load[d+1].max;
5295                 if (dd->bGridJump)
5296                 {
5297                     sbuf[pos++] = comm->load[d+1].sum_m;
5298                     sbuf[pos++] = comm->load[d+1].cvol_min*cell_frac;
5299                     sbuf[pos++] = comm->load[d+1].flags;
5300                     if (d > 0)
5301                     {
5302                         sbuf[pos++] = comm->cell_f_max0[d];
5303                         sbuf[pos++] = comm->cell_f_min1[d];
5304                     }
5305                 }
5306                 if (bSepPME)
5307                 {
5308                     sbuf[pos++] = comm->load[d+1].mdf;
5309                     sbuf[pos++] = comm->load[d+1].pme;
5310                 }
5311             }
5312             load->nload = pos;
5313             /* Communicate a row in DD direction d.
5314              * The communicators are setup such that the root always has rank 0.
5315              */
5316 #ifdef GMX_MPI
5317             MPI_Gather(sbuf, load->nload*sizeof(float), MPI_BYTE,
5318                        load->load, load->nload*sizeof(float), MPI_BYTE,
5319                        0, comm->mpi_comm_load[d]);
5320 #endif
5321             if (dd->ci[dim] == dd->master_ci[dim])
5322             {
5323                 /* We are the root, process this row */
5324                 if (comm->bDynLoadBal)
5325                 {
5326                     root = comm->root[d];
5327                 }
5328                 load->sum      = 0;
5329                 load->max      = 0;
5330                 load->sum_m    = 0;
5331                 load->cvol_min = 1;
5332                 load->flags    = 0;
5333                 load->mdf      = 0;
5334                 load->pme      = 0;
5335                 pos            = 0;
5336                 for (i = 0; i < dd->nc[dim]; i++)
5337                 {
5338                     load->sum += load->load[pos++];
5339                     load->max  = max(load->max, load->load[pos]);
5340                     pos++;
5341                     if (dd->bGridJump)
5342                     {
5343                         if (root->bLimited)
5344                         {
5345                             /* This direction could not be load balanced properly,
5346                              * therefore we need to use the maximum iso the average load.
5347                              */
5348                             load->sum_m = max(load->sum_m, load->load[pos]);
5349                         }
5350                         else
5351                         {
5352                             load->sum_m += load->load[pos];
5353                         }
5354                         pos++;
5355                         load->cvol_min = min(load->cvol_min, load->load[pos]);
5356                         pos++;
5357                         if (d < dd->ndim-1)
5358                         {
5359                             load->flags = (int)(load->load[pos++] + 0.5);
5360                         }
5361                         if (d > 0)
5362                         {
5363                             root->cell_f_max0[i] = load->load[pos++];
5364                             root->cell_f_min1[i] = load->load[pos++];
5365                         }
5366                     }
5367                     if (bSepPME)
5368                     {
5369                         load->mdf = max(load->mdf, load->load[pos]);
5370                         pos++;
5371                         load->pme = max(load->pme, load->load[pos]);
5372                         pos++;
5373                     }
5374                 }
5375                 if (comm->bDynLoadBal && root->bLimited)
5376                 {
5377                     load->sum_m *= dd->nc[dim];
5378                     load->flags |= (1<<d);
5379                 }
5380             }
5381         }
5382     }
5383
5384     if (DDMASTER(dd))
5385     {
5386         comm->nload      += dd_load_count(comm);
5387         comm->load_step  += comm->cycl[ddCyclStep];
5388         comm->load_sum   += comm->load[0].sum;
5389         comm->load_max   += comm->load[0].max;
5390         if (comm->bDynLoadBal)
5391         {
5392             for (d = 0; d < dd->ndim; d++)
5393             {
5394                 if (comm->load[0].flags & (1<<d))
5395                 {
5396                     comm->load_lim[d]++;
5397                 }
5398             }
5399         }
5400         if (bSepPME)
5401         {
5402             comm->load_mdf += comm->load[0].mdf;
5403             comm->load_pme += comm->load[0].pme;
5404         }
5405     }
5406
5407     wallcycle_stop(wcycle, ewcDDCOMMLOAD);
5408
5409     if (debug)
5410     {
5411         fprintf(debug, "get_load_distribution finished\n");
5412     }
5413 }
5414
5415 static float dd_force_imb_perf_loss(gmx_domdec_t *dd)
5416 {
5417     /* Return the relative performance loss on the total run time
5418      * due to the force calculation load imbalance.
5419      */
5420     if (dd->comm->nload > 0)
5421     {
5422         return
5423             (dd->comm->load_max*dd->nnodes - dd->comm->load_sum)/
5424             (dd->comm->load_step*dd->nnodes);
5425     }
5426     else
5427     {
5428         return 0;
5429     }
5430 }
5431
5432 static void print_dd_load_av(FILE *fplog, gmx_domdec_t *dd)
5433 {
5434     char               buf[STRLEN];
5435     int                npp, npme, nnodes, d, limp;
5436     float              imbal, pme_f_ratio, lossf, lossp = 0;
5437     gmx_bool           bLim;
5438     gmx_domdec_comm_t *comm;
5439
5440     comm = dd->comm;
5441     if (DDMASTER(dd) && comm->nload > 0)
5442     {
5443         npp    = dd->nnodes;
5444         npme   = (dd->pme_nodeid >= 0) ? comm->npmenodes : 0;
5445         nnodes = npp + npme;
5446         imbal  = comm->load_max*npp/comm->load_sum - 1;
5447         lossf  = dd_force_imb_perf_loss(dd);
5448         sprintf(buf, " Average load imbalance: %.1f %%\n", imbal*100);
5449         fprintf(fplog, "%s", buf);
5450         fprintf(stderr, "\n");
5451         fprintf(stderr, "%s", buf);
5452         sprintf(buf, " Part of the total run time spent waiting due to load imbalance: %.1f %%\n", lossf*100);
5453         fprintf(fplog, "%s", buf);
5454         fprintf(stderr, "%s", buf);
5455         bLim = FALSE;
5456         if (comm->bDynLoadBal)
5457         {
5458             sprintf(buf, " Steps where the load balancing was limited by -rdd, -rcon and/or -dds:");
5459             for (d = 0; d < dd->ndim; d++)
5460             {
5461                 limp = (200*comm->load_lim[d]+1)/(2*comm->nload);
5462                 sprintf(buf+strlen(buf), " %c %d %%", dim2char(dd->dim[d]), limp);
5463                 if (limp >= 50)
5464                 {
5465                     bLim = TRUE;
5466                 }
5467             }
5468             sprintf(buf+strlen(buf), "\n");
5469             fprintf(fplog, "%s", buf);
5470             fprintf(stderr, "%s", buf);
5471         }
5472         if (npme > 0)
5473         {
5474             pme_f_ratio = comm->load_pme/comm->load_mdf;
5475             lossp       = (comm->load_pme -comm->load_mdf)/comm->load_step;
5476             if (lossp <= 0)
5477             {
5478                 lossp *= (float)npme/(float)nnodes;
5479             }
5480             else
5481             {
5482                 lossp *= (float)npp/(float)nnodes;
5483             }
5484             sprintf(buf, " Average PME mesh/force load: %5.3f\n", pme_f_ratio);
5485             fprintf(fplog, "%s", buf);
5486             fprintf(stderr, "%s", buf);
5487             sprintf(buf, " Part of the total run time spent waiting due to PP/PME imbalance: %.1f %%\n", fabs(lossp)*100);
5488             fprintf(fplog, "%s", buf);
5489             fprintf(stderr, "%s", buf);
5490         }
5491         fprintf(fplog, "\n");
5492         fprintf(stderr, "\n");
5493
5494         if (lossf >= DD_PERF_LOSS_WARN)
5495         {
5496             sprintf(buf,
5497                     "NOTE: %.1f %% of the available CPU time was lost due to load imbalance\n"
5498                     "      in the domain decomposition.\n", lossf*100);
5499             if (!comm->bDynLoadBal)
5500             {
5501                 sprintf(buf+strlen(buf), "      You might want to use dynamic load balancing (option -dlb.)\n");
5502             }
5503             else if (bLim)
5504             {
5505                 sprintf(buf+strlen(buf), "      You might want to decrease the cell size limit (options -rdd, -rcon and/or -dds).\n");
5506             }
5507             fprintf(fplog, "%s\n", buf);
5508             fprintf(stderr, "%s\n", buf);
5509         }
5510         if (npme > 0 && fabs(lossp) >= DD_PERF_LOSS_WARN)
5511         {
5512             sprintf(buf,
5513                     "NOTE: %.1f %% performance was lost because the PME ranks\n"
5514                     "      had %s work to do than the PP ranks.\n"
5515                     "      You might want to %s the number of PME ranks\n"
5516                     "      or %s the cut-off and the grid spacing.\n",
5517                     fabs(lossp*100),
5518                     (lossp < 0) ? "less"     : "more",
5519                     (lossp < 0) ? "decrease" : "increase",
5520                     (lossp < 0) ? "decrease" : "increase");
5521             fprintf(fplog, "%s\n", buf);
5522             fprintf(stderr, "%s\n", buf);
5523         }
5524     }
5525 }
5526
5527 static float dd_vol_min(gmx_domdec_t *dd)
5528 {
5529     return dd->comm->load[0].cvol_min*dd->nnodes;
5530 }
5531
5532 static gmx_bool dd_load_flags(gmx_domdec_t *dd)
5533 {
5534     return dd->comm->load[0].flags;
5535 }
5536
5537 static float dd_f_imbal(gmx_domdec_t *dd)
5538 {
5539     return dd->comm->load[0].max*dd->nnodes/dd->comm->load[0].sum - 1;
5540 }
5541
5542 float dd_pme_f_ratio(gmx_domdec_t *dd)
5543 {
5544     if (dd->comm->cycl_n[ddCyclPME] > 0)
5545     {
5546         return dd->comm->load[0].pme/dd->comm->load[0].mdf;
5547     }
5548     else
5549     {
5550         return -1.0;
5551     }
5552 }
5553
5554 static void dd_print_load(FILE *fplog, gmx_domdec_t *dd, gmx_int64_t step)
5555 {
5556     int  flags, d;
5557     char buf[22];
5558
5559     flags = dd_load_flags(dd);
5560     if (flags)
5561     {
5562         fprintf(fplog,
5563                 "DD  load balancing is limited by minimum cell size in dimension");
5564         for (d = 0; d < dd->ndim; d++)
5565         {
5566             if (flags & (1<<d))
5567             {
5568                 fprintf(fplog, " %c", dim2char(dd->dim[d]));
5569             }
5570         }
5571         fprintf(fplog, "\n");
5572     }
5573     fprintf(fplog, "DD  step %s", gmx_step_str(step, buf));
5574     if (dd->comm->bDynLoadBal)
5575     {
5576         fprintf(fplog, "  vol min/aver %5.3f%c",
5577                 dd_vol_min(dd), flags ? '!' : ' ');
5578     }
5579     fprintf(fplog, " load imb.: force %4.1f%%", dd_f_imbal(dd)*100);
5580     if (dd->comm->cycl_n[ddCyclPME])
5581     {
5582         fprintf(fplog, "  pme mesh/force %5.3f", dd_pme_f_ratio(dd));
5583     }
5584     fprintf(fplog, "\n\n");
5585 }
5586
5587 static void dd_print_load_verbose(gmx_domdec_t *dd)
5588 {
5589     if (dd->comm->bDynLoadBal)
5590     {
5591         fprintf(stderr, "vol %4.2f%c ",
5592                 dd_vol_min(dd), dd_load_flags(dd) ? '!' : ' ');
5593     }
5594     fprintf(stderr, "imb F %2d%% ", (int)(dd_f_imbal(dd)*100+0.5));
5595     if (dd->comm->cycl_n[ddCyclPME])
5596     {
5597         fprintf(stderr, "pme/F %4.2f ", dd_pme_f_ratio(dd));
5598     }
5599 }
5600
5601 #ifdef GMX_MPI
5602 static void make_load_communicator(gmx_domdec_t *dd, int dim_ind, ivec loc)
5603 {
5604     MPI_Comm           c_row;
5605     int                dim, i, rank;
5606     ivec               loc_c;
5607     gmx_domdec_root_t *root;
5608     gmx_bool           bPartOfGroup = FALSE;
5609
5610     dim = dd->dim[dim_ind];
5611     copy_ivec(loc, loc_c);
5612     for (i = 0; i < dd->nc[dim]; i++)
5613     {
5614         loc_c[dim] = i;
5615         rank       = dd_index(dd->nc, loc_c);
5616         if (rank == dd->rank)
5617         {
5618             /* This process is part of the group */
5619             bPartOfGroup = TRUE;
5620         }
5621     }
5622     MPI_Comm_split(dd->mpi_comm_all, bPartOfGroup ? 0 : MPI_UNDEFINED, dd->rank,
5623                    &c_row);
5624     if (bPartOfGroup)
5625     {
5626         dd->comm->mpi_comm_load[dim_ind] = c_row;
5627         if (dd->comm->eDLB != edlbNO)
5628         {
5629             if (dd->ci[dim] == dd->master_ci[dim])
5630             {
5631                 /* This is the root process of this row */
5632                 snew(dd->comm->root[dim_ind], 1);
5633                 root = dd->comm->root[dim_ind];
5634                 snew(root->cell_f, DD_CELL_F_SIZE(dd, dim_ind));
5635                 snew(root->old_cell_f, dd->nc[dim]+1);
5636                 snew(root->bCellMin, dd->nc[dim]);
5637                 if (dim_ind > 0)
5638                 {
5639                     snew(root->cell_f_max0, dd->nc[dim]);
5640                     snew(root->cell_f_min1, dd->nc[dim]);
5641                     snew(root->bound_min, dd->nc[dim]);
5642                     snew(root->bound_max, dd->nc[dim]);
5643                 }
5644                 snew(root->buf_ncd, dd->nc[dim]);
5645             }
5646             else
5647             {
5648                 /* This is not a root process, we only need to receive cell_f */
5649                 snew(dd->comm->cell_f_row, DD_CELL_F_SIZE(dd, dim_ind));
5650             }
5651         }
5652         if (dd->ci[dim] == dd->master_ci[dim])
5653         {
5654             snew(dd->comm->load[dim_ind].load, dd->nc[dim]*DD_NLOAD_MAX);
5655         }
5656     }
5657 }
5658 #endif
5659
5660 void dd_setup_dlb_resource_sharing(t_commrec           gmx_unused *cr,
5661                                    const gmx_hw_info_t gmx_unused *hwinfo,
5662                                    const gmx_hw_opt_t  gmx_unused *hw_opt)
5663 {
5664 #ifdef GMX_MPI
5665     int           physicalnode_id_hash;
5666     int           gpu_id;
5667     gmx_domdec_t *dd;
5668     MPI_Comm      mpi_comm_pp_physicalnode;
5669
5670     if (!(cr->duty & DUTY_PP) ||
5671         hw_opt->gpu_opt.ncuda_dev_use == 0)
5672     {
5673         /* Only PP nodes (currently) use GPUs.
5674          * If we don't have GPUs, there are no resources to share.
5675          */
5676         return;
5677     }
5678
5679     physicalnode_id_hash = gmx_physicalnode_id_hash();
5680
5681     gpu_id = get_gpu_device_id(&hwinfo->gpu_info, &hw_opt->gpu_opt, cr->rank_pp_intranode);
5682
5683     dd = cr->dd;
5684
5685     if (debug)
5686     {
5687         fprintf(debug, "dd_setup_dd_dlb_gpu_sharing:\n");
5688         fprintf(debug, "DD PP rank %d physical node hash %d gpu_id %d\n",
5689                 dd->rank, physicalnode_id_hash, gpu_id);
5690     }
5691     /* Split the PP communicator over the physical nodes */
5692     /* TODO: See if we should store this (before), as it's also used for
5693      * for the nodecomm summution.
5694      */
5695     MPI_Comm_split(dd->mpi_comm_all, physicalnode_id_hash, dd->rank,
5696                    &mpi_comm_pp_physicalnode);
5697     MPI_Comm_split(mpi_comm_pp_physicalnode, gpu_id, dd->rank,
5698                    &dd->comm->mpi_comm_gpu_shared);
5699     MPI_Comm_free(&mpi_comm_pp_physicalnode);
5700     MPI_Comm_size(dd->comm->mpi_comm_gpu_shared, &dd->comm->nrank_gpu_shared);
5701
5702     if (debug)
5703     {
5704         fprintf(debug, "nrank_gpu_shared %d\n", dd->comm->nrank_gpu_shared);
5705     }
5706
5707     /* Note that some ranks could share a GPU, while others don't */
5708
5709     if (dd->comm->nrank_gpu_shared == 1)
5710     {
5711         MPI_Comm_free(&dd->comm->mpi_comm_gpu_shared);
5712     }
5713 #endif
5714 }
5715
5716 static void make_load_communicators(gmx_domdec_t gmx_unused *dd)
5717 {
5718 #ifdef GMX_MPI
5719     int  dim0, dim1, i, j;
5720     ivec loc;
5721
5722     if (debug)
5723     {
5724         fprintf(debug, "Making load communicators\n");
5725     }
5726
5727     snew(dd->comm->load, dd->ndim);
5728     snew(dd->comm->mpi_comm_load, dd->ndim);
5729
5730     clear_ivec(loc);
5731     make_load_communicator(dd, 0, loc);
5732     if (dd->ndim > 1)
5733     {
5734         dim0 = dd->dim[0];
5735         for (i = 0; i < dd->nc[dim0]; i++)
5736         {
5737             loc[dim0] = i;
5738             make_load_communicator(dd, 1, loc);
5739         }
5740     }
5741     if (dd->ndim > 2)
5742     {
5743         dim0 = dd->dim[0];
5744         for (i = 0; i < dd->nc[dim0]; i++)
5745         {
5746             loc[dim0] = i;
5747             dim1      = dd->dim[1];
5748             for (j = 0; j < dd->nc[dim1]; j++)
5749             {
5750                 loc[dim1] = j;
5751                 make_load_communicator(dd, 2, loc);
5752             }
5753         }
5754     }
5755
5756     if (debug)
5757     {
5758         fprintf(debug, "Finished making load communicators\n");
5759     }
5760 #endif
5761 }
5762
5763 void setup_dd_grid(FILE *fplog, gmx_domdec_t *dd)
5764 {
5765     gmx_bool                bZYX;
5766     int                     d, dim, i, j, m;
5767     ivec                    tmp, s;
5768     int                     nzone, nzonep;
5769     ivec                    dd_zp[DD_MAXIZONE];
5770     gmx_domdec_zones_t     *zones;
5771     gmx_domdec_ns_ranges_t *izone;
5772
5773     for (d = 0; d < dd->ndim; d++)
5774     {
5775         dim = dd->dim[d];
5776         copy_ivec(dd->ci, tmp);
5777         tmp[dim]           = (tmp[dim] + 1) % dd->nc[dim];
5778         dd->neighbor[d][0] = ddcoord2ddnodeid(dd, tmp);
5779         copy_ivec(dd->ci, tmp);
5780         tmp[dim]           = (tmp[dim] - 1 + dd->nc[dim]) % dd->nc[dim];
5781         dd->neighbor[d][1] = ddcoord2ddnodeid(dd, tmp);
5782         if (debug)
5783         {
5784             fprintf(debug, "DD rank %d neighbor ranks in dir %d are + %d - %d\n",
5785                     dd->rank, dim,
5786                     dd->neighbor[d][0],
5787                     dd->neighbor[d][1]);
5788         }
5789     }
5790
5791     if (fplog)
5792     {
5793         fprintf(fplog, "\nMaking %dD domain decomposition grid %d x %d x %d, home cell index %d %d %d\n\n",
5794                 dd->ndim,
5795                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ],
5796                 dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
5797     }
5798     switch (dd->ndim)
5799     {
5800         case 3:
5801             nzone  = dd_z3n;
5802             nzonep = dd_zp3n;
5803             for (i = 0; i < nzonep; i++)
5804             {
5805                 copy_ivec(dd_zp3[i], dd_zp[i]);
5806             }
5807             break;
5808         case 2:
5809             nzone  = dd_z2n;
5810             nzonep = dd_zp2n;
5811             for (i = 0; i < nzonep; i++)
5812             {
5813                 copy_ivec(dd_zp2[i], dd_zp[i]);
5814             }
5815             break;
5816         case 1:
5817             nzone  = dd_z1n;
5818             nzonep = dd_zp1n;
5819             for (i = 0; i < nzonep; i++)
5820             {
5821                 copy_ivec(dd_zp1[i], dd_zp[i]);
5822             }
5823             break;
5824         default:
5825             gmx_fatal(FARGS, "Can only do 1, 2 or 3D domain decomposition");
5826             nzone  = 0;
5827             nzonep = 0;
5828     }
5829
5830     zones = &dd->comm->zones;
5831
5832     for (i = 0; i < nzone; i++)
5833     {
5834         m = 0;
5835         clear_ivec(zones->shift[i]);
5836         for (d = 0; d < dd->ndim; d++)
5837         {
5838             zones->shift[i][dd->dim[d]] = dd_zo[i][m++];
5839         }
5840     }
5841
5842     zones->n = nzone;
5843     for (i = 0; i < nzone; i++)
5844     {
5845         for (d = 0; d < DIM; d++)
5846         {
5847             s[d] = dd->ci[d] - zones->shift[i][d];
5848             if (s[d] < 0)
5849             {
5850                 s[d] += dd->nc[d];
5851             }
5852             else if (s[d] >= dd->nc[d])
5853             {
5854                 s[d] -= dd->nc[d];
5855             }
5856         }
5857     }
5858     zones->nizone = nzonep;
5859     for (i = 0; i < zones->nizone; i++)
5860     {
5861         if (dd_zp[i][0] != i)
5862         {
5863             gmx_fatal(FARGS, "Internal inconsistency in the dd grid setup");
5864         }
5865         izone     = &zones->izone[i];
5866         izone->j0 = dd_zp[i][1];
5867         izone->j1 = dd_zp[i][2];
5868         for (dim = 0; dim < DIM; dim++)
5869         {
5870             if (dd->nc[dim] == 1)
5871             {
5872                 /* All shifts should be allowed */
5873                 izone->shift0[dim] = -1;
5874                 izone->shift1[dim] = 1;
5875             }
5876             else
5877             {
5878                 /*
5879                    izone->shift0[d] = 0;
5880                    izone->shift1[d] = 0;
5881                    for(j=izone->j0; j<izone->j1; j++) {
5882                    if (dd->shift[j][d] > dd->shift[i][d])
5883                    izone->shift0[d] = -1;
5884                    if (dd->shift[j][d] < dd->shift[i][d])
5885                    izone->shift1[d] = 1;
5886                    }
5887                  */
5888
5889                 int shift_diff;
5890
5891                 /* Assume the shift are not more than 1 cell */
5892                 izone->shift0[dim] = 1;
5893                 izone->shift1[dim] = -1;
5894                 for (j = izone->j0; j < izone->j1; j++)
5895                 {
5896                     shift_diff = zones->shift[j][dim] - zones->shift[i][dim];
5897                     if (shift_diff < izone->shift0[dim])
5898                     {
5899                         izone->shift0[dim] = shift_diff;
5900                     }
5901                     if (shift_diff > izone->shift1[dim])
5902                     {
5903                         izone->shift1[dim] = shift_diff;
5904                     }
5905                 }
5906             }
5907         }
5908     }
5909
5910     if (dd->comm->eDLB != edlbNO)
5911     {
5912         snew(dd->comm->root, dd->ndim);
5913     }
5914
5915     if (dd->comm->bRecordLoad)
5916     {
5917         make_load_communicators(dd);
5918     }
5919 }
5920
5921 static void make_pp_communicator(FILE *fplog, t_commrec *cr, int gmx_unused reorder)
5922 {
5923     gmx_domdec_t      *dd;
5924     gmx_domdec_comm_t *comm;
5925     int                i, rank, *buf;
5926     ivec               periods;
5927 #ifdef GMX_MPI
5928     MPI_Comm           comm_cart;
5929 #endif
5930
5931     dd   = cr->dd;
5932     comm = dd->comm;
5933
5934 #ifdef GMX_MPI
5935     if (comm->bCartesianPP)
5936     {
5937         /* Set up cartesian communication for the particle-particle part */
5938         if (fplog)
5939         {
5940             fprintf(fplog, "Will use a Cartesian communicator: %d x %d x %d\n",
5941                     dd->nc[XX], dd->nc[YY], dd->nc[ZZ]);
5942         }
5943
5944         for (i = 0; i < DIM; i++)
5945         {
5946             periods[i] = TRUE;
5947         }
5948         MPI_Cart_create(cr->mpi_comm_mygroup, DIM, dd->nc, periods, reorder,
5949                         &comm_cart);
5950         /* We overwrite the old communicator with the new cartesian one */
5951         cr->mpi_comm_mygroup = comm_cart;
5952     }
5953
5954     dd->mpi_comm_all = cr->mpi_comm_mygroup;
5955     MPI_Comm_rank(dd->mpi_comm_all, &dd->rank);
5956
5957     if (comm->bCartesianPP_PME)
5958     {
5959         /* Since we want to use the original cartesian setup for sim,
5960          * and not the one after split, we need to make an index.
5961          */
5962         snew(comm->ddindex2ddnodeid, dd->nnodes);
5963         comm->ddindex2ddnodeid[dd_index(dd->nc, dd->ci)] = dd->rank;
5964         gmx_sumi(dd->nnodes, comm->ddindex2ddnodeid, cr);
5965         /* Get the rank of the DD master,
5966          * above we made sure that the master node is a PP node.
5967          */
5968         if (MASTER(cr))
5969         {
5970             rank = dd->rank;
5971         }
5972         else
5973         {
5974             rank = 0;
5975         }
5976         MPI_Allreduce(&rank, &dd->masterrank, 1, MPI_INT, MPI_SUM, dd->mpi_comm_all);
5977     }
5978     else if (comm->bCartesianPP)
5979     {
5980         if (cr->npmenodes == 0)
5981         {
5982             /* The PP communicator is also
5983              * the communicator for this simulation
5984              */
5985             cr->mpi_comm_mysim = cr->mpi_comm_mygroup;
5986         }
5987         cr->nodeid = dd->rank;
5988
5989         MPI_Cart_coords(dd->mpi_comm_all, dd->rank, DIM, dd->ci);
5990
5991         /* We need to make an index to go from the coordinates
5992          * to the nodeid of this simulation.
5993          */
5994         snew(comm->ddindex2simnodeid, dd->nnodes);
5995         snew(buf, dd->nnodes);
5996         if (cr->duty & DUTY_PP)
5997         {
5998             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
5999         }
6000         /* Communicate the ddindex to simulation nodeid index */
6001         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6002                       cr->mpi_comm_mysim);
6003         sfree(buf);
6004
6005         /* Determine the master coordinates and rank.
6006          * The DD master should be the same node as the master of this sim.
6007          */
6008         for (i = 0; i < dd->nnodes; i++)
6009         {
6010             if (comm->ddindex2simnodeid[i] == 0)
6011             {
6012                 ddindex2xyz(dd->nc, i, dd->master_ci);
6013                 MPI_Cart_rank(dd->mpi_comm_all, dd->master_ci, &dd->masterrank);
6014             }
6015         }
6016         if (debug)
6017         {
6018             fprintf(debug, "The master rank is %d\n", dd->masterrank);
6019         }
6020     }
6021     else
6022     {
6023         /* No Cartesian communicators */
6024         /* We use the rank in dd->comm->all as DD index */
6025         ddindex2xyz(dd->nc, dd->rank, dd->ci);
6026         /* The simulation master nodeid is 0, so the DD master rank is also 0 */
6027         dd->masterrank = 0;
6028         clear_ivec(dd->master_ci);
6029     }
6030 #endif
6031
6032     if (fplog)
6033     {
6034         fprintf(fplog,
6035                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6036                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6037     }
6038     if (debug)
6039     {
6040         fprintf(debug,
6041                 "Domain decomposition rank %d, coordinates %d %d %d\n\n",
6042                 dd->rank, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6043     }
6044 }
6045
6046 static void receive_ddindex2simnodeid(t_commrec *cr)
6047 {
6048     gmx_domdec_t      *dd;
6049
6050     gmx_domdec_comm_t *comm;
6051     int               *buf;
6052
6053     dd   = cr->dd;
6054     comm = dd->comm;
6055
6056 #ifdef GMX_MPI
6057     if (!comm->bCartesianPP_PME && comm->bCartesianPP)
6058     {
6059         snew(comm->ddindex2simnodeid, dd->nnodes);
6060         snew(buf, dd->nnodes);
6061         if (cr->duty & DUTY_PP)
6062         {
6063             buf[dd_index(dd->nc, dd->ci)] = cr->sim_nodeid;
6064         }
6065 #ifdef GMX_MPI
6066         /* Communicate the ddindex to simulation nodeid index */
6067         MPI_Allreduce(buf, comm->ddindex2simnodeid, dd->nnodes, MPI_INT, MPI_SUM,
6068                       cr->mpi_comm_mysim);
6069 #endif
6070         sfree(buf);
6071     }
6072 #endif
6073 }
6074
6075 static gmx_domdec_master_t *init_gmx_domdec_master_t(gmx_domdec_t *dd,
6076                                                      int ncg, int natoms)
6077 {
6078     gmx_domdec_master_t *ma;
6079     int                  i;
6080
6081     snew(ma, 1);
6082
6083     snew(ma->ncg, dd->nnodes);
6084     snew(ma->index, dd->nnodes+1);
6085     snew(ma->cg, ncg);
6086     snew(ma->nat, dd->nnodes);
6087     snew(ma->ibuf, dd->nnodes*2);
6088     snew(ma->cell_x, DIM);
6089     for (i = 0; i < DIM; i++)
6090     {
6091         snew(ma->cell_x[i], dd->nc[i]+1);
6092     }
6093
6094     if (dd->nnodes <= GMX_DD_NNODES_SENDRECV)
6095     {
6096         ma->vbuf = NULL;
6097     }
6098     else
6099     {
6100         snew(ma->vbuf, natoms);
6101     }
6102
6103     return ma;
6104 }
6105
6106 static void split_communicator(FILE *fplog, t_commrec *cr, int gmx_unused dd_node_order,
6107                                int gmx_unused reorder)
6108 {
6109     gmx_domdec_t      *dd;
6110     gmx_domdec_comm_t *comm;
6111     int                i, rank;
6112     gmx_bool           bDiv[DIM];
6113     ivec               periods;
6114 #ifdef GMX_MPI
6115     MPI_Comm           comm_cart;
6116 #endif
6117
6118     dd   = cr->dd;
6119     comm = dd->comm;
6120
6121     if (comm->bCartesianPP)
6122     {
6123         for (i = 1; i < DIM; i++)
6124         {
6125             bDiv[i] = ((cr->npmenodes*dd->nc[i]) % (dd->nnodes) == 0);
6126         }
6127         if (bDiv[YY] || bDiv[ZZ])
6128         {
6129             comm->bCartesianPP_PME = TRUE;
6130             /* If we have 2D PME decomposition, which is always in x+y,
6131              * we stack the PME only nodes in z.
6132              * Otherwise we choose the direction that provides the thinnest slab
6133              * of PME only nodes as this will have the least effect
6134              * on the PP communication.
6135              * But for the PME communication the opposite might be better.
6136              */
6137             if (bDiv[ZZ] && (comm->npmenodes_y > 1 ||
6138                              !bDiv[YY] ||
6139                              dd->nc[YY] > dd->nc[ZZ]))
6140             {
6141                 comm->cartpmedim = ZZ;
6142             }
6143             else
6144             {
6145                 comm->cartpmedim = YY;
6146             }
6147             comm->ntot[comm->cartpmedim]
6148                 += (cr->npmenodes*dd->nc[comm->cartpmedim])/dd->nnodes;
6149         }
6150         else if (fplog)
6151         {
6152             fprintf(fplog, "Number of PME-only ranks (%d) is not a multiple of nx*ny (%d*%d) or nx*nz (%d*%d)\n", cr->npmenodes, dd->nc[XX], dd->nc[YY], dd->nc[XX], dd->nc[ZZ]);
6153             fprintf(fplog,
6154                     "Will not use a Cartesian communicator for PP <-> PME\n\n");
6155         }
6156     }
6157
6158 #ifdef GMX_MPI
6159     if (comm->bCartesianPP_PME)
6160     {
6161         if (fplog)
6162         {
6163             fprintf(fplog, "Will use a Cartesian communicator for PP <-> PME: %d x %d x %d\n", comm->ntot[XX], comm->ntot[YY], comm->ntot[ZZ]);
6164         }
6165
6166         for (i = 0; i < DIM; i++)
6167         {
6168             periods[i] = TRUE;
6169         }
6170         MPI_Cart_create(cr->mpi_comm_mysim, DIM, comm->ntot, periods, reorder,
6171                         &comm_cart);
6172
6173         MPI_Comm_rank(comm_cart, &rank);
6174         if (MASTERNODE(cr) && rank != 0)
6175         {
6176             gmx_fatal(FARGS, "MPI rank 0 was renumbered by MPI_Cart_create, we do not allow this");
6177         }
6178
6179         /* With this assigment we loose the link to the original communicator
6180          * which will usually be MPI_COMM_WORLD, unless have multisim.
6181          */
6182         cr->mpi_comm_mysim = comm_cart;
6183         cr->sim_nodeid     = rank;
6184
6185         MPI_Cart_coords(cr->mpi_comm_mysim, cr->sim_nodeid, DIM, dd->ci);
6186
6187         if (fplog)
6188         {
6189             fprintf(fplog, "Cartesian rank %d, coordinates %d %d %d\n\n",
6190                     cr->sim_nodeid, dd->ci[XX], dd->ci[YY], dd->ci[ZZ]);
6191         }
6192
6193         if (dd->ci[comm->cartpmedim] < dd->nc[comm->cartpmedim])
6194         {
6195             cr->duty = DUTY_PP;
6196         }
6197         if (cr->npmenodes == 0 ||
6198             dd->ci[comm->cartpmedim] >= dd->nc[comm->cartpmedim])
6199         {
6200             cr->duty = DUTY_PME;
6201         }
6202
6203         /* Split the sim communicator into PP and PME only nodes */
6204         MPI_Comm_split(cr->mpi_comm_mysim,
6205                        cr->duty,
6206                        dd_index(comm->ntot, dd->ci),
6207                        &cr->mpi_comm_mygroup);
6208     }
6209     else
6210     {
6211         switch (dd_node_order)
6212         {
6213             case ddnoPP_PME:
6214                 if (fplog)
6215                 {
6216                     fprintf(fplog, "Order of the ranks: PP first, PME last\n");
6217                 }
6218                 break;
6219             case ddnoINTERLEAVE:
6220                 /* Interleave the PP-only and PME-only nodes,
6221                  * as on clusters with dual-core machines this will double
6222                  * the communication bandwidth of the PME processes
6223                  * and thus speed up the PP <-> PME and inter PME communication.
6224                  */
6225                 if (fplog)
6226                 {
6227                     fprintf(fplog, "Interleaving PP and PME ranks\n");
6228                 }
6229                 comm->pmenodes = dd_pmenodes(cr);
6230                 break;
6231             case ddnoCARTESIAN:
6232                 break;
6233             default:
6234                 gmx_fatal(FARGS, "Unknown dd_node_order=%d", dd_node_order);
6235         }
6236
6237         if (dd_simnode2pmenode(cr, cr->sim_nodeid) == -1)
6238         {
6239             cr->duty = DUTY_PME;
6240         }
6241         else
6242         {
6243             cr->duty = DUTY_PP;
6244         }
6245
6246         /* Split the sim communicator into PP and PME only nodes */
6247         MPI_Comm_split(cr->mpi_comm_mysim,
6248                        cr->duty,
6249                        cr->nodeid,
6250                        &cr->mpi_comm_mygroup);
6251         MPI_Comm_rank(cr->mpi_comm_mygroup, &cr->nodeid);
6252     }
6253 #endif
6254
6255     if (fplog)
6256     {
6257         fprintf(fplog, "This rank does only %s work.\n\n",
6258                 (cr->duty & DUTY_PP) ? "particle-particle" : "PME-mesh");
6259     }
6260 }
6261
6262 void make_dd_communicators(FILE *fplog, t_commrec *cr, int dd_node_order)
6263 {
6264     gmx_domdec_t      *dd;
6265     gmx_domdec_comm_t *comm;
6266     int                CartReorder;
6267
6268     dd   = cr->dd;
6269     comm = dd->comm;
6270
6271     copy_ivec(dd->nc, comm->ntot);
6272
6273     comm->bCartesianPP     = (dd_node_order == ddnoCARTESIAN);
6274     comm->bCartesianPP_PME = FALSE;
6275
6276     /* Reorder the nodes by default. This might change the MPI ranks.
6277      * Real reordering is only supported on very few architectures,
6278      * Blue Gene is one of them.
6279      */
6280     CartReorder = (getenv("GMX_NO_CART_REORDER") == NULL);
6281
6282     if (cr->npmenodes > 0)
6283     {
6284         /* Split the communicator into a PP and PME part */
6285         split_communicator(fplog, cr, dd_node_order, CartReorder);
6286         if (comm->bCartesianPP_PME)
6287         {
6288             /* We (possibly) reordered the nodes in split_communicator,
6289              * so it is no longer required in make_pp_communicator.
6290              */
6291             CartReorder = FALSE;
6292         }
6293     }
6294     else
6295     {
6296         /* All nodes do PP and PME */
6297 #ifdef GMX_MPI
6298         /* We do not require separate communicators */
6299         cr->mpi_comm_mygroup = cr->mpi_comm_mysim;
6300 #endif
6301     }
6302
6303     if (cr->duty & DUTY_PP)
6304     {
6305         /* Copy or make a new PP communicator */
6306         make_pp_communicator(fplog, cr, CartReorder);
6307     }
6308     else
6309     {
6310         receive_ddindex2simnodeid(cr);
6311     }
6312
6313     if (!(cr->duty & DUTY_PME))
6314     {
6315         /* Set up the commnuication to our PME node */
6316         dd->pme_nodeid           = dd_simnode2pmenode(cr, cr->sim_nodeid);
6317         dd->pme_receive_vir_ener = receive_vir_ener(cr);
6318         if (debug)
6319         {
6320             fprintf(debug, "My pme_nodeid %d receive ener %d\n",
6321                     dd->pme_nodeid, dd->pme_receive_vir_ener);
6322         }
6323     }
6324     else
6325     {
6326         dd->pme_nodeid = -1;
6327     }
6328
6329     if (DDMASTER(dd))
6330     {
6331         dd->ma = init_gmx_domdec_master_t(dd,
6332                                           comm->cgs_gl.nr,
6333                                           comm->cgs_gl.index[comm->cgs_gl.nr]);
6334     }
6335 }
6336
6337 static real *get_slb_frac(FILE *fplog, const char *dir, int nc, const char *size_string)
6338 {
6339     real  *slb_frac, tot;
6340     int    i, n;
6341     double dbl;
6342
6343     slb_frac = NULL;
6344     if (nc > 1 && size_string != NULL)
6345     {
6346         if (fplog)
6347         {
6348             fprintf(fplog, "Using static load balancing for the %s direction\n",
6349                     dir);
6350         }
6351         snew(slb_frac, nc);
6352         tot = 0;
6353         for (i = 0; i < nc; i++)
6354         {
6355             dbl = 0;
6356             sscanf(size_string, "%lf%n", &dbl, &n);
6357             if (dbl == 0)
6358             {
6359                 gmx_fatal(FARGS, "Incorrect or not enough DD cell size entries for direction %s: '%s'", dir, size_string);
6360             }
6361             slb_frac[i]  = dbl;
6362             size_string += n;
6363             tot         += slb_frac[i];
6364         }
6365         /* Normalize */
6366         if (fplog)
6367         {
6368             fprintf(fplog, "Relative cell sizes:");
6369         }
6370         for (i = 0; i < nc; i++)
6371         {
6372             slb_frac[i] /= tot;
6373             if (fplog)
6374             {
6375                 fprintf(fplog, " %5.3f", slb_frac[i]);
6376             }
6377         }
6378         if (fplog)
6379         {
6380             fprintf(fplog, "\n");
6381         }
6382     }
6383
6384     return slb_frac;
6385 }
6386
6387 static int multi_body_bondeds_count(gmx_mtop_t *mtop)
6388 {
6389     int                  n, nmol, ftype;
6390     gmx_mtop_ilistloop_t iloop;
6391     t_ilist             *il;
6392
6393     n     = 0;
6394     iloop = gmx_mtop_ilistloop_init(mtop);
6395     while (gmx_mtop_ilistloop_next(iloop, &il, &nmol))
6396     {
6397         for (ftype = 0; ftype < F_NRE; ftype++)
6398         {
6399             if ((interaction_function[ftype].flags & IF_BOND) &&
6400                 NRAL(ftype) >  2)
6401             {
6402                 n += nmol*il[ftype].nr/(1 + NRAL(ftype));
6403             }
6404         }
6405     }
6406
6407     return n;
6408 }
6409
6410 static int dd_getenv(FILE *fplog, const char *env_var, int def)
6411 {
6412     char *val;
6413     int   nst;
6414
6415     nst = def;
6416     val = getenv(env_var);
6417     if (val)
6418     {
6419         if (sscanf(val, "%d", &nst) <= 0)
6420         {
6421             nst = 1;
6422         }
6423         if (fplog)
6424         {
6425             fprintf(fplog, "Found env.var. %s = %s, using value %d\n",
6426                     env_var, val, nst);
6427         }
6428     }
6429
6430     return nst;
6431 }
6432
6433 static void dd_warning(t_commrec *cr, FILE *fplog, const char *warn_string)
6434 {
6435     if (MASTER(cr))
6436     {
6437         fprintf(stderr, "\n%s\n", warn_string);
6438     }
6439     if (fplog)
6440     {
6441         fprintf(fplog, "\n%s\n", warn_string);
6442     }
6443 }
6444
6445 static void check_dd_restrictions(t_commrec *cr, gmx_domdec_t *dd,
6446                                   t_inputrec *ir, FILE *fplog)
6447 {
6448     if (ir->ePBC == epbcSCREW &&
6449         (dd->nc[XX] == 1 || dd->nc[YY] > 1 || dd->nc[ZZ] > 1))
6450     {
6451         gmx_fatal(FARGS, "With pbc=%s can only do domain decomposition in the x-direction", epbc_names[ir->ePBC]);
6452     }
6453
6454     if (ir->ns_type == ensSIMPLE)
6455     {
6456         gmx_fatal(FARGS, "Domain decomposition does not support simple neighbor searching, use grid searching or run with one MPI rank");
6457     }
6458
6459     if (ir->nstlist == 0)
6460     {
6461         gmx_fatal(FARGS, "Domain decomposition does not work with nstlist=0");
6462     }
6463
6464     if (ir->comm_mode == ecmANGULAR && ir->ePBC != epbcNONE)
6465     {
6466         dd_warning(cr, fplog, "comm-mode angular will give incorrect results when the comm group partially crosses a periodic boundary");
6467     }
6468 }
6469
6470 static real average_cellsize_min(gmx_domdec_t *dd, gmx_ddbox_t *ddbox)
6471 {
6472     int  di, d;
6473     real r;
6474
6475     r = ddbox->box_size[XX];
6476     for (di = 0; di < dd->ndim; di++)
6477     {
6478         d = dd->dim[di];
6479         /* Check using the initial average cell size */
6480         r = min(r, ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
6481     }
6482
6483     return r;
6484 }
6485
6486 static int check_dlb_support(FILE *fplog, t_commrec *cr,
6487                              const char *dlb_opt, gmx_bool bRecordLoad,
6488                              unsigned long Flags, t_inputrec *ir)
6489 {
6490     gmx_domdec_t *dd;
6491     int           eDLB = -1;
6492     char          buf[STRLEN];
6493
6494     switch (dlb_opt[0])
6495     {
6496         case 'a': eDLB = edlbAUTO; break;
6497         case 'n': eDLB = edlbNO;   break;
6498         case 'y': eDLB = edlbYES;  break;
6499         default: gmx_incons("Unknown dlb_opt");
6500     }
6501
6502     if (Flags & MD_RERUN)
6503     {
6504         return edlbNO;
6505     }
6506
6507     if (!EI_DYNAMICS(ir->eI))
6508     {
6509         if (eDLB == edlbYES)
6510         {
6511             sprintf(buf, "NOTE: dynamic load balancing is only supported with dynamics, not with integrator '%s'\n", EI(ir->eI));
6512             dd_warning(cr, fplog, buf);
6513         }
6514
6515         return edlbNO;
6516     }
6517
6518     if (!bRecordLoad)
6519     {
6520         dd_warning(cr, fplog, "NOTE: Cycle counting is not supported on this architecture, will not use dynamic load balancing\n");
6521
6522         return edlbNO;
6523     }
6524
6525     if (Flags & MD_REPRODUCIBLE)
6526     {
6527         switch (eDLB)
6528         {
6529             case edlbNO:
6530                 break;
6531             case edlbAUTO:
6532                 dd_warning(cr, fplog, "NOTE: reproducibility requested, will not use dynamic load balancing\n");
6533                 eDLB = edlbNO;
6534                 break;
6535             case edlbYES:
6536                 dd_warning(cr, fplog, "WARNING: reproducibility requested with dynamic load balancing, the simulation will NOT be binary reproducible\n");
6537                 break;
6538             default:
6539                 gmx_fatal(FARGS, "Death horror: undefined case (%d) for load balancing choice", eDLB);
6540                 break;
6541         }
6542     }
6543
6544     return eDLB;
6545 }
6546
6547 static void set_dd_dim(FILE *fplog, gmx_domdec_t *dd)
6548 {
6549     int dim;
6550
6551     dd->ndim = 0;
6552     if (getenv("GMX_DD_ORDER_ZYX") != NULL)
6553     {
6554         /* Decomposition order z,y,x */
6555         if (fplog)
6556         {
6557             fprintf(fplog, "Using domain decomposition order z, y, x\n");
6558         }
6559         for (dim = DIM-1; dim >= 0; dim--)
6560         {
6561             if (dd->nc[dim] > 1)
6562             {
6563                 dd->dim[dd->ndim++] = dim;
6564             }
6565         }
6566     }
6567     else
6568     {
6569         /* Decomposition order x,y,z */
6570         for (dim = 0; dim < DIM; dim++)
6571         {
6572             if (dd->nc[dim] > 1)
6573             {
6574                 dd->dim[dd->ndim++] = dim;
6575             }
6576         }
6577     }
6578 }
6579
6580 static gmx_domdec_comm_t *init_dd_comm()
6581 {
6582     gmx_domdec_comm_t *comm;
6583     int                i;
6584
6585     snew(comm, 1);
6586     snew(comm->cggl_flag, DIM*2);
6587     snew(comm->cgcm_state, DIM*2);
6588     for (i = 0; i < DIM*2; i++)
6589     {
6590         comm->cggl_flag_nalloc[i]  = 0;
6591         comm->cgcm_state_nalloc[i] = 0;
6592     }
6593
6594     comm->nalloc_int = 0;
6595     comm->buf_int    = NULL;
6596
6597     vec_rvec_init(&comm->vbuf);
6598
6599     comm->n_load_have    = 0;
6600     comm->n_load_collect = 0;
6601
6602     for (i = 0; i < ddnatNR-ddnatZONE; i++)
6603     {
6604         comm->sum_nat[i] = 0;
6605     }
6606     comm->ndecomp   = 0;
6607     comm->nload     = 0;
6608     comm->load_step = 0;
6609     comm->load_sum  = 0;
6610     comm->load_max  = 0;
6611     clear_ivec(comm->load_lim);
6612     comm->load_mdf  = 0;
6613     comm->load_pme  = 0;
6614
6615     return comm;
6616 }
6617
6618 gmx_domdec_t *init_domain_decomposition(FILE *fplog, t_commrec *cr,
6619                                         unsigned long Flags,
6620                                         ivec nc,
6621                                         real comm_distance_min, real rconstr,
6622                                         const char *dlb_opt, real dlb_scale,
6623                                         const char *sizex, const char *sizey, const char *sizez,
6624                                         gmx_mtop_t *mtop, t_inputrec *ir,
6625                                         matrix box, rvec *x,
6626                                         gmx_ddbox_t *ddbox,
6627                                         int *npme_x, int *npme_y)
6628 {
6629     gmx_domdec_t      *dd;
6630     gmx_domdec_comm_t *comm;
6631     int                recload;
6632     int                d, i, j;
6633     real               r_2b, r_mb, r_bonded = -1, r_bonded_limit = -1, limit, acs;
6634     gmx_bool           bC;
6635     char               buf[STRLEN];
6636
6637     if (fplog)
6638     {
6639         fprintf(fplog,
6640                 "\nInitializing Domain Decomposition on %d ranks\n", cr->nnodes);
6641     }
6642
6643     snew(dd, 1);
6644
6645     dd->comm = init_dd_comm();
6646     comm     = dd->comm;
6647     snew(comm->cggl_flag, DIM*2);
6648     snew(comm->cgcm_state, DIM*2);
6649
6650     dd->npbcdim   = ePBC2npbcdim(ir->ePBC);
6651     dd->bScrewPBC = (ir->ePBC == epbcSCREW);
6652
6653     dd->bSendRecv2      = dd_getenv(fplog, "GMX_DD_USE_SENDRECV2", 0);
6654     comm->dlb_scale_lim = dd_getenv(fplog, "GMX_DLB_MAX_BOX_SCALING", 10);
6655     comm->eFlop         = dd_getenv(fplog, "GMX_DLB_BASED_ON_FLOPS", 0);
6656     recload             = dd_getenv(fplog, "GMX_DD_RECORD_LOAD", 1);
6657     comm->nstSortCG     = dd_getenv(fplog, "GMX_DD_NST_SORT_CHARGE_GROUPS", 1);
6658     comm->nstDDDump     = dd_getenv(fplog, "GMX_DD_NST_DUMP", 0);
6659     comm->nstDDDumpGrid = dd_getenv(fplog, "GMX_DD_NST_DUMP_GRID", 0);
6660     comm->DD_debug      = dd_getenv(fplog, "GMX_DD_DEBUG", 0);
6661
6662     dd->pme_recv_f_alloc = 0;
6663     dd->pme_recv_f_buf   = NULL;
6664
6665     if (dd->bSendRecv2 && fplog)
6666     {
6667         fprintf(fplog, "Will use two sequential MPI_Sendrecv calls instead of two simultaneous non-blocking MPI_Irecv and MPI_Isend pairs for constraint and vsite communication\n");
6668     }
6669     if (comm->eFlop)
6670     {
6671         if (fplog)
6672         {
6673             fprintf(fplog, "Will load balance based on FLOP count\n");
6674         }
6675         if (comm->eFlop > 1)
6676         {
6677             srand(1+cr->nodeid);
6678         }
6679         comm->bRecordLoad = TRUE;
6680     }
6681     else
6682     {
6683         comm->bRecordLoad = (wallcycle_have_counter() && recload > 0);
6684
6685     }
6686
6687     /* Initialize to GPU share count to 0, might change later */
6688     comm->nrank_gpu_shared = 0;
6689
6690     comm->eDLB        = check_dlb_support(fplog, cr, dlb_opt, comm->bRecordLoad, Flags, ir);
6691     comm->bDLB_locked = FALSE;
6692
6693     comm->bDynLoadBal = (comm->eDLB == edlbYES);
6694     if (fplog)
6695     {
6696         fprintf(fplog, "Dynamic load balancing: %s\n", edlb_names[comm->eDLB]);
6697     }
6698     dd->bGridJump              = comm->bDynLoadBal;
6699     comm->bPMELoadBalDLBLimits = FALSE;
6700
6701     if (comm->nstSortCG)
6702     {
6703         if (fplog)
6704         {
6705             if (comm->nstSortCG == 1)
6706             {
6707                 fprintf(fplog, "Will sort the charge groups at every domain (re)decomposition\n");
6708             }
6709             else
6710             {
6711                 fprintf(fplog, "Will sort the charge groups every %d steps\n",
6712                         comm->nstSortCG);
6713             }
6714         }
6715         snew(comm->sort, 1);
6716     }
6717     else
6718     {
6719         if (fplog)
6720         {
6721             fprintf(fplog, "Will not sort the charge groups\n");
6722         }
6723     }
6724
6725     comm->bCGs = (ncg_mtop(mtop) < mtop->natoms);
6726
6727     comm->bInterCGBondeds = (ncg_mtop(mtop) > mtop->mols.nr);
6728     if (comm->bInterCGBondeds)
6729     {
6730         comm->bInterCGMultiBody = (multi_body_bondeds_count(mtop) > 0);
6731     }
6732     else
6733     {
6734         comm->bInterCGMultiBody = FALSE;
6735     }
6736
6737     dd->bInterCGcons    = inter_charge_group_constraints(mtop);
6738     dd->bInterCGsettles = inter_charge_group_settles(mtop);
6739
6740     if (ir->rlistlong == 0)
6741     {
6742         /* Set the cut-off to some very large value,
6743          * so we don't need if statements everywhere in the code.
6744          * We use sqrt, since the cut-off is squared in some places.
6745          */
6746         comm->cutoff   = GMX_CUTOFF_INF;
6747     }
6748     else
6749     {
6750         comm->cutoff   = ir->rlistlong;
6751     }
6752     comm->cutoff_mbody = 0;
6753
6754     comm->cellsize_limit = 0;
6755     comm->bBondComm      = FALSE;
6756
6757     /* Atoms should be able to move by up to half the list buffer size (if > 0)
6758      * within nstlist steps. Since boundaries are allowed to displace by half
6759      * a cell size, DD cells should be at least the size of the list buffer.
6760      */
6761     comm->cellsize_limit = max(comm->cellsize_limit,
6762                                ir->rlistlong - max(ir->rvdw, ir->rcoulomb));
6763
6764     if (comm->bInterCGBondeds)
6765     {
6766         if (comm_distance_min > 0)
6767         {
6768             comm->cutoff_mbody = comm_distance_min;
6769             if (Flags & MD_DDBONDCOMM)
6770             {
6771                 comm->bBondComm = (comm->cutoff_mbody > comm->cutoff);
6772             }
6773             else
6774             {
6775                 comm->cutoff = max(comm->cutoff, comm->cutoff_mbody);
6776             }
6777             r_bonded_limit = comm->cutoff_mbody;
6778         }
6779         else if (ir->bPeriodicMols)
6780         {
6781             /* Can not easily determine the required cut-off */
6782             dd_warning(cr, fplog, "NOTE: Periodic molecules are present in this system. Because of this, the domain decomposition algorithm cannot easily determine the minimum cell size that it requires for treating bonded interactions. Instead, domain decomposition will assume that half the non-bonded cut-off will be a suitable lower bound.\n");
6783             comm->cutoff_mbody = comm->cutoff/2;
6784             r_bonded_limit     = comm->cutoff_mbody;
6785         }
6786         else
6787         {
6788             if (MASTER(cr))
6789             {
6790                 dd_bonded_cg_distance(fplog, mtop, ir, x, box,
6791                                       Flags & MD_DDBONDCHECK, &r_2b, &r_mb);
6792             }
6793             gmx_bcast(sizeof(r_2b), &r_2b, cr);
6794             gmx_bcast(sizeof(r_mb), &r_mb, cr);
6795
6796             /* We use an initial margin of 10% for the minimum cell size,
6797              * except when we are just below the non-bonded cut-off.
6798              */
6799             if (Flags & MD_DDBONDCOMM)
6800             {
6801                 if (max(r_2b, r_mb) > comm->cutoff)
6802                 {
6803                     r_bonded        = max(r_2b, r_mb);
6804                     r_bonded_limit  = 1.1*r_bonded;
6805                     comm->bBondComm = TRUE;
6806                 }
6807                 else
6808                 {
6809                     r_bonded       = r_mb;
6810                     r_bonded_limit = min(1.1*r_bonded, comm->cutoff);
6811                 }
6812                 /* We determine cutoff_mbody later */
6813             }
6814             else
6815             {
6816                 /* No special bonded communication,
6817                  * simply increase the DD cut-off.
6818                  */
6819                 r_bonded_limit     = 1.1*max(r_2b, r_mb);
6820                 comm->cutoff_mbody = r_bonded_limit;
6821                 comm->cutoff       = max(comm->cutoff, comm->cutoff_mbody);
6822             }
6823         }
6824         comm->cellsize_limit = max(comm->cellsize_limit, r_bonded_limit);
6825         if (fplog)
6826         {
6827             fprintf(fplog,
6828                     "Minimum cell size due to bonded interactions: %.3f nm\n",
6829                     comm->cellsize_limit);
6830         }
6831     }
6832
6833     if (dd->bInterCGcons && rconstr <= 0)
6834     {
6835         /* There is a cell size limit due to the constraints (P-LINCS) */
6836         rconstr = constr_r_max(fplog, mtop, ir);
6837         if (fplog)
6838         {
6839             fprintf(fplog,
6840                     "Estimated maximum distance required for P-LINCS: %.3f nm\n",
6841                     rconstr);
6842             if (rconstr > comm->cellsize_limit)
6843             {
6844                 fprintf(fplog, "This distance will limit the DD cell size, you can override this with -rcon\n");
6845             }
6846         }
6847     }
6848     else if (rconstr > 0 && fplog)
6849     {
6850         /* Here we do not check for dd->bInterCGcons,
6851          * because one can also set a cell size limit for virtual sites only
6852          * and at this point we don't know yet if there are intercg v-sites.
6853          */
6854         fprintf(fplog,
6855                 "User supplied maximum distance required for P-LINCS: %.3f nm\n",
6856                 rconstr);
6857     }
6858     comm->cellsize_limit = max(comm->cellsize_limit, rconstr);
6859
6860     comm->cgs_gl = gmx_mtop_global_cgs(mtop);
6861
6862     if (nc[XX] > 0)
6863     {
6864         copy_ivec(nc, dd->nc);
6865         set_dd_dim(fplog, dd);
6866         set_ddbox_cr(cr, &dd->nc, ir, box, &comm->cgs_gl, x, ddbox);
6867
6868         if (cr->npmenodes == -1)
6869         {
6870             cr->npmenodes = 0;
6871         }
6872         acs = average_cellsize_min(dd, ddbox);
6873         if (acs < comm->cellsize_limit)
6874         {
6875             if (fplog)
6876             {
6877                 fprintf(fplog, "ERROR: The initial cell size (%f) is smaller than the cell size limit (%f)\n", acs, comm->cellsize_limit);
6878             }
6879             gmx_fatal_collective(FARGS, cr, NULL,
6880                                  "The initial cell size (%f) is smaller than the cell size limit (%f), change options -dd, -rdd or -rcon, see the log file for details",
6881                                  acs, comm->cellsize_limit);
6882         }
6883     }
6884     else
6885     {
6886         set_ddbox_cr(cr, NULL, ir, box, &comm->cgs_gl, x, ddbox);
6887
6888         /* We need to choose the optimal DD grid and possibly PME nodes */
6889         limit = dd_choose_grid(fplog, cr, dd, ir, mtop, box, ddbox,
6890                                comm->eDLB != edlbNO, dlb_scale,
6891                                comm->cellsize_limit, comm->cutoff,
6892                                comm->bInterCGBondeds);
6893
6894         if (dd->nc[XX] == 0)
6895         {
6896             bC = (dd->bInterCGcons && rconstr > r_bonded_limit);
6897             sprintf(buf, "Change the number of ranks or mdrun option %s%s%s",
6898                     !bC ? "-rdd" : "-rcon",
6899                     comm->eDLB != edlbNO ? " or -dds" : "",
6900                     bC ? " or your LINCS settings" : "");
6901
6902             gmx_fatal_collective(FARGS, cr, NULL,
6903                                  "There is no domain decomposition for %d ranks that is compatible with the given box and a minimum cell size of %g nm\n"
6904                                  "%s\n"
6905                                  "Look in the log file for details on the domain decomposition",
6906                                  cr->nnodes-cr->npmenodes, limit, buf);
6907         }
6908         set_dd_dim(fplog, dd);
6909     }
6910
6911     if (fplog)
6912     {
6913         fprintf(fplog,
6914                 "Domain decomposition grid %d x %d x %d, separate PME ranks %d\n",
6915                 dd->nc[XX], dd->nc[YY], dd->nc[ZZ], cr->npmenodes);
6916     }
6917
6918     dd->nnodes = dd->nc[XX]*dd->nc[YY]*dd->nc[ZZ];
6919     if (cr->nnodes - dd->nnodes != cr->npmenodes)
6920     {
6921         gmx_fatal_collective(FARGS, cr, NULL,
6922                              "The size of the domain decomposition grid (%d) does not match the number of ranks (%d). The total number of ranks is %d",
6923                              dd->nnodes, cr->nnodes - cr->npmenodes, cr->nnodes);
6924     }
6925     if (cr->npmenodes > dd->nnodes)
6926     {
6927         gmx_fatal_collective(FARGS, cr, NULL,
6928                              "The number of separate PME ranks (%d) is larger than the number of PP ranks (%d), this is not supported.", cr->npmenodes, dd->nnodes);
6929     }
6930     if (cr->npmenodes > 0)
6931     {
6932         comm->npmenodes = cr->npmenodes;
6933     }
6934     else
6935     {
6936         comm->npmenodes = dd->nnodes;
6937     }
6938
6939     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
6940     {
6941         /* The following choices should match those
6942          * in comm_cost_est in domdec_setup.c.
6943          * Note that here the checks have to take into account
6944          * that the decomposition might occur in a different order than xyz
6945          * (for instance through the env.var. GMX_DD_ORDER_ZYX),
6946          * in which case they will not match those in comm_cost_est,
6947          * but since that is mainly for testing purposes that's fine.
6948          */
6949         if (dd->ndim >= 2 && dd->dim[0] == XX && dd->dim[1] == YY &&
6950             comm->npmenodes > dd->nc[XX] && comm->npmenodes % dd->nc[XX] == 0 &&
6951             getenv("GMX_PMEONEDD") == NULL)
6952         {
6953             comm->npmedecompdim = 2;
6954             comm->npmenodes_x   = dd->nc[XX];
6955             comm->npmenodes_y   = comm->npmenodes/comm->npmenodes_x;
6956         }
6957         else
6958         {
6959             /* In case nc is 1 in both x and y we could still choose to
6960              * decompose pme in y instead of x, but we use x for simplicity.
6961              */
6962             comm->npmedecompdim = 1;
6963             if (dd->dim[0] == YY)
6964             {
6965                 comm->npmenodes_x = 1;
6966                 comm->npmenodes_y = comm->npmenodes;
6967             }
6968             else
6969             {
6970                 comm->npmenodes_x = comm->npmenodes;
6971                 comm->npmenodes_y = 1;
6972             }
6973         }
6974         if (fplog)
6975         {
6976             fprintf(fplog, "PME domain decomposition: %d x %d x %d\n",
6977                     comm->npmenodes_x, comm->npmenodes_y, 1);
6978         }
6979     }
6980     else
6981     {
6982         comm->npmedecompdim = 0;
6983         comm->npmenodes_x   = 0;
6984         comm->npmenodes_y   = 0;
6985     }
6986
6987     /* Technically we don't need both of these,
6988      * but it simplifies code not having to recalculate it.
6989      */
6990     *npme_x = comm->npmenodes_x;
6991     *npme_y = comm->npmenodes_y;
6992
6993     snew(comm->slb_frac, DIM);
6994     if (comm->eDLB == edlbNO)
6995     {
6996         comm->slb_frac[XX] = get_slb_frac(fplog, "x", dd->nc[XX], sizex);
6997         comm->slb_frac[YY] = get_slb_frac(fplog, "y", dd->nc[YY], sizey);
6998         comm->slb_frac[ZZ] = get_slb_frac(fplog, "z", dd->nc[ZZ], sizez);
6999     }
7000
7001     if (comm->bInterCGBondeds && comm->cutoff_mbody == 0)
7002     {
7003         if (comm->bBondComm || comm->eDLB != edlbNO)
7004         {
7005             /* Set the bonded communication distance to halfway
7006              * the minimum and the maximum,
7007              * since the extra communication cost is nearly zero.
7008              */
7009             acs                = average_cellsize_min(dd, ddbox);
7010             comm->cutoff_mbody = 0.5*(r_bonded + acs);
7011             if (comm->eDLB != edlbNO)
7012             {
7013                 /* Check if this does not limit the scaling */
7014                 comm->cutoff_mbody = min(comm->cutoff_mbody, dlb_scale*acs);
7015             }
7016             if (!comm->bBondComm)
7017             {
7018                 /* Without bBondComm do not go beyond the n.b. cut-off */
7019                 comm->cutoff_mbody = min(comm->cutoff_mbody, comm->cutoff);
7020                 if (comm->cellsize_limit >= comm->cutoff)
7021                 {
7022                     /* We don't loose a lot of efficieny
7023                      * when increasing it to the n.b. cut-off.
7024                      * It can even be slightly faster, because we need
7025                      * less checks for the communication setup.
7026                      */
7027                     comm->cutoff_mbody = comm->cutoff;
7028                 }
7029             }
7030             /* Check if we did not end up below our original limit */
7031             comm->cutoff_mbody = max(comm->cutoff_mbody, r_bonded_limit);
7032
7033             if (comm->cutoff_mbody > comm->cellsize_limit)
7034             {
7035                 comm->cellsize_limit = comm->cutoff_mbody;
7036             }
7037         }
7038         /* Without DLB and cutoff_mbody<cutoff, cutoff_mbody is dynamic */
7039     }
7040
7041     if (debug)
7042     {
7043         fprintf(debug, "Bonded atom communication beyond the cut-off: %d\n"
7044                 "cellsize limit %f\n",
7045                 comm->bBondComm, comm->cellsize_limit);
7046     }
7047
7048     if (MASTER(cr))
7049     {
7050         check_dd_restrictions(cr, dd, ir, fplog);
7051     }
7052
7053     comm->partition_step = INT_MIN;
7054     dd->ddp_count        = 0;
7055
7056     clear_dd_cycle_counts(dd);
7057
7058     return dd;
7059 }
7060
7061 static void set_dlb_limits(gmx_domdec_t *dd)
7062
7063 {
7064     int d;
7065
7066     for (d = 0; d < dd->ndim; d++)
7067     {
7068         dd->comm->cd[d].np                 = dd->comm->cd[d].np_dlb;
7069         dd->comm->cellsize_min[dd->dim[d]] =
7070             dd->comm->cellsize_min_dlb[dd->dim[d]];
7071     }
7072 }
7073
7074
7075 static void turn_on_dlb(FILE *fplog, t_commrec *cr, gmx_int64_t step)
7076 {
7077     gmx_domdec_t      *dd;
7078     gmx_domdec_comm_t *comm;
7079     real               cellsize_min;
7080     int                d, nc, i;
7081     char               buf[STRLEN];
7082
7083     dd   = cr->dd;
7084     comm = dd->comm;
7085
7086     if (fplog)
7087     {
7088         fprintf(fplog, "At step %s the performance loss due to force load imbalance is %.1f %%\n", gmx_step_str(step, buf), dd_force_imb_perf_loss(dd)*100);
7089     }
7090
7091     cellsize_min = comm->cellsize_min[dd->dim[0]];
7092     for (d = 1; d < dd->ndim; d++)
7093     {
7094         cellsize_min = min(cellsize_min, comm->cellsize_min[dd->dim[d]]);
7095     }
7096
7097     if (cellsize_min < comm->cellsize_limit*1.05)
7098     {
7099         dd_warning(cr, fplog, "NOTE: the minimum cell size is smaller than 1.05 times the cell size limit, will not turn on dynamic load balancing\n");
7100
7101         /* Change DLB from "auto" to "no". */
7102         comm->eDLB = edlbNO;
7103
7104         return;
7105     }
7106
7107     dd_warning(cr, fplog, "NOTE: Turning on dynamic load balancing\n");
7108     comm->bDynLoadBal = TRUE;
7109     dd->bGridJump     = TRUE;
7110
7111     set_dlb_limits(dd);
7112
7113     /* We can set the required cell size info here,
7114      * so we do not need to communicate this.
7115      * The grid is completely uniform.
7116      */
7117     for (d = 0; d < dd->ndim; d++)
7118     {
7119         if (comm->root[d])
7120         {
7121             comm->load[d].sum_m = comm->load[d].sum;
7122
7123             nc = dd->nc[dd->dim[d]];
7124             for (i = 0; i < nc; i++)
7125             {
7126                 comm->root[d]->cell_f[i]    = i/(real)nc;
7127                 if (d > 0)
7128                 {
7129                     comm->root[d]->cell_f_max0[i] =  i   /(real)nc;
7130                     comm->root[d]->cell_f_min1[i] = (i+1)/(real)nc;
7131                 }
7132             }
7133             comm->root[d]->cell_f[nc] = 1.0;
7134         }
7135     }
7136 }
7137
7138 static char *init_bLocalCG(gmx_mtop_t *mtop)
7139 {
7140     int   ncg, cg;
7141     char *bLocalCG;
7142
7143     ncg = ncg_mtop(mtop);
7144     snew(bLocalCG, ncg);
7145     for (cg = 0; cg < ncg; cg++)
7146     {
7147         bLocalCG[cg] = FALSE;
7148     }
7149
7150     return bLocalCG;
7151 }
7152
7153 void dd_init_bondeds(FILE *fplog,
7154                      gmx_domdec_t *dd, gmx_mtop_t *mtop,
7155                      gmx_vsite_t *vsite,
7156                      t_inputrec *ir, gmx_bool bBCheck, cginfo_mb_t *cginfo_mb)
7157 {
7158     gmx_domdec_comm_t *comm;
7159     gmx_bool           bBondComm;
7160     int                d;
7161
7162     dd_make_reverse_top(fplog, dd, mtop, vsite, ir, bBCheck);
7163
7164     comm = dd->comm;
7165
7166     if (comm->bBondComm)
7167     {
7168         /* Communicate atoms beyond the cut-off for bonded interactions */
7169         comm = dd->comm;
7170
7171         comm->cglink = make_charge_group_links(mtop, dd, cginfo_mb);
7172
7173         comm->bLocalCG = init_bLocalCG(mtop);
7174     }
7175     else
7176     {
7177         /* Only communicate atoms based on cut-off */
7178         comm->cglink   = NULL;
7179         comm->bLocalCG = NULL;
7180     }
7181 }
7182
7183 static void print_dd_settings(FILE *fplog, gmx_domdec_t *dd,
7184                               t_inputrec *ir,
7185                               gmx_bool bDynLoadBal, real dlb_scale,
7186                               gmx_ddbox_t *ddbox)
7187 {
7188     gmx_domdec_comm_t *comm;
7189     int                d;
7190     ivec               np;
7191     real               limit, shrink;
7192     char               buf[64];
7193
7194     if (fplog == NULL)
7195     {
7196         return;
7197     }
7198
7199     comm = dd->comm;
7200
7201     if (bDynLoadBal)
7202     {
7203         fprintf(fplog, "The maximum number of communication pulses is:");
7204         for (d = 0; d < dd->ndim; d++)
7205         {
7206             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), comm->cd[d].np_dlb);
7207         }
7208         fprintf(fplog, "\n");
7209         fprintf(fplog, "The minimum size for domain decomposition cells is %.3f nm\n", comm->cellsize_limit);
7210         fprintf(fplog, "The requested allowed shrink of DD cells (option -dds) is: %.2f\n", dlb_scale);
7211         fprintf(fplog, "The allowed shrink of domain decomposition cells is:");
7212         for (d = 0; d < DIM; d++)
7213         {
7214             if (dd->nc[d] > 1)
7215             {
7216                 if (d >= ddbox->npbcdim && dd->nc[d] == 2)
7217                 {
7218                     shrink = 0;
7219                 }
7220                 else
7221                 {
7222                     shrink =
7223                         comm->cellsize_min_dlb[d]/
7224                         (ddbox->box_size[d]*ddbox->skew_fac[d]/dd->nc[d]);
7225                 }
7226                 fprintf(fplog, " %c %.2f", dim2char(d), shrink);
7227             }
7228         }
7229         fprintf(fplog, "\n");
7230     }
7231     else
7232     {
7233         set_dd_cell_sizes_slb(dd, ddbox, setcellsizeslbPULSE_ONLY, np);
7234         fprintf(fplog, "The initial number of communication pulses is:");
7235         for (d = 0; d < dd->ndim; d++)
7236         {
7237             fprintf(fplog, " %c %d", dim2char(dd->dim[d]), np[dd->dim[d]]);
7238         }
7239         fprintf(fplog, "\n");
7240         fprintf(fplog, "The initial domain decomposition cell size is:");
7241         for (d = 0; d < DIM; d++)
7242         {
7243             if (dd->nc[d] > 1)
7244             {
7245                 fprintf(fplog, " %c %.2f nm",
7246                         dim2char(d), dd->comm->cellsize_min[d]);
7247             }
7248         }
7249         fprintf(fplog, "\n\n");
7250     }
7251
7252     if (comm->bInterCGBondeds || dd->vsite_comm || dd->constraint_comm)
7253     {
7254         fprintf(fplog, "The maximum allowed distance for charge groups involved in interactions is:\n");
7255         fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7256                 "non-bonded interactions", "", comm->cutoff);
7257
7258         if (bDynLoadBal)
7259         {
7260             limit = dd->comm->cellsize_limit;
7261         }
7262         else
7263         {
7264             if (dynamic_dd_box(ddbox, ir))
7265             {
7266                 fprintf(fplog, "(the following are initial values, they could change due to box deformation)\n");
7267             }
7268             limit = dd->comm->cellsize_min[XX];
7269             for (d = 1; d < DIM; d++)
7270             {
7271                 limit = min(limit, dd->comm->cellsize_min[d]);
7272             }
7273         }
7274
7275         if (comm->bInterCGBondeds)
7276         {
7277             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7278                     "two-body bonded interactions", "(-rdd)",
7279                     max(comm->cutoff, comm->cutoff_mbody));
7280             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7281                     "multi-body bonded interactions", "(-rdd)",
7282                     (comm->bBondComm || dd->bGridJump) ? comm->cutoff_mbody : min(comm->cutoff, limit));
7283         }
7284         if (dd->vsite_comm)
7285         {
7286             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7287                     "virtual site constructions", "(-rcon)", limit);
7288         }
7289         if (dd->constraint_comm)
7290         {
7291             sprintf(buf, "atoms separated by up to %d constraints",
7292                     1+ir->nProjOrder);
7293             fprintf(fplog, "%40s  %-7s %6.3f nm\n",
7294                     buf, "(-rcon)", limit);
7295         }
7296         fprintf(fplog, "\n");
7297     }
7298
7299     fflush(fplog);
7300 }
7301
7302 static void set_cell_limits_dlb(gmx_domdec_t      *dd,
7303                                 real               dlb_scale,
7304                                 const t_inputrec  *ir,
7305                                 const gmx_ddbox_t *ddbox)
7306 {
7307     gmx_domdec_comm_t *comm;
7308     int                d, dim, npulse, npulse_d_max, npulse_d;
7309     gmx_bool           bNoCutOff;
7310
7311     comm = dd->comm;
7312
7313     bNoCutOff = (ir->rvdw == 0 || ir->rcoulomb == 0);
7314
7315     /* Determine the maximum number of comm. pulses in one dimension */
7316
7317     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7318
7319     /* Determine the maximum required number of grid pulses */
7320     if (comm->cellsize_limit >= comm->cutoff)
7321     {
7322         /* Only a single pulse is required */
7323         npulse = 1;
7324     }
7325     else if (!bNoCutOff && comm->cellsize_limit > 0)
7326     {
7327         /* We round down slightly here to avoid overhead due to the latency
7328          * of extra communication calls when the cut-off
7329          * would be only slightly longer than the cell size.
7330          * Later cellsize_limit is redetermined,
7331          * so we can not miss interactions due to this rounding.
7332          */
7333         npulse = (int)(0.96 + comm->cutoff/comm->cellsize_limit);
7334     }
7335     else
7336     {
7337         /* There is no cell size limit */
7338         npulse = max(dd->nc[XX]-1, max(dd->nc[YY]-1, dd->nc[ZZ]-1));
7339     }
7340
7341     if (!bNoCutOff && npulse > 1)
7342     {
7343         /* See if we can do with less pulses, based on dlb_scale */
7344         npulse_d_max = 0;
7345         for (d = 0; d < dd->ndim; d++)
7346         {
7347             dim      = dd->dim[d];
7348             npulse_d = (int)(1 + dd->nc[dim]*comm->cutoff
7349                              /(ddbox->box_size[dim]*ddbox->skew_fac[dim]*dlb_scale));
7350             npulse_d_max = max(npulse_d_max, npulse_d);
7351         }
7352         npulse = min(npulse, npulse_d_max);
7353     }
7354
7355     /* This env var can override npulse */
7356     d = dd_getenv(debug, "GMX_DD_NPULSE", 0);
7357     if (d > 0)
7358     {
7359         npulse = d;
7360     }
7361
7362     comm->maxpulse       = 1;
7363     comm->bVacDLBNoLimit = (ir->ePBC == epbcNONE);
7364     for (d = 0; d < dd->ndim; d++)
7365     {
7366         comm->cd[d].np_dlb    = min(npulse, dd->nc[dd->dim[d]]-1);
7367         comm->cd[d].np_nalloc = comm->cd[d].np_dlb;
7368         snew(comm->cd[d].ind, comm->cd[d].np_nalloc);
7369         comm->maxpulse = max(comm->maxpulse, comm->cd[d].np_dlb);
7370         if (comm->cd[d].np_dlb < dd->nc[dd->dim[d]]-1)
7371         {
7372             comm->bVacDLBNoLimit = FALSE;
7373         }
7374     }
7375
7376     /* cellsize_limit is set for LINCS in init_domain_decomposition */
7377     if (!comm->bVacDLBNoLimit)
7378     {
7379         comm->cellsize_limit = max(comm->cellsize_limit,
7380                                    comm->cutoff/comm->maxpulse);
7381     }
7382     comm->cellsize_limit = max(comm->cellsize_limit, comm->cutoff_mbody);
7383     /* Set the minimum cell size for each DD dimension */
7384     for (d = 0; d < dd->ndim; d++)
7385     {
7386         if (comm->bVacDLBNoLimit ||
7387             comm->cd[d].np_dlb*comm->cellsize_limit >= comm->cutoff)
7388         {
7389             comm->cellsize_min_dlb[dd->dim[d]] = comm->cellsize_limit;
7390         }
7391         else
7392         {
7393             comm->cellsize_min_dlb[dd->dim[d]] =
7394                 comm->cutoff/comm->cd[d].np_dlb;
7395         }
7396     }
7397     if (comm->cutoff_mbody <= 0)
7398     {
7399         comm->cutoff_mbody = min(comm->cutoff, comm->cellsize_limit);
7400     }
7401     if (comm->bDynLoadBal)
7402     {
7403         set_dlb_limits(dd);
7404     }
7405 }
7406
7407 gmx_bool dd_bonded_molpbc(gmx_domdec_t *dd, int ePBC)
7408 {
7409     /* If each molecule is a single charge group
7410      * or we use domain decomposition for each periodic dimension,
7411      * we do not need to take pbc into account for the bonded interactions.
7412      */
7413     return (ePBC != epbcNONE && dd->comm->bInterCGBondeds &&
7414             !(dd->nc[XX] > 1 &&
7415               dd->nc[YY] > 1 &&
7416               (dd->nc[ZZ] > 1 || ePBC == epbcXY)));
7417 }
7418
7419 void set_dd_parameters(FILE *fplog, gmx_domdec_t *dd, real dlb_scale,
7420                        t_inputrec *ir, gmx_ddbox_t *ddbox)
7421 {
7422     gmx_domdec_comm_t *comm;
7423     int                natoms_tot;
7424     real               vol_frac;
7425
7426     comm = dd->comm;
7427
7428     /* Initialize the thread data.
7429      * This can not be done in init_domain_decomposition,
7430      * as the numbers of threads is determined later.
7431      */
7432     comm->nth = gmx_omp_nthreads_get(emntDomdec);
7433     if (comm->nth > 1)
7434     {
7435         snew(comm->dth, comm->nth);
7436     }
7437
7438     if (EEL_PME(ir->coulombtype) || EVDW_PME(ir->vdwtype))
7439     {
7440         init_ddpme(dd, &comm->ddpme[0], 0);
7441         if (comm->npmedecompdim >= 2)
7442         {
7443             init_ddpme(dd, &comm->ddpme[1], 1);
7444         }
7445     }
7446     else
7447     {
7448         comm->npmenodes = 0;
7449         if (dd->pme_nodeid >= 0)
7450         {
7451             gmx_fatal_collective(FARGS, NULL, dd,
7452                                  "Can not have separate PME ranks without PME electrostatics");
7453         }
7454     }
7455
7456     if (debug)
7457     {
7458         fprintf(debug, "The DD cut-off is %f\n", comm->cutoff);
7459     }
7460     if (comm->eDLB != edlbNO)
7461     {
7462         set_cell_limits_dlb(dd, dlb_scale, ir, ddbox);
7463     }
7464
7465     print_dd_settings(fplog, dd, ir, comm->bDynLoadBal, dlb_scale, ddbox);
7466     if (comm->eDLB == edlbAUTO)
7467     {
7468         if (fplog)
7469         {
7470             fprintf(fplog, "When dynamic load balancing gets turned on, these settings will change to:\n");
7471         }
7472         print_dd_settings(fplog, dd, ir, TRUE, dlb_scale, ddbox);
7473     }
7474
7475     if (ir->ePBC == epbcNONE)
7476     {
7477         vol_frac = 1 - 1/(double)dd->nnodes;
7478     }
7479     else
7480     {
7481         vol_frac =
7482             (1 + comm_box_frac(dd->nc, comm->cutoff, ddbox))/(double)dd->nnodes;
7483     }
7484     if (debug)
7485     {
7486         fprintf(debug, "Volume fraction for all DD zones: %f\n", vol_frac);
7487     }
7488     natoms_tot = comm->cgs_gl.index[comm->cgs_gl.nr];
7489
7490     dd->ga2la = ga2la_init(natoms_tot, vol_frac*natoms_tot);
7491 }
7492
7493 static gmx_bool test_dd_cutoff(t_commrec *cr,
7494                                t_state *state, t_inputrec *ir,
7495                                real cutoff_req)
7496 {
7497     gmx_domdec_t *dd;
7498     gmx_ddbox_t   ddbox;
7499     int           d, dim, np;
7500     real          inv_cell_size;
7501     int           LocallyLimited;
7502
7503     dd = cr->dd;
7504
7505     set_ddbox(dd, FALSE, cr, ir, state->box,
7506               TRUE, &dd->comm->cgs_gl, state->x, &ddbox);
7507
7508     LocallyLimited = 0;
7509
7510     for (d = 0; d < dd->ndim; d++)
7511     {
7512         dim = dd->dim[d];
7513
7514         inv_cell_size = DD_CELL_MARGIN*dd->nc[dim]/ddbox.box_size[dim];
7515         if (dynamic_dd_box(&ddbox, ir))
7516         {
7517             inv_cell_size *= DD_PRES_SCALE_MARGIN;
7518         }
7519
7520         np = 1 + (int)(cutoff_req*inv_cell_size*ddbox.skew_fac[dim]);
7521
7522         if (dd->comm->eDLB != edlbNO && dim < ddbox.npbcdim &&
7523             dd->comm->cd[d].np_dlb > 0)
7524         {
7525             if (np > dd->comm->cd[d].np_dlb)
7526             {
7527                 return FALSE;
7528             }
7529
7530             /* If a current local cell size is smaller than the requested
7531              * cut-off, we could still fix it, but this gets very complicated.
7532              * Without fixing here, we might actually need more checks.
7533              */
7534             if ((dd->comm->cell_x1[dim] - dd->comm->cell_x0[dim])*ddbox.skew_fac[dim]*dd->comm->cd[d].np_dlb < cutoff_req)
7535             {
7536                 LocallyLimited = 1;
7537             }
7538         }
7539     }
7540
7541     if (dd->comm->eDLB != edlbNO)
7542     {
7543         /* If DLB is not active yet, we don't need to check the grid jumps.
7544          * Actually we shouldn't, because then the grid jump data is not set.
7545          */
7546         if (dd->comm->bDynLoadBal &&
7547             check_grid_jump(0, dd, cutoff_req, &ddbox, FALSE))
7548         {
7549             LocallyLimited = 1;
7550         }
7551
7552         gmx_sumi(1, &LocallyLimited, cr);
7553
7554         if (LocallyLimited > 0)
7555         {
7556             return FALSE;
7557         }
7558     }
7559
7560     return TRUE;
7561 }
7562
7563 gmx_bool change_dd_cutoff(t_commrec *cr, t_state *state, t_inputrec *ir,
7564                           real cutoff_req)
7565 {
7566     gmx_bool bCutoffAllowed;
7567
7568     bCutoffAllowed = test_dd_cutoff(cr, state, ir, cutoff_req);
7569
7570     if (bCutoffAllowed)
7571     {
7572         cr->dd->comm->cutoff = cutoff_req;
7573     }
7574
7575     return bCutoffAllowed;
7576 }
7577
7578 void change_dd_dlb_cutoff_limit(t_commrec *cr)
7579 {
7580     gmx_domdec_comm_t *comm;
7581
7582     comm = cr->dd->comm;
7583
7584     /* Turn on the DLB limiting (might have been on already) */
7585     comm->bPMELoadBalDLBLimits = TRUE;
7586
7587     /* Change the cut-off limit */
7588     comm->PMELoadBal_max_cutoff = comm->cutoff;
7589 }
7590
7591 gmx_bool dd_dlb_is_locked(const gmx_domdec_t *dd)
7592 {
7593     return dd->comm->bDLB_locked;
7594 }
7595
7596 void dd_dlb_set_lock(gmx_domdec_t *dd, gmx_bool bValue)
7597 {
7598     /* We can only lock the DLB when it is set to auto, otherwise don't lock */
7599     if (dd->comm->eDLB == edlbAUTO)
7600     {
7601         dd->comm->bDLB_locked = bValue;
7602     }
7603 }
7604
7605 static void merge_cg_buffers(int ncell,
7606                              gmx_domdec_comm_dim_t *cd, int pulse,
7607                              int  *ncg_cell,
7608                              int  *index_gl, int  *recv_i,
7609                              rvec *cg_cm,    rvec *recv_vr,
7610                              int *cgindex,
7611                              cginfo_mb_t *cginfo_mb, int *cginfo)
7612 {
7613     gmx_domdec_ind_t *ind, *ind_p;
7614     int               p, cell, c, cg, cg0, cg1, cg_gl, nat;
7615     int               shift, shift_at;
7616
7617     ind = &cd->ind[pulse];
7618
7619     /* First correct the already stored data */
7620     shift = ind->nrecv[ncell];
7621     for (cell = ncell-1; cell >= 0; cell--)
7622     {
7623         shift -= ind->nrecv[cell];
7624         if (shift > 0)
7625         {
7626             /* Move the cg's present from previous grid pulses */
7627             cg0                = ncg_cell[ncell+cell];
7628             cg1                = ncg_cell[ncell+cell+1];
7629             cgindex[cg1+shift] = cgindex[cg1];
7630             for (cg = cg1-1; cg >= cg0; cg--)
7631             {
7632                 index_gl[cg+shift] = index_gl[cg];
7633                 copy_rvec(cg_cm[cg], cg_cm[cg+shift]);
7634                 cgindex[cg+shift] = cgindex[cg];
7635                 cginfo[cg+shift]  = cginfo[cg];
7636             }
7637             /* Correct the already stored send indices for the shift */
7638             for (p = 1; p <= pulse; p++)
7639             {
7640                 ind_p = &cd->ind[p];
7641                 cg0   = 0;
7642                 for (c = 0; c < cell; c++)
7643                 {
7644                     cg0 += ind_p->nsend[c];
7645                 }
7646                 cg1 = cg0 + ind_p->nsend[cell];
7647                 for (cg = cg0; cg < cg1; cg++)
7648                 {
7649                     ind_p->index[cg] += shift;
7650                 }
7651             }
7652         }
7653     }
7654
7655     /* Merge in the communicated buffers */
7656     shift    = 0;
7657     shift_at = 0;
7658     cg0      = 0;
7659     for (cell = 0; cell < ncell; cell++)
7660     {
7661         cg1 = ncg_cell[ncell+cell+1] + shift;
7662         if (shift_at > 0)
7663         {
7664             /* Correct the old cg indices */
7665             for (cg = ncg_cell[ncell+cell]; cg < cg1; cg++)
7666             {
7667                 cgindex[cg+1] += shift_at;
7668             }
7669         }
7670         for (cg = 0; cg < ind->nrecv[cell]; cg++)
7671         {
7672             /* Copy this charge group from the buffer */
7673             index_gl[cg1] = recv_i[cg0];
7674             copy_rvec(recv_vr[cg0], cg_cm[cg1]);
7675             /* Add it to the cgindex */
7676             cg_gl          = index_gl[cg1];
7677             cginfo[cg1]    = ddcginfo(cginfo_mb, cg_gl);
7678             nat            = GET_CGINFO_NATOMS(cginfo[cg1]);
7679             cgindex[cg1+1] = cgindex[cg1] + nat;
7680             cg0++;
7681             cg1++;
7682             shift_at += nat;
7683         }
7684         shift                 += ind->nrecv[cell];
7685         ncg_cell[ncell+cell+1] = cg1;
7686     }
7687 }
7688
7689 static void make_cell2at_index(gmx_domdec_comm_dim_t *cd,
7690                                int nzone, int cg0, const int *cgindex)
7691 {
7692     int cg, zone, p;
7693
7694     /* Store the atom block boundaries for easy copying of communication buffers
7695      */
7696     cg = cg0;
7697     for (zone = 0; zone < nzone; zone++)
7698     {
7699         for (p = 0; p < cd->np; p++)
7700         {
7701             cd->ind[p].cell2at0[zone] = cgindex[cg];
7702             cg += cd->ind[p].nrecv[zone];
7703             cd->ind[p].cell2at1[zone] = cgindex[cg];
7704         }
7705     }
7706 }
7707
7708 static gmx_bool missing_link(t_blocka *link, int cg_gl, char *bLocalCG)
7709 {
7710     int      i;
7711     gmx_bool bMiss;
7712
7713     bMiss = FALSE;
7714     for (i = link->index[cg_gl]; i < link->index[cg_gl+1]; i++)
7715     {
7716         if (!bLocalCG[link->a[i]])
7717         {
7718             bMiss = TRUE;
7719         }
7720     }
7721
7722     return bMiss;
7723 }
7724
7725 /* Domain corners for communication, a maximum of 4 i-zones see a j domain */
7726 typedef struct {
7727     real c[DIM][4]; /* the corners for the non-bonded communication */
7728     real cr0;       /* corner for rounding */
7729     real cr1[4];    /* corners for rounding */
7730     real bc[DIM];   /* corners for bounded communication */
7731     real bcr1;      /* corner for rounding for bonded communication */
7732 } dd_corners_t;
7733
7734 /* Determine the corners of the domain(s) we are communicating with */
7735 static void
7736 set_dd_corners(const gmx_domdec_t *dd,
7737                int dim0, int dim1, int dim2,
7738                gmx_bool bDistMB,
7739                dd_corners_t *c)
7740 {
7741     const gmx_domdec_comm_t  *comm;
7742     const gmx_domdec_zones_t *zones;
7743     int i, j;
7744
7745     comm = dd->comm;
7746
7747     zones = &comm->zones;
7748
7749     /* Keep the compiler happy */
7750     c->cr0  = 0;
7751     c->bcr1 = 0;
7752
7753     /* The first dimension is equal for all cells */
7754     c->c[0][0] = comm->cell_x0[dim0];
7755     if (bDistMB)
7756     {
7757         c->bc[0] = c->c[0][0];
7758     }
7759     if (dd->ndim >= 2)
7760     {
7761         dim1 = dd->dim[1];
7762         /* This cell row is only seen from the first row */
7763         c->c[1][0] = comm->cell_x0[dim1];
7764         /* All rows can see this row */
7765         c->c[1][1] = comm->cell_x0[dim1];
7766         if (dd->bGridJump)
7767         {
7768             c->c[1][1] = max(comm->cell_x0[dim1], comm->zone_d1[1].mch0);
7769             if (bDistMB)
7770             {
7771                 /* For the multi-body distance we need the maximum */
7772                 c->bc[1] = max(comm->cell_x0[dim1], comm->zone_d1[1].p1_0);
7773             }
7774         }
7775         /* Set the upper-right corner for rounding */
7776         c->cr0 = comm->cell_x1[dim0];
7777
7778         if (dd->ndim >= 3)
7779         {
7780             dim2 = dd->dim[2];
7781             for (j = 0; j < 4; j++)
7782             {
7783                 c->c[2][j] = comm->cell_x0[dim2];
7784             }
7785             if (dd->bGridJump)
7786             {
7787                 /* Use the maximum of the i-cells that see a j-cell */
7788                 for (i = 0; i < zones->nizone; i++)
7789                 {
7790                     for (j = zones->izone[i].j0; j < zones->izone[i].j1; j++)
7791                     {
7792                         if (j >= 4)
7793                         {
7794                             c->c[2][j-4] =
7795                                 max(c->c[2][j-4],
7796                                     comm->zone_d2[zones->shift[i][dim0]][zones->shift[i][dim1]].mch0);
7797                         }
7798                     }
7799                 }
7800                 if (bDistMB)
7801                 {
7802                     /* For the multi-body distance we need the maximum */
7803                     c->bc[2] = comm->cell_x0[dim2];
7804                     for (i = 0; i < 2; i++)
7805                     {
7806                         for (j = 0; j < 2; j++)
7807                         {
7808                             c->bc[2] = max(c->bc[2], comm->zone_d2[i][j].p1_0);
7809                         }
7810                     }
7811                 }
7812             }
7813
7814             /* Set the upper-right corner for rounding */
7815             /* Cell (0,0,0) and cell (1,0,0) can see cell 4 (0,1,1)
7816              * Only cell (0,0,0) can see cell 7 (1,1,1)
7817              */
7818             c->cr1[0] = comm->cell_x1[dim1];
7819             c->cr1[3] = comm->cell_x1[dim1];
7820             if (dd->bGridJump)
7821             {
7822                 c->cr1[0] = max(comm->cell_x1[dim1], comm->zone_d1[1].mch1);
7823                 if (bDistMB)
7824                 {
7825                     /* For the multi-body distance we need the maximum */
7826                     c->bcr1 = max(comm->cell_x1[dim1], comm->zone_d1[1].p1_1);
7827                 }
7828             }
7829         }
7830     }
7831 }
7832
7833 /* Determine which cg's we need to send in this pulse from this zone */
7834 static void
7835 get_zone_pulse_cgs(gmx_domdec_t *dd,
7836                    int zonei, int zone,
7837                    int cg0, int cg1,
7838                    const int *index_gl,
7839                    const int *cgindex,
7840                    int dim, int dim_ind,
7841                    int dim0, int dim1, int dim2,
7842                    real r_comm2, real r_bcomm2,
7843                    matrix box,
7844                    ivec tric_dist,
7845                    rvec *normal,
7846                    real skew_fac2_d, real skew_fac_01,
7847                    rvec *v_d, rvec *v_0, rvec *v_1,
7848                    const dd_corners_t *c,
7849                    rvec sf2_round,
7850                    gmx_bool bDistBonded,
7851                    gmx_bool bBondComm,
7852                    gmx_bool bDist2B,
7853                    gmx_bool bDistMB,
7854                    rvec *cg_cm,
7855                    int *cginfo,
7856                    gmx_domdec_ind_t *ind,
7857                    int **ibuf, int *ibuf_nalloc,
7858                    vec_rvec_t *vbuf,
7859                    int *nsend_ptr,
7860                    int *nat_ptr,
7861                    int *nsend_z_ptr)
7862 {
7863     gmx_domdec_comm_t *comm;
7864     gmx_bool           bScrew;
7865     gmx_bool           bDistMB_pulse;
7866     int                cg, i;
7867     real               r2, rb2, r, tric_sh;
7868     rvec               rn, rb;
7869     int                dimd;
7870     int                nsend_z, nsend, nat;
7871
7872     comm = dd->comm;
7873
7874     bScrew = (dd->bScrewPBC && dim == XX);
7875
7876     bDistMB_pulse = (bDistMB && bDistBonded);
7877
7878     nsend_z = 0;
7879     nsend   = *nsend_ptr;
7880     nat     = *nat_ptr;
7881
7882     for (cg = cg0; cg < cg1; cg++)
7883     {
7884         r2  = 0;
7885         rb2 = 0;
7886         if (tric_dist[dim_ind] == 0)
7887         {
7888             /* Rectangular direction, easy */
7889             r = cg_cm[cg][dim] - c->c[dim_ind][zone];
7890             if (r > 0)
7891             {
7892                 r2 += r*r;
7893             }
7894             if (bDistMB_pulse)
7895             {
7896                 r = cg_cm[cg][dim] - c->bc[dim_ind];
7897                 if (r > 0)
7898                 {
7899                     rb2 += r*r;
7900                 }
7901             }
7902             /* Rounding gives at most a 16% reduction
7903              * in communicated atoms
7904              */
7905             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7906             {
7907                 r = cg_cm[cg][dim0] - c->cr0;
7908                 /* This is the first dimension, so always r >= 0 */
7909                 r2 += r*r;
7910                 if (bDistMB_pulse)
7911                 {
7912                     rb2 += r*r;
7913                 }
7914             }
7915             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7916             {
7917                 r = cg_cm[cg][dim1] - c->cr1[zone];
7918                 if (r > 0)
7919                 {
7920                     r2 += r*r;
7921                 }
7922                 if (bDistMB_pulse)
7923                 {
7924                     r = cg_cm[cg][dim1] - c->bcr1;
7925                     if (r > 0)
7926                     {
7927                         rb2 += r*r;
7928                     }
7929                 }
7930             }
7931         }
7932         else
7933         {
7934             /* Triclinic direction, more complicated */
7935             clear_rvec(rn);
7936             clear_rvec(rb);
7937             /* Rounding, conservative as the skew_fac multiplication
7938              * will slightly underestimate the distance.
7939              */
7940             if (dim_ind >= 1 && (zonei == 1 || zonei == 2))
7941             {
7942                 rn[dim0] = cg_cm[cg][dim0] - c->cr0;
7943                 for (i = dim0+1; i < DIM; i++)
7944                 {
7945                     rn[dim0] -= cg_cm[cg][i]*v_0[i][dim0];
7946                 }
7947                 r2 = rn[dim0]*rn[dim0]*sf2_round[dim0];
7948                 if (bDistMB_pulse)
7949                 {
7950                     rb[dim0] = rn[dim0];
7951                     rb2      = r2;
7952                 }
7953                 /* Take care that the cell planes along dim0 might not
7954                  * be orthogonal to those along dim1 and dim2.
7955                  */
7956                 for (i = 1; i <= dim_ind; i++)
7957                 {
7958                     dimd = dd->dim[i];
7959                     if (normal[dim0][dimd] > 0)
7960                     {
7961                         rn[dimd] -= rn[dim0]*normal[dim0][dimd];
7962                         if (bDistMB_pulse)
7963                         {
7964                             rb[dimd] -= rb[dim0]*normal[dim0][dimd];
7965                         }
7966                     }
7967                 }
7968             }
7969             if (dim_ind == 2 && (zonei == 2 || zonei == 3))
7970             {
7971                 rn[dim1] += cg_cm[cg][dim1] - c->cr1[zone];
7972                 tric_sh   = 0;
7973                 for (i = dim1+1; i < DIM; i++)
7974                 {
7975                     tric_sh -= cg_cm[cg][i]*v_1[i][dim1];
7976                 }
7977                 rn[dim1] += tric_sh;
7978                 if (rn[dim1] > 0)
7979                 {
7980                     r2 += rn[dim1]*rn[dim1]*sf2_round[dim1];
7981                     /* Take care of coupling of the distances
7982                      * to the planes along dim0 and dim1 through dim2.
7983                      */
7984                     r2 -= rn[dim0]*rn[dim1]*skew_fac_01;
7985                     /* Take care that the cell planes along dim1
7986                      * might not be orthogonal to that along dim2.
7987                      */
7988                     if (normal[dim1][dim2] > 0)
7989                     {
7990                         rn[dim2] -= rn[dim1]*normal[dim1][dim2];
7991                     }
7992                 }
7993                 if (bDistMB_pulse)
7994                 {
7995                     rb[dim1] +=
7996                         cg_cm[cg][dim1] - c->bcr1 + tric_sh;
7997                     if (rb[dim1] > 0)
7998                     {
7999                         rb2 += rb[dim1]*rb[dim1]*sf2_round[dim1];
8000                         /* Take care of coupling of the distances
8001                          * to the planes along dim0 and dim1 through dim2.
8002                          */
8003                         rb2 -= rb[dim0]*rb[dim1]*skew_fac_01;
8004                         /* Take care that the cell planes along dim1
8005                          * might not be orthogonal to that along dim2.
8006                          */
8007                         if (normal[dim1][dim2] > 0)
8008                         {
8009                             rb[dim2] -= rb[dim1]*normal[dim1][dim2];
8010                         }
8011                     }
8012                 }
8013             }
8014             /* The distance along the communication direction */
8015             rn[dim] += cg_cm[cg][dim] - c->c[dim_ind][zone];
8016             tric_sh  = 0;
8017             for (i = dim+1; i < DIM; i++)
8018             {
8019                 tric_sh -= cg_cm[cg][i]*v_d[i][dim];
8020             }
8021             rn[dim] += tric_sh;
8022             if (rn[dim] > 0)
8023             {
8024                 r2 += rn[dim]*rn[dim]*skew_fac2_d;
8025                 /* Take care of coupling of the distances
8026                  * to the planes along dim0 and dim1 through dim2.
8027                  */
8028                 if (dim_ind == 1 && zonei == 1)
8029                 {
8030                     r2 -= rn[dim0]*rn[dim]*skew_fac_01;
8031                 }
8032             }
8033             if (bDistMB_pulse)
8034             {
8035                 clear_rvec(rb);
8036                 rb[dim] += cg_cm[cg][dim] - c->bc[dim_ind] + tric_sh;
8037                 if (rb[dim] > 0)
8038                 {
8039                     rb2 += rb[dim]*rb[dim]*skew_fac2_d;
8040                     /* Take care of coupling of the distances
8041                      * to the planes along dim0 and dim1 through dim2.
8042                      */
8043                     if (dim_ind == 1 && zonei == 1)
8044                     {
8045                         rb2 -= rb[dim0]*rb[dim]*skew_fac_01;
8046                     }
8047                 }
8048             }
8049         }
8050
8051         if (r2 < r_comm2 ||
8052             (bDistBonded &&
8053              ((bDistMB && rb2 < r_bcomm2) ||
8054               (bDist2B && r2  < r_bcomm2)) &&
8055              (!bBondComm ||
8056               (GET_CGINFO_BOND_INTER(cginfo[cg]) &&
8057                missing_link(comm->cglink, index_gl[cg],
8058                             comm->bLocalCG)))))
8059         {
8060             /* Make an index to the local charge groups */
8061             if (nsend+1 > ind->nalloc)
8062             {
8063                 ind->nalloc = over_alloc_large(nsend+1);
8064                 srenew(ind->index, ind->nalloc);
8065             }
8066             if (nsend+1 > *ibuf_nalloc)
8067             {
8068                 *ibuf_nalloc = over_alloc_large(nsend+1);
8069                 srenew(*ibuf, *ibuf_nalloc);
8070             }
8071             ind->index[nsend] = cg;
8072             (*ibuf)[nsend]    = index_gl[cg];
8073             nsend_z++;
8074             vec_rvec_check_alloc(vbuf, nsend+1);
8075
8076             if (dd->ci[dim] == 0)
8077             {
8078                 /* Correct cg_cm for pbc */
8079                 rvec_add(cg_cm[cg], box[dim], vbuf->v[nsend]);
8080                 if (bScrew)
8081                 {
8082                     vbuf->v[nsend][YY] = box[YY][YY] - vbuf->v[nsend][YY];
8083                     vbuf->v[nsend][ZZ] = box[ZZ][ZZ] - vbuf->v[nsend][ZZ];
8084                 }
8085             }
8086             else
8087             {
8088                 copy_rvec(cg_cm[cg], vbuf->v[nsend]);
8089             }
8090             nsend++;
8091             nat += cgindex[cg+1] - cgindex[cg];
8092         }
8093     }
8094
8095     *nsend_ptr   = nsend;
8096     *nat_ptr     = nat;
8097     *nsend_z_ptr = nsend_z;
8098 }
8099
8100 static void setup_dd_communication(gmx_domdec_t *dd,
8101                                    matrix box, gmx_ddbox_t *ddbox,
8102                                    t_forcerec *fr, t_state *state, rvec **f)
8103 {
8104     int                    dim_ind, dim, dim0, dim1, dim2, dimd, p, nat_tot;
8105     int                    nzone, nzone_send, zone, zonei, cg0, cg1;
8106     int                    c, i, j, cg, cg_gl, nrcg;
8107     int                   *zone_cg_range, pos_cg, *index_gl, *cgindex, *recv_i;
8108     gmx_domdec_comm_t     *comm;
8109     gmx_domdec_zones_t    *zones;
8110     gmx_domdec_comm_dim_t *cd;
8111     gmx_domdec_ind_t      *ind;
8112     cginfo_mb_t           *cginfo_mb;
8113     gmx_bool               bBondComm, bDist2B, bDistMB, bDistBonded;
8114     real                   r_mb, r_comm2, r_scomm2, r_bcomm2, r_0, r_1, r2inc, inv_ncg;
8115     dd_corners_t           corners;
8116     ivec                   tric_dist;
8117     rvec                  *cg_cm, *normal, *v_d, *v_0 = NULL, *v_1 = NULL, *recv_vr;
8118     real                   skew_fac2_d, skew_fac_01;
8119     rvec                   sf2_round;
8120     int                    nsend, nat;
8121     int                    th;
8122
8123     if (debug)
8124     {
8125         fprintf(debug, "Setting up DD communication\n");
8126     }
8127
8128     comm  = dd->comm;
8129
8130     switch (fr->cutoff_scheme)
8131     {
8132         case ecutsGROUP:
8133             cg_cm = fr->cg_cm;
8134             break;
8135         case ecutsVERLET:
8136             cg_cm = state->x;
8137             break;
8138         default:
8139             gmx_incons("unimplemented");
8140             cg_cm = NULL;
8141     }
8142
8143     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8144     {
8145         dim = dd->dim[dim_ind];
8146
8147         /* Check if we need to use triclinic distances */
8148         tric_dist[dim_ind] = 0;
8149         for (i = 0; i <= dim_ind; i++)
8150         {
8151             if (ddbox->tric_dir[dd->dim[i]])
8152             {
8153                 tric_dist[dim_ind] = 1;
8154             }
8155         }
8156     }
8157
8158     bBondComm = comm->bBondComm;
8159
8160     /* Do we need to determine extra distances for multi-body bondeds? */
8161     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8162
8163     /* Do we need to determine extra distances for only two-body bondeds? */
8164     bDist2B = (bBondComm && !bDistMB);
8165
8166     r_comm2  = sqr(comm->cutoff);
8167     r_bcomm2 = sqr(comm->cutoff_mbody);
8168
8169     if (debug)
8170     {
8171         fprintf(debug, "bBondComm %d, r_bc %f\n", bBondComm, sqrt(r_bcomm2));
8172     }
8173
8174     zones = &comm->zones;
8175
8176     dim0 = dd->dim[0];
8177     dim1 = (dd->ndim >= 2 ? dd->dim[1] : -1);
8178     dim2 = (dd->ndim >= 3 ? dd->dim[2] : -1);
8179
8180     set_dd_corners(dd, dim0, dim1, dim2, bDistMB, &corners);
8181
8182     /* Triclinic stuff */
8183     normal      = ddbox->normal;
8184     skew_fac_01 = 0;
8185     if (dd->ndim >= 2)
8186     {
8187         v_0 = ddbox->v[dim0];
8188         if (ddbox->tric_dir[dim0] && ddbox->tric_dir[dim1])
8189         {
8190             /* Determine the coupling coefficient for the distances
8191              * to the cell planes along dim0 and dim1 through dim2.
8192              * This is required for correct rounding.
8193              */
8194             skew_fac_01 =
8195                 ddbox->v[dim0][dim1+1][dim0]*ddbox->v[dim1][dim1+1][dim1];
8196             if (debug)
8197             {
8198                 fprintf(debug, "\nskew_fac_01 %f\n", skew_fac_01);
8199             }
8200         }
8201     }
8202     if (dd->ndim >= 3)
8203     {
8204         v_1 = ddbox->v[dim1];
8205     }
8206
8207     zone_cg_range = zones->cg_range;
8208     index_gl      = dd->index_gl;
8209     cgindex       = dd->cgindex;
8210     cginfo_mb     = fr->cginfo_mb;
8211
8212     zone_cg_range[0]   = 0;
8213     zone_cg_range[1]   = dd->ncg_home;
8214     comm->zone_ncg1[0] = dd->ncg_home;
8215     pos_cg             = dd->ncg_home;
8216
8217     nat_tot = dd->nat_home;
8218     nzone   = 1;
8219     for (dim_ind = 0; dim_ind < dd->ndim; dim_ind++)
8220     {
8221         dim = dd->dim[dim_ind];
8222         cd  = &comm->cd[dim_ind];
8223
8224         if (dim >= ddbox->npbcdim && dd->ci[dim] == 0)
8225         {
8226             /* No pbc in this dimension, the first node should not comm. */
8227             nzone_send = 0;
8228         }
8229         else
8230         {
8231             nzone_send = nzone;
8232         }
8233
8234         v_d         = ddbox->v[dim];
8235         skew_fac2_d = sqr(ddbox->skew_fac[dim]);
8236
8237         cd->bInPlace = TRUE;
8238         for (p = 0; p < cd->np; p++)
8239         {
8240             /* Only atoms communicated in the first pulse are used
8241              * for multi-body bonded interactions or for bBondComm.
8242              */
8243             bDistBonded = ((bDistMB || bDist2B) && p == 0);
8244
8245             ind   = &cd->ind[p];
8246             nsend = 0;
8247             nat   = 0;
8248             for (zone = 0; zone < nzone_send; zone++)
8249             {
8250                 if (tric_dist[dim_ind] && dim_ind > 0)
8251                 {
8252                     /* Determine slightly more optimized skew_fac's
8253                      * for rounding.
8254                      * This reduces the number of communicated atoms
8255                      * by about 10% for 3D DD of rhombic dodecahedra.
8256                      */
8257                     for (dimd = 0; dimd < dim; dimd++)
8258                     {
8259                         sf2_round[dimd] = 1;
8260                         if (ddbox->tric_dir[dimd])
8261                         {
8262                             for (i = dd->dim[dimd]+1; i < DIM; i++)
8263                             {
8264                                 /* If we are shifted in dimension i
8265                                  * and the cell plane is tilted forward
8266                                  * in dimension i, skip this coupling.
8267                                  */
8268                                 if (!(zones->shift[nzone+zone][i] &&
8269                                       ddbox->v[dimd][i][dimd] >= 0))
8270                                 {
8271                                     sf2_round[dimd] +=
8272                                         sqr(ddbox->v[dimd][i][dimd]);
8273                                 }
8274                             }
8275                             sf2_round[dimd] = 1/sf2_round[dimd];
8276                         }
8277                     }
8278                 }
8279
8280                 zonei = zone_perm[dim_ind][zone];
8281                 if (p == 0)
8282                 {
8283                     /* Here we permutate the zones to obtain a convenient order
8284                      * for neighbor searching
8285                      */
8286                     cg0 = zone_cg_range[zonei];
8287                     cg1 = zone_cg_range[zonei+1];
8288                 }
8289                 else
8290                 {
8291                     /* Look only at the cg's received in the previous grid pulse
8292                      */
8293                     cg1 = zone_cg_range[nzone+zone+1];
8294                     cg0 = cg1 - cd->ind[p-1].nrecv[zone];
8295                 }
8296
8297 #pragma omp parallel for num_threads(comm->nth) schedule(static)
8298                 for (th = 0; th < comm->nth; th++)
8299                 {
8300                     gmx_domdec_ind_t *ind_p;
8301                     int             **ibuf_p, *ibuf_nalloc_p;
8302                     vec_rvec_t       *vbuf_p;
8303                     int              *nsend_p, *nat_p;
8304                     int              *nsend_zone_p;
8305                     int               cg0_th, cg1_th;
8306
8307                     if (th == 0)
8308                     {
8309                         /* Thread 0 writes in the comm buffers */
8310                         ind_p         = ind;
8311                         ibuf_p        = &comm->buf_int;
8312                         ibuf_nalloc_p = &comm->nalloc_int;
8313                         vbuf_p        = &comm->vbuf;
8314                         nsend_p       = &nsend;
8315                         nat_p         = &nat;
8316                         nsend_zone_p  = &ind->nsend[zone];
8317                     }
8318                     else
8319                     {
8320                         /* Other threads write into temp buffers */
8321                         ind_p         = &comm->dth[th].ind;
8322                         ibuf_p        = &comm->dth[th].ibuf;
8323                         ibuf_nalloc_p = &comm->dth[th].ibuf_nalloc;
8324                         vbuf_p        = &comm->dth[th].vbuf;
8325                         nsend_p       = &comm->dth[th].nsend;
8326                         nat_p         = &comm->dth[th].nat;
8327                         nsend_zone_p  = &comm->dth[th].nsend_zone;
8328
8329                         comm->dth[th].nsend      = 0;
8330                         comm->dth[th].nat        = 0;
8331                         comm->dth[th].nsend_zone = 0;
8332                     }
8333
8334                     if (comm->nth == 1)
8335                     {
8336                         cg0_th = cg0;
8337                         cg1_th = cg1;
8338                     }
8339                     else
8340                     {
8341                         cg0_th = cg0 + ((cg1 - cg0)* th   )/comm->nth;
8342                         cg1_th = cg0 + ((cg1 - cg0)*(th+1))/comm->nth;
8343                     }
8344
8345                     /* Get the cg's for this pulse in this zone */
8346                     get_zone_pulse_cgs(dd, zonei, zone, cg0_th, cg1_th,
8347                                        index_gl, cgindex,
8348                                        dim, dim_ind, dim0, dim1, dim2,
8349                                        r_comm2, r_bcomm2,
8350                                        box, tric_dist,
8351                                        normal, skew_fac2_d, skew_fac_01,
8352                                        v_d, v_0, v_1, &corners, sf2_round,
8353                                        bDistBonded, bBondComm,
8354                                        bDist2B, bDistMB,
8355                                        cg_cm, fr->cginfo,
8356                                        ind_p,
8357                                        ibuf_p, ibuf_nalloc_p,
8358                                        vbuf_p,
8359                                        nsend_p, nat_p,
8360                                        nsend_zone_p);
8361                 }
8362
8363                 /* Append data of threads>=1 to the communication buffers */
8364                 for (th = 1; th < comm->nth; th++)
8365                 {
8366                     dd_comm_setup_work_t *dth;
8367                     int                   i, ns1;
8368
8369                     dth = &comm->dth[th];
8370
8371                     ns1 = nsend + dth->nsend_zone;
8372                     if (ns1 > ind->nalloc)
8373                     {
8374                         ind->nalloc = over_alloc_dd(ns1);
8375                         srenew(ind->index, ind->nalloc);
8376                     }
8377                     if (ns1 > comm->nalloc_int)
8378                     {
8379                         comm->nalloc_int = over_alloc_dd(ns1);
8380                         srenew(comm->buf_int, comm->nalloc_int);
8381                     }
8382                     if (ns1 > comm->vbuf.nalloc)
8383                     {
8384                         comm->vbuf.nalloc = over_alloc_dd(ns1);
8385                         srenew(comm->vbuf.v, comm->vbuf.nalloc);
8386                     }
8387
8388                     for (i = 0; i < dth->nsend_zone; i++)
8389                     {
8390                         ind->index[nsend]    = dth->ind.index[i];
8391                         comm->buf_int[nsend] = dth->ibuf[i];
8392                         copy_rvec(dth->vbuf.v[i],
8393                                   comm->vbuf.v[nsend]);
8394                         nsend++;
8395                     }
8396                     nat              += dth->nat;
8397                     ind->nsend[zone] += dth->nsend_zone;
8398                 }
8399             }
8400             /* Clear the counts in case we do not have pbc */
8401             for (zone = nzone_send; zone < nzone; zone++)
8402             {
8403                 ind->nsend[zone] = 0;
8404             }
8405             ind->nsend[nzone]   = nsend;
8406             ind->nsend[nzone+1] = nat;
8407             /* Communicate the number of cg's and atoms to receive */
8408             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8409                             ind->nsend, nzone+2,
8410                             ind->nrecv, nzone+2);
8411
8412             /* The rvec buffer is also required for atom buffers of size nsend
8413              * in dd_move_x and dd_move_f.
8414              */
8415             vec_rvec_check_alloc(&comm->vbuf, ind->nsend[nzone+1]);
8416
8417             if (p > 0)
8418             {
8419                 /* We can receive in place if only the last zone is not empty */
8420                 for (zone = 0; zone < nzone-1; zone++)
8421                 {
8422                     if (ind->nrecv[zone] > 0)
8423                     {
8424                         cd->bInPlace = FALSE;
8425                     }
8426                 }
8427                 if (!cd->bInPlace)
8428                 {
8429                     /* The int buffer is only required here for the cg indices */
8430                     if (ind->nrecv[nzone] > comm->nalloc_int2)
8431                     {
8432                         comm->nalloc_int2 = over_alloc_dd(ind->nrecv[nzone]);
8433                         srenew(comm->buf_int2, comm->nalloc_int2);
8434                     }
8435                     /* The rvec buffer is also required for atom buffers
8436                      * of size nrecv in dd_move_x and dd_move_f.
8437                      */
8438                     i = max(cd->ind[0].nrecv[nzone+1], ind->nrecv[nzone+1]);
8439                     vec_rvec_check_alloc(&comm->vbuf2, i);
8440                 }
8441             }
8442
8443             /* Make space for the global cg indices */
8444             if (pos_cg + ind->nrecv[nzone] > dd->cg_nalloc
8445                 || dd->cg_nalloc == 0)
8446             {
8447                 dd->cg_nalloc = over_alloc_dd(pos_cg + ind->nrecv[nzone]);
8448                 srenew(index_gl, dd->cg_nalloc);
8449                 srenew(cgindex, dd->cg_nalloc+1);
8450             }
8451             /* Communicate the global cg indices */
8452             if (cd->bInPlace)
8453             {
8454                 recv_i = index_gl + pos_cg;
8455             }
8456             else
8457             {
8458                 recv_i = comm->buf_int2;
8459             }
8460             dd_sendrecv_int(dd, dim_ind, dddirBackward,
8461                             comm->buf_int, nsend,
8462                             recv_i,        ind->nrecv[nzone]);
8463
8464             /* Make space for cg_cm */
8465             dd_check_alloc_ncg(fr, state, f, pos_cg + ind->nrecv[nzone]);
8466             if (fr->cutoff_scheme == ecutsGROUP)
8467             {
8468                 cg_cm = fr->cg_cm;
8469             }
8470             else
8471             {
8472                 cg_cm = state->x;
8473             }
8474             /* Communicate cg_cm */
8475             if (cd->bInPlace)
8476             {
8477                 recv_vr = cg_cm + pos_cg;
8478             }
8479             else
8480             {
8481                 recv_vr = comm->vbuf2.v;
8482             }
8483             dd_sendrecv_rvec(dd, dim_ind, dddirBackward,
8484                              comm->vbuf.v, nsend,
8485                              recv_vr,      ind->nrecv[nzone]);
8486
8487             /* Make the charge group index */
8488             if (cd->bInPlace)
8489             {
8490                 zone = (p == 0 ? 0 : nzone - 1);
8491                 while (zone < nzone)
8492                 {
8493                     for (cg = 0; cg < ind->nrecv[zone]; cg++)
8494                     {
8495                         cg_gl              = index_gl[pos_cg];
8496                         fr->cginfo[pos_cg] = ddcginfo(cginfo_mb, cg_gl);
8497                         nrcg               = GET_CGINFO_NATOMS(fr->cginfo[pos_cg]);
8498                         cgindex[pos_cg+1]  = cgindex[pos_cg] + nrcg;
8499                         if (bBondComm)
8500                         {
8501                             /* Update the charge group presence,
8502                              * so we can use it in the next pass of the loop.
8503                              */
8504                             comm->bLocalCG[cg_gl] = TRUE;
8505                         }
8506                         pos_cg++;
8507                     }
8508                     if (p == 0)
8509                     {
8510                         comm->zone_ncg1[nzone+zone] = ind->nrecv[zone];
8511                     }
8512                     zone++;
8513                     zone_cg_range[nzone+zone] = pos_cg;
8514                 }
8515             }
8516             else
8517             {
8518                 /* This part of the code is never executed with bBondComm. */
8519                 merge_cg_buffers(nzone, cd, p, zone_cg_range,
8520                                  index_gl, recv_i, cg_cm, recv_vr,
8521                                  cgindex, fr->cginfo_mb, fr->cginfo);
8522                 pos_cg += ind->nrecv[nzone];
8523             }
8524             nat_tot += ind->nrecv[nzone+1];
8525         }
8526         if (!cd->bInPlace)
8527         {
8528             /* Store the atom block for easy copying of communication buffers */
8529             make_cell2at_index(cd, nzone, zone_cg_range[nzone], cgindex);
8530         }
8531         nzone += nzone;
8532     }
8533     dd->index_gl = index_gl;
8534     dd->cgindex  = cgindex;
8535
8536     dd->ncg_tot          = zone_cg_range[zones->n];
8537     dd->nat_tot          = nat_tot;
8538     comm->nat[ddnatHOME] = dd->nat_home;
8539     for (i = ddnatZONE; i < ddnatNR; i++)
8540     {
8541         comm->nat[i] = dd->nat_tot;
8542     }
8543
8544     if (!bBondComm)
8545     {
8546         /* We don't need to update cginfo, since that was alrady done above.
8547          * So we pass NULL for the forcerec.
8548          */
8549         dd_set_cginfo(dd->index_gl, dd->ncg_home, dd->ncg_tot,
8550                       NULL, comm->bLocalCG);
8551     }
8552
8553     if (debug)
8554     {
8555         fprintf(debug, "Finished setting up DD communication, zones:");
8556         for (c = 0; c < zones->n; c++)
8557         {
8558             fprintf(debug, " %d", zones->cg_range[c+1]-zones->cg_range[c]);
8559         }
8560         fprintf(debug, "\n");
8561     }
8562 }
8563
8564 static void set_cg_boundaries(gmx_domdec_zones_t *zones)
8565 {
8566     int c;
8567
8568     for (c = 0; c < zones->nizone; c++)
8569     {
8570         zones->izone[c].cg1  = zones->cg_range[c+1];
8571         zones->izone[c].jcg0 = zones->cg_range[zones->izone[c].j0];
8572         zones->izone[c].jcg1 = zones->cg_range[zones->izone[c].j1];
8573     }
8574 }
8575
8576 static void set_zones_size(gmx_domdec_t *dd,
8577                            matrix box, const gmx_ddbox_t *ddbox,
8578                            int zone_start, int zone_end)
8579 {
8580     gmx_domdec_comm_t  *comm;
8581     gmx_domdec_zones_t *zones;
8582     gmx_bool            bDistMB;
8583     int                 z, zi, zj0, zj1, d, dim;
8584     real                rcs, rcmbs;
8585     int                 i, j;
8586     real                size_j, add_tric;
8587     real                vol;
8588
8589     comm = dd->comm;
8590
8591     zones = &comm->zones;
8592
8593     /* Do we need to determine extra distances for multi-body bondeds? */
8594     bDistMB = (comm->bInterCGMultiBody && dd->bGridJump && dd->ndim > 1);
8595
8596     for (z = zone_start; z < zone_end; z++)
8597     {
8598         /* Copy cell limits to zone limits.
8599          * Valid for non-DD dims and non-shifted dims.
8600          */
8601         copy_rvec(comm->cell_x0, zones->size[z].x0);
8602         copy_rvec(comm->cell_x1, zones->size[z].x1);
8603     }
8604
8605     for (d = 0; d < dd->ndim; d++)
8606     {
8607         dim = dd->dim[d];
8608
8609         for (z = 0; z < zones->n; z++)
8610         {
8611             /* With a staggered grid we have different sizes
8612              * for non-shifted dimensions.
8613              */
8614             if (dd->bGridJump && zones->shift[z][dim] == 0)
8615             {
8616                 if (d == 1)
8617                 {
8618                     zones->size[z].x0[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min0;
8619                     zones->size[z].x1[dim] = comm->zone_d1[zones->shift[z][dd->dim[d-1]]].max1;
8620                 }
8621                 else if (d == 2)
8622                 {
8623                     zones->size[z].x0[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min0;
8624                     zones->size[z].x1[dim] = comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].max1;
8625                 }
8626             }
8627         }
8628
8629         rcs   = comm->cutoff;
8630         rcmbs = comm->cutoff_mbody;
8631         if (ddbox->tric_dir[dim])
8632         {
8633             rcs   /= ddbox->skew_fac[dim];
8634             rcmbs /= ddbox->skew_fac[dim];
8635         }
8636
8637         /* Set the lower limit for the shifted zone dimensions */
8638         for (z = zone_start; z < zone_end; z++)
8639         {
8640             if (zones->shift[z][dim] > 0)
8641             {
8642                 dim = dd->dim[d];
8643                 if (!dd->bGridJump || d == 0)
8644                 {
8645                     zones->size[z].x0[dim] = comm->cell_x1[dim];
8646                     zones->size[z].x1[dim] = comm->cell_x1[dim] + rcs;
8647                 }
8648                 else
8649                 {
8650                     /* Here we take the lower limit of the zone from
8651                      * the lowest domain of the zone below.
8652                      */
8653                     if (z < 4)
8654                     {
8655                         zones->size[z].x0[dim] =
8656                             comm->zone_d1[zones->shift[z][dd->dim[d-1]]].min1;
8657                     }
8658                     else
8659                     {
8660                         if (d == 1)
8661                         {
8662                             zones->size[z].x0[dim] =
8663                                 zones->size[zone_perm[2][z-4]].x0[dim];
8664                         }
8665                         else
8666                         {
8667                             zones->size[z].x0[dim] =
8668                                 comm->zone_d2[zones->shift[z][dd->dim[d-2]]][zones->shift[z][dd->dim[d-1]]].min1;
8669                         }
8670                     }
8671                     /* A temporary limit, is updated below */
8672                     zones->size[z].x1[dim] = zones->size[z].x0[dim];
8673
8674                     if (bDistMB)
8675                     {
8676                         for (zi = 0; zi < zones->nizone; zi++)
8677                         {
8678                             if (zones->shift[zi][dim] == 0)
8679                             {
8680                                 /* This takes the whole zone into account.
8681                                  * With multiple pulses this will lead
8682                                  * to a larger zone then strictly necessary.
8683                                  */
8684                                 zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8685                                                              zones->size[zi].x1[dim]+rcmbs);
8686                             }
8687                         }
8688                     }
8689                 }
8690             }
8691         }
8692
8693         /* Loop over the i-zones to set the upper limit of each
8694          * j-zone they see.
8695          */
8696         for (zi = 0; zi < zones->nizone; zi++)
8697         {
8698             if (zones->shift[zi][dim] == 0)
8699             {
8700                 for (z = zones->izone[zi].j0; z < zones->izone[zi].j1; z++)
8701                 {
8702                     if (zones->shift[z][dim] > 0)
8703                     {
8704                         zones->size[z].x1[dim] = max(zones->size[z].x1[dim],
8705                                                      zones->size[zi].x1[dim]+rcs);
8706                     }
8707                 }
8708             }
8709         }
8710     }
8711
8712     for (z = zone_start; z < zone_end; z++)
8713     {
8714         /* Initialization only required to keep the compiler happy */
8715         rvec corner_min = {0, 0, 0}, corner_max = {0, 0, 0}, corner;
8716         int  nc, c;
8717
8718         /* To determine the bounding box for a zone we need to find
8719          * the extreme corners of 4, 2 or 1 corners.
8720          */
8721         nc = 1 << (ddbox->npbcdim - 1);
8722
8723         for (c = 0; c < nc; c++)
8724         {
8725             /* Set up a zone corner at x=0, ignoring trilinic couplings */
8726             corner[XX] = 0;
8727             if ((c & 1) == 0)
8728             {
8729                 corner[YY] = zones->size[z].x0[YY];
8730             }
8731             else
8732             {
8733                 corner[YY] = zones->size[z].x1[YY];
8734             }
8735             if ((c & 2) == 0)
8736             {
8737                 corner[ZZ] = zones->size[z].x0[ZZ];
8738             }
8739             else
8740             {
8741                 corner[ZZ] = zones->size[z].x1[ZZ];
8742             }
8743             if (dd->ndim == 1 && box[ZZ][YY] != 0)
8744             {
8745                 /* With 1D domain decomposition the cg's are not in
8746                  * the triclinic box, but triclinic x-y and rectangular y-z.
8747                  * Shift y back, so it will later end up at 0.
8748                  */
8749                 corner[YY] -= corner[ZZ]*box[ZZ][YY]/box[ZZ][ZZ];
8750             }
8751             /* Apply the triclinic couplings */
8752             assert(ddbox->npbcdim <= DIM);
8753             for (i = YY; i < ddbox->npbcdim; i++)
8754             {
8755                 for (j = XX; j < i; j++)
8756                 {
8757                     corner[j] += corner[i]*box[i][j]/box[i][i];
8758                 }
8759             }
8760             if (c == 0)
8761             {
8762                 copy_rvec(corner, corner_min);
8763                 copy_rvec(corner, corner_max);
8764             }
8765             else
8766             {
8767                 for (i = 0; i < DIM; i++)
8768                 {
8769                     corner_min[i] = min(corner_min[i], corner[i]);
8770                     corner_max[i] = max(corner_max[i], corner[i]);
8771                 }
8772             }
8773         }
8774         /* Copy the extreme cornes without offset along x */
8775         for (i = 0; i < DIM; i++)
8776         {
8777             zones->size[z].bb_x0[i] = corner_min[i];
8778             zones->size[z].bb_x1[i] = corner_max[i];
8779         }
8780         /* Add the offset along x */
8781         zones->size[z].bb_x0[XX] += zones->size[z].x0[XX];
8782         zones->size[z].bb_x1[XX] += zones->size[z].x1[XX];
8783     }
8784
8785     if (zone_start == 0)
8786     {
8787         vol = 1;
8788         for (dim = 0; dim < DIM; dim++)
8789         {
8790             vol *= zones->size[0].x1[dim] - zones->size[0].x0[dim];
8791         }
8792         zones->dens_zone0 = (zones->cg_range[1] - zones->cg_range[0])/vol;
8793     }
8794
8795     if (debug)
8796     {
8797         for (z = zone_start; z < zone_end; z++)
8798         {
8799             fprintf(debug, "zone %d    %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8800                     z,
8801                     zones->size[z].x0[XX], zones->size[z].x1[XX],
8802                     zones->size[z].x0[YY], zones->size[z].x1[YY],
8803                     zones->size[z].x0[ZZ], zones->size[z].x1[ZZ]);
8804             fprintf(debug, "zone %d bb %6.3f - %6.3f  %6.3f - %6.3f  %6.3f - %6.3f\n",
8805                     z,
8806                     zones->size[z].bb_x0[XX], zones->size[z].bb_x1[XX],
8807                     zones->size[z].bb_x0[YY], zones->size[z].bb_x1[YY],
8808                     zones->size[z].bb_x0[ZZ], zones->size[z].bb_x1[ZZ]);
8809         }
8810     }
8811 }
8812
8813 static int comp_cgsort(const void *a, const void *b)
8814 {
8815     int           comp;
8816
8817     gmx_cgsort_t *cga, *cgb;
8818     cga = (gmx_cgsort_t *)a;
8819     cgb = (gmx_cgsort_t *)b;
8820
8821     comp = cga->nsc - cgb->nsc;
8822     if (comp == 0)
8823     {
8824         comp = cga->ind_gl - cgb->ind_gl;
8825     }
8826
8827     return comp;
8828 }
8829
8830 static void order_int_cg(int n, const gmx_cgsort_t *sort,
8831                          int *a, int *buf)
8832 {
8833     int i;
8834
8835     /* Order the data */
8836     for (i = 0; i < n; i++)
8837     {
8838         buf[i] = a[sort[i].ind];
8839     }
8840
8841     /* Copy back to the original array */
8842     for (i = 0; i < n; i++)
8843     {
8844         a[i] = buf[i];
8845     }
8846 }
8847
8848 static void order_vec_cg(int n, const gmx_cgsort_t *sort,
8849                          rvec *v, rvec *buf)
8850 {
8851     int i;
8852
8853     /* Order the data */
8854     for (i = 0; i < n; i++)
8855     {
8856         copy_rvec(v[sort[i].ind], buf[i]);
8857     }
8858
8859     /* Copy back to the original array */
8860     for (i = 0; i < n; i++)
8861     {
8862         copy_rvec(buf[i], v[i]);
8863     }
8864 }
8865
8866 static void order_vec_atom(int ncg, const int *cgindex, const gmx_cgsort_t *sort,
8867                            rvec *v, rvec *buf)
8868 {
8869     int a, atot, cg, cg0, cg1, i;
8870
8871     if (cgindex == NULL)
8872     {
8873         /* Avoid the useless loop of the atoms within a cg */
8874         order_vec_cg(ncg, sort, v, buf);
8875
8876         return;
8877     }
8878
8879     /* Order the data */
8880     a = 0;
8881     for (cg = 0; cg < ncg; cg++)
8882     {
8883         cg0 = cgindex[sort[cg].ind];
8884         cg1 = cgindex[sort[cg].ind+1];
8885         for (i = cg0; i < cg1; i++)
8886         {
8887             copy_rvec(v[i], buf[a]);
8888             a++;
8889         }
8890     }
8891     atot = a;
8892
8893     /* Copy back to the original array */
8894     for (a = 0; a < atot; a++)
8895     {
8896         copy_rvec(buf[a], v[a]);
8897     }
8898 }
8899
8900 static void ordered_sort(int nsort2, gmx_cgsort_t *sort2,
8901                          int nsort_new, gmx_cgsort_t *sort_new,
8902                          gmx_cgsort_t *sort1)
8903 {
8904     int i1, i2, i_new;
8905
8906     /* The new indices are not very ordered, so we qsort them */
8907     gmx_qsort_threadsafe(sort_new, nsort_new, sizeof(sort_new[0]), comp_cgsort);
8908
8909     /* sort2 is already ordered, so now we can merge the two arrays */
8910     i1    = 0;
8911     i2    = 0;
8912     i_new = 0;
8913     while (i2 < nsort2 || i_new < nsort_new)
8914     {
8915         if (i2 == nsort2)
8916         {
8917             sort1[i1++] = sort_new[i_new++];
8918         }
8919         else if (i_new == nsort_new)
8920         {
8921             sort1[i1++] = sort2[i2++];
8922         }
8923         else if (sort2[i2].nsc < sort_new[i_new].nsc ||
8924                  (sort2[i2].nsc == sort_new[i_new].nsc &&
8925                   sort2[i2].ind_gl < sort_new[i_new].ind_gl))
8926         {
8927             sort1[i1++] = sort2[i2++];
8928         }
8929         else
8930         {
8931             sort1[i1++] = sort_new[i_new++];
8932         }
8933     }
8934 }
8935
8936 static int dd_sort_order(gmx_domdec_t *dd, t_forcerec *fr, int ncg_home_old)
8937 {
8938     gmx_domdec_sort_t *sort;
8939     gmx_cgsort_t      *cgsort, *sort_i;
8940     int                ncg_new, nsort2, nsort_new, i, *a, moved, *ibuf;
8941     int                sort_last, sort_skip;
8942
8943     sort = dd->comm->sort;
8944
8945     a = fr->ns.grid->cell_index;
8946
8947     moved = NSGRID_SIGNAL_MOVED_FAC*fr->ns.grid->ncells;
8948
8949     if (ncg_home_old >= 0)
8950     {
8951         /* The charge groups that remained in the same ns grid cell
8952          * are completely ordered. So we can sort efficiently by sorting
8953          * the charge groups that did move into the stationary list.
8954          */
8955         ncg_new   = 0;
8956         nsort2    = 0;
8957         nsort_new = 0;
8958         for (i = 0; i < dd->ncg_home; i++)
8959         {
8960             /* Check if this cg did not move to another node */
8961             if (a[i] < moved)
8962             {
8963                 if (i >= ncg_home_old || a[i] != sort->sort[i].nsc)
8964                 {
8965                     /* This cg is new on this node or moved ns grid cell */
8966                     if (nsort_new >= sort->sort_new_nalloc)
8967                     {
8968                         sort->sort_new_nalloc = over_alloc_dd(nsort_new+1);
8969                         srenew(sort->sort_new, sort->sort_new_nalloc);
8970                     }
8971                     sort_i = &(sort->sort_new[nsort_new++]);
8972                 }
8973                 else
8974                 {
8975                     /* This cg did not move */
8976                     sort_i = &(sort->sort2[nsort2++]);
8977                 }
8978                 /* Sort on the ns grid cell indices
8979                  * and the global topology index.
8980                  * index_gl is irrelevant with cell ns,
8981                  * but we set it here anyhow to avoid a conditional.
8982                  */
8983                 sort_i->nsc    = a[i];
8984                 sort_i->ind_gl = dd->index_gl[i];
8985                 sort_i->ind    = i;
8986                 ncg_new++;
8987             }
8988         }
8989         if (debug)
8990         {
8991             fprintf(debug, "ordered sort cgs: stationary %d moved %d\n",
8992                     nsort2, nsort_new);
8993         }
8994         /* Sort efficiently */
8995         ordered_sort(nsort2, sort->sort2, nsort_new, sort->sort_new,
8996                      sort->sort);
8997     }
8998     else
8999     {
9000         cgsort  = sort->sort;
9001         ncg_new = 0;
9002         for (i = 0; i < dd->ncg_home; i++)
9003         {
9004             /* Sort on the ns grid cell indices
9005              * and the global topology index
9006              */
9007             cgsort[i].nsc    = a[i];
9008             cgsort[i].ind_gl = dd->index_gl[i];
9009             cgsort[i].ind    = i;
9010             if (cgsort[i].nsc < moved)
9011             {
9012                 ncg_new++;
9013             }
9014         }
9015         if (debug)
9016         {
9017             fprintf(debug, "qsort cgs: %d new home %d\n", dd->ncg_home, ncg_new);
9018         }
9019         /* Determine the order of the charge groups using qsort */
9020         gmx_qsort_threadsafe(cgsort, dd->ncg_home, sizeof(cgsort[0]), comp_cgsort);
9021     }
9022
9023     return ncg_new;
9024 }
9025
9026 static int dd_sort_order_nbnxn(gmx_domdec_t *dd, t_forcerec *fr)
9027 {
9028     gmx_cgsort_t *sort;
9029     int           ncg_new, i, *a, na;
9030
9031     sort = dd->comm->sort->sort;
9032
9033     nbnxn_get_atomorder(fr->nbv->nbs, &a, &na);
9034
9035     ncg_new = 0;
9036     for (i = 0; i < na; i++)
9037     {
9038         if (a[i] >= 0)
9039         {
9040             sort[ncg_new].ind = a[i];
9041             ncg_new++;
9042         }
9043     }
9044
9045     return ncg_new;
9046 }
9047
9048 static void dd_sort_state(gmx_domdec_t *dd, rvec *cgcm, t_forcerec *fr, t_state *state,
9049                           int ncg_home_old)
9050 {
9051     gmx_domdec_sort_t *sort;
9052     gmx_cgsort_t      *cgsort, *sort_i;
9053     int               *cgindex;
9054     int                ncg_new, i, *ibuf, cgsize;
9055     rvec              *vbuf;
9056
9057     sort = dd->comm->sort;
9058
9059     if (dd->ncg_home > sort->sort_nalloc)
9060     {
9061         sort->sort_nalloc = over_alloc_dd(dd->ncg_home);
9062         srenew(sort->sort, sort->sort_nalloc);
9063         srenew(sort->sort2, sort->sort_nalloc);
9064     }
9065     cgsort = sort->sort;
9066
9067     switch (fr->cutoff_scheme)
9068     {
9069         case ecutsGROUP:
9070             ncg_new = dd_sort_order(dd, fr, ncg_home_old);
9071             break;
9072         case ecutsVERLET:
9073             ncg_new = dd_sort_order_nbnxn(dd, fr);
9074             break;
9075         default:
9076             gmx_incons("unimplemented");
9077             ncg_new = 0;
9078     }
9079
9080     /* We alloc with the old size, since cgindex is still old */
9081     vec_rvec_check_alloc(&dd->comm->vbuf, dd->cgindex[dd->ncg_home]);
9082     vbuf = dd->comm->vbuf.v;
9083
9084     if (dd->comm->bCGs)
9085     {
9086         cgindex = dd->cgindex;
9087     }
9088     else
9089     {
9090         cgindex = NULL;
9091     }
9092
9093     /* Remove the charge groups which are no longer at home here */
9094     dd->ncg_home = ncg_new;
9095     if (debug)
9096     {
9097         fprintf(debug, "Set the new home charge group count to %d\n",
9098                 dd->ncg_home);
9099     }
9100
9101     /* Reorder the state */
9102     for (i = 0; i < estNR; i++)
9103     {
9104         if (EST_DISTR(i) && (state->flags & (1<<i)))
9105         {
9106             switch (i)
9107             {
9108                 case estX:
9109                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->x, vbuf);
9110                     break;
9111                 case estV:
9112                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->v, vbuf);
9113                     break;
9114                 case estSDX:
9115                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->sd_X, vbuf);
9116                     break;
9117                 case estCGP:
9118                     order_vec_atom(dd->ncg_home, cgindex, cgsort, state->cg_p, vbuf);
9119                     break;
9120                 case estLD_RNG:
9121                 case estLD_RNGI:
9122                 case estDISRE_INITF:
9123                 case estDISRE_RM3TAV:
9124                 case estORIRE_INITF:
9125                 case estORIRE_DTAV:
9126                     /* No ordering required */
9127                     break;
9128                 default:
9129                     gmx_incons("Unknown state entry encountered in dd_sort_state");
9130                     break;
9131             }
9132         }
9133     }
9134     if (fr->cutoff_scheme == ecutsGROUP)
9135     {
9136         /* Reorder cgcm */
9137         order_vec_cg(dd->ncg_home, cgsort, cgcm, vbuf);
9138     }
9139
9140     if (dd->ncg_home+1 > sort->ibuf_nalloc)
9141     {
9142         sort->ibuf_nalloc = over_alloc_dd(dd->ncg_home+1);
9143         srenew(sort->ibuf, sort->ibuf_nalloc);
9144     }
9145     ibuf = sort->ibuf;
9146     /* Reorder the global cg index */
9147     order_int_cg(dd->ncg_home, cgsort, dd->index_gl, ibuf);
9148     /* Reorder the cginfo */
9149     order_int_cg(dd->ncg_home, cgsort, fr->cginfo, ibuf);
9150     /* Rebuild the local cg index */
9151     if (dd->comm->bCGs)
9152     {
9153         ibuf[0] = 0;
9154         for (i = 0; i < dd->ncg_home; i++)
9155         {
9156             cgsize    = dd->cgindex[cgsort[i].ind+1] - dd->cgindex[cgsort[i].ind];
9157             ibuf[i+1] = ibuf[i] + cgsize;
9158         }
9159         for (i = 0; i < dd->ncg_home+1; i++)
9160         {
9161             dd->cgindex[i] = ibuf[i];
9162         }
9163     }
9164     else
9165     {
9166         for (i = 0; i < dd->ncg_home+1; i++)
9167         {
9168             dd->cgindex[i] = i;
9169         }
9170     }
9171     /* Set the home atom number */
9172     dd->nat_home = dd->cgindex[dd->ncg_home];
9173
9174     if (fr->cutoff_scheme == ecutsVERLET)
9175     {
9176         /* The atoms are now exactly in grid order, update the grid order */
9177         nbnxn_set_atomorder(fr->nbv->nbs);
9178     }
9179     else
9180     {
9181         /* Copy the sorted ns cell indices back to the ns grid struct */
9182         for (i = 0; i < dd->ncg_home; i++)
9183         {
9184             fr->ns.grid->cell_index[i] = cgsort[i].nsc;
9185         }
9186         fr->ns.grid->nr = dd->ncg_home;
9187     }
9188 }
9189
9190 static void add_dd_statistics(gmx_domdec_t *dd)
9191 {
9192     gmx_domdec_comm_t *comm;
9193     int                ddnat;
9194
9195     comm = dd->comm;
9196
9197     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9198     {
9199         comm->sum_nat[ddnat-ddnatZONE] +=
9200             comm->nat[ddnat] - comm->nat[ddnat-1];
9201     }
9202     comm->ndecomp++;
9203 }
9204
9205 void reset_dd_statistics_counters(gmx_domdec_t *dd)
9206 {
9207     gmx_domdec_comm_t *comm;
9208     int                ddnat;
9209
9210     comm = dd->comm;
9211
9212     /* Reset all the statistics and counters for total run counting */
9213     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9214     {
9215         comm->sum_nat[ddnat-ddnatZONE] = 0;
9216     }
9217     comm->ndecomp   = 0;
9218     comm->nload     = 0;
9219     comm->load_step = 0;
9220     comm->load_sum  = 0;
9221     comm->load_max  = 0;
9222     clear_ivec(comm->load_lim);
9223     comm->load_mdf = 0;
9224     comm->load_pme = 0;
9225 }
9226
9227 void print_dd_statistics(t_commrec *cr, t_inputrec *ir, FILE *fplog)
9228 {
9229     gmx_domdec_comm_t *comm;
9230     int                ddnat;
9231     double             av;
9232
9233     comm = cr->dd->comm;
9234
9235     gmx_sumd(ddnatNR-ddnatZONE, comm->sum_nat, cr);
9236
9237     if (fplog == NULL)
9238     {
9239         return;
9240     }
9241
9242     fprintf(fplog, "\n    D O M A I N   D E C O M P O S I T I O N   S T A T I S T I C S\n\n");
9243
9244     for (ddnat = ddnatZONE; ddnat < ddnatNR; ddnat++)
9245     {
9246         av = comm->sum_nat[ddnat-ddnatZONE]/comm->ndecomp;
9247         switch (ddnat)
9248         {
9249             case ddnatZONE:
9250                 fprintf(fplog,
9251                         " av. #atoms communicated per step for force:  %d x %.1f\n",
9252                         2, av);
9253                 break;
9254             case ddnatVSITE:
9255                 if (cr->dd->vsite_comm)
9256                 {
9257                     fprintf(fplog,
9258                             " av. #atoms communicated per step for vsites: %d x %.1f\n",
9259                             (EEL_PME(ir->coulombtype) || ir->coulombtype == eelEWALD) ? 3 : 2,
9260                             av);
9261                 }
9262                 break;
9263             case ddnatCON:
9264                 if (cr->dd->constraint_comm)
9265                 {
9266                     fprintf(fplog,
9267                             " av. #atoms communicated per step for LINCS:  %d x %.1f\n",
9268                             1 + ir->nLincsIter, av);
9269                 }
9270                 break;
9271             default:
9272                 gmx_incons(" Unknown type for DD statistics");
9273         }
9274     }
9275     fprintf(fplog, "\n");
9276
9277     if (comm->bRecordLoad && EI_DYNAMICS(ir->eI))
9278     {
9279         print_dd_load_av(fplog, cr->dd);
9280     }
9281 }
9282
9283 void dd_partition_system(FILE                *fplog,
9284                          gmx_int64_t          step,
9285                          t_commrec           *cr,
9286                          gmx_bool             bMasterState,
9287                          int                  nstglobalcomm,
9288                          t_state             *state_global,
9289                          gmx_mtop_t          *top_global,
9290                          t_inputrec          *ir,
9291                          t_state             *state_local,
9292                          rvec               **f,
9293                          t_mdatoms           *mdatoms,
9294                          gmx_localtop_t      *top_local,
9295                          t_forcerec          *fr,
9296                          gmx_vsite_t         *vsite,
9297                          gmx_shellfc_t        shellfc,
9298                          gmx_constr_t         constr,
9299                          t_nrnb              *nrnb,
9300                          gmx_wallcycle_t      wcycle,
9301                          gmx_bool             bVerbose)
9302 {
9303     gmx_domdec_t      *dd;
9304     gmx_domdec_comm_t *comm;
9305     gmx_ddbox_t        ddbox = {0};
9306     t_block           *cgs_gl;
9307     gmx_int64_t        step_pcoupl;
9308     rvec               cell_ns_x0, cell_ns_x1;
9309     int                i, j, n, ncgindex_set, ncg_home_old = -1, ncg_moved, nat_f_novirsum;
9310     gmx_bool           bBoxChanged, bNStGlobalComm, bDoDLB, bCheckDLB, bTurnOnDLB, bLogLoad;
9311     gmx_bool           bRedist, bSortCG, bResortAll;
9312     ivec               ncells_old = {0, 0, 0}, ncells_new = {0, 0, 0}, np;
9313     real               grid_density;
9314     char               sbuf[22];
9315
9316     dd   = cr->dd;
9317     comm = dd->comm;
9318
9319     bBoxChanged = (bMasterState || DEFORM(*ir));
9320     if (ir->epc != epcNO)
9321     {
9322         /* With nstpcouple > 1 pressure coupling happens.
9323          * one step after calculating the pressure.
9324          * Box scaling happens at the end of the MD step,
9325          * after the DD partitioning.
9326          * We therefore have to do DLB in the first partitioning
9327          * after an MD step where P-coupling occured.
9328          * We need to determine the last step in which p-coupling occurred.
9329          * MRS -- need to validate this for vv?
9330          */
9331         n = ir->nstpcouple;
9332         if (n == 1)
9333         {
9334             step_pcoupl = step - 1;
9335         }
9336         else
9337         {
9338             step_pcoupl = ((step - 1)/n)*n + 1;
9339         }
9340         if (step_pcoupl >= comm->partition_step)
9341         {
9342             bBoxChanged = TRUE;
9343         }
9344     }
9345
9346     bNStGlobalComm = (step % nstglobalcomm == 0);
9347
9348     if (!comm->bDynLoadBal)
9349     {
9350         bDoDLB = FALSE;
9351     }
9352     else
9353     {
9354         /* Should we do dynamic load balacing this step?
9355          * Since it requires (possibly expensive) global communication,
9356          * we might want to do DLB less frequently.
9357          */
9358         if (bBoxChanged || ir->epc != epcNO)
9359         {
9360             bDoDLB = bBoxChanged;
9361         }
9362         else
9363         {
9364             bDoDLB = bNStGlobalComm;
9365         }
9366     }
9367
9368     /* Check if we have recorded loads on the nodes */
9369     if (comm->bRecordLoad && dd_load_count(comm) > 0)
9370     {
9371         if (comm->eDLB == edlbAUTO && !comm->bDynLoadBal && !dd_dlb_is_locked(dd))
9372         {
9373             /* Check if we should use DLB at the second partitioning
9374              * and every 100 partitionings,
9375              * so the extra communication cost is negligible.
9376              */
9377             const int nddp_chk_dlb = 100;
9378
9379             bCheckDLB = (comm->n_load_collect == 0 ||
9380                          comm->n_load_have % nddp_chk_dlb == nddp_chk_dlb - 1);
9381         }
9382         else
9383         {
9384             bCheckDLB = FALSE;
9385         }
9386
9387         /* Print load every nstlog, first and last step to the log file */
9388         bLogLoad = ((ir->nstlog > 0 && step % ir->nstlog == 0) ||
9389                     comm->n_load_collect == 0 ||
9390                     (ir->nsteps >= 0 &&
9391                      (step + ir->nstlist > ir->init_step + ir->nsteps)));
9392
9393         /* Avoid extra communication due to verbose screen output
9394          * when nstglobalcomm is set.
9395          */
9396         if (bDoDLB || bLogLoad || bCheckDLB ||
9397             (bVerbose && (ir->nstlist == 0 || nstglobalcomm <= ir->nstlist)))
9398         {
9399             get_load_distribution(dd, wcycle);
9400             if (DDMASTER(dd))
9401             {
9402                 if (bLogLoad)
9403                 {
9404                     dd_print_load(fplog, dd, step-1);
9405                 }
9406                 if (bVerbose)
9407                 {
9408                     dd_print_load_verbose(dd);
9409                 }
9410             }
9411             comm->n_load_collect++;
9412
9413             if (bCheckDLB)
9414             {
9415                 /* Since the timings are node dependent, the master decides */
9416                 if (DDMASTER(dd))
9417                 {
9418                     /* Here we check if the max PME rank load is more than 0.98
9419                      * the max PP force load. If so, PP DLB will not help,
9420                      * since we are (almost) limited by PME. Furthermore,
9421                      * DLB will cause a significant extra x/f redistribution
9422                      * cost on the PME ranks, which will then surely result
9423                      * in lower total performance.
9424                      * This check might be fragile, since one measurement
9425                      * below 0.98 (although only done once every 100 DD part.)
9426                      * could turn on DLB for the rest of the run.
9427                      */
9428                     if (cr->npmenodes > 0 &&
9429                         dd_pme_f_ratio(dd) > 1 - DD_PERF_LOSS_DLB_ON)
9430                     {
9431                         bTurnOnDLB = FALSE;
9432                     }
9433                     else
9434                     {
9435                         bTurnOnDLB =
9436                             (dd_force_imb_perf_loss(dd) >= DD_PERF_LOSS_DLB_ON);
9437                     }
9438                     if (debug)
9439                     {
9440                         fprintf(debug, "step %s, imb loss %f\n",
9441                                 gmx_step_str(step, sbuf),
9442                                 dd_force_imb_perf_loss(dd));
9443                     }
9444                 }
9445                 dd_bcast(dd, sizeof(bTurnOnDLB), &bTurnOnDLB);
9446                 if (bTurnOnDLB)
9447                 {
9448                     turn_on_dlb(fplog, cr, step);
9449                     bDoDLB = TRUE;
9450                 }
9451             }
9452         }
9453         comm->n_load_have++;
9454     }
9455
9456     cgs_gl = &comm->cgs_gl;
9457
9458     bRedist = FALSE;
9459     if (bMasterState)
9460     {
9461         /* Clear the old state */
9462         clear_dd_indices(dd, 0, 0);
9463         ncgindex_set = 0;
9464
9465         set_ddbox(dd, bMasterState, cr, ir, state_global->box,
9466                   TRUE, cgs_gl, state_global->x, &ddbox);
9467
9468         get_cg_distribution(fplog, step, dd, cgs_gl,
9469                             state_global->box, &ddbox, state_global->x);
9470
9471         dd_distribute_state(dd, cgs_gl,
9472                             state_global, state_local, f);
9473
9474         dd_make_local_cgs(dd, &top_local->cgs);
9475
9476         /* Ensure that we have space for the new distribution */
9477         dd_check_alloc_ncg(fr, state_local, f, dd->ncg_home);
9478
9479         if (fr->cutoff_scheme == ecutsGROUP)
9480         {
9481             calc_cgcm(fplog, 0, dd->ncg_home,
9482                       &top_local->cgs, state_local->x, fr->cg_cm);
9483         }
9484
9485         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9486
9487         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9488     }
9489     else if (state_local->ddp_count != dd->ddp_count)
9490     {
9491         if (state_local->ddp_count > dd->ddp_count)
9492         {
9493             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count (%d) > dd->ddp_count (%d)", state_local->ddp_count, dd->ddp_count);
9494         }
9495
9496         if (state_local->ddp_count_cg_gl != state_local->ddp_count)
9497         {
9498             gmx_fatal(FARGS, "Internal inconsistency state_local->ddp_count_cg_gl (%d) != state_local->ddp_count (%d)", state_local->ddp_count_cg_gl, state_local->ddp_count);
9499         }
9500
9501         /* Clear the old state */
9502         clear_dd_indices(dd, 0, 0);
9503
9504         /* Build the new indices */
9505         rebuild_cgindex(dd, cgs_gl->index, state_local);
9506         make_dd_indices(dd, cgs_gl->index, 0);
9507         ncgindex_set = dd->ncg_home;
9508
9509         if (fr->cutoff_scheme == ecutsGROUP)
9510         {
9511             /* Redetermine the cg COMs */
9512             calc_cgcm(fplog, 0, dd->ncg_home,
9513                       &top_local->cgs, state_local->x, fr->cg_cm);
9514         }
9515
9516         inc_nrnb(nrnb, eNR_CGCM, dd->nat_home);
9517
9518         dd_set_cginfo(dd->index_gl, 0, dd->ncg_home, fr, comm->bLocalCG);
9519
9520         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9521                   TRUE, &top_local->cgs, state_local->x, &ddbox);
9522
9523         bRedist = comm->bDynLoadBal;
9524     }
9525     else
9526     {
9527         /* We have the full state, only redistribute the cgs */
9528
9529         /* Clear the non-home indices */
9530         clear_dd_indices(dd, dd->ncg_home, dd->nat_home);
9531         ncgindex_set = 0;
9532
9533         /* Avoid global communication for dim's without pbc and -gcom */
9534         if (!bNStGlobalComm)
9535         {
9536             copy_rvec(comm->box0, ddbox.box0    );
9537             copy_rvec(comm->box_size, ddbox.box_size);
9538         }
9539         set_ddbox(dd, bMasterState, cr, ir, state_local->box,
9540                   bNStGlobalComm, &top_local->cgs, state_local->x, &ddbox);
9541
9542         bBoxChanged = TRUE;
9543         bRedist     = TRUE;
9544     }
9545     /* For dim's without pbc and -gcom */
9546     copy_rvec(ddbox.box0, comm->box0    );
9547     copy_rvec(ddbox.box_size, comm->box_size);
9548
9549     set_dd_cell_sizes(dd, &ddbox, dynamic_dd_box(&ddbox, ir), bMasterState, bDoDLB,
9550                       step, wcycle);
9551
9552     if (comm->nstDDDumpGrid > 0 && step % comm->nstDDDumpGrid == 0)
9553     {
9554         write_dd_grid_pdb("dd_grid", step, dd, state_local->box, &ddbox);
9555     }
9556
9557     /* Check if we should sort the charge groups */
9558     if (comm->nstSortCG > 0)
9559     {
9560         bSortCG = (bMasterState ||
9561                    (bRedist && (step % comm->nstSortCG == 0)));
9562     }
9563     else
9564     {
9565         bSortCG = FALSE;
9566     }
9567
9568     ncg_home_old = dd->ncg_home;
9569
9570     ncg_moved = 0;
9571     if (bRedist)
9572     {
9573         wallcycle_sub_start(wcycle, ewcsDD_REDIST);
9574
9575         dd_redistribute_cg(fplog, step, dd, ddbox.tric_dir,
9576                            state_local, f, fr,
9577                            !bSortCG, nrnb, &ncgindex_set, &ncg_moved);
9578
9579         wallcycle_sub_stop(wcycle, ewcsDD_REDIST);
9580     }
9581
9582     get_nsgrid_boundaries(ddbox.nboundeddim, state_local->box,
9583                           dd, &ddbox,
9584                           &comm->cell_x0, &comm->cell_x1,
9585                           dd->ncg_home, fr->cg_cm,
9586                           cell_ns_x0, cell_ns_x1, &grid_density);
9587
9588     if (bBoxChanged)
9589     {
9590         comm_dd_ns_cell_sizes(dd, &ddbox, cell_ns_x0, cell_ns_x1, step);
9591     }
9592
9593     switch (fr->cutoff_scheme)
9594     {
9595         case ecutsGROUP:
9596             copy_ivec(fr->ns.grid->n, ncells_old);
9597             grid_first(fplog, fr->ns.grid, dd, &ddbox,
9598                        state_local->box, cell_ns_x0, cell_ns_x1,
9599                        fr->rlistlong, grid_density);
9600             break;
9601         case ecutsVERLET:
9602             nbnxn_get_ncells(fr->nbv->nbs, &ncells_old[XX], &ncells_old[YY]);
9603             break;
9604         default:
9605             gmx_incons("unimplemented");
9606     }
9607     /* We need to store tric_dir for dd_get_ns_ranges called from ns.c */
9608     copy_ivec(ddbox.tric_dir, comm->tric_dir);
9609
9610     if (bSortCG)
9611     {
9612         wallcycle_sub_start(wcycle, ewcsDD_GRID);
9613
9614         /* Sort the state on charge group position.
9615          * This enables exact restarts from this step.
9616          * It also improves performance by about 15% with larger numbers
9617          * of atoms per node.
9618          */
9619
9620         /* Fill the ns grid with the home cell,
9621          * so we can sort with the indices.
9622          */
9623         set_zones_ncg_home(dd);
9624
9625         switch (fr->cutoff_scheme)
9626         {
9627             case ecutsVERLET:
9628                 set_zones_size(dd, state_local->box, &ddbox, 0, 1);
9629
9630                 nbnxn_put_on_grid(fr->nbv->nbs, fr->ePBC, state_local->box,
9631                                   0,
9632                                   comm->zones.size[0].bb_x0,
9633                                   comm->zones.size[0].bb_x1,
9634                                   0, dd->ncg_home,
9635                                   comm->zones.dens_zone0,
9636                                   fr->cginfo,
9637                                   state_local->x,
9638                                   ncg_moved, bRedist ? comm->moved : NULL,
9639                                   fr->nbv->grp[eintLocal].kernel_type,
9640                                   fr->nbv->grp[eintLocal].nbat);
9641
9642                 nbnxn_get_ncells(fr->nbv->nbs, &ncells_new[XX], &ncells_new[YY]);
9643                 break;
9644             case ecutsGROUP:
9645                 fill_grid(&comm->zones, fr->ns.grid, dd->ncg_home,
9646                           0, dd->ncg_home, fr->cg_cm);
9647
9648                 copy_ivec(fr->ns.grid->n, ncells_new);
9649                 break;
9650             default:
9651                 gmx_incons("unimplemented");
9652         }
9653
9654         bResortAll = bMasterState;
9655
9656         /* Check if we can user the old order and ns grid cell indices
9657          * of the charge groups to sort the charge groups efficiently.
9658          */
9659         if (ncells_new[XX] != ncells_old[XX] ||
9660             ncells_new[YY] != ncells_old[YY] ||
9661             ncells_new[ZZ] != ncells_old[ZZ])
9662         {
9663             bResortAll = TRUE;
9664         }
9665
9666         if (debug)
9667         {
9668             fprintf(debug, "Step %s, sorting the %d home charge groups\n",
9669                     gmx_step_str(step, sbuf), dd->ncg_home);
9670         }
9671         dd_sort_state(dd, fr->cg_cm, fr, state_local,
9672                       bResortAll ? -1 : ncg_home_old);
9673         /* Rebuild all the indices */
9674         ga2la_clear(dd->ga2la);
9675         ncgindex_set = 0;
9676
9677         wallcycle_sub_stop(wcycle, ewcsDD_GRID);
9678     }
9679
9680     wallcycle_sub_start(wcycle, ewcsDD_SETUPCOMM);
9681
9682     /* Setup up the communication and communicate the coordinates */
9683     setup_dd_communication(dd, state_local->box, &ddbox, fr, state_local, f);
9684
9685     /* Set the indices */
9686     make_dd_indices(dd, cgs_gl->index, ncgindex_set);
9687
9688     /* Set the charge group boundaries for neighbor searching */
9689     set_cg_boundaries(&comm->zones);
9690
9691     if (fr->cutoff_scheme == ecutsVERLET)
9692     {
9693         set_zones_size(dd, state_local->box, &ddbox,
9694                        bSortCG ? 1 : 0, comm->zones.n);
9695     }
9696
9697     wallcycle_sub_stop(wcycle, ewcsDD_SETUPCOMM);
9698
9699     /*
9700        write_dd_pdb("dd_home",step,"dump",top_global,cr,
9701                  -1,state_local->x,state_local->box);
9702      */
9703
9704     wallcycle_sub_start(wcycle, ewcsDD_MAKETOP);
9705
9706     /* Extract a local topology from the global topology */
9707     for (i = 0; i < dd->ndim; i++)
9708     {
9709         np[dd->dim[i]] = comm->cd[i].np;
9710     }
9711     dd_make_local_top(dd, &comm->zones, dd->npbcdim, state_local->box,
9712                       comm->cellsize_min, np,
9713                       fr,
9714                       fr->cutoff_scheme == ecutsGROUP ? fr->cg_cm : state_local->x,
9715                       vsite, top_global, top_local);
9716
9717     wallcycle_sub_stop(wcycle, ewcsDD_MAKETOP);
9718
9719     wallcycle_sub_start(wcycle, ewcsDD_MAKECONSTR);
9720
9721     /* Set up the special atom communication */
9722     n = comm->nat[ddnatZONE];
9723     for (i = ddnatZONE+1; i < ddnatNR; i++)
9724     {
9725         switch (i)
9726         {
9727             case ddnatVSITE:
9728                 if (vsite && vsite->n_intercg_vsite)
9729                 {
9730                     n = dd_make_local_vsites(dd, n, top_local->idef.il);
9731                 }
9732                 break;
9733             case ddnatCON:
9734                 if (dd->bInterCGcons || dd->bInterCGsettles)
9735                 {
9736                     /* Only for inter-cg constraints we need special code */
9737                     n = dd_make_local_constraints(dd, n, top_global, fr->cginfo,
9738                                                   constr, ir->nProjOrder,
9739                                                   top_local->idef.il);
9740                 }
9741                 break;
9742             default:
9743                 gmx_incons("Unknown special atom type setup");
9744         }
9745         comm->nat[i] = n;
9746     }
9747
9748     wallcycle_sub_stop(wcycle, ewcsDD_MAKECONSTR);
9749
9750     wallcycle_sub_start(wcycle, ewcsDD_TOPOTHER);
9751
9752     /* Make space for the extra coordinates for virtual site
9753      * or constraint communication.
9754      */
9755     state_local->natoms = comm->nat[ddnatNR-1];
9756     if (state_local->natoms > state_local->nalloc)
9757     {
9758         dd_realloc_state(state_local, f, state_local->natoms);
9759     }
9760
9761     if (fr->bF_NoVirSum)
9762     {
9763         if (vsite && vsite->n_intercg_vsite)
9764         {
9765             nat_f_novirsum = comm->nat[ddnatVSITE];
9766         }
9767         else
9768         {
9769             if (EEL_FULL(ir->coulombtype) && dd->n_intercg_excl > 0)
9770             {
9771                 nat_f_novirsum = dd->nat_tot;
9772             }
9773             else
9774             {
9775                 nat_f_novirsum = dd->nat_home;
9776             }
9777         }
9778     }
9779     else
9780     {
9781         nat_f_novirsum = 0;
9782     }
9783
9784     /* Set the number of atoms required for the force calculation.
9785      * Forces need to be constrained when using a twin-range setup
9786      * or with energy minimization. For simple simulations we could
9787      * avoid some allocation, zeroing and copying, but this is
9788      * probably not worth the complications ande checking.
9789      */
9790     forcerec_set_ranges(fr, dd->ncg_home, dd->ncg_tot,
9791                         dd->nat_tot, comm->nat[ddnatCON], nat_f_novirsum);
9792
9793     /* We make the all mdatoms up to nat_tot_con.
9794      * We could save some work by only setting invmass
9795      * between nat_tot and nat_tot_con.
9796      */
9797     /* This call also sets the new number of home particles to dd->nat_home */
9798     atoms2md(top_global, ir,
9799              comm->nat[ddnatCON], dd->gatindex, dd->nat_home, mdatoms);
9800
9801     /* Now we have the charges we can sort the FE interactions */
9802     dd_sort_local_top(dd, mdatoms, top_local);
9803
9804     if (vsite != NULL)
9805     {
9806         /* Now we have updated mdatoms, we can do the last vsite bookkeeping */
9807         split_vsites_over_threads(top_local->idef.il, top_local->idef.iparams,
9808                                   mdatoms, FALSE, vsite);
9809     }
9810
9811     if (shellfc)
9812     {
9813         /* Make the local shell stuff, currently no communication is done */
9814         make_local_shells(cr, mdatoms, shellfc);
9815     }
9816
9817     if (ir->implicit_solvent)
9818     {
9819         make_local_gb(cr, fr->born, ir->gb_algorithm);
9820     }
9821
9822     setup_bonded_threading(fr, &top_local->idef);
9823
9824     if (!(cr->duty & DUTY_PME))
9825     {
9826         /* Send the charges and/or c6/sigmas to our PME only node */
9827         gmx_pme_send_parameters(cr,
9828                                 fr->ic,
9829                                 mdatoms->nChargePerturbed, mdatoms->nTypePerturbed,
9830                                 mdatoms->chargeA, mdatoms->chargeB,
9831                                 mdatoms->sqrt_c6A, mdatoms->sqrt_c6B,
9832                                 mdatoms->sigmaA, mdatoms->sigmaB,
9833                                 dd_pme_maxshift_x(dd), dd_pme_maxshift_y(dd));
9834     }
9835
9836     if (constr)
9837     {
9838         set_constraints(constr, top_local, ir, mdatoms, cr);
9839     }
9840
9841     if (ir->ePull != epullNO)
9842     {
9843         /* Update the local pull groups */
9844         dd_make_local_pull_groups(dd, ir->pull, mdatoms);
9845     }
9846
9847     if (ir->bRot)
9848     {
9849         /* Update the local rotation groups */
9850         dd_make_local_rotation_groups(dd, ir->rot);
9851     }
9852
9853     if (ir->eSwapCoords != eswapNO)
9854     {
9855         /* Update the local groups needed for ion swapping */
9856         dd_make_local_swap_groups(dd, ir->swap);
9857     }
9858
9859     /* Update the local atoms to be communicated via the IMD protocol if bIMD is TRUE. */
9860     dd_make_local_IMD_atoms(ir->bIMD, dd, ir->imd);
9861
9862     add_dd_statistics(dd);
9863
9864     /* Make sure we only count the cycles for this DD partitioning */
9865     clear_dd_cycle_counts(dd);
9866
9867     /* Because the order of the atoms might have changed since
9868      * the last vsite construction, we need to communicate the constructing
9869      * atom coordinates again (for spreading the forces this MD step).
9870      */
9871     dd_move_x_vsites(dd, state_local->box, state_local->x);
9872
9873     wallcycle_sub_stop(wcycle, ewcsDD_TOPOTHER);
9874
9875     if (comm->nstDDDump > 0 && step % comm->nstDDDump == 0)
9876     {
9877         dd_move_x(dd, state_local->box, state_local->x);
9878         write_dd_pdb("dd_dump", step, "dump", top_global, cr,
9879                      -1, state_local->x, state_local->box);
9880     }
9881
9882     /* Store the partitioning step */
9883     comm->partition_step = step;
9884
9885     /* Increase the DD partitioning counter */
9886     dd->ddp_count++;
9887     /* The state currently matches this DD partitioning count, store it */
9888     state_local->ddp_count = dd->ddp_count;
9889     if (bMasterState)
9890     {
9891         /* The DD master node knows the complete cg distribution,
9892          * store the count so we can possibly skip the cg info communication.
9893          */
9894         comm->master_cg_ddp_count = (bSortCG ? 0 : dd->ddp_count);
9895     }
9896
9897     if (comm->DD_debug > 0)
9898     {
9899         /* Set the env var GMX_DD_DEBUG if you suspect corrupted indices */
9900         check_index_consistency(dd, top_global->natoms, ncg_mtop(top_global),
9901                                 "after partitioning");
9902     }
9903 }